[lxc-devel] [lxd/master] seccomp: implement redirection to fuse

brauner on Github lxc-bot at linuxcontainers.org
Tue Nov 12 22:25:11 UTC 2019


A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 710 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20191112/8a1faf10/attachment-0001.bin>
-------------- next part --------------
From 00811cbb3b68840f9796e2fa939f411601ebb970 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Tue, 12 Nov 2019 23:14:13 +0100
Subject: [PATCH 1/4] seccomp: implement redirection to fuse

This allows to intercept and redirect mount syscalls for filesystems and
redirect them to their corresponding fuse implementation.

A new key
security.syscalls.intercept.mount.fuse=<fstype>=<fuse-binary>
is added.

Filesystems cannot both appear in security.syscalls.intercept.mount.fuse and
security.syscalls.intercept.mount.allowed.

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 lxd/container.go        |   6 ++
 lxd/main_forksyscall.go |  31 +++++--
 lxd/seccomp/seccomp.go  | 190 +++++++++++++++++++++++++++++++++++-----
 shared/container.go     |   1 +
 shared/util.go          |   2 +
 5 files changed, 199 insertions(+), 31 deletions(-)

diff --git a/lxd/container.go b/lxd/container.go
index 8d5d3c457b..f61f286d01 100644
--- a/lxd/container.go
+++ b/lxd/container.go
@@ -24,6 +24,7 @@ import (
 	deviceConfig "github.com/lxc/lxd/lxd/device/config"
 	"github.com/lxc/lxd/lxd/instance/instancetype"
 	"github.com/lxc/lxd/lxd/operations"
+	"github.com/lxc/lxd/lxd/seccomp"
 	"github.com/lxc/lxd/lxd/state"
 	storagePools "github.com/lxc/lxd/lxd/storage"
 	storageDrivers "github.com/lxc/lxd/lxd/storage/drivers"
@@ -168,6 +169,11 @@ func containerValidConfig(sysOS *sys.OS, config map[string]string, profile bool,
 		return fmt.Errorf("security.syscalls.whitelist is mutually exclusive with security.syscalls.blacklist*")
 	}
 
+	err, _ := seccomp.SeccompSyscallInterceptMountFilter(config)
+	if err != nil {
+		return err
+	}
+
 	if expanded && (config["security.privileged"] == "" || !shared.IsTrue(config["security.privileged"])) && sysOS.IdmapSet == nil {
 		return fmt.Errorf("LXD doesn't have a uid/gid allocation. In this mode, only privileged containers are supported")
 	}
diff --git a/lxd/main_forksyscall.go b/lxd/main_forksyscall.go
index 04738aa6d9..6b2e1ac5d1 100644
--- a/lxd/main_forksyscall.go
+++ b/lxd/main_forksyscall.go
@@ -381,6 +381,7 @@ static void mount_emulate(void)
 {
 	__do_close_prot_errno int mnt_fd = -EBADF;
 	char *source = NULL, *shiftfs = NULL, *target = NULL, *fstype = NULL;
+	bool use_fuse;
 	uid_t uid = -1, fsuid = -1;
 	gid_t gid = -1, fsgid = -1;
 	int ret;
@@ -389,28 +390,40 @@ static void mount_emulate(void)
 	const void *data;
 
 	pid = atoi(advance_arg(true));
-	source = advance_arg(true);
-	target = advance_arg(true);
-	fstype = advance_arg(true);
-	flags = atoi(advance_arg(true));
-	shiftfs = advance_arg(true);
+	use_fuse = (atoi(advance_arg(true)) == 1);
+	if (!use_fuse) {
+		source = advance_arg(true);
+		target = advance_arg(true);
+		fstype = advance_arg(true);
+		flags = atoi(advance_arg(true));
+		shiftfs = advance_arg(true);
+	}
 	uid = atoi(advance_arg(true));
 	gid = atoi(advance_arg(true));
 	fsuid = atoi(advance_arg(true));
 	fsgid = atoi(advance_arg(true));
-	data = advance_arg(false);
+	if (!use_fuse)
+		data = advance_arg(false);
 
 	mnt_fd = preserve_ns(getpid(), "mnt");
 	if (mnt_fd < 0)
 		_exit(EXIT_FAILURE);
 
+	if (use_fuse)
+		attach_userns(pid);
+
 	if (!acquire_basic_creds(pid))
 		_exit(EXIT_FAILURE);
 
 	if (!acquire_final_creds(pid, uid, gid, fsuid, fsgid))
 		_exit(EXIT_FAILURE);
 
-	if (strcmp(shiftfs, "true") == 0) {
+	if (use_fuse) {
+		const char *cmd = advance_arg(true);
+		ret = system(cmd);
+		if (ret)
+			_exit(EXIT_FAILURE);
+	} else if (strcmp(shiftfs, "true") == 0) {
 		char template[] = P_tmpdir "/.lxd_tmp_mount_XXXXXX";
 
 		// Create basic mount in container's mount namespace.
@@ -525,12 +538,12 @@ type cmdForksyscall struct {
 func (c *cmdForksyscall) Command() *cobra.Command {
 	// Main subcommand
 	cmd := &cobra.Command{}
-	cmd.Use = "forksyscall <syscall> <PID> <path> <mode> <dev>"
+	cmd.Use = "forksyscall <syscall> <PID> [...]"
 	cmd.Short = "Perform syscall operations"
 	cmd.Long = `Description:
   Perform syscall operations
 
-  This set of internal commands are used for all seccom-based container syscall
+  This set of internal commands is used for all seccomp-based container syscall
   operations.
 `
 	cmd.RunE = c.Run
diff --git a/lxd/seccomp/seccomp.go b/lxd/seccomp/seccomp.go
index 65ff54b70f..eef84aeb68 100644
--- a/lxd/seccomp/seccomp.go
+++ b/lxd/seccomp/seccomp.go
@@ -45,6 +45,7 @@ import (
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/mount.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
@@ -1176,6 +1177,77 @@ type MountArgs struct {
 	shift  bool
 }
 
+// MS_REC
+var mountFlagsToOptMap = map[C.ulong]string{
+	C.MS_BIND:        "bind",
+	C.ulong(0):       "defaults",
+	C.MS_LAZYTIME:    "lazytime",
+	C.MS_MANDLOCK:    "mand",
+	C.MS_NOATIME:     "noatime",
+	C.MS_NODEV:       "nodev",
+	C.MS_NODIRATIME:  "nodiratime",
+	C.MS_NOEXEC:      "noexec",
+	C.MS_NOSUID:      "nosuid",
+	C.MS_RELATIME:    "relatime",
+	C.MS_REMOUNT:     "remount",
+	C.MS_RDONLY:      "ro",
+	C.MS_STRICTATIME: "strictatime",
+	C.MS_SYNCHRONOUS: "sync",
+	C.MS_PRIVATE:     "--make-private",
+	C.MS_SHARED:      "--make-shared",
+	C.MS_SLAVE:       "--make-slave",
+	C.MS_UNBINDABLE:  "--make-unbindable",
+
+	C.MS_REC | C.MS_BIND:       "rbind",
+	C.MS_REC | C.MS_PRIVATE:    "--make-rprivate",
+	C.MS_REC | C.MS_SHARED:     "--make-rshared",
+	C.MS_REC | C.MS_SLAVE:      "--make-rslave",
+	C.MS_REC | C.MS_UNBINDABLE: "--make-runbindable",
+}
+
+func mountFlagsToOpts(flags C.ulong) (string, string) {
+	var bit C.ulong = 0
+	opts := ""
+	args := ""
+	var msRec C.ulong = (flags & C.MS_REC)
+
+	flags = (flags &^ C.MS_REC)
+	for bit < (4*8 - 1) {
+		if (flags & (1 << bit)) > 0 {
+			var flagKey C.ulong = (1 << bit)
+
+			switch flagKey {
+			case C.MS_BIND:
+				fallthrough
+			case C.MS_PRIVATE:
+				fallthrough
+			case C.MS_SHARED:
+				fallthrough
+			case C.MS_SLAVE:
+				fallthrough
+			case C.MS_UNBINDABLE:
+				flagKey |= msRec
+			}
+			optOrArg := mountFlagsToOptMap[flagKey]
+
+			if optOrArg == "" {
+				continue
+			}
+
+			if strings.HasPrefix(optOrArg, "--") {
+				args = fmt.Sprintf("%s %s", args, optOrArg)
+			} else if opts == "" {
+				opts = fmt.Sprintf("%s", optOrArg)
+			} else {
+				opts = fmt.Sprintf("%s,%s", opts, optOrArg)
+			}
+		}
+		bit++
+	}
+
+	return opts, args
+}
+
 // HandleMountSyscall handles mount syscalls.
 func (s *Server) HandleMountSyscall(c Instance, siov *Iovec) int {
 	ctx := log.Ctx{"container": c.Name(),
@@ -1252,7 +1324,8 @@ func (s *Server) HandleMountSyscall(c Instance, siov *Iovec) int {
 		args.data = C.GoString(&cBuf[0])
 	}
 
-	if !s.MountSyscallValid(c, &args) {
+	ok, fuseBinary := s.MountSyscallValid(c, &args)
+	if !ok {
 		ctx["syscall_continue"] = "true"
 		C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(seccompUserNotifFlagContinue))
 		return 0
@@ -1265,20 +1338,55 @@ func (s *Server) HandleMountSyscall(c Instance, siov *Iovec) int {
 		return 0
 	}
 
-	_, _, err = shared.RunCommandSplit(nil, util.GetExecPath(),
-		"forksyscall",
-		"mount",
-		fmt.Sprintf("%d", args.pid),
-		fmt.Sprintf("%s", args.source),
-		fmt.Sprintf("%s", args.target),
-		fmt.Sprintf("%s", args.fstype),
-		fmt.Sprintf("%d", args.flags),
-		fmt.Sprintf("%t", args.shift),
-		fmt.Sprintf("%d", nsuid),
-		fmt.Sprintf("%d", nsgid),
-		fmt.Sprintf("%d", nsfsuid),
-		fmt.Sprintf("%d", nsfsgid),
-		fmt.Sprintf("%s", args.data))
+	if fuseBinary != "" {
+		addOpts, addArgs := mountFlagsToOpts(C.ulong(args.flags))
+
+		fuseCmd := fmt.Sprintf("mount.fuse %s#%s %s", fuseBinary, args.source, args.target)
+
+		if addArgs != "" {
+			fuseCmd = fmt.Sprintf("%s %s", fuseCmd, addArgs)
+		}
+
+		if args.data != "" || addOpts != "" {
+			fuseCmd = fmt.Sprintf("%s -o", fuseCmd)
+			if args.data != "" && addOpts != "" {
+				fuseCmd = fmt.Sprintf("%s %s,%s", fuseCmd, args.data, addOpts)
+			} else if args.data != "" {
+				fuseCmd = fmt.Sprintf("%s %s", fuseCmd, args.data)
+			} else {
+				fuseCmd = fmt.Sprintf("%s %s", fuseCmd, addOpts)
+			}
+		}
+
+		logger.Errorf("AAAA: %s", fuseCmd)
+		ctx["fuse_cmd"] = fuseCmd
+		_, _, err = shared.RunCommandSplit(nil, util.GetExecPath(),
+			"forksyscall",
+			"mount",
+			fmt.Sprintf("%d", args.pid),
+			fmt.Sprintf("%d", 1),
+			fmt.Sprintf("%d", nsuid),
+			fmt.Sprintf("%d", nsgid),
+			fmt.Sprintf("%d", nsfsuid),
+			fmt.Sprintf("%d", nsfsgid),
+			fmt.Sprintf("%s", fuseCmd))
+	} else {
+		_, _, err = shared.RunCommandSplit(nil, util.GetExecPath(),
+			"forksyscall",
+			"mount",
+			fmt.Sprintf("%d", args.pid),
+			fmt.Sprintf("%d", 0),
+			fmt.Sprintf("%s", args.source),
+			fmt.Sprintf("%s", args.target),
+			fmt.Sprintf("%s", args.fstype),
+			fmt.Sprintf("%d", args.flags),
+			fmt.Sprintf("%t", args.shift),
+			fmt.Sprintf("%d", nsuid),
+			fmt.Sprintf("%d", nsgid),
+			fmt.Sprintf("%d", nsfsuid),
+			fmt.Sprintf("%d", nsfsgid),
+			fmt.Sprintf("%s", args.data))
+	}
 	if err != nil {
 		ctx["syscall_continue"] = "true"
 		C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(seccompUserNotifFlagContinue))
@@ -1390,16 +1498,54 @@ func MountSyscallFilter(config map[string]string) []string {
 	return fs
 }
 
-// MountSyscallValid checks whether this is a mount syscall we intercept.
-func (s *Server) MountSyscallValid(c Instance, args *MountArgs) bool {
-	fsList := MountSyscallFilter(c.ExpandedConfig())
-	for _, fs := range fsList {
-		if fs == args.fstype {
-			return true
+// SeccompSyscallInterceptMountFilter creates a new mount syscall interception filter
+func SeccompSyscallInterceptMountFilter(config map[string]string) (error, map[string]string) {
+	if !shared.IsTrue(config["security.syscalls.intercept.mount"]) {
+		return nil, map[string]string{}
+
+	}
+
+	fsMap := map[string]string{}
+	fsFused := strings.Split(config["security.syscalls.intercept.mount.fuse"], ",")
+	if len(fsFused) > 0 && fsFused[0] != "" {
+		for _, ent := range fsFused {
+			fsfuse := strings.Split(ent, "=")
+			if len(fsfuse) != 2 {
+				return fmt.Errorf("security.syscalls.intercept.mount.fuse is not of the form 'filesystem=fuse-binary': %s", ent), map[string]string{}
+			}
+
+			// fsfuse[0] == filesystems that are ok to mount
+			// fsfuse[1] == fuse binary to use to mount filesystemstype
+			fsMap[fsfuse[0]] = fsfuse[1]
 		}
 	}
 
-	return false
+	fsAllowed := strings.Split(config["security.syscalls.intercept.mount.allowed"], ",")
+	if len(fsAllowed) > 0 && fsAllowed[0] != "" {
+		for _, allowedfs := range fsAllowed {
+			if fsMap[allowedfs] != "" {
+				return fmt.Errorf("Filesystem %s cannot appear in security.syscalls.intercept.mount.allowed and security.syscalls.intercept.mount.fuse", allowedfs), map[string]string{}
+			}
+
+			fsMap[allowedfs] = ""
+		}
+	}
+
+	return nil, fsMap
+}
+
+// MountSyscallValid checks whether this is a mount syscall we intercept.
+func (s *Server) MountSyscallValid(c Instance, args *MountArgs) (bool, string) {
+	err, fsMap := SeccompSyscallInterceptMountFilter(c.ExpandedConfig())
+	if err != nil {
+		return false, ""
+	}
+
+	if fuse, ok := fsMap[args.fstype]; ok {
+		return true, fuse
+	}
+
+	return false, ""
 }
 
 // MountSyscallShift checks whether this mount syscall needs shiftfs.
diff --git a/shared/container.go b/shared/container.go
index cb04e09141..719aac857f 100644
--- a/shared/container.go
+++ b/shared/container.go
@@ -300,6 +300,7 @@ var KnownContainerConfigKeys = map[string]func(value string) error{
 	"security.syscalls.intercept.mknod":         IsBool,
 	"security.syscalls.intercept.mount":         IsBool,
 	"security.syscalls.intercept.mount.allowed": IsAny,
+	"security.syscalls.intercept.mount.fuse":    IsAny,
 	"security.syscalls.intercept.mount.shift":   IsBool,
 	"security.syscalls.intercept.setxattr":      IsBool,
 	"security.syscalls.whitelist":               IsAny,
diff --git a/shared/util.go b/shared/util.go
index e307f985fe..8bb03ab470 100644
--- a/shared/util.go
+++ b/shared/util.go
@@ -29,6 +29,7 @@ import (
 
 	"github.com/lxc/lxd/shared/cancel"
 	"github.com/lxc/lxd/shared/ioprogress"
+	"github.com/lxc/lxd/shared/logger"
 	"github.com/lxc/lxd/shared/units"
 )
 
@@ -991,6 +992,7 @@ func DownloadFileHash(httpClient *http.Client, useragent string, progress func(p
 		}
 
 		result := fmt.Sprintf("%x", hashFunc.Sum(nil))
+		logger.Errorf("Hashing for download from url %s. Got hash: %s. Expected hash: %s", url, result, hash)
 		if result != hash {
 			return -1, fmt.Errorf("Hash mismatch for %s: %s != %s", url, result, hash)
 		}

From dab3ee3cfe2401081d9954ce3ada2d40632f6dd2 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Tue, 12 Nov 2019 23:18:37 +0100
Subject: [PATCH 2/4] api: add container_syscall_intercept_mount_fuse extension

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 doc/api-extensions.md | 7 ++++++-
 shared/version/api.go | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/doc/api-extensions.md b/doc/api-extensions.md
index ca09e50e41..b78ffb32c8 100644
--- a/doc/api-extensions.md
+++ b/doc/api-extensions.md
@@ -871,4 +871,9 @@ elevated permissions.
 Adds support for importing/exporting of images/backups using SquashFS file system format.
 
 ## container\_raw\_mount
-This adds support for passing in raw mount options for disk devices. 
\ No newline at end of file
+This adds support for passing in raw mount options for disk devices.
+
+## container\_syscall\_intercept\_mount\_fuse
+Adds the `security.syscalls.intercept.mount.fuse` key. It can be used to
+redirect filesystem mounts to their fuse implementation. To this end, set e.g.
+`security.syscalls.intercept.mount.fuse=ext4=fuse2fs`.
diff --git a/shared/version/api.go b/shared/version/api.go
index f6b0e345a1..fb1e6edd43 100644
--- a/shared/version/api.go
+++ b/shared/version/api.go
@@ -175,6 +175,7 @@ var APIExtensions = []string{
 	"container_syscall_intercept_mount",
 	"compression_squashfs",
 	"container_raw_mount",
+	"container_syscall_intercept_mount_fuse",
 }
 
 // APIExtensionsCount returns the number of available API extensions.

From 9f64917fb6e48c39b3d8ed5516999f2a5a59957d Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Tue, 12 Nov 2019 23:22:17 +0100
Subject: [PATCH 3/4] doc: add security.syscalls.intercept.mount.fuse

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 doc/containers.md | 109 +++++++++++++++++++++++-----------------------
 1 file changed, 55 insertions(+), 54 deletions(-)

diff --git a/doc/containers.md b/doc/containers.md
index 5441367e3b..e5c46602c0 100644
--- a/doc/containers.md
+++ b/doc/containers.md
@@ -34,60 +34,61 @@ currently supported:
 
 The currently supported keys are:
 
-Key                                             | Type      | Default           | Live update   | API extension                        | Description
-:--                                             | :---      | :------           | :----------   | :------------                        | :----------
-boot.autostart                                  | boolean   | -                 | n/a           | -                                    | Always start the container when LXD starts (if not set, restore last state)
-boot.autostart.delay                            | integer   | 0                 | n/a           | -                                    | Number of seconds to wait after the container started before starting the next one
-boot.autostart.priority                         | integer   | 0                 | n/a           | -                                    | What order to start the containers in (starting with highest)
-boot.host\_shutdown\_timeout                    | integer   | 30                | yes           | container\_host\_shutdown\_timeout   | Seconds to wait for container to shutdown before it is force stopped
-boot.stop.priority                              | integer   | 0                 | n/a           | container\_stop\_priority            | What order to shutdown the containers (starting with highest)
-environment.\*                                  | string    | -                 | yes (exec)    | -                                    | key/value environment variables to export to the container and set on exec
-limits.cpu                                      | string    | - (all)           | yes           | -                                    | Number or range of CPUs to expose to the container
-limits.cpu.allowance                            | string    | 100%              | yes           | -                                    | How much of the CPU can be used. Can be a percentage (e.g. 50%) for a soft limit or hard a chunk of time (25ms/100ms)
-limits.cpu.priority                             | integer   | 10 (maximum)      | yes           | -                                    | CPU scheduling priority compared to other containers sharing the same CPUs (overcommit) (integer between 0 and 10)
-limits.disk.priority                            | integer   | 5 (medium)        | yes           | -                                    | When under load, how much priority to give to the container's I/O requests (integer between 0 and 10)
-limits.kernel.\*                                | string    | -                 | no            | kernel\_limits                       | This limits kernel resources per container (e.g. number of open files)
-limits.memory                                   | string    | - (all)           | yes           | -                                    | Percentage of the host's memory or fixed value in bytes (various suffixes supported, see below)
-limits.memory.enforce                           | string    | hard              | yes           | -                                    | If hard, container can't exceed its memory limit. If soft, the container can exceed its memory limit when extra host memory is available.
-limits.memory.swap                              | boolean   | true              | yes           | -                                    | Whether to allow some of the container's memory to be swapped out to disk
-limits.memory.swap.priority                     | integer   | 10 (maximum)      | yes           | -                                    | The higher this is set, the least likely the container is to be swapped to disk (integer between 0 and 10)
-limits.network.priority                         | integer   | 0 (minimum)       | yes           | -                                    | When under load, how much priority to give to the container's network requests (integer between 0 and 10)
-limits.processes                                | integer   | - (max)           | yes           | -                                    | Maximum number of processes that can run in the container
-linux.kernel\_modules                           | string    | -                 | yes           | -                                    | Comma separated list of kernel modules to load before starting the container
-migration.incremental.memory                    | boolean   | false             | yes           | migration\_pre\_copy                 | Incremental memory transfer of the container's memory to reduce downtime.
-migration.incremental.memory.goal               | integer   | 70                | yes           | migration\_pre\_copy                 | Percentage of memory to have in sync before stopping the container.
-migration.incremental.memory.iterations         | integer   | 10                | yes           | migration\_pre\_copy                 | Maximum number of transfer operations to go through before stopping the container.
-nvidia.driver.capabilities                      | string    | compute,utility   | no            | nvidia\_runtime\_config              | What driver capabilities the container needs (sets libnvidia-container NVIDIA\_DRIVER\_CAPABILITIES)
-nvidia.runtime                                  | boolean   | false             | no            | nvidia\_runtime                      | Pass the host NVIDIA and CUDA runtime libraries into the container
-nvidia.require.cuda                             | string    | -                 | no            | nvidia\_runtime\_config              | Version expression for the required CUDA version (sets libnvidia-container NVIDIA\_REQUIRE\_CUDA)
-nvidia.require.driver                           | string    | -                 | no            | nvidia\_runtime\_config              | Version expression for the required driver version (sets libnvidia-container NVIDIA\_REQUIRE\_DRIVER)
-raw.apparmor                                    | blob      | -                 | yes           | -                                    | Apparmor profile entries to be appended to the generated profile
-raw.idmap                                       | blob      | -                 | no            | id\_map                              | Raw idmap configuration (e.g. "both 1000 1000")
-raw.lxc                                         | blob      | -                 | no            | -                                    | Raw LXC configuration to be appended to the generated one
-raw.seccomp                                     | blob      | -                 | no            | container\_syscall\_filtering        | Raw Seccomp configuration
-security.devlxd                                 | boolean   | true              | no            | restrict\_devlxd                     | Controls the presence of /dev/lxd in the container
-security.devlxd.images                          | boolean   | false             | no            | devlxd\_images                       | Controls the availability of the /1.0/images API over devlxd
-security.idmap.base                             | integer   | -                 | no            | id\_map\_base                        | The base host ID to use for the allocation (overrides auto-detection)
-security.idmap.isolated                         | boolean   | false             | no            | id\_map                              | Use an idmap for this container that is unique among containers with isolated set.
-security.idmap.size                             | integer   | -                 | no            | id\_map                              | The size of the idmap to use
-security.nesting                                | boolean   | false             | yes           | -                                    | Support running lxd (nested) inside the container
-security.privileged                             | boolean   | false             | no            | -                                    | Runs the container in privileged mode
-security.protection.delete                      | boolean   | false             | yes           | container\_protection\_delete        | Prevents the container from being deleted
-security.protection.shift                       | boolean   | false             | yes           | container\_protection\_shift         | Prevents the container's filesystem from being uid/gid shifted on startup
-security.syscalls.blacklist                     | string    | -                 | no            | container\_syscall\_filtering        | A '\n' separated list of syscalls to blacklist
-security.syscalls.blacklist\_compat             | boolean   | false             | no            | container\_syscall\_filtering        | On x86\_64 this enables blocking of compat\_\* syscalls, it is a no-op on other arches
-security.syscalls.blacklist\_default            | boolean   | true              | no            | container\_syscall\_filtering        | Enables the default syscall blacklist
-security.syscalls.intercept.mknod               | boolean   | false             | no            | container\_syscall\_intercept        | Handles the `mknod` and `mknodat` system calls (allows creation of a limited subset of char/block devices)
-security.syscalls.intercept.mount               | boolean   | false             | no            | container\_syscall\_intercept\_mount | Handles the `mount` system call
-security.syscalls.intercept.mount.allowed       | string    | -                 | yes           | container\_syscall\_intercept\_mount | Specify a comma-separated list of filesystems that are safe to mount for processes inside the container.
-security.syscalls.intercept.mount.shift         | boolean   | false             | yes           | container\_syscall\_intercept\_mount | Whether to mount shiftfs on top of filesystems handled through mount syscall interception.
-security.syscalls.intercept.setxattr            | boolean   | false             | no            | container\_syscall\_intercept        | Handles the `setxattr` system call (allows setting a limited subset of restricted extended attributes)
-security.syscalls.whitelist                     | string    | -                 | no            | container\_syscall\_filtering        | A '\n' separated list of syscalls to whitelist (mutually exclusive with security.syscalls.blacklist\*)
-snapshots.schedule                              | string    | -                 | no            | snapshot\_scheduling                 | Cron expression (`<minute> <hour> <dom> <month> <dow>`)
-snapshots.schedule.stopped                      | bool      | false             | no            | snapshot\_scheduling                 | Controls whether or not stopped containers are to be snapshoted automatically
-snapshots.pattern                               | string    | snap%d            | no            | snapshot\_scheduling                 | Pongo2 template string which represents the snapshot name (used for scheduled snapshots and unnamed snapshots)
-snapshots.expiry                                | string    | -                 | no            | snapshot\_expiry                     | Controls when snapshots are to be deleted (expects expression like `1M 2H 3d 4w 5m 6y`)
-user.\*                                         | string    | -                 | n/a           | -                                    | Free form user key/value storage (can be used in search)
+Key                                             | Type      | Default           | Live update   | API extension                              | Description
+:--                                             | :---      | :------           | :----------   | :------------                              | :----------
+boot.autostart                                  | boolean   | -                 | n/a           | -                                          | Always start the container when LXD starts (if not set, restore last state)
+boot.autostart.delay                            | integer   | 0                 | n/a           | -                                          | Number of seconds to wait after the container started before starting the next one
+boot.autostart.priority                         | integer   | 0                 | n/a           | -                                          | What order to start the containers in (starting with highest)
+boot.host\_shutdown\_timeout                    | integer   | 30                | yes           | container\_host\_shutdown\_timeout         | Seconds to wait for container to shutdown before it is force stopped
+boot.stop.priority                              | integer   | 0                 | n/a           | container\_stop\_priority                  | What order to shutdown the containers (starting with highest)
+environment.\*                                  | string    | -                 | yes (exec)    | -                                          | key/value environment variables to export to the container and set on exec
+limits.cpu                                      | string    | - (all)           | yes           | -                                          | Number or range of CPUs to expose to the container
+limits.cpu.allowance                            | string    | 100%              | yes           | -                                          | How much of the CPU can be used. Can be a percentage (e.g. 50%) for a soft limit or hard a chunk of time (25ms/100ms)
+limits.cpu.priority                             | integer   | 10 (maximum)      | yes           | -                                          | CPU scheduling priority compared to other containers sharing the same CPUs (overcommit) (integer between 0 and 10)
+limits.disk.priority                            | integer   | 5 (medium)        | yes           | -                                          | When under load, how much priority to give to the container's I/O requests (integer between 0 and 10)
+limits.kernel.\*                                | string    | -                 | no            | kernel\_limits                             | This limits kernel resources per container (e.g. number of open files)
+limits.memory                                   | string    | - (all)           | yes           | -                                          | Percentage of the host's memory or fixed value in bytes (various suffixes supported, see below)
+limits.memory.enforce                           | string    | hard              | yes           | -                                          | If hard, container can't exceed its memory limit. If soft, the container can exceed its memory limit when extra host memory is available.
+limits.memory.swap                              | boolean   | true              | yes           | -                                          | Whether to allow some of the container's memory to be swapped out to disk
+limits.memory.swap.priority                     | integer   | 10 (maximum)      | yes           | -                                          | The higher this is set, the least likely the container is to be swapped to disk (integer between 0 and 10)
+limits.network.priority                         | integer   | 0 (minimum)       | yes           | -                                          | When under load, how much priority to give to the container's network requests (integer between 0 and 10)
+limits.processes                                | integer   | - (max)           | yes           | -                                          | Maximum number of processes that can run in the container
+linux.kernel\_modules                           | string    | -                 | yes           | -                                          | Comma separated list of kernel modules to load before starting the container
+migration.incremental.memory                    | boolean   | false             | yes           | migration\_pre\_copy                       | Incremental memory transfer of the container's memory to reduce downtime.
+migration.incremental.memory.goal               | integer   | 70                | yes           | migration\_pre\_copy                       | Percentage of memory to have in sync before stopping the container.
+migration.incremental.memory.iterations         | integer   | 10                | yes           | migration\_pre\_copy                       | Maximum number of transfer operations to go through before stopping the container.
+nvidia.driver.capabilities                      | string    | compute,utility   | no            | nvidia\_runtime\_config                    | What driver capabilities the container needs (sets libnvidia-container NVIDIA\_DRIVER\_CAPABILITIES)
+nvidia.runtime                                  | boolean   | false             | no            | nvidia\_runtime                            | Pass the host NVIDIA and CUDA runtime libraries into the container
+nvidia.require.cuda                             | string    | -                 | no            | nvidia\_runtime\_config                    | Version expression for the required CUDA version (sets libnvidia-container NVIDIA\_REQUIRE\_CUDA)
+nvidia.require.driver                           | string    | -                 | no            | nvidia\_runtime\_config                    | Version expression for the required driver version (sets libnvidia-container NVIDIA\_REQUIRE\_DRIVER)
+raw.apparmor                                    | blob      | -                 | yes           | -                                          | Apparmor profile entries to be appended to the generated profile
+raw.idmap                                       | blob      | -                 | no            | id\_map                                    | Raw idmap configuration (e.g. "both 1000 1000")
+raw.lxc                                         | blob      | -                 | no            | -                                          | Raw LXC configuration to be appended to the generated one
+raw.seccomp                                     | blob      | -                 | no            | container\_syscall\_filtering              | Raw Seccomp configuration
+security.devlxd                                 | boolean   | true              | no            | restrict\_devlxd                           | Controls the presence of /dev/lxd in the container
+security.devlxd.images                          | boolean   | false             | no            | devlxd\_images                             | Controls the availability of the /1.0/images API over devlxd
+security.idmap.base                             | integer   | -                 | no            | id\_map\_base                              | The base host ID to use for the allocation (overrides auto-detection)
+security.idmap.isolated                         | boolean   | false             | no            | id\_map                                    | Use an idmap for this container that is unique among containers with isolated set.
+security.idmap.size                             | integer   | -                 | no            | id\_map                                    | The size of the idmap to use
+security.nesting                                | boolean   | false             | yes           | -                                          | Support running lxd (nested) inside the container
+security.privileged                             | boolean   | false             | no            | -                                          | Runs the container in privileged mode
+security.protection.delete                      | boolean   | false             | yes           | container\_protection\_delete              | Prevents the container from being deleted
+security.protection.shift                       | boolean   | false             | yes           | container\_protection\_shift               | Prevents the container's filesystem from being uid/gid shifted on startup
+security.syscalls.blacklist                     | string    | -                 | no            | container\_syscall\_filtering              | A '\n' separated list of syscalls to blacklist
+security.syscalls.blacklist\_compat             | boolean   | false             | no            | container\_syscall\_filtering              | On x86\_64 this enables blocking of compat\_\* syscalls, it is a no-op on other arches
+security.syscalls.blacklist\_default            | boolean   | true              | no            | container\_syscall\_filtering              | Enables the default syscall blacklist
+security.syscalls.intercept.mknod               | boolean   | false             | no            | container\_syscall\_intercept              | Handles the `mknod` and `mknodat` system calls (allows creation of a limited subset of char/block devices)
+security.syscalls.intercept.mount               | boolean   | false             | no            | container\_syscall\_intercept\_mount       | Handles the `mount` system call
+security.syscalls.intercept.mount.allowed       | string    | -                 | yes           | container\_syscall\_intercept\_mount       | Specify a comma-separated list of filesystems that are safe to mount for processes inside the container.
+security.syscalls.intercept.mount.fuse          | string    | -                 | yes           | container\_syscall\_intercept\_mount\_fuse | Whether to mount shiftfs on top of filesystems handled through mount syscall interception.
+security.syscalls.intercept.mount.shift         | boolean   | false             | yes           | container\_syscall\_intercept\_mount       | Whether to redirect mounts of a given filesystem to their fuse implemenation (e.g. ext4=fuse2fs)
+security.syscalls.intercept.setxattr            | boolean   | false             | no            | container\_syscall\_intercept              | Handles the `setxattr` system call (allows setting a limited subset of restricted extended attributes)
+security.syscalls.whitelist                     | string    | -                 | no            | container\_syscall\_filtering              | A '\n' separated list of syscalls to whitelist (mutually exclusive with security.syscalls.blacklist\*)
+snapshots.schedule                              | string    | -                 | no            | snapshot\_scheduling                       | Cron expression (`<minute> <hour> <dom> <month> <dow>`)
+snapshots.schedule.stopped                      | bool      | false             | no            | snapshot\_scheduling                       | Controls whether or not stopped containers are to be snapshoted automatically
+snapshots.pattern                               | string    | snap%d            | no            | snapshot\_scheduling                       | Pongo2 template string which represents the snapshot name (used for scheduled snapshots and unnamed snapshots)
+snapshots.expiry                                | string    | -                 | no            | snapshot\_expiry                           | Controls when snapshots are to be deleted (expects expression like `1M 2H 3d 4w 5m 6y`)
+user.\*                                         | string    | -                 | n/a           | -                                          | Free form user key/value storage (can be used in search)
 
 The following volatile keys are currently internally used by LXD:
 

From 9cb23c07a7f17cefc235afe8436d184b68e1a832 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Tue, 12 Nov 2019 23:22:52 +0100
Subject: [PATCH 4/4] scripts: add security.syscalls.intercept.mount.fuse

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 scripts/bash/lxd-client | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/bash/lxd-client b/scripts/bash/lxd-client
index 1fae67dea7..19d4173bd7 100644
--- a/scripts/bash/lxd-client
+++ b/scripts/bash/lxd-client
@@ -96,6 +96,7 @@ _have lxc && {
       security.syscalls.blacklist_compat security.syscalls.blacklist_default \
       security.syscalls.intercept.mknod security.syscalls.intercept.mount \
       security.syscalls.intercept.mount.allowed \
+      security.syscall.intercept.mount.fuse \
       security.syscalls.intercept.setxattr \
       security.syscall.intercept.mount.shift \
       snapshots.schedule snapshots.schedule.stopped snapshots.pattern \


More information about the lxc-devel mailing list