[lxc-devel] [lxd/master] seccomp: implement redirection to fuse
brauner on Github
lxc-bot at linuxcontainers.org
Tue Nov 12 22:25:11 UTC 2019
A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 710 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20191112/8a1faf10/attachment-0001.bin>
-------------- next part --------------
From 00811cbb3b68840f9796e2fa939f411601ebb970 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Tue, 12 Nov 2019 23:14:13 +0100
Subject: [PATCH 1/4] seccomp: implement redirection to fuse
This allows to intercept and redirect mount syscalls for filesystems and
redirect them to their corresponding fuse implementation.
A new key
security.syscalls.intercept.mount.fuse=<fstype>=<fuse-binary>
is added.
Filesystems cannot both appear in security.syscalls.intercept.mount.fuse and
security.syscalls.intercept.mount.allowed.
Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
lxd/container.go | 6 ++
lxd/main_forksyscall.go | 31 +++++--
lxd/seccomp/seccomp.go | 190 +++++++++++++++++++++++++++++++++++-----
shared/container.go | 1 +
shared/util.go | 2 +
5 files changed, 199 insertions(+), 31 deletions(-)
diff --git a/lxd/container.go b/lxd/container.go
index 8d5d3c457b..f61f286d01 100644
--- a/lxd/container.go
+++ b/lxd/container.go
@@ -24,6 +24,7 @@ import (
deviceConfig "github.com/lxc/lxd/lxd/device/config"
"github.com/lxc/lxd/lxd/instance/instancetype"
"github.com/lxc/lxd/lxd/operations"
+ "github.com/lxc/lxd/lxd/seccomp"
"github.com/lxc/lxd/lxd/state"
storagePools "github.com/lxc/lxd/lxd/storage"
storageDrivers "github.com/lxc/lxd/lxd/storage/drivers"
@@ -168,6 +169,11 @@ func containerValidConfig(sysOS *sys.OS, config map[string]string, profile bool,
return fmt.Errorf("security.syscalls.whitelist is mutually exclusive with security.syscalls.blacklist*")
}
+ err, _ := seccomp.SeccompSyscallInterceptMountFilter(config)
+ if err != nil {
+ return err
+ }
+
if expanded && (config["security.privileged"] == "" || !shared.IsTrue(config["security.privileged"])) && sysOS.IdmapSet == nil {
return fmt.Errorf("LXD doesn't have a uid/gid allocation. In this mode, only privileged containers are supported")
}
diff --git a/lxd/main_forksyscall.go b/lxd/main_forksyscall.go
index 04738aa6d9..6b2e1ac5d1 100644
--- a/lxd/main_forksyscall.go
+++ b/lxd/main_forksyscall.go
@@ -381,6 +381,7 @@ static void mount_emulate(void)
{
__do_close_prot_errno int mnt_fd = -EBADF;
char *source = NULL, *shiftfs = NULL, *target = NULL, *fstype = NULL;
+ bool use_fuse;
uid_t uid = -1, fsuid = -1;
gid_t gid = -1, fsgid = -1;
int ret;
@@ -389,28 +390,40 @@ static void mount_emulate(void)
const void *data;
pid = atoi(advance_arg(true));
- source = advance_arg(true);
- target = advance_arg(true);
- fstype = advance_arg(true);
- flags = atoi(advance_arg(true));
- shiftfs = advance_arg(true);
+ use_fuse = (atoi(advance_arg(true)) == 1);
+ if (!use_fuse) {
+ source = advance_arg(true);
+ target = advance_arg(true);
+ fstype = advance_arg(true);
+ flags = atoi(advance_arg(true));
+ shiftfs = advance_arg(true);
+ }
uid = atoi(advance_arg(true));
gid = atoi(advance_arg(true));
fsuid = atoi(advance_arg(true));
fsgid = atoi(advance_arg(true));
- data = advance_arg(false);
+ if (!use_fuse)
+ data = advance_arg(false);
mnt_fd = preserve_ns(getpid(), "mnt");
if (mnt_fd < 0)
_exit(EXIT_FAILURE);
+ if (use_fuse)
+ attach_userns(pid);
+
if (!acquire_basic_creds(pid))
_exit(EXIT_FAILURE);
if (!acquire_final_creds(pid, uid, gid, fsuid, fsgid))
_exit(EXIT_FAILURE);
- if (strcmp(shiftfs, "true") == 0) {
+ if (use_fuse) {
+ const char *cmd = advance_arg(true);
+ ret = system(cmd);
+ if (ret)
+ _exit(EXIT_FAILURE);
+ } else if (strcmp(shiftfs, "true") == 0) {
char template[] = P_tmpdir "/.lxd_tmp_mount_XXXXXX";
// Create basic mount in container's mount namespace.
@@ -525,12 +538,12 @@ type cmdForksyscall struct {
func (c *cmdForksyscall) Command() *cobra.Command {
// Main subcommand
cmd := &cobra.Command{}
- cmd.Use = "forksyscall <syscall> <PID> <path> <mode> <dev>"
+ cmd.Use = "forksyscall <syscall> <PID> [...]"
cmd.Short = "Perform syscall operations"
cmd.Long = `Description:
Perform syscall operations
- This set of internal commands are used for all seccom-based container syscall
+ This set of internal commands is used for all seccomp-based container syscall
operations.
`
cmd.RunE = c.Run
diff --git a/lxd/seccomp/seccomp.go b/lxd/seccomp/seccomp.go
index 65ff54b70f..eef84aeb68 100644
--- a/lxd/seccomp/seccomp.go
+++ b/lxd/seccomp/seccomp.go
@@ -45,6 +45,7 @@ import (
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
+#include <sys/mount.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/syscall.h>
@@ -1176,6 +1177,77 @@ type MountArgs struct {
shift bool
}
+// MS_REC
+var mountFlagsToOptMap = map[C.ulong]string{
+ C.MS_BIND: "bind",
+ C.ulong(0): "defaults",
+ C.MS_LAZYTIME: "lazytime",
+ C.MS_MANDLOCK: "mand",
+ C.MS_NOATIME: "noatime",
+ C.MS_NODEV: "nodev",
+ C.MS_NODIRATIME: "nodiratime",
+ C.MS_NOEXEC: "noexec",
+ C.MS_NOSUID: "nosuid",
+ C.MS_RELATIME: "relatime",
+ C.MS_REMOUNT: "remount",
+ C.MS_RDONLY: "ro",
+ C.MS_STRICTATIME: "strictatime",
+ C.MS_SYNCHRONOUS: "sync",
+ C.MS_PRIVATE: "--make-private",
+ C.MS_SHARED: "--make-shared",
+ C.MS_SLAVE: "--make-slave",
+ C.MS_UNBINDABLE: "--make-unbindable",
+
+ C.MS_REC | C.MS_BIND: "rbind",
+ C.MS_REC | C.MS_PRIVATE: "--make-rprivate",
+ C.MS_REC | C.MS_SHARED: "--make-rshared",
+ C.MS_REC | C.MS_SLAVE: "--make-rslave",
+ C.MS_REC | C.MS_UNBINDABLE: "--make-runbindable",
+}
+
+func mountFlagsToOpts(flags C.ulong) (string, string) {
+ var bit C.ulong = 0
+ opts := ""
+ args := ""
+ var msRec C.ulong = (flags & C.MS_REC)
+
+ flags = (flags &^ C.MS_REC)
+ for bit < (4*8 - 1) {
+ if (flags & (1 << bit)) > 0 {
+ var flagKey C.ulong = (1 << bit)
+
+ switch flagKey {
+ case C.MS_BIND:
+ fallthrough
+ case C.MS_PRIVATE:
+ fallthrough
+ case C.MS_SHARED:
+ fallthrough
+ case C.MS_SLAVE:
+ fallthrough
+ case C.MS_UNBINDABLE:
+ flagKey |= msRec
+ }
+ optOrArg := mountFlagsToOptMap[flagKey]
+
+ if optOrArg == "" {
+ continue
+ }
+
+ if strings.HasPrefix(optOrArg, "--") {
+ args = fmt.Sprintf("%s %s", args, optOrArg)
+ } else if opts == "" {
+ opts = fmt.Sprintf("%s", optOrArg)
+ } else {
+ opts = fmt.Sprintf("%s,%s", opts, optOrArg)
+ }
+ }
+ bit++
+ }
+
+ return opts, args
+}
+
// HandleMountSyscall handles mount syscalls.
func (s *Server) HandleMountSyscall(c Instance, siov *Iovec) int {
ctx := log.Ctx{"container": c.Name(),
@@ -1252,7 +1324,8 @@ func (s *Server) HandleMountSyscall(c Instance, siov *Iovec) int {
args.data = C.GoString(&cBuf[0])
}
- if !s.MountSyscallValid(c, &args) {
+ ok, fuseBinary := s.MountSyscallValid(c, &args)
+ if !ok {
ctx["syscall_continue"] = "true"
C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(seccompUserNotifFlagContinue))
return 0
@@ -1265,20 +1338,55 @@ func (s *Server) HandleMountSyscall(c Instance, siov *Iovec) int {
return 0
}
- _, _, err = shared.RunCommandSplit(nil, util.GetExecPath(),
- "forksyscall",
- "mount",
- fmt.Sprintf("%d", args.pid),
- fmt.Sprintf("%s", args.source),
- fmt.Sprintf("%s", args.target),
- fmt.Sprintf("%s", args.fstype),
- fmt.Sprintf("%d", args.flags),
- fmt.Sprintf("%t", args.shift),
- fmt.Sprintf("%d", nsuid),
- fmt.Sprintf("%d", nsgid),
- fmt.Sprintf("%d", nsfsuid),
- fmt.Sprintf("%d", nsfsgid),
- fmt.Sprintf("%s", args.data))
+ if fuseBinary != "" {
+ addOpts, addArgs := mountFlagsToOpts(C.ulong(args.flags))
+
+ fuseCmd := fmt.Sprintf("mount.fuse %s#%s %s", fuseBinary, args.source, args.target)
+
+ if addArgs != "" {
+ fuseCmd = fmt.Sprintf("%s %s", fuseCmd, addArgs)
+ }
+
+ if args.data != "" || addOpts != "" {
+ fuseCmd = fmt.Sprintf("%s -o", fuseCmd)
+ if args.data != "" && addOpts != "" {
+ fuseCmd = fmt.Sprintf("%s %s,%s", fuseCmd, args.data, addOpts)
+ } else if args.data != "" {
+ fuseCmd = fmt.Sprintf("%s %s", fuseCmd, args.data)
+ } else {
+ fuseCmd = fmt.Sprintf("%s %s", fuseCmd, addOpts)
+ }
+ }
+
+ logger.Errorf("AAAA: %s", fuseCmd)
+ ctx["fuse_cmd"] = fuseCmd
+ _, _, err = shared.RunCommandSplit(nil, util.GetExecPath(),
+ "forksyscall",
+ "mount",
+ fmt.Sprintf("%d", args.pid),
+ fmt.Sprintf("%d", 1),
+ fmt.Sprintf("%d", nsuid),
+ fmt.Sprintf("%d", nsgid),
+ fmt.Sprintf("%d", nsfsuid),
+ fmt.Sprintf("%d", nsfsgid),
+ fmt.Sprintf("%s", fuseCmd))
+ } else {
+ _, _, err = shared.RunCommandSplit(nil, util.GetExecPath(),
+ "forksyscall",
+ "mount",
+ fmt.Sprintf("%d", args.pid),
+ fmt.Sprintf("%d", 0),
+ fmt.Sprintf("%s", args.source),
+ fmt.Sprintf("%s", args.target),
+ fmt.Sprintf("%s", args.fstype),
+ fmt.Sprintf("%d", args.flags),
+ fmt.Sprintf("%t", args.shift),
+ fmt.Sprintf("%d", nsuid),
+ fmt.Sprintf("%d", nsgid),
+ fmt.Sprintf("%d", nsfsuid),
+ fmt.Sprintf("%d", nsfsgid),
+ fmt.Sprintf("%s", args.data))
+ }
if err != nil {
ctx["syscall_continue"] = "true"
C.seccomp_notify_update_response(siov.resp, 0, C.uint32_t(seccompUserNotifFlagContinue))
@@ -1390,16 +1498,54 @@ func MountSyscallFilter(config map[string]string) []string {
return fs
}
-// MountSyscallValid checks whether this is a mount syscall we intercept.
-func (s *Server) MountSyscallValid(c Instance, args *MountArgs) bool {
- fsList := MountSyscallFilter(c.ExpandedConfig())
- for _, fs := range fsList {
- if fs == args.fstype {
- return true
+// SeccompSyscallInterceptMountFilter creates a new mount syscall interception filter
+func SeccompSyscallInterceptMountFilter(config map[string]string) (error, map[string]string) {
+ if !shared.IsTrue(config["security.syscalls.intercept.mount"]) {
+ return nil, map[string]string{}
+
+ }
+
+ fsMap := map[string]string{}
+ fsFused := strings.Split(config["security.syscalls.intercept.mount.fuse"], ",")
+ if len(fsFused) > 0 && fsFused[0] != "" {
+ for _, ent := range fsFused {
+ fsfuse := strings.Split(ent, "=")
+ if len(fsfuse) != 2 {
+ return fmt.Errorf("security.syscalls.intercept.mount.fuse is not of the form 'filesystem=fuse-binary': %s", ent), map[string]string{}
+ }
+
+ // fsfuse[0] == filesystems that are ok to mount
+ // fsfuse[1] == fuse binary to use to mount filesystemstype
+ fsMap[fsfuse[0]] = fsfuse[1]
}
}
- return false
+ fsAllowed := strings.Split(config["security.syscalls.intercept.mount.allowed"], ",")
+ if len(fsAllowed) > 0 && fsAllowed[0] != "" {
+ for _, allowedfs := range fsAllowed {
+ if fsMap[allowedfs] != "" {
+ return fmt.Errorf("Filesystem %s cannot appear in security.syscalls.intercept.mount.allowed and security.syscalls.intercept.mount.fuse", allowedfs), map[string]string{}
+ }
+
+ fsMap[allowedfs] = ""
+ }
+ }
+
+ return nil, fsMap
+}
+
+// MountSyscallValid checks whether this is a mount syscall we intercept.
+func (s *Server) MountSyscallValid(c Instance, args *MountArgs) (bool, string) {
+ err, fsMap := SeccompSyscallInterceptMountFilter(c.ExpandedConfig())
+ if err != nil {
+ return false, ""
+ }
+
+ if fuse, ok := fsMap[args.fstype]; ok {
+ return true, fuse
+ }
+
+ return false, ""
}
// MountSyscallShift checks whether this mount syscall needs shiftfs.
diff --git a/shared/container.go b/shared/container.go
index cb04e09141..719aac857f 100644
--- a/shared/container.go
+++ b/shared/container.go
@@ -300,6 +300,7 @@ var KnownContainerConfigKeys = map[string]func(value string) error{
"security.syscalls.intercept.mknod": IsBool,
"security.syscalls.intercept.mount": IsBool,
"security.syscalls.intercept.mount.allowed": IsAny,
+ "security.syscalls.intercept.mount.fuse": IsAny,
"security.syscalls.intercept.mount.shift": IsBool,
"security.syscalls.intercept.setxattr": IsBool,
"security.syscalls.whitelist": IsAny,
diff --git a/shared/util.go b/shared/util.go
index e307f985fe..8bb03ab470 100644
--- a/shared/util.go
+++ b/shared/util.go
@@ -29,6 +29,7 @@ import (
"github.com/lxc/lxd/shared/cancel"
"github.com/lxc/lxd/shared/ioprogress"
+ "github.com/lxc/lxd/shared/logger"
"github.com/lxc/lxd/shared/units"
)
@@ -991,6 +992,7 @@ func DownloadFileHash(httpClient *http.Client, useragent string, progress func(p
}
result := fmt.Sprintf("%x", hashFunc.Sum(nil))
+ logger.Errorf("Hashing for download from url %s. Got hash: %s. Expected hash: %s", url, result, hash)
if result != hash {
return -1, fmt.Errorf("Hash mismatch for %s: %s != %s", url, result, hash)
}
From dab3ee3cfe2401081d9954ce3ada2d40632f6dd2 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Tue, 12 Nov 2019 23:18:37 +0100
Subject: [PATCH 2/4] api: add container_syscall_intercept_mount_fuse extension
Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
doc/api-extensions.md | 7 ++++++-
shared/version/api.go | 1 +
2 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/doc/api-extensions.md b/doc/api-extensions.md
index ca09e50e41..b78ffb32c8 100644
--- a/doc/api-extensions.md
+++ b/doc/api-extensions.md
@@ -871,4 +871,9 @@ elevated permissions.
Adds support for importing/exporting of images/backups using SquashFS file system format.
## container\_raw\_mount
-This adds support for passing in raw mount options for disk devices.
\ No newline at end of file
+This adds support for passing in raw mount options for disk devices.
+
+## container\_syscall\_intercept\_mount\_fuse
+Adds the `security.syscalls.intercept.mount.fuse` key. It can be used to
+redirect filesystem mounts to their fuse implementation. To this end, set e.g.
+`security.syscalls.intercept.mount.fuse=ext4=fuse2fs`.
diff --git a/shared/version/api.go b/shared/version/api.go
index f6b0e345a1..fb1e6edd43 100644
--- a/shared/version/api.go
+++ b/shared/version/api.go
@@ -175,6 +175,7 @@ var APIExtensions = []string{
"container_syscall_intercept_mount",
"compression_squashfs",
"container_raw_mount",
+ "container_syscall_intercept_mount_fuse",
}
// APIExtensionsCount returns the number of available API extensions.
From 9f64917fb6e48c39b3d8ed5516999f2a5a59957d Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Tue, 12 Nov 2019 23:22:17 +0100
Subject: [PATCH 3/4] doc: add security.syscalls.intercept.mount.fuse
Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
doc/containers.md | 109 +++++++++++++++++++++++-----------------------
1 file changed, 55 insertions(+), 54 deletions(-)
diff --git a/doc/containers.md b/doc/containers.md
index 5441367e3b..e5c46602c0 100644
--- a/doc/containers.md
+++ b/doc/containers.md
@@ -34,60 +34,61 @@ currently supported:
The currently supported keys are:
-Key | Type | Default | Live update | API extension | Description
-:-- | :--- | :------ | :---------- | :------------ | :----------
-boot.autostart | boolean | - | n/a | - | Always start the container when LXD starts (if not set, restore last state)
-boot.autostart.delay | integer | 0 | n/a | - | Number of seconds to wait after the container started before starting the next one
-boot.autostart.priority | integer | 0 | n/a | - | What order to start the containers in (starting with highest)
-boot.host\_shutdown\_timeout | integer | 30 | yes | container\_host\_shutdown\_timeout | Seconds to wait for container to shutdown before it is force stopped
-boot.stop.priority | integer | 0 | n/a | container\_stop\_priority | What order to shutdown the containers (starting with highest)
-environment.\* | string | - | yes (exec) | - | key/value environment variables to export to the container and set on exec
-limits.cpu | string | - (all) | yes | - | Number or range of CPUs to expose to the container
-limits.cpu.allowance | string | 100% | yes | - | How much of the CPU can be used. Can be a percentage (e.g. 50%) for a soft limit or hard a chunk of time (25ms/100ms)
-limits.cpu.priority | integer | 10 (maximum) | yes | - | CPU scheduling priority compared to other containers sharing the same CPUs (overcommit) (integer between 0 and 10)
-limits.disk.priority | integer | 5 (medium) | yes | - | When under load, how much priority to give to the container's I/O requests (integer between 0 and 10)
-limits.kernel.\* | string | - | no | kernel\_limits | This limits kernel resources per container (e.g. number of open files)
-limits.memory | string | - (all) | yes | - | Percentage of the host's memory or fixed value in bytes (various suffixes supported, see below)
-limits.memory.enforce | string | hard | yes | - | If hard, container can't exceed its memory limit. If soft, the container can exceed its memory limit when extra host memory is available.
-limits.memory.swap | boolean | true | yes | - | Whether to allow some of the container's memory to be swapped out to disk
-limits.memory.swap.priority | integer | 10 (maximum) | yes | - | The higher this is set, the least likely the container is to be swapped to disk (integer between 0 and 10)
-limits.network.priority | integer | 0 (minimum) | yes | - | When under load, how much priority to give to the container's network requests (integer between 0 and 10)
-limits.processes | integer | - (max) | yes | - | Maximum number of processes that can run in the container
-linux.kernel\_modules | string | - | yes | - | Comma separated list of kernel modules to load before starting the container
-migration.incremental.memory | boolean | false | yes | migration\_pre\_copy | Incremental memory transfer of the container's memory to reduce downtime.
-migration.incremental.memory.goal | integer | 70 | yes | migration\_pre\_copy | Percentage of memory to have in sync before stopping the container.
-migration.incremental.memory.iterations | integer | 10 | yes | migration\_pre\_copy | Maximum number of transfer operations to go through before stopping the container.
-nvidia.driver.capabilities | string | compute,utility | no | nvidia\_runtime\_config | What driver capabilities the container needs (sets libnvidia-container NVIDIA\_DRIVER\_CAPABILITIES)
-nvidia.runtime | boolean | false | no | nvidia\_runtime | Pass the host NVIDIA and CUDA runtime libraries into the container
-nvidia.require.cuda | string | - | no | nvidia\_runtime\_config | Version expression for the required CUDA version (sets libnvidia-container NVIDIA\_REQUIRE\_CUDA)
-nvidia.require.driver | string | - | no | nvidia\_runtime\_config | Version expression for the required driver version (sets libnvidia-container NVIDIA\_REQUIRE\_DRIVER)
-raw.apparmor | blob | - | yes | - | Apparmor profile entries to be appended to the generated profile
-raw.idmap | blob | - | no | id\_map | Raw idmap configuration (e.g. "both 1000 1000")
-raw.lxc | blob | - | no | - | Raw LXC configuration to be appended to the generated one
-raw.seccomp | blob | - | no | container\_syscall\_filtering | Raw Seccomp configuration
-security.devlxd | boolean | true | no | restrict\_devlxd | Controls the presence of /dev/lxd in the container
-security.devlxd.images | boolean | false | no | devlxd\_images | Controls the availability of the /1.0/images API over devlxd
-security.idmap.base | integer | - | no | id\_map\_base | The base host ID to use for the allocation (overrides auto-detection)
-security.idmap.isolated | boolean | false | no | id\_map | Use an idmap for this container that is unique among containers with isolated set.
-security.idmap.size | integer | - | no | id\_map | The size of the idmap to use
-security.nesting | boolean | false | yes | - | Support running lxd (nested) inside the container
-security.privileged | boolean | false | no | - | Runs the container in privileged mode
-security.protection.delete | boolean | false | yes | container\_protection\_delete | Prevents the container from being deleted
-security.protection.shift | boolean | false | yes | container\_protection\_shift | Prevents the container's filesystem from being uid/gid shifted on startup
-security.syscalls.blacklist | string | - | no | container\_syscall\_filtering | A '\n' separated list of syscalls to blacklist
-security.syscalls.blacklist\_compat | boolean | false | no | container\_syscall\_filtering | On x86\_64 this enables blocking of compat\_\* syscalls, it is a no-op on other arches
-security.syscalls.blacklist\_default | boolean | true | no | container\_syscall\_filtering | Enables the default syscall blacklist
-security.syscalls.intercept.mknod | boolean | false | no | container\_syscall\_intercept | Handles the `mknod` and `mknodat` system calls (allows creation of a limited subset of char/block devices)
-security.syscalls.intercept.mount | boolean | false | no | container\_syscall\_intercept\_mount | Handles the `mount` system call
-security.syscalls.intercept.mount.allowed | string | - | yes | container\_syscall\_intercept\_mount | Specify a comma-separated list of filesystems that are safe to mount for processes inside the container.
-security.syscalls.intercept.mount.shift | boolean | false | yes | container\_syscall\_intercept\_mount | Whether to mount shiftfs on top of filesystems handled through mount syscall interception.
-security.syscalls.intercept.setxattr | boolean | false | no | container\_syscall\_intercept | Handles the `setxattr` system call (allows setting a limited subset of restricted extended attributes)
-security.syscalls.whitelist | string | - | no | container\_syscall\_filtering | A '\n' separated list of syscalls to whitelist (mutually exclusive with security.syscalls.blacklist\*)
-snapshots.schedule | string | - | no | snapshot\_scheduling | Cron expression (`<minute> <hour> <dom> <month> <dow>`)
-snapshots.schedule.stopped | bool | false | no | snapshot\_scheduling | Controls whether or not stopped containers are to be snapshoted automatically
-snapshots.pattern | string | snap%d | no | snapshot\_scheduling | Pongo2 template string which represents the snapshot name (used for scheduled snapshots and unnamed snapshots)
-snapshots.expiry | string | - | no | snapshot\_expiry | Controls when snapshots are to be deleted (expects expression like `1M 2H 3d 4w 5m 6y`)
-user.\* | string | - | n/a | - | Free form user key/value storage (can be used in search)
+Key | Type | Default | Live update | API extension | Description
+:-- | :--- | :------ | :---------- | :------------ | :----------
+boot.autostart | boolean | - | n/a | - | Always start the container when LXD starts (if not set, restore last state)
+boot.autostart.delay | integer | 0 | n/a | - | Number of seconds to wait after the container started before starting the next one
+boot.autostart.priority | integer | 0 | n/a | - | What order to start the containers in (starting with highest)
+boot.host\_shutdown\_timeout | integer | 30 | yes | container\_host\_shutdown\_timeout | Seconds to wait for container to shutdown before it is force stopped
+boot.stop.priority | integer | 0 | n/a | container\_stop\_priority | What order to shutdown the containers (starting with highest)
+environment.\* | string | - | yes (exec) | - | key/value environment variables to export to the container and set on exec
+limits.cpu | string | - (all) | yes | - | Number or range of CPUs to expose to the container
+limits.cpu.allowance | string | 100% | yes | - | How much of the CPU can be used. Can be a percentage (e.g. 50%) for a soft limit or hard a chunk of time (25ms/100ms)
+limits.cpu.priority | integer | 10 (maximum) | yes | - | CPU scheduling priority compared to other containers sharing the same CPUs (overcommit) (integer between 0 and 10)
+limits.disk.priority | integer | 5 (medium) | yes | - | When under load, how much priority to give to the container's I/O requests (integer between 0 and 10)
+limits.kernel.\* | string | - | no | kernel\_limits | This limits kernel resources per container (e.g. number of open files)
+limits.memory | string | - (all) | yes | - | Percentage of the host's memory or fixed value in bytes (various suffixes supported, see below)
+limits.memory.enforce | string | hard | yes | - | If hard, container can't exceed its memory limit. If soft, the container can exceed its memory limit when extra host memory is available.
+limits.memory.swap | boolean | true | yes | - | Whether to allow some of the container's memory to be swapped out to disk
+limits.memory.swap.priority | integer | 10 (maximum) | yes | - | The higher this is set, the least likely the container is to be swapped to disk (integer between 0 and 10)
+limits.network.priority | integer | 0 (minimum) | yes | - | When under load, how much priority to give to the container's network requests (integer between 0 and 10)
+limits.processes | integer | - (max) | yes | - | Maximum number of processes that can run in the container
+linux.kernel\_modules | string | - | yes | - | Comma separated list of kernel modules to load before starting the container
+migration.incremental.memory | boolean | false | yes | migration\_pre\_copy | Incremental memory transfer of the container's memory to reduce downtime.
+migration.incremental.memory.goal | integer | 70 | yes | migration\_pre\_copy | Percentage of memory to have in sync before stopping the container.
+migration.incremental.memory.iterations | integer | 10 | yes | migration\_pre\_copy | Maximum number of transfer operations to go through before stopping the container.
+nvidia.driver.capabilities | string | compute,utility | no | nvidia\_runtime\_config | What driver capabilities the container needs (sets libnvidia-container NVIDIA\_DRIVER\_CAPABILITIES)
+nvidia.runtime | boolean | false | no | nvidia\_runtime | Pass the host NVIDIA and CUDA runtime libraries into the container
+nvidia.require.cuda | string | - | no | nvidia\_runtime\_config | Version expression for the required CUDA version (sets libnvidia-container NVIDIA\_REQUIRE\_CUDA)
+nvidia.require.driver | string | - | no | nvidia\_runtime\_config | Version expression for the required driver version (sets libnvidia-container NVIDIA\_REQUIRE\_DRIVER)
+raw.apparmor | blob | - | yes | - | Apparmor profile entries to be appended to the generated profile
+raw.idmap | blob | - | no | id\_map | Raw idmap configuration (e.g. "both 1000 1000")
+raw.lxc | blob | - | no | - | Raw LXC configuration to be appended to the generated one
+raw.seccomp | blob | - | no | container\_syscall\_filtering | Raw Seccomp configuration
+security.devlxd | boolean | true | no | restrict\_devlxd | Controls the presence of /dev/lxd in the container
+security.devlxd.images | boolean | false | no | devlxd\_images | Controls the availability of the /1.0/images API over devlxd
+security.idmap.base | integer | - | no | id\_map\_base | The base host ID to use for the allocation (overrides auto-detection)
+security.idmap.isolated | boolean | false | no | id\_map | Use an idmap for this container that is unique among containers with isolated set.
+security.idmap.size | integer | - | no | id\_map | The size of the idmap to use
+security.nesting | boolean | false | yes | - | Support running lxd (nested) inside the container
+security.privileged | boolean | false | no | - | Runs the container in privileged mode
+security.protection.delete | boolean | false | yes | container\_protection\_delete | Prevents the container from being deleted
+security.protection.shift | boolean | false | yes | container\_protection\_shift | Prevents the container's filesystem from being uid/gid shifted on startup
+security.syscalls.blacklist | string | - | no | container\_syscall\_filtering | A '\n' separated list of syscalls to blacklist
+security.syscalls.blacklist\_compat | boolean | false | no | container\_syscall\_filtering | On x86\_64 this enables blocking of compat\_\* syscalls, it is a no-op on other arches
+security.syscalls.blacklist\_default | boolean | true | no | container\_syscall\_filtering | Enables the default syscall blacklist
+security.syscalls.intercept.mknod | boolean | false | no | container\_syscall\_intercept | Handles the `mknod` and `mknodat` system calls (allows creation of a limited subset of char/block devices)
+security.syscalls.intercept.mount | boolean | false | no | container\_syscall\_intercept\_mount | Handles the `mount` system call
+security.syscalls.intercept.mount.allowed | string | - | yes | container\_syscall\_intercept\_mount | Specify a comma-separated list of filesystems that are safe to mount for processes inside the container.
+security.syscalls.intercept.mount.fuse | string | - | yes | container\_syscall\_intercept\_mount\_fuse | Whether to mount shiftfs on top of filesystems handled through mount syscall interception.
+security.syscalls.intercept.mount.shift | boolean | false | yes | container\_syscall\_intercept\_mount | Whether to redirect mounts of a given filesystem to their fuse implemenation (e.g. ext4=fuse2fs)
+security.syscalls.intercept.setxattr | boolean | false | no | container\_syscall\_intercept | Handles the `setxattr` system call (allows setting a limited subset of restricted extended attributes)
+security.syscalls.whitelist | string | - | no | container\_syscall\_filtering | A '\n' separated list of syscalls to whitelist (mutually exclusive with security.syscalls.blacklist\*)
+snapshots.schedule | string | - | no | snapshot\_scheduling | Cron expression (`<minute> <hour> <dom> <month> <dow>`)
+snapshots.schedule.stopped | bool | false | no | snapshot\_scheduling | Controls whether or not stopped containers are to be snapshoted automatically
+snapshots.pattern | string | snap%d | no | snapshot\_scheduling | Pongo2 template string which represents the snapshot name (used for scheduled snapshots and unnamed snapshots)
+snapshots.expiry | string | - | no | snapshot\_expiry | Controls when snapshots are to be deleted (expects expression like `1M 2H 3d 4w 5m 6y`)
+user.\* | string | - | n/a | - | Free form user key/value storage (can be used in search)
The following volatile keys are currently internally used by LXD:
From 9cb23c07a7f17cefc235afe8436d184b68e1a832 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Tue, 12 Nov 2019 23:22:52 +0100
Subject: [PATCH 4/4] scripts: add security.syscalls.intercept.mount.fuse
Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
scripts/bash/lxd-client | 1 +
1 file changed, 1 insertion(+)
diff --git a/scripts/bash/lxd-client b/scripts/bash/lxd-client
index 1fae67dea7..19d4173bd7 100644
--- a/scripts/bash/lxd-client
+++ b/scripts/bash/lxd-client
@@ -96,6 +96,7 @@ _have lxc && {
security.syscalls.blacklist_compat security.syscalls.blacklist_default \
security.syscalls.intercept.mknod security.syscalls.intercept.mount \
security.syscalls.intercept.mount.allowed \
+ security.syscall.intercept.mount.fuse \
security.syscalls.intercept.setxattr \
security.syscall.intercept.mount.shift \
snapshots.schedule snapshots.schedule.stopped snapshots.pattern \
More information about the lxc-devel
mailing list