[lxc-devel] [lxd/master] add hugetlbfs and hugepages handling for containers

brauner on Github lxc-bot at linuxcontainers.org
Mon Mar 2 16:38:44 UTC 2020


A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 364 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20200302/df4755fe/attachment-0001.bin>
-------------- next part --------------
From bd3292244c2a35fd6ae12c18452aa2c2be96ffe4 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Mon, 2 Mar 2020 15:27:05 +0100
Subject: [PATCH 1/4] seccomp: handle hugetlbfs mount syscall interception

hugetlbfs already allocates new instances on every mount.
We can deal with the filesystem through existing mount interception.

We just need to add automatic handling of uid and gid mount options when
detecting a hugetlbfs mount. We also need to have hugetlbfs bypass the shifted
option as we don't want to put shiftfs on top of it when we already set the
correct uid/gid.

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 lxd/seccomp/seccomp.go | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/lxd/seccomp/seccomp.go b/lxd/seccomp/seccomp.go
index f996bb19d9..97b11c6c14 100644
--- a/lxd/seccomp/seccomp.go
+++ b/lxd/seccomp/seccomp.go
@@ -1249,6 +1249,15 @@ func mountFlagsToOpts(flags C.ulong) string {
 	return opts
 }
 
+// mountHandleHugetlbfsArgs adds user namespace root uid and gid to the
+// hugetlbfs mount options to make it useable in unprivileged containers.
+func (s *Server) mountHandleHugetlbfsArgs(args *MountArgs, nsuid int64, nsgid int64) {
+	if args.fstype == "hugetlbfs" && args.data == "" {
+		args.data = fmt.Sprintf("uid=%d,gid=%d", nsuid, nsgid)
+		args.shift = false
+	}
+}
+
 // HandleMountSyscall handles mount syscalls.
 func (s *Server) HandleMountSyscall(c Instance, siov *Iovec) int {
 	ctx := log.Ctx{"container": c.Name(),
@@ -1339,6 +1348,8 @@ func (s *Server) HandleMountSyscall(c Instance, siov *Iovec) int {
 		return 0
 	}
 
+	s.mountHandleHugetlbfsArgs(&args, nsuid, nsgid)
+
 	if fuseBinary != "" {
 		// Record ignored flags for debugging purposes
 		flags := C.ulong(args.flags)

From 8174951d490bcbcec07211f7e7728e86a233ab24 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Mon, 2 Mar 2020 17:34:28 +0100
Subject: [PATCH 2/4] doc: add container_syscall_intercept_hugetlbfs

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 doc/api-extensions.md | 6 ++++++
 shared/version/api.go | 1 +
 2 files changed, 7 insertions(+)

diff --git a/doc/api-extensions.md b/doc/api-extensions.md
index 5d9fd59a8a..2bedc3a5ab 100644
--- a/doc/api-extensions.md
+++ b/doc/api-extensions.md
@@ -926,3 +926,9 @@ Introduces the ability to create a storage pool from an existing non-empty volum
 This option should be used with care, as LXD can then not guarantee that volume name conflicts won't occur
 with non-LXD created volumes in the same volume group.
 This could also potentially lead to LXD deleting a non-LXD volume should name conflicts occur.
+
+## container\_syscall\_intercept\_hugetlbfs
+When mount syscall interception is enabled and hugetlbfs is specified as an
+allowed filesystem type LXD will mount a separate hugetlbfs instance for the
+container with the uid and gid mount options set to the container's root uid
+and gid. This ensure that processes in the container can use hugepages.
diff --git a/shared/version/api.go b/shared/version/api.go
index 503c7d4138..fbc50b997c 100644
--- a/shared/version/api.go
+++ b/shared/version/api.go
@@ -190,6 +190,7 @@ var APIExtensions = []string{
 	"clustering_sizing",
 	"firewall_driver",
 	"projects_limits",
+	"container_syscall_intercept_hugetlbfs",
 }
 
 // APIExtensionsCount returns the number of available API extensions.

From 81615cd980672594c564d5b6c9baa542650f2786 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Mon, 2 Mar 2020 17:10:18 +0100
Subject: [PATCH 3/4] limits: add limits.hugepages.* keys

limits.hugepages.64KB
limits.hugepages.1MB
limits.hugepages.2MB
limits.hugepages.1GB

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 lxd/cgroup/abstraction.go          | 18 ++++++++++++++++++
 lxd/cgroup/init.go                 | 10 ++++++++++
 lxd/instance/drivers/driver_lxc.go | 26 ++++++++++++++++++++++++++
 shared/instance.go                 |  5 +++++
 4 files changed, 59 insertions(+)

diff --git a/lxd/cgroup/abstraction.go b/lxd/cgroup/abstraction.go
index ab8a910cf5..61b7ccf664 100644
--- a/lxd/cgroup/abstraction.go
+++ b/lxd/cgroup/abstraction.go
@@ -305,3 +305,21 @@ func (cg *CGroup) SetNetIfPrio(value string) error {
 	}
 	return ErrUnknownVersion
 }
+
+// SetMaxHugepages applies a limit to the number of processes
+func (cg *CGroup) SetMaxHugepages(pageType string, value string) error {
+	// Confirm we have the controller
+	version := cgControllers["hugetlb"]
+	switch version {
+	case Unavailable:
+		return ErrControllerMissing
+	case V1:
+		return cg.rw.Set(version, "hugetlb", fmt.Sprintf("hugetlb.%s.limit_in_bytes", pageType), value)
+	case V2:
+		if value == "" {
+			return cg.rw.Set(version, "hugetlb", fmt.Sprintf("hugetlb.%s.max", pageType), "max")
+		}
+		return cg.rw.Set(version, "hugetlb", fmt.Sprintf("hugetlb.%s.max", pageType), value)
+	}
+	return ErrUnknownVersion
+}
diff --git a/lxd/cgroup/init.go b/lxd/cgroup/init.go
index 0bacbb45b2..de6ee08678 100644
--- a/lxd/cgroup/init.go
+++ b/lxd/cgroup/init.go
@@ -91,6 +91,9 @@ const (
 	// Freezer resource control
 	Freezer
 
+	// Hugetlb resource control
+	Hugetlb
+
 	// Memory resource control
 	Memory
 
@@ -161,6 +164,9 @@ func (info *Info) SupportsVersion(resource Resource) (Backend, bool) {
 	case Freezer:
 		val, ok := cgControllers["freezer"]
 		return val, ok
+	case Hugetlb:
+		val, ok := cgControllers["hugetlb"]
+		return val, ok
 	case Memory:
 		val, ok := cgControllers["memory"]
 		return val, ok
@@ -266,6 +272,10 @@ func (info *Info) Log() {
 		logger.Warnf(" - Couldn't find the CGroup freezer controller, pausing/resuming containers won't work")
 	}
 
+	if !info.Supports(Hugetlb, nil) {
+		logger.Warnf(" - Couldn't find the CGroup hugetlb controller, pausing/resuming containers won't work")
+	}
+
 	if !info.Supports(Memory, nil) {
 		logger.Warnf(" - Couldn't find the CGroup memory controller, memory limits will be ignored")
 	}
diff --git a/lxd/instance/drivers/driver_lxc.go b/lxd/instance/drivers/driver_lxc.go
index fc19ec5022..86ea10f1d2 100644
--- a/lxd/instance/drivers/driver_lxc.go
+++ b/lxd/instance/drivers/driver_lxc.go
@@ -4291,6 +4291,32 @@ func (c *lxc) Update(args db.InstanceArgs, userRequested bool) error {
 						return err
 					}
 				}
+			} else if strings.HasPrefix(key, "limits.hugepages.") {
+				pageType := ""
+
+				switch key {
+				case "limits.hugepages.64KB":
+					pageType = "64KB"
+				case "limits.hugepages.1MB":
+					pageType = "1MB"
+				case "limits.hugepages.2MB":
+					pageType = "2MB"
+				case "limits.hugepages.1GB":
+					pageType = "1GB"
+				}
+
+				if value != "" {
+					valueInt, err := units.ParseByteSizeString(value)
+					if err != nil {
+						return err
+					}
+					value = fmt.Sprintf("%d", valueInt)
+				}
+
+				err = cg.SetMaxHugepages(pageType, value)
+				if err != nil {
+					return err
+				}
 			}
 		}
 	}
diff --git a/shared/instance.go b/shared/instance.go
index a9dc1ab973..a616a45bf4 100644
--- a/shared/instance.go
+++ b/shared/instance.go
@@ -254,6 +254,11 @@ var KnownInstanceConfigKeys = map[string]func(value string) error{
 
 	"limits.disk.priority": IsPriority,
 
+	"limits.hugepages.64KB": IsSize,
+	"limits.hugepages.1MB":  IsSize,
+	"limits.hugepages.2MB":  IsSize,
+	"limits.hugepages.1GB":  IsSize,
+
 	"limits.memory": func(value string) error {
 		if value == "" {
 			return nil

From c68c311bd8d0570157e32e6c8137f99db37fa9b9 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Mon, 2 Mar 2020 17:35:05 +0100
Subject: [PATCH 4/4] doc: add limits_hugepages api extension

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 doc/api-extensions.md | 7 +++++++
 doc/instances.md      | 4 ++++
 shared/version/api.go | 1 +
 3 files changed, 12 insertions(+)

diff --git a/doc/api-extensions.md b/doc/api-extensions.md
index 2bedc3a5ab..ee3bc99546 100644
--- a/doc/api-extensions.md
+++ b/doc/api-extensions.md
@@ -932,3 +932,10 @@ When mount syscall interception is enabled and hugetlbfs is specified as an
 allowed filesystem type LXD will mount a separate hugetlbfs instance for the
 container with the uid and gid mount options set to the container's root uid
 and gid. This ensure that processes in the container can use hugepages.
+
+##limits\_hugepages.*
+This allows to limit the number of hugepages a container can use through the
+hugetlb cgroup. This means the hugetlb cgroup needs to be available. Note, that
+limiting hugepages is recommended when intercepting the mount syscall for the
+hugetlbfs filesystem to avoid allowing the container to exhaust the host's
+hugepages resources.
diff --git a/doc/instances.md b/doc/instances.md
index 15f572b773..d3f97d8c3c 100644
--- a/doc/instances.md
+++ b/doc/instances.md
@@ -46,6 +46,10 @@ limits.cpu                                  | string    | - (all)           | ye
 limits.cpu.allowance                        | string    | 100%              | yes           | -                 | How much of the CPU can be used. Can be a percentage (e.g. 50%) for a soft limit or hard a chunk of time (25ms/100ms)
 limits.cpu.priority                         | integer   | 10 (maximum)      | yes           | -                 | CPU scheduling priority compared to other instances sharing the same CPUs (overcommit) (integer between 0 and 10)
 limits.disk.priority                        | integer   | 5 (medium)        | yes           | -                 | When under load, how much priority to give to the instance's I/O requests (integer between 0 and 10)
+limits.hugepages.64KB                       | string    | -                 | yes           | container         | Fixed value in bytes (various suffixes supported, see below) to limit number of 64 KB hugepages
+limits.hugepages.1MB                        | string    | -                 | yes           | container         | Fixed value in bytes (various suffixes supported, see below) to limit number of 1 MB hugepages
+limits.hugepages.2MB                        | string    | -                 | yes           | container         | Fixed value in bytes (various suffixes supported, see below) to limit number of 2 MB hugepages
+limits.hugepages.1GB                        | string    | -                 | yes           | container         | Fixed value in bytes (various suffixes supported, see below) to limit number of 1 GB hugepages
 limits.kernel.\*                            | string    | -                 | no            | container         | This limits kernel resources per instance (e.g. number of open files)
 limits.memory                               | string    | - (all)           | yes           | -                 | Percentage of the host's memory or fixed value in bytes (various suffixes supported, see below)
 limits.memory.enforce                       | string    | hard              | yes           | container         | If hard, instance can't exceed its memory limit. If soft, the instance can exceed its memory limit when extra host memory is available
diff --git a/shared/version/api.go b/shared/version/api.go
index fbc50b997c..83df5eab7c 100644
--- a/shared/version/api.go
+++ b/shared/version/api.go
@@ -191,6 +191,7 @@ var APIExtensions = []string{
 	"firewall_driver",
 	"projects_limits",
 	"container_syscall_intercept_hugetlbfs",
+	"limits_hugepages",
 }
 
 // APIExtensionsCount returns the number of available API extensions.


More information about the lxc-devel mailing list