[lxc-devel] [lxc/master] cgroups: add unified hierarchy support

brauner on Github lxc-bot at linuxcontainers.org
Wed Jan 31 15:51:25 UTC 2018


A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 364 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20180131/d20223ab/attachment.bin>
-------------- next part --------------
From d6337a5f9dc7311af168aa3d586fdf239f5a10d3 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Wed, 31 Jan 2018 16:25:11 +0100
Subject: [PATCH 1/7] cgroups: get controllers on the unified hierarchy

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 src/lxc/cgroups/cgfsng.c       | 403 +++++++++++++++++++++++++++++++----------
 src/lxc/cgroups/cgroup.h       |   7 +
 src/lxc/cgroups/cgroup_utils.c |   6 +-
 src/lxc/cgroups/cgroup_utils.h |   4 -
 4 files changed, 322 insertions(+), 98 deletions(-)

diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c
index 86b39574d..4724fb5bc 100644
--- a/src/lxc/cgroups/cgfsng.c
+++ b/src/lxc/cgroups/cgfsng.c
@@ -78,7 +78,7 @@ struct hierarchy {
 	char *mountpoint;
 	char *base_cgroup;
 	char *fullcgpath;
-	bool is_cgroup_v2;
+	int version;
 };
 
 /*
@@ -98,14 +98,17 @@ struct cgfsng_handler_data {
 	char *name; /* container name */
 	/* per-container cgroup information */
 	struct lxc_cgroup cgroup_meta;
+	cgroup_layout_t cgroup_layout;
 };
 
 /*
  * @hierarchies - a NULL-terminated array of struct hierarchy, one per
- *   hierarchy.  No duplicates.  First sufficient, writeable mounted
- *   hierarchy wins
+ *                legacy hierarchy. No duplicates. First sufficient, writeable
+ *                mounted hierarchy wins
  */
 struct hierarchy **hierarchies;
+struct hierarchy *unified;
+cgroup_layout_t cgroup_layout;
 
 /*
  * @cgroup_use - a copy of the lxc.cgroup.use
@@ -183,6 +186,7 @@ static bool string_in_list(char **list, const char *entry)
 
 	if (!list)
 		return false;
+
 	for (i = 0; list[i]; i++)
 		if (strcmp(list[i], entry) == 0)
 			return true;
@@ -220,8 +224,6 @@ static void must_append_controller(char **klist, char **nlist, char ***clist, ch
 		copy = must_copy_string(entry);
 	else if (string_in_list(klist, entry))
 		copy = must_copy_string(entry);
-	else if (!strcmp(entry, "cgroup2"))
-		copy = must_copy_string(entry);
 	else
 		copy = must_prefix_named(entry);
 
@@ -250,10 +252,21 @@ struct hierarchy *get_hierarchy(const char *c)
 
 	if (!hierarchies)
 		return NULL;
+
 	for (i = 0; hierarchies[i]; i++) {
+		if (!c) {
+			/* This is the empty unified hierarchy. */
+			if (hierarchies[i]->controllers &&
+			    !hierarchies[i]->controllers[0])
+				return hierarchies[i];
+
+			return NULL;
+		}
+
 		if (string_in_list(hierarchies[i]->controllers, c))
 			return hierarchies[i];
 	}
+
 	return NULL;
 }
 
@@ -278,7 +291,7 @@ static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
 }
 
 /* Slurp in a whole file */
-static char *read_file(char *fnam)
+static char *read_file(const char *fnam)
 {
 	FILE *f;
 	char *line = NULL, *buf = NULL;
@@ -713,12 +726,14 @@ static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
 static bool controller_found(struct hierarchy **hlist, char *entry)
 {
 	int i;
+
 	if (!hlist)
 		return false;
 
 	for (i = 0; hlist[i]; i++)
 		if (string_in_list(hlist[i]->controllers, entry))
 			return true;
+
 	return false;
 }
 
@@ -757,12 +772,13 @@ static bool all_controllers_found(void)
  * options.  But we simply assume that the mountpoint must be
  * /sys/fs/cgroup/controller-list
  */
-static char **get_controllers(char **klist, char **nlist, char *line, int type)
+static char **get_controllers_on_hybrid_layout(char **klist, char **nlist,
+					       char *line, int type)
 {
 	/* the fourth field is /sys/fs/cgroup/comma-delimited-controller-list */
 	int i;
 	char *dup, *p2, *tok;
-	char *p = line, *saveptr = NULL;
+	char *p = line, *saveptr = NULL, *sep = ",";
 	char **aret = NULL;
 
 	for (i = 0; i < 4; i++) {
@@ -778,6 +794,7 @@ static char **get_controllers(char **klist, char **nlist, char *line, int type)
 		CGFSNG_DEBUG("Found hierarchy not under /sys/fs/cgroup: \"%s\"\n", p);
 		return NULL;
 	}
+
 	p += 15;
 	p2 = strchr(p, ' ');
 	if (!p2) {
@@ -786,30 +803,60 @@ static char **get_controllers(char **klist, char **nlist, char *line, int type)
 	}
 	*p2 = '\0';
 
-	/* cgroup v2 does not have separate mountpoints for controllers */
-	if (type == CGROUP_V2) {
-		must_append_controller(klist, nlist, &aret, "cgroup2");
-		return aret;
+	if (type == CGROUP_SUPER_MAGIC) {
+		/* strdup() here for v1 hierarchies. Otherwise strtok_r() will
+		 * destroy mountpoints such as "/sys/fs/cgroup/cpu,cpuacct".
+		 */
+		dup = strdup(p);
+		if (!dup)
+			return NULL;
+
+		for (tok = strtok_r(dup, sep, &saveptr); tok;
+		     tok = strtok_r(NULL, sep, &saveptr))
+			must_append_controller(klist, nlist, &aret, tok);
+
+		free(dup);
 	}
+	*p2 = ' ';
+	return aret;
+}
 
-	/* strdup() here for v1 hierarchies. Otherwise strtok_r() will destroy
-	 * mountpoints such as "/sys/fs/cgroup/cpu,cpuacct".
-	 */
-	dup = strdup(p);
-	if (!dup)
+static char **cg_unified_make_empty_controller(void)
+{
+	int newentry;
+	char **aret = NULL;
+
+	newentry = append_null_to_list((void ***)&aret);
+	aret[newentry] = NULL;
+	return aret;
+}
+
+static char **cg_unified_get_controllers(const char *file)
+{
+	char *buf, *tok;
+	char *saveptr = NULL, *sep = " \t\n";
+	char **aret = NULL;
+
+	buf = read_file(file);
+	if (!buf)
 		return NULL;
 
-	for (tok = strtok_r(dup, ",", &saveptr); tok;
-			tok = strtok_r(NULL, ",", &saveptr)) {
-		must_append_controller(klist, nlist, &aret, tok);
+	for (tok = strtok_r(buf, sep, &saveptr); tok;
+	     tok = strtok_r(NULL, sep, &saveptr)) {
+		int newentry;
+		char *copy;
+
+		newentry = append_null_to_list((void ***)&aret);
+		copy = must_copy_string(tok);
+		aret[newentry] = copy;
 	}
 
-	free(dup);
+	free(buf);
 	return aret;
 }
 
-/* Add a controller to our list of hierarchies */
-static void add_controller(char **clist, char *mountpoint, char *base_cgroup)
+static struct hierarchy *add_hierarchy(char **clist, char *mountpoint,
+				       char *base_cgroup, int type)
 {
 	struct hierarchy *new;
 	int newentry;
@@ -819,26 +866,24 @@ static void add_controller(char **clist, char *mountpoint, char *base_cgroup)
 	new->mountpoint = mountpoint;
 	new->base_cgroup = base_cgroup;
 	new->fullcgpath = NULL;
-
-	/* record if this is the cgroup v2 hierarchy */
-	if (clist && !strcmp(*clist, "cgroup2"))
-		new->is_cgroup_v2 = true;
-	else
-		new->is_cgroup_v2 = false;
+	new->version = type;
 
 	newentry = append_null_to_list((void ***)&hierarchies);
 	hierarchies[newentry] = new;
+	return new;
 }
 
 /*
  * Get a copy of the mountpoint from @line, which is a line from
  * /proc/self/mountinfo
  */
-static char *get_mountpoint(char *line)
+static char *get_mountpoint_on_hybrid_layout(char *line)
 {
 	int i;
-	char *p = line, *sret;
+	char *p2;
 	size_t len;
+	char *p = line;
+	char *sret = NULL;
 
 	for (i = 0; i < 4; i++) {
 		p = strchr(p, ' ');
@@ -846,7 +891,15 @@ static char *get_mountpoint(char *line)
 			return NULL;
 		p++;
 	}
-	/* we've already stuck a \0 after the mountpoint */
+
+	if (strncmp(p, "/sys/fs/cgroup/", 15))
+		return NULL;
+
+	p2 = strchr(p + 15, ' ');
+	if (!p2)
+		return NULL;
+	*p2 = '\0';
+
 	len = strlen(p);
 	sret = must_alloc(len + 1);
 	memcpy(sret, p, len);
@@ -893,10 +946,11 @@ static bool controller_in_clist(char *cgline, char *c)
 	tmp[len] = '\0';
 
 	for (tok = strtok_r(tmp, ",", &saveptr); tok;
-			tok = strtok_r(NULL, ",", &saveptr)) {
+	     tok = strtok_r(NULL, ",", &saveptr)) {
 		if (strcmp(tok, c) == 0)
 			return true;
 	}
+
 	return false;
 }
 
@@ -904,24 +958,23 @@ static bool controller_in_clist(char *cgline, char *c)
  * @basecginfo is a copy of /proc/$$/cgroup.  Return the current
  * cgroup for @controller
  */
-static char *get_current_cgroup(char *basecginfo, char *controller)
+static char *get_current_cgroup(char *basecginfo, char *controller, int type)
 {
 	char *p = basecginfo;
-	bool is_cgroup_v2;
-	bool is_cgroup_v2_base_cgroup;
 
-	is_cgroup_v2 = !strcmp(controller, "cgroup2");
-	while (true) {
-		is_cgroup_v2_base_cgroup = false;
+	for (;;) {
+		bool is_cgv2_base_cgroup = false;
+
 		/* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
-		if (is_cgroup_v2 && (*p == '0'))
-			is_cgroup_v2_base_cgroup = true;
+		if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0'))
+			is_cgv2_base_cgroup = true;
 
 		p = strchr(p, ':');
 		if (!p)
 			return NULL;
 		p++;
-		if (is_cgroup_v2_base_cgroup || controller_in_clist(p, controller)) {
+
+		if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) {
 			p = strchr(p, ':');
 			if (!p)
 				return NULL;
@@ -945,14 +998,16 @@ static void must_append_string(char ***list, char *entry)
 	(*list)[newentry] = copy;
 }
 
-static void get_existing_subsystems(char ***klist, char ***nlist)
+static int get_existing_subsystems(char ***klist, char ***nlist)
 {
 	FILE *f;
 	char *line = NULL;
 	size_t len = 0;
 
-	if ((f = fopen("/proc/self/cgroup", "r")) == NULL)
-		return;
+	f = fopen("/proc/self/cgroup", "r");
+	if (!f)
+		return -1;
+
 	while (getline(&line, &len, f) != -1) {
 		char *p, *p2, *tok, *saveptr = NULL;
 		p = strchr(line, ':');
@@ -977,7 +1032,7 @@ static void get_existing_subsystems(char ***klist, char ***nlist)
 		}
 
 		for (tok = strtok_r(p, ",", &saveptr); tok;
-				tok = strtok_r(NULL, ",", &saveptr)) {
+		     tok = strtok_r(NULL, ",", &saveptr)) {
 			if (strncmp(tok, "name=", 5) == 0)
 				must_append_string(nlist, tok);
 			else
@@ -987,6 +1042,7 @@ static void get_existing_subsystems(char ***klist, char ***nlist)
 
 	free(line);
 	fclose(f);
+	return 0;
 }
 
 static void trim(char *s)
@@ -1054,82 +1110,125 @@ static void lxc_cgfsng_print_debuginfo(const struct cgfsng_handler_data *d)
  * At startup, parse_hierarchies finds all the info we need about
  * cgroup mountpoints and current cgroups, and stores it in @d.
  */
-static bool parse_hierarchies(void)
+static bool cg_init_hybrid(void)
 {
+	int ret;
+	char *basecginfo;
+	bool will_escape;
 	FILE *f;
-	char * line = NULL, *basecginfo;
-	char **klist = NULL, **nlist = NULL;
 	size_t len = 0;
+	char *line = NULL;
+	char **klist = NULL, **nlist = NULL;
 
 	/*
 	 * Root spawned containers escape the current cgroup, so use init's
 	 * cgroups as our base in that case.
 	 */
-	if (geteuid())
-		basecginfo = read_file("/proc/self/cgroup");
-	else
+	will_escape = (geteuid() == 0);
+	if (will_escape)
 		basecginfo = read_file("/proc/1/cgroup");
+	else
+		basecginfo = read_file("/proc/self/cgroup");
 	if (!basecginfo)
 		return false;
 
-	if ((f = fopen("/proc/self/mountinfo", "r")) == NULL) {
-		CGFSNG_DEBUG("Failed to open \"/proc/self/mountinfo\"\n");
+	ret = get_existing_subsystems(&klist, &nlist);
+	if (ret < 0) {
+		CGFSNG_DEBUG("Failed to retrieve available cgroup v1 controllers\n");
+		free(basecginfo);
 		return false;
 	}
 
-	get_existing_subsystems(&klist, &nlist);
+	f = fopen("/proc/self/mountinfo", "r");
+	if (!f) {
+		CGFSNG_DEBUG("Failed to open \"/proc/self/mountinfo\"\n");
+		return false;
+	}
 
 	if (lxc_cgfsng_debug)
 		lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
 
-	/* we support simple cgroup mounts and lxcfs mounts */
 	while (getline(&line, &len, f) != -1) {
-		char **controller_list = NULL;
-		char *mountpoint, *base_cgroup;
-		bool writeable;
 		int type;
+		bool writeable;
+		struct hierarchy *new;
+		char *mountpoint = NULL, *base_cgroup = NULL;
+		char **controller_list = NULL;
 
 		type = get_cgroup_version(line);
-		if (type < 0)
+		if (type == 0)
 			continue;
 
-		controller_list = get_controllers(klist, nlist, line, type);
-		if (!controller_list)
+		if (type == CGROUP2_SUPER_MAGIC && unified)
 			continue;
 
-		if (controller_list_is_dup(hierarchies, controller_list)) {
-			free(controller_list);
-			continue;
+		if (cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
+			if (type == CGROUP2_SUPER_MAGIC)
+				cgroup_layout = CGROUP_LAYOUT_UNIFIED;
+			else if (type == CGROUP_SUPER_MAGIC)
+				cgroup_layout = CGROUP_LAYOUT_LEGACY;
+		} else if (cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
+			if (type == CGROUP_SUPER_MAGIC)
+				cgroup_layout = CGROUP_LAYOUT_HYBRID;
+		} else if (cgroup_layout == CGROUP_LAYOUT_LEGACY) {
+			if (type == CGROUP2_SUPER_MAGIC)
+				cgroup_layout = CGROUP_LAYOUT_HYBRID;
 		}
 
-		mountpoint = get_mountpoint(line);
+		controller_list = get_controllers_on_hybrid_layout(klist, nlist, line, type);
+		if (!controller_list && type == CGROUP_SUPER_MAGIC)
+			continue;
+
+		if (type == CGROUP_SUPER_MAGIC)
+			if (controller_list_is_dup(hierarchies, controller_list))
+				goto next;
+
+		mountpoint = get_mountpoint_on_hybrid_layout(line);
 		if (!mountpoint) {
 			CGFSNG_DEBUG("Failed parsing mountpoint from \"%s\"\n", line);
-			free_string_list(controller_list);
-			continue;
+			goto next;
 		}
 
-		base_cgroup = get_current_cgroup(basecginfo, controller_list[0]);
+		if (type == CGROUP_SUPER_MAGIC)
+			base_cgroup = get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
+		else
+			base_cgroup = get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
 		if (!base_cgroup) {
-			CGFSNG_DEBUG("Failed to find current cgroup for controller \"%s\"\n", controller_list[0]);
-			free_string_list(controller_list);
-			free(mountpoint);
-			continue;
+			CGFSNG_DEBUG("Failed to find current cgroup\n");
+			goto next;
 		}
 
 		trim(base_cgroup);
 		prune_init_scope(base_cgroup);
-		if (type == CGROUP_V2)
+		if (type == CGROUP2_SUPER_MAGIC)
 			writeable = test_writeable_v2(mountpoint, base_cgroup);
 		else
 			writeable = test_writeable_v1(mountpoint, base_cgroup);
-		if (!writeable) {
-			free_string_list(controller_list);
-			free(mountpoint);
-			free(base_cgroup);
-			continue;
+		if (!writeable)
+			goto next;
+
+		if (type == CGROUP2_SUPER_MAGIC) {
+			char *cgv2_ctrl_path;
+
+			cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
+							"cgroup.controllers",
+							NULL);
+
+			controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
+			free(cgv2_ctrl_path);
+			if (!controller_list)
+				controller_list = cg_unified_make_empty_controller();
 		}
-		add_controller(controller_list, mountpoint, base_cgroup);
+		new = add_hierarchy(controller_list, mountpoint, base_cgroup, type);
+		if (type == CGROUP2_SUPER_MAGIC && !unified)
+			unified = new;
+
+		continue;
+
+	next:
+		free_string_list(controller_list);
+		free(mountpoint);
+		free(base_cgroup);
 	}
 
 	free_string_list(klist);
@@ -1154,9 +1253,106 @@ static bool parse_hierarchies(void)
 	return true;
 }
 
-static bool collect_hierarchy_info(void)
+static int cg_is_pure_unified(void) {
+
+	int ret;
+        struct statfs fs;
+
+        ret = statfs("/sys/fs/cgroup", &fs);
+        if (ret < 0)
+                return -ENOMEDIUM;
+
+        if (is_fs_type(&fs, CGROUP2_SUPER_MAGIC))
+		return CGROUP2_SUPER_MAGIC;
+
+        return 0;
+}
+
+/* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
+static char *cg_get_current_cgroup_unified(void)
 {
+	char *basecginfo;
+	char *base_cgroup;
+	bool will_escape;
+	char *copy = NULL;
+
+	will_escape = (geteuid() == 0);
+	if (will_escape)
+		basecginfo = read_file("/proc/1/cgroup");
+	else
+		basecginfo = read_file("/proc/self/cgroup");
+	if (!basecginfo)
+		return NULL;
+
+	base_cgroup = strstr(basecginfo, "0::/");
+	if (!base_cgroup)
+		goto cleanup_on_err;
+
+	base_cgroup = base_cgroup + 3;
+	copy = copy_to_eol(base_cgroup);
+	if (!copy)
+		goto cleanup_on_err;
+
+cleanup_on_err:
+	free(basecginfo);
+	if (copy)
+		trim(copy);
+
+	return copy;
+}
+
+static int cg_init_unified(void)
+{
+	int ret;
+	char *mountpoint, *subtree_path;
+	char **delegatable;
+	char *base_cgroup = NULL;
+
+	ret = cg_is_pure_unified();
+	if (ret == -ENOMEDIUM)
+		return -ENOMEDIUM;
+
+	if (ret != CGROUP2_SUPER_MAGIC)
+		return 0;
+
+	base_cgroup = cg_get_current_cgroup_unified();
+	if (!base_cgroup)
+		return -EINVAL;
+	prune_init_scope(base_cgroup);
+
+	/* We assume that we have already been given controllers to delegate
+	 * further down the hierarchy. If not it is up to the user to delegate
+	 * them to us.
+	 */
+	mountpoint = must_copy_string("/sys/fs/cgroup");
+	subtree_path = must_make_path(mountpoint, base_cgroup,
+				      "cgroup.subtree_control", NULL);
+	delegatable = cg_unified_get_controllers(subtree_path);
+	free(subtree_path);
+	if (!delegatable)
+		delegatable = cg_unified_make_empty_controller();
+	if (!delegatable[0])
+		CGFSNG_DEBUG("No controllers are enabled for delegation\n");
+
+	/* TODO: If the user requested specific controllers via lxc.cgroup.use
+	 * we should verify here. The reason I'm not doing it right is that I'm
+	 * not convinced that lxc.cgroup.use will be the future since it is a
+	 * global property. I much rather have an option that lets you request
+	 * controllers per container.
+	 */
+
+	add_hierarchy(delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
+	unified = hierarchies[0];
+
+	cgroup_layout = CGROUP_LAYOUT_UNIFIED;
+	return CGROUP2_SUPER_MAGIC;
+}
+
+static bool cg_init(void)
+{
+	int ret;
 	const char *tmp;
+
 	errno = 0;
 	tmp = lxc_global_config_value("lxc.cgroup.use");
 	if (!cgroup_use && errno != 0) { /* lxc.cgroup.use can be NULL */
@@ -1165,7 +1361,14 @@ static bool collect_hierarchy_info(void)
 	}
 	cgroup_use = must_copy_string(tmp);
 
-	return parse_hierarchies();
+	ret = cg_init_unified();
+	if (ret < 0)
+		return false;
+
+	if (ret == CGROUP2_SUPER_MAGIC)
+		return true;
+
+	return cg_init_hybrid();
 }
 
 static void *cgfsng_init(struct lxc_handler *handler)
@@ -1196,6 +1399,16 @@ static void *cgfsng_init(struct lxc_handler *handler)
 	}
 	d->cgroup_pattern = must_copy_string(cgroup_pattern);
 
+	d->cgroup_layout = cgroup_layout;
+	if (d->cgroup_layout == CGROUP_LAYOUT_LEGACY)
+		TRACE("Running with legacy cgroup layout");
+	else if (d->cgroup_layout == CGROUP_LAYOUT_HYBRID)
+		TRACE("Running with hybrid cgroup layout");
+	else if (d->cgroup_layout == CGROUP_LAYOUT_UNIFIED)
+		TRACE("Running with unified cgroup layout");
+	else
+		WARN("Running with unknown cgroup layout");
+
 	if (lxc_cgfsng_debug)
 		lxc_cgfsng_print_debuginfo(d);
 
@@ -1343,7 +1556,7 @@ struct cgroup_ops *cgfsng_ops_init(void)
 	if (getenv("LXC_DEBUG_CGFSNG"))
 		lxc_cgfsng_debug = true;
 
-	if (!collect_hierarchy_info())
+	if (!cg_init())
 		return NULL;
 
 	return &cgfsng_ops;
@@ -1529,7 +1742,7 @@ static int chown_cgroup_wrapper(void *data)
 			WARN("Error chmoding %s: %s", path, strerror(errno));
 		free(fullpath);
 
-		if (!hierarchies[i]->is_cgroup_v2)
+		if (hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
 			continue;
 
 		fullpath = must_make_path(path, "cgroup.subtree_control", NULL);
@@ -1679,7 +1892,7 @@ static int mount_cgroup_cgns_supported(int type, struct hierarchy *h, const char
 	 if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
 		 flags |= MS_RDONLY;
 
-	 if (!h->is_cgroup_v2) {
+	 if (h->version != CGROUP2_SUPER_MAGIC) {
 		 controllers = lxc_string_join(",", (const char **)h->controllers, false);
 		 if (!controllers)
 			 return -ENOMEM;
@@ -1902,25 +2115,33 @@ static bool cgfsng_get_hierarchies(int n, char ***out)
 #define THAWED "THAWED"
 #define THAWED_LEN (strlen(THAWED))
 
+/* TODO: If the unified cgroup hierarchy grows a freezer controller this needs
+ * to be adapted.
+ */
 static bool cgfsng_unfreeze(void *hdata)
 {
+	int ret;
 	char *fullpath;
-	struct hierarchy *h = get_hierarchy("freezer");
+	struct hierarchy *h;
 
+	h = get_hierarchy("freezer");
 	if (!h)
 		return false;
+
 	fullpath = must_make_path(h->fullcgpath, "freezer.state", NULL);
-	if (lxc_write_to_file(fullpath, THAWED, THAWED_LEN, false) != 0) {
-		free(fullpath);
-		return false;
-	}
+	ret = lxc_write_to_file(fullpath, THAWED, THAWED_LEN, false);
 	free(fullpath);
+	if (ret < 0)
+		return false;
+
 	return true;
 }
 
 static const char *cgfsng_get_cgroup(void *hdata, const char *subsystem)
 {
-	struct hierarchy *h = get_hierarchy(subsystem);
+	struct hierarchy *h;
+
+	h = get_hierarchy(subsystem);
 	if (!h)
 		return NULL;
 
diff --git a/src/lxc/cgroups/cgroup.h b/src/lxc/cgroups/cgroup.h
index f17a6abe0..f409eee7c 100644
--- a/src/lxc/cgroups/cgroup.h
+++ b/src/lxc/cgroups/cgroup.h
@@ -32,6 +32,13 @@ struct lxc_handler;
 struct lxc_conf;
 struct lxc_list;
 
+typedef enum {
+        CGROUP_LAYOUT_UNKNOWN = -1,
+        CGROUP_LAYOUT_LEGACY  =  0,
+        CGROUP_LAYOUT_HYBRID  =  1,
+        CGROUP_LAYOUT_UNIFIED =  2,
+} cgroup_layout_t;
+
 typedef enum {
 	CGFS,
 	CGMANAGER,
diff --git a/src/lxc/cgroups/cgroup_utils.c b/src/lxc/cgroups/cgroup_utils.c
index 6dda1a617..8e2a40eda 100644
--- a/src/lxc/cgroups/cgroup_utils.c
+++ b/src/lxc/cgroups/cgroup_utils.c
@@ -35,12 +35,12 @@
 int get_cgroup_version(char *line)
 {
 	if (is_cgroupfs_v1(line))
-		return CGROUP_V1;
+		return CGROUP_SUPER_MAGIC;
 
 	if (is_cgroupfs_v2(line))
-		return CGROUP_V2;
+		return CGROUP2_SUPER_MAGIC;
 
-	return -1;
+	return 0;
 }
 
 bool is_cgroupfs_v1(char *line)
diff --git a/src/lxc/cgroups/cgroup_utils.h b/src/lxc/cgroups/cgroup_utils.h
index e9e4448a6..3a4726e5b 100644
--- a/src/lxc/cgroups/cgroup_utils.h
+++ b/src/lxc/cgroups/cgroup_utils.h
@@ -28,10 +28,6 @@
 #include <stdbool.h>
 #include <stdio.h>
 
-#define CGROUP_V1 0
-#define CGROUP_V2 1
-#define LXCFS_CGROUP 2
-
 /* Retrieve the cgroup version of a given entry from /proc/<pid>/mountinfo. */
 extern int get_cgroup_version(char *line);
 

From 0c3deb94f438ccf0d622440d80e27a41db465d8b Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Wed, 31 Jan 2018 16:33:17 +0100
Subject: [PATCH 2/7] cgroups: cgfsng_create: handle unified hierarchy

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 src/lxc/cgroups/cgfsng.c | 96 ++++++++++++++++++++++++++++++++++++++++++------
 src/lxc/utils.c          | 27 ++++++++++++++
 src/lxc/utils.h          | 11 +++++-
 3 files changed, 122 insertions(+), 12 deletions(-)

diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c
index 4724fb5bc..24a020ff3 100644
--- a/src/lxc/cgroups/cgfsng.c
+++ b/src/lxc/cgroups/cgfsng.c
@@ -1562,18 +1562,92 @@ struct cgroup_ops *cgfsng_ops_init(void)
 	return &cgfsng_ops;
 }
 
+static bool handle_unified_hierarchy(struct hierarchy *h, char *cgname)
+{
+	char **it;
+	size_t i, parts_len;
+	size_t full_len = 0;
+	char *add_controllers = NULL, *cgroup = NULL;
+	char **parts = NULL;
+	bool bret = false;
+
+	if (h->version != CGROUP2_SUPER_MAGIC)
+		return true;
+
+	if (!h->controllers)
+		return true;
+
+	/* For now we simply enable all controllers that we have detected by
+	 * creating a string like "+memory +pids +cpu +io".
+	 * TODO: In the near future we might want to support "-<controller>"
+	 * etc. but whether supporting semantics like this make sense will need
+	 * some thinking.
+	 */
+	for (it = h->controllers; it && *it; it++) {
+                full_len += strlen(*it) + 2;
+                add_controllers = must_realloc(add_controllers, full_len + 1);
+                if (h->controllers[0] == *it)
+                        add_controllers[0] = '\0';
+                strcat(add_controllers, "+");
+                strcat(add_controllers, *it);
+                if ((it + 1) && *(it + 1))
+                        strcat(add_controllers, " ");
+	}
+
+	parts = lxc_string_split(cgname, '/');
+	if (!parts)
+		goto on_error;
+	parts_len = lxc_array_len((void **)parts);
+	if (parts_len > 0)
+		parts_len--;
+
+	cgroup = must_make_path(h->mountpoint, h->base_cgroup, NULL);
+	for (i = 0; i < parts_len; i++) {
+		int ret;
+		char *target;
+
+		cgroup = must_append_path(cgroup, parts[i], NULL);
+		target = must_make_path(cgroup, "cgroup.subtree_control", NULL);
+		ret = lxc_write_to_file(target, add_controllers, full_len, false);
+		free(target);
+		if (ret < 0) {
+			SYSERROR("Could not enable \"%s\" controllers in the "
+				 "unified cgroup \"%s\"", add_controllers, cgroup);
+			goto on_error;
+		}
+	}
+
+	bret = true;
+
+on_error:
+	lxc_free_array((void **)parts, free);
+	free(add_controllers);
+	free(cgroup);
+	return bret;
+}
+
 static bool create_path_for_hierarchy(struct hierarchy *h, char *cgname)
 {
+	int ret;
+
 	h->fullcgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
 	if (dir_exists(h->fullcgpath)) { /* it must not already exist */
-		ERROR("Path \"%s\" already existed.", h->fullcgpath);
+		ERROR("cgroup \"%s\" already existed", h->fullcgpath);
 		return false;
 	}
+
 	if (!handle_cpuset_hierarchy(h, cgname)) {
-		ERROR("Failed to handle cgroupfs v1 cpuset controller.");
+		ERROR("Failed to handle cgroupfs v1 cpuset controller");
 		return false;
 	}
-	return mkdir_p(h->fullcgpath, 0755) == 0;
+
+	ret = mkdir_p(h->fullcgpath, 0755);
+	if (ret < 0) {
+		ERROR("Failed to create cgroup \"%s\"", h->fullcgpath);
+		return false;
+	}
+
+	return handle_unified_hierarchy(h, cgname);
 }
 
 static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname)
@@ -1592,7 +1666,7 @@ static inline bool cgfsng_create(void *hdata)
 {
 	int i;
 	size_t len;
-	char *cgname, *offset, *tmp;
+	char *container_cgroup, *offset, *tmp;
 	int idx = 0;
 	struct cgfsng_handler_data *d = hdata;
 
@@ -1613,10 +1687,10 @@ static inline bool cgfsng_create(void *hdata)
 		return false;
 	}
 	len = strlen(tmp) + 5; /* leave room for -NNN\0 */
-	cgname = must_alloc(len);
-	strcpy(cgname, tmp);
+	container_cgroup = must_alloc(len);
+	strcpy(container_cgroup, tmp);
 	free(tmp);
-	offset = cgname + len - 5;
+	offset = container_cgroup + len - 5;
 
 again:
 	if (idx == 1000) {
@@ -1638,23 +1712,23 @@ static inline bool cgfsng_create(void *hdata)
 		}
 	}
 	for (i = 0; hierarchies[i]; i++) {
-		if (!create_path_for_hierarchy(hierarchies[i], cgname)) {
+		if (!create_path_for_hierarchy(hierarchies[i], container_cgroup)) {
 			int j;
 			ERROR("Failed to create \"%s\"", hierarchies[i]->fullcgpath);
 			free(hierarchies[i]->fullcgpath);
 			hierarchies[i]->fullcgpath = NULL;
 			for (j = 0; j < i; j++)
-				remove_path_for_hierarchy(hierarchies[j], cgname);
+				remove_path_for_hierarchy(hierarchies[j], container_cgroup);
 			idx++;
 			goto again;
 		}
 	}
 	/* Done */
-	d->container_cgroup = cgname;
+	d->container_cgroup = container_cgroup;
 	return true;
 
 out_free:
-	free(cgname);
+	free(container_cgroup);
 	return false;
 }
 
diff --git a/src/lxc/utils.c b/src/lxc/utils.c
index c7812fdac..a1fe7d4ec 100644
--- a/src/lxc/utils.c
+++ b/src/lxc/utils.c
@@ -2307,6 +2307,33 @@ char *must_make_path(const char *first, ...)
 	return dest;
 }
 
+char *must_append_path(char *first, ...)
+{
+	char *cur;
+	size_t full_len;
+	va_list args;
+	char *dest = first;
+
+	full_len = strlen(first);
+	va_start(args, first);
+	while ((cur = va_arg(args, char *)) != NULL) {
+		full_len += strlen(cur);
+
+		if (cur[0] != '/')
+			full_len++;
+
+		dest = must_realloc(dest, full_len + 1);
+
+		if (cur[0] != '/')
+			strcat(dest, "/");
+
+		strcat(dest, cur);
+	}
+	va_end(args);
+
+	return dest;
+}
+
 char *must_copy_string(const char *entry)
 {
 	char *ret;
diff --git a/src/lxc/utils.h b/src/lxc/utils.h
index 223580edc..4d129d137 100644
--- a/src/lxc/utils.h
+++ b/src/lxc/utils.h
@@ -86,6 +86,14 @@
 #define CAP_SYS_ADMIN 21
 #endif
 
+#ifndef CGROUP_SUPER_MAGIC
+#define CGROUP_SUPER_MAGIC 0x27e0eb
+#endif
+
+#ifndef CGROUP2_SUPER_MAGIC
+#define CGROUP2_SUPER_MAGIC 0x63677270
+#endif
+
 /* Useful macros */
 /* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
 #define LXC_NUMSTRLEN64 21
@@ -529,7 +537,8 @@ extern int run_command(char *buf, size_t buf_size, int (*child_fn)(void *),
 /* Concatenate all passed-in strings into one path. Do not fail. If any piece
  * is not prefixed with '/', add a '/'.
  */
-extern char *must_make_path(const char *first, ...) __attribute__((sentinel));
+__attribute__((sentinel)) extern char *must_make_path(const char *first, ...);
+__attribute__((sentinel)) extern char *must_append_path(char *first, ...);
 
 /* return copy of string @entry;  do not fail. */
 extern char *must_copy_string(const char *entry);

From c2aed66d0ff8440cae33b5c08ca055a234197c88 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Wed, 31 Jan 2018 16:38:55 +0100
Subject: [PATCH 3/7] cgroups: cgfsng_attach: handle unified hierarchy

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 src/lxc/cgroups/cgfsng.c | 94 +++++++++++++++++++++++++++++++++++++++++++++---
 src/lxc/commands.c       | 15 +++++---
 2 files changed, 99 insertions(+), 10 deletions(-)

diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c
index 24a020ff3..e0ae55a01 100644
--- a/src/lxc/cgroups/cgfsng.c
+++ b/src/lxc/cgroups/cgfsng.c
@@ -2233,26 +2233,110 @@ static char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
 	return must_make_path(h->mountpoint, inpath, filename, NULL);
 }
 
+/* Technically, we're always at a delegation boundary here. (This is especially
+ * true when cgroup namespaces are available.) The reasoning is that in order
+ * for us to have been able to start a container in the first place the root
+ * cgroup must have been a leaf node.  Now, either the container's init system
+ * has populated the cgroup and kept it as a leaf node or it has created
+ * subtrees. In the former case we will simply attach to the leaf node we
+ * created when we started the container in the latter case we create our own
+ * cgroup for the attaching process.
+ */
+static int cg_attach_unified(const struct hierarchy *h, const char *name,
+			     const char *lxcpath, const char *pidstr,
+			     size_t pidstr_len, const char *controller)
+{
+	int ret;
+	size_t len;
+	int fret = -1, idx = 0;
+	char *base_path = NULL, *container_cgroup = NULL, *full_path = NULL;
+
+	container_cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
+	/* not running */
+	if (!container_cgroup)
+		return 0;
+
+	base_path = must_make_path(h->mountpoint, container_cgroup, NULL);
+	full_path = must_make_path(base_path, "cgroup.procs", NULL);
+	/* cgroup is populated */
+	ret = lxc_write_to_file(full_path, pidstr, pidstr_len, false);
+	if (ret < 0 && errno != EBUSY)
+		goto on_error;
+
+	if (ret == 0)
+		goto on_success;
+
+	free(full_path);
+
+	len = strlen(base_path) + sizeof("/lxc-1000") - 1 +
+	      sizeof("/cgroup-procs") - 1;
+	full_path = must_alloc(len + 1);
+	do {
+		if (idx)
+			ret = snprintf(full_path, len + 1, "%s/lxc-%d",
+				       base_path, idx);
+		else
+			ret = snprintf(full_path, len + 1, "%s/lxc", base_path);
+		if (ret < 0 || (size_t)ret >= len + 1)
+			goto on_error;
+
+		ret = mkdir_p(full_path, 0755);
+		if (ret < 0 && errno != EEXIST)
+			goto on_error;
+
+		strcat(full_path, "/cgroup.procs");
+		ret = lxc_write_to_file(full_path, pidstr, len, false);
+		if (ret == 0)
+			goto on_success;
+
+		/* this is a non-leaf node */
+		if (errno != EBUSY)
+			goto on_error;
+
+	} while (++idx > 0 && idx < 1000);
+
+on_success:
+	if (idx < 1000)
+		fret = 0;
+
+on_error:
+	free(base_path);
+	free(container_cgroup);
+	free(full_path);
+
+	return fret;
+}
+
 static bool cgfsng_attach(const char *name, const char *lxcpath, pid_t pid)
 {
+	int i, len, ret;
 	char pidstr[25];
-	int i, len;
 
 	len = snprintf(pidstr, 25, "%d", pid);
 	if (len < 0 || len > 25)
 		return false;
 
 	for (i = 0; hierarchies[i]; i++) {
-		char *path, *fullpath;
+		char *path;
+		char *fullpath = NULL;
 		struct hierarchy *h = hierarchies[i];
 
+		if (h->version == CGROUP2_SUPER_MAGIC) {
+			ret = cg_attach_unified(h, name, lxcpath, pidstr, len, h->controllers[0]);
+			if (ret < 0)
+				return false;
+
+			continue;
+		}
+
 		path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
-		if (!path) /* not running */
+		/* not running */
+		if (!path)
 			continue;
 
 		fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
-		free(path);
-		if (lxc_write_to_file(fullpath, pidstr, len, false) != 0) {
+		ret = lxc_write_to_file(fullpath, pidstr, len, false);
+		if (ret < 0) {
 			SYSERROR("Failed to attach %d to %s", (int)pid, fullpath);
 			free(fullpath);
 			return false;
diff --git a/src/lxc/commands.c b/src/lxc/commands.c
index b4d0e3979..eae06d9be 100644
--- a/src/lxc/commands.c
+++ b/src/lxc/commands.c
@@ -445,11 +445,16 @@ char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
 	struct lxc_cmd_rr cmd = {
 		.req = {
 			.cmd = LXC_CMD_GET_CGROUP,
-			.datalen = strlen(subsystem) + 1,
 			.data = subsystem,
+			.datalen = 0,
 		},
 	};
 
+	cmd.req.data = subsystem;
+	cmd.req.datalen = 0;
+	if (subsystem)
+		cmd.req.datalen = strlen(subsystem) + 1;
+
 	ret = lxc_cmd(name, &cmd, &stopped, lxcpath, NULL);
 	if (ret < 0)
 		return NULL;
@@ -469,10 +474,10 @@ static int lxc_cmd_get_cgroup_callback(int fd, struct lxc_cmd_req *req,
 	const char *path;
 	struct lxc_cmd_rsp rsp;
 
-	if (req->datalen < 1)
-		return -1;
-
-	path = cgroup_get_cgroup(handler, req->data);
+	if (req->datalen > 0)
+		path = cgroup_get_cgroup(handler, req->data);
+	else
+		path = cgroup_get_cgroup(handler, NULL);
 	if (!path)
 		return -1;
 

From 0069cc619ed2c9fc2405c1a500f6c9c211ee4bba Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Wed, 31 Jan 2018 16:41:53 +0100
Subject: [PATCH 4/7] cgroups: cgfsng_get: handle unified hierarchy

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 src/lxc/cgroups/cgfsng.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c
index e0ae55a01..961a25fee 100644
--- a/src/lxc/cgroups/cgfsng.c
+++ b/src/lxc/cgroups/cgfsng.c
@@ -2352,28 +2352,34 @@ static bool cgfsng_attach(const char *name, const char *lxcpath, pid_t pid)
  * Here we don't have a cgroup_data set up, so we ask the running
  * container through the commands API for the cgroup path
  */
-static int cgfsng_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath)
+static int cgfsng_get(const char *filename, char *value, size_t len,
+		      const char *name, const char *lxcpath)
 {
-	char *subsystem, *p, *path;
-	struct hierarchy *h;
 	int ret = -1;
+	size_t controller_len;
+	char *controller, *p, *path;
+	struct hierarchy *h;
 
-	subsystem = alloca(strlen(filename) + 1);
-	strcpy(subsystem, filename);
-	if ((p = strchr(subsystem, '.')) != NULL)
+	controller_len = strlen(filename);
+	controller = alloca(controller_len + 1);
+	strcpy(controller, filename);
+	p = strchr(controller, '.');
+	if (p)
 		*p = '\0';
 
-	path = lxc_cmd_get_cgroup_path(name, lxcpath, subsystem);
-	if (!path) /* not running */
+	path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
+	/* not running */
+	if (!path)
 		return -1;
 
-	h = get_hierarchy(subsystem);
+	h = get_hierarchy(controller);
 	if (h) {
-		char *fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
+		char *fullpath;
+
+		fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
 		ret = lxc_read_from_file(fullpath, value, len);
 		free(fullpath);
 	}
-
 	free(path);
 
 	return ret;

From 8777796839ab754e22b088dbf4e681d903c68ce8 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Wed, 31 Jan 2018 16:42:19 +0100
Subject: [PATCH 5/7] cgroups: cgfsng_set: handle unified hierarchy

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 src/lxc/cgroups/cgfsng.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c
index 961a25fee..c94257c9f 100644
--- a/src/lxc/cgroups/cgfsng.c
+++ b/src/lxc/cgroups/cgfsng.c
@@ -2390,28 +2390,34 @@ static int cgfsng_get(const char *filename, char *value, size_t len,
  * Here we don't have a cgroup_data set up, so we ask the running
  * container through the commands API for the cgroup path
  */
-static int cgfsng_set(const char *filename, const char *value, const char *name, const char *lxcpath)
+static int cgfsng_set(const char *filename, const char *value, const char *name,
+		      const char *lxcpath)
 {
-	char *subsystem, *p, *path;
-	struct hierarchy *h;
 	int ret = -1;
+	size_t controller_len;
+	char *controller, *p, *path;
+	struct hierarchy *h;
 
-	subsystem = alloca(strlen(filename) + 1);
-	strcpy(subsystem, filename);
-	if ((p = strchr(subsystem, '.')) != NULL)
+	controller_len = strlen(filename);
+	controller = alloca(controller_len + 1);
+	strcpy(controller, filename);
+	p = strchr(controller, '.');
+	if (p)
 		*p = '\0';
 
-	path = lxc_cmd_get_cgroup_path(name, lxcpath, subsystem);
-	if (!path) /* not running */
+	path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
+	/* not running */
+	if (!path)
 		return -1;
 
-	h = get_hierarchy(subsystem);
+	h = get_hierarchy(controller);
 	if (h) {
-		char *fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
+		char *fullpath;
+
+		fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
 		ret = lxc_write_to_file(fullpath, value, strlen(value), false);
 		free(fullpath);
 	}
-
 	free(path);
 
 	return ret;

From 54860ed02790c1c87bf1125c3b91b52ee1ee1c37 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Wed, 31 Jan 2018 16:44:30 +0100
Subject: [PATCH 6/7] confile: add lxc.cgroup2.[controller].[property]

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 doc/lxc.container.conf.sgml.in |  30 +++++++---
 src/lxc/conf.c                 |  33 ++++++++---
 src/lxc/conf.h                 |  10 +++-
 src/lxc/confile.c              | 123 +++++++++++++++++++++++++++++++----------
 4 files changed, 150 insertions(+), 46 deletions(-)

diff --git a/doc/lxc.container.conf.sgml.in b/doc/lxc.container.conf.sgml.in
index 3ae4bfd18..dfb7ba05d 100644
--- a/doc/lxc.container.conf.sgml.in
+++ b/doc/lxc.container.conf.sgml.in
@@ -1330,17 +1330,31 @@ dev/null proc/kcore none bind,relative 0 0
       <variablelist>
         <varlistentry>
           <term>
-            <option>lxc.cgroup.[subsystem name]</option>
+            <option>lxc.cgroup.[controller name]</option>
           </term>
           <listitem>
             <para>
-              specify the control group value to be set.  The
-              subsystem name is the literal name of the control group
-              subsystem.  The permitted names and the syntax of their
-              values is not dictated by LXC, instead it depends on the
-              features of the Linux kernel running at the time the
-              container is started,
-              eg. <option>lxc.cgroup.cpuset.cpus</option>
+              Specify the control group value to be set on a legacy cgroup
+              hierarchy. The controller name is the literal name of the control
+              group. The permitted names and the syntax of their values is not
+              dictated by LXC, instead it depends on the features of the Linux
+              kernel running at the time the container is started, eg.
+              <option>lxc.cgroup.cpuset.cpus</option>
+            </para>
+          </listitem>
+        </varlistentry>
+        <varlistentry>
+          <term>
+            <option>lxc.cgroup2.[controller name]</option>
+          </term>
+          <listitem>
+            <para>
+              Specify the control group value to be set on the unified cgroup
+              shierarchy. The controller name is the literal name of the control
+              group. The permitted names and the syntax of their values is not
+              dictated by LXC, instead it depends on the features of the Linux
+              kernel running at the time the container is started, eg.
+              <option>lxc.cgroup2.memory.high</option>
             </para>
           </listitem>
         </varlistentry>
diff --git a/src/lxc/conf.c b/src/lxc/conf.c
index a080bbd7e..9b6868940 100644
--- a/src/lxc/conf.c
+++ b/src/lxc/conf.c
@@ -2555,6 +2555,7 @@ struct lxc_conf *lxc_conf_init(void)
 	}
 	new->logfd = -1;
 	lxc_list_init(&new->cgroup);
+	lxc_list_init(&new->cgroup2);
 	lxc_list_init(&new->network);
 	lxc_list_init(&new->mount_list);
 	lxc_list_init(&new->caps);
@@ -3446,23 +3447,38 @@ int lxc_clear_config_keepcaps(struct lxc_conf *c)
 	return 0;
 }
 
-int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
+int lxc_clear_cgroups(struct lxc_conf *c, const char *key, int version)
 {
-	struct lxc_list *it,*next;
-	bool all = false;
+	char *global_token, *namespaced_token;
+	struct lxc_list *it, *next, *list;
 	const char *k = NULL;
+	bool all = false;
 
-	if (strcmp(key, "lxc.cgroup") == 0)
+	if (version == CGROUP2_SUPER_MAGIC) {
+		global_token = "lxc.cgroup2";
+		namespaced_token = "lxc.cgroup2.";
+		list = &c->cgroup2;
+	} else if (version == CGROUP_SUPER_MAGIC) {
+		global_token = "lxc.cgroup";
+		namespaced_token = "lxc.cgroup.";
+		list = &c->cgroup;
+	} else {
+		return -1;
+	}
+
+	if (strcmp(key, global_token) == 0)
 		all = true;
-	else if (strncmp(key, "lxc.cgroup.", sizeof("lxc.cgroup.") - 1) == 0)
-		k = key + sizeof("lxc.cgroup.") - 1;
+	else if (strncmp(key, namespaced_token, sizeof(namespaced_token) - 1) == 0)
+		k = key + sizeof(namespaced_token) - 1;
 	else
 		return -1;
 
-	lxc_list_for_each_safe(it, &c->cgroup, next) {
+	lxc_list_for_each_safe(it, list, next) {
 		struct lxc_cgroup *cg = it->elem;
+
 		if (!all && strcmp(cg->subsystem, k) != 0)
 			continue;
+
 		lxc_list_del(it);
 		free(cg->subsystem);
 		free(cg->value);
@@ -3680,7 +3696,8 @@ void lxc_conf_free(struct lxc_conf *conf)
 	lxc_seccomp_free(conf);
 	lxc_clear_config_caps(conf);
 	lxc_clear_config_keepcaps(conf);
-	lxc_clear_cgroups(conf, "lxc.cgroup");
+	lxc_clear_cgroups(conf, "lxc.cgroup", CGROUP_SUPER_MAGIC);
+	lxc_clear_cgroups(conf, "lxc.cgroup2", CGROUP2_SUPER_MAGIC);
 	lxc_clear_hooks(conf, "lxc.hook");
 	lxc_clear_mount_entries(conf);
 	lxc_clear_idmaps(conf);
diff --git a/src/lxc/conf.h b/src/lxc/conf.h
index 1146a1d4f..2346b717f 100644
--- a/src/lxc/conf.h
+++ b/src/lxc/conf.h
@@ -52,6 +52,8 @@ typedef void * scmp_filter_ctx;
  * programmer to specify the right subsystem.
  * @subsystem : the targeted subsystem
  * @value     : the value to set
+ * @version   : The version of the cgroup filesystem on which the controller
+ *              resides.
  *
  * @controllers : The controllers to use for this container.
  * @dir         : The name of the directory containing the container's cgroup.
@@ -61,6 +63,7 @@ struct lxc_cgroup {
 	union {
 		/* information about a specific controller */
 		struct /* controller */ {
+			int version;
 			char *subsystem;
 			char *value;
 		};
@@ -282,7 +285,10 @@ struct lxc_conf {
 	int reboot;
 	signed long personality;
 	struct utsname *utsname;
-	struct lxc_list cgroup;
+	struct {
+		struct lxc_list cgroup;
+		struct lxc_list cgroup2;
+	};
 	struct {
 		struct lxc_list id_map;
 
@@ -433,7 +439,7 @@ extern int lxc_create_tty(const char *name, struct lxc_conf *conf);
 extern void lxc_delete_tty(struct lxc_tty_info *tty_info);
 extern int lxc_clear_config_caps(struct lxc_conf *c);
 extern int lxc_clear_config_keepcaps(struct lxc_conf *c);
-extern int lxc_clear_cgroups(struct lxc_conf *c, const char *key);
+extern int lxc_clear_cgroups(struct lxc_conf *c, const char *key, int version);
 extern int lxc_clear_mount_entries(struct lxc_conf *c);
 extern int lxc_clear_automounts(struct lxc_conf *c);
 extern int lxc_clear_hooks(struct lxc_conf *c, const char *key);
diff --git a/src/lxc/confile.c b/src/lxc/confile.c
index 3deec58bf..fa4f84da9 100644
--- a/src/lxc/confile.c
+++ b/src/lxc/confile.c
@@ -81,6 +81,7 @@ lxc_config_define(apparmor_profile);
 lxc_config_define(cap_drop);
 lxc_config_define(cap_keep);
 lxc_config_define(cgroup_controller);
+lxc_config_define(cgroup2_controller);
 lxc_config_define(cgroup_dir);
 lxc_config_define(console_logfile);
 lxc_config_define(console_rotate);
@@ -153,6 +154,7 @@ static struct lxc_config_t config[] = {
 	{ "lxc.autodev",                   false,                  set_config_autodev,                     get_config_autodev,                     clr_config_autodev,                   },
 	{ "lxc.cap.drop",                  false,                  set_config_cap_drop,                    get_config_cap_drop,                    clr_config_cap_drop,                  },
 	{ "lxc.cap.keep",                  false,                  set_config_cap_keep,                    get_config_cap_keep,                    clr_config_cap_keep,                  },
+	{ "lxc.cgroup2",                   false,                  set_config_cgroup2_controller,          get_config_cgroup2_controller,          clr_config_cgroup2_controller,        },
 	{ "lxc.cgroup.dir",                false,                  set_config_cgroup_dir,                  get_config_cgroup_dir,                  clr_config_cgroup_dir,                },
 	{ "lxc.cgroup",                    false,                  set_config_cgroup_controller,           get_config_cgroup_controller,           clr_config_cgroup_controller,         },
 	{ "lxc.console.buffer.logfile",    false,                  set_config_console_buffer_logfile,      get_config_console_buffer_logfile,      clr_config_console_buffer_logfile,    },
@@ -1374,28 +1376,33 @@ static int set_config_signal_stop(const char *key, const char *value,
 	return 0;
 }
 
-static int set_config_cgroup_controller(const char *key, const char *value,
-					struct lxc_conf *lxc_conf, void *data)
+static int __set_config_cgroup_controller(const char *key, const char *value,
+					  struct lxc_conf *lxc_conf, int version)
 {
-	char *subkey;
-	char *token = "lxc.cgroup.";
+	const char *subkey, *token;
+	size_t token_len;
 	struct lxc_list *cglist = NULL;
 	struct lxc_cgroup *cgelem = NULL;
 
 	if (lxc_config_value_empty(value))
-		return lxc_clear_cgroups(lxc_conf, key);
-
-	subkey = strstr(key, token);
-	if (!subkey)
-		return -1;
-
-	if (!strlen(subkey))
-		return -1;
+		return lxc_clear_cgroups(lxc_conf, key, version);
+
+	if (version == CGROUP2_SUPER_MAGIC) {
+		token = "lxc.cgroup2.";
+		token_len = 12;
+	} else if (version == CGROUP_SUPER_MAGIC) {
+		token = "lxc.cgroup.";
+		token_len = 11;
+	} else {
+		return -EINVAL;
+	}
 
-	if (strlen(subkey) == strlen(token))
-		return -1;
+	if (strncmp(key, token, token_len) != 0)
+		return -EINVAL;
 
-	subkey += strlen(token);
+	subkey = key + token_len;
+	if (*subkey == '\0')
+		return -EINVAL;
 
 	cglist = malloc(sizeof(*cglist));
 	if (!cglist)
@@ -1407,14 +1414,21 @@ static int set_config_cgroup_controller(const char *key, const char *value,
 	memset(cgelem, 0, sizeof(*cgelem));
 
 	cgelem->subsystem = strdup(subkey);
-	cgelem->value = strdup(value);
+	if (!cgelem->subsystem)
+		goto out;
 
-	if (!cgelem->subsystem || !cgelem->value)
+	cgelem->value = strdup(value);
+	if (!cgelem->value)
 		goto out;
 
-	cglist->elem = cgelem;
+	cgelem->version = version;
+
+	lxc_list_add_elem(cglist, cgelem);
 
-	lxc_list_add_tail(&lxc_conf->cgroup, cglist);
+	if (version == CGROUP2_SUPER_MAGIC)
+		lxc_list_add_tail(&lxc_conf->cgroup2, cglist);
+	else
+		lxc_list_add_tail(&lxc_conf->cgroup, cglist);
 
 	return 0;
 
@@ -1429,6 +1443,21 @@ static int set_config_cgroup_controller(const char *key, const char *value,
 	return -1;
 }
 
+static int set_config_cgroup_controller(const char *key, const char *value,
+					struct lxc_conf *lxc_conf, void *data)
+{
+	return __set_config_cgroup_controller(key, value, lxc_conf,
+					      CGROUP_SUPER_MAGIC);
+}
+
+static int set_config_cgroup2_controller(const char *key, const char *value,
+					 struct lxc_conf *lxc_conf, void *data)
+{
+	return __set_config_cgroup_controller(key, value, lxc_conf,
+					      CGROUP2_SUPER_MAGIC);
+}
+
+
 static int set_config_cgroup_dir(const char *key, const char *value,
 				 struct lxc_conf *lxc_conf, void *data)
 {
@@ -2910,11 +2939,14 @@ static int get_config_selinux_context(const char *key, char *retv, int inlen,
  * If you ask for 'lxc.cgroup", then all cgroup entries will be printed, in
  * 'lxc.cgroup.subsystem.key = value' format.
  */
-static int get_config_cgroup_controller(const char *key, char *retv, int inlen,
-					struct lxc_conf *c, void *data)
+static int __get_config_cgroup_controller(const char *key, char *retv,
+					  int inlen, struct lxc_conf *c,
+					  int version)
 {
-	struct lxc_list *it;
 	int len;
+	size_t namespaced_token_len;
+	char *global_token, *namespaced_token;
+	struct lxc_list *it;
 	int fulllen = 0;
 	bool get_all = false;
 
@@ -2923,10 +2955,22 @@ static int get_config_cgroup_controller(const char *key, char *retv, int inlen,
 	else
 		memset(retv, 0, inlen);
 
-	if (!strcmp(key, "lxc.cgroup"))
+	if (version == CGROUP2_SUPER_MAGIC) {
+		global_token = "lxc.cgroup2";
+		namespaced_token = "lxc.cgroup2.";
+		namespaced_token_len = sizeof("lxc.cgroup2.") - 1;;
+	} else if (version == CGROUP_SUPER_MAGIC) {
+		global_token = "lxc.cgroup";
+		namespaced_token = "lxc.cgroup.";
+		namespaced_token_len = sizeof("lxc.cgroup.") - 1;;
+	} else {
+		return -1;
+	}
+
+	if (strcmp(key, global_token) == 0)
 		get_all = true;
-	else if (!strncmp(key, "lxc.cgroup.", 11))
-		key += 11;
+	else if (strncmp(key, namespaced_token, namespaced_token_len) == 0)
+		key += namespaced_token_len;
 	else
 		return -1;
 
@@ -2934,8 +2978,11 @@ static int get_config_cgroup_controller(const char *key, char *retv, int inlen,
 		struct lxc_cgroup *cg = it->elem;
 
 		if (get_all) {
-			strprint(retv, inlen, "lxc.cgroup.%s = %s\n",
-				 cg->subsystem, cg->value);
+			if (version != cg->version)
+				continue;
+
+			strprint(retv, inlen, "%s.%s = %s\n",
+				 global_token, cg->subsystem, cg->value);
 		} else if (!strcmp(cg->subsystem, key)) {
 			strprint(retv, inlen, "%s\n", cg->value);
 		}
@@ -2944,6 +2991,20 @@ static int get_config_cgroup_controller(const char *key, char *retv, int inlen,
 	return fulllen;
 }
 
+static int get_config_cgroup_controller(const char *key, char *retv, int inlen,
+					struct lxc_conf *c, void *data)
+{
+	return __get_config_cgroup_controller(key, retv, inlen, c,
+					      CGROUP_SUPER_MAGIC);
+}
+
+static int get_config_cgroup2_controller(const char *key, char *retv, int inlen,
+					 struct lxc_conf *c, void *data)
+{
+	return __get_config_cgroup_controller(key, retv, inlen, c,
+					      CGROUP2_SUPER_MAGIC);
+}
+
 static int get_config_cgroup_dir(const char *key, char *retv, int inlen,
 				 struct lxc_conf *lxc_conf, void *data)
 {
@@ -3632,7 +3693,13 @@ static inline int clr_config_selinux_context(const char *key,
 static inline int clr_config_cgroup_controller(const char *key,
 					       struct lxc_conf *c, void *data)
 {
-	return lxc_clear_cgroups(c, key);
+	return lxc_clear_cgroups(c, key, CGROUP_SUPER_MAGIC);
+}
+
+static inline int clr_config_cgroup2_controller(const char *key,
+						struct lxc_conf *c, void *data)
+{
+	return lxc_clear_cgroups(c, key, CGROUP2_SUPER_MAGIC);
 }
 
 static int clr_config_cgroup_dir(const char *key, struct lxc_conf *lxc_conf,

From 6b38e644cb8d4942f16d9a82b72d56a72b9aa81d Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Wed, 31 Jan 2018 16:45:04 +0100
Subject: [PATCH 7/7] cgroups: handle limits on the unified hierarchy

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 src/lxc/cgroups/cgfs.c      |  4 ++--
 src/lxc/cgroups/cgfsng.c    | 58 +++++++++++++++++++++++++++++++++++++++------
 src/lxc/cgroups/cgmanager.c |  3 ++-
 src/lxc/cgroups/cgroup.c    |  2 +-
 src/lxc/cgroups/cgroup.h    |  5 ++--
 5 files changed, 59 insertions(+), 13 deletions(-)

diff --git a/src/lxc/cgroups/cgfs.c b/src/lxc/cgroups/cgfs.c
index fc25bc9b5..89aec91f7 100644
--- a/src/lxc/cgroups/cgfs.c
+++ b/src/lxc/cgroups/cgfs.c
@@ -2525,14 +2525,14 @@ static bool cgfs_unfreeze(void *hdata)
 	return ret == 0;
 }
 
-static bool cgroupfs_setup_limits(void *hdata, struct lxc_list *cgroup_conf,
+static bool cgroupfs_setup_limits(void *hdata, struct lxc_conf *conf,
 				  bool with_devices)
 {
 	struct cgfs_data *d = hdata;
 
 	if (!d)
 		return false;
-	return do_setup_cgroup_limits(d, cgroup_conf, with_devices) == 0;
+	return do_setup_cgroup_limits(d, &conf->cgroup, with_devices) == 0;
 }
 
 static bool lxc_cgroupfs_attach(const char *name, const char *lxcpath, pid_t pid)
diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c
index c94257c9f..5ecc3f0d1 100644
--- a/src/lxc/cgroups/cgfsng.c
+++ b/src/lxc/cgroups/cgfsng.c
@@ -2161,6 +2161,7 @@ static bool cgfsng_escape()
 	return true;
 }
 
+/* TODO: handle the unified cgroup hierarchy */
 static int cgfsng_num_hierarchies(void)
 {
 	int i;
@@ -2171,15 +2172,15 @@ static int cgfsng_num_hierarchies(void)
 	return i;
 }
 
+/* TODO: handle the unified cgroup hierarchy */
 static bool cgfsng_get_hierarchies(int n, char ***out)
 {
 	int i;
 
 	/* sanity check n */
-	for (i = 0; i < n; i++) {
+	for (i = 0; i < n; i++)
 		if (!hierarchies[i])
 			return false;
-	}
 
 	*out = hierarchies[i]->controllers;
 
@@ -2541,8 +2542,9 @@ static int lxc_cgroup_set_data(const char *filename, const char *value, struct c
 	return ret;
 }
 
-static bool cgfsng_setup_limits(void *hdata, struct lxc_list *cgroup_settings,
-				  bool do_devices)
+static bool __cgfsng_setup_limits_legacy(void *hdata,
+					 struct lxc_list *cgroup_settings,
+					 bool do_devices)
 {
 	struct cgfsng_handler_data *d = hdata;
 	struct lxc_list *iterator, *sorted_cgroup_settings, *next;
@@ -2553,9 +2555,8 @@ static bool cgfsng_setup_limits(void *hdata, struct lxc_list *cgroup_settings,
 		return true;
 
 	sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
-	if (!sorted_cgroup_settings) {
+	if (!sorted_cgroup_settings)
 		return false;
-	}
 
 	lxc_list_for_each(iterator, sorted_cgroup_settings) {
 		cg = iterator->elem;
@@ -2576,7 +2577,7 @@ static bool cgfsng_setup_limits(void *hdata, struct lxc_list *cgroup_settings,
 	}
 
 	ret = true;
-	INFO("cgroup has been setup");
+	INFO("Limits for the legacy cgroup hierarchies have been setup");
 out:
 	lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
 		lxc_list_del(iterator);
@@ -2586,6 +2587,49 @@ static bool cgfsng_setup_limits(void *hdata, struct lxc_list *cgroup_settings,
 	return ret;
 }
 
+static bool __cgfsng_setup_limits_unified(void *hdata,
+					  struct lxc_list *cgroup_settings)
+{
+	struct lxc_list *iterator;
+	struct hierarchy *h = unified;
+
+	if (lxc_list_empty(cgroup_settings))
+		return true;
+
+	if (!h)
+		return false;
+
+	lxc_list_for_each(iterator, cgroup_settings) {
+		int ret;
+		char *fullpath;
+		struct lxc_cgroup *cg = iterator->elem;
+
+		fullpath = must_make_path(h->fullcgpath, cg->subsystem, NULL);
+		ret = lxc_write_to_file(fullpath, cg->value, strlen(cg->value), false);
+		free(fullpath);
+		if (ret < 0) {
+			SYSERROR("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
+			return false;
+		}
+		TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
+	}
+
+	INFO("Limits for the unified cgroup hierarchy have been setup");
+	return true;
+}
+
+static bool cgfsng_setup_limits(void *hdata, struct lxc_conf *conf,
+				bool do_devices)
+{
+	bool bret;
+
+	bret = __cgfsng_setup_limits_legacy(hdata, &conf->cgroup, do_devices);
+	if (!bret)
+		return false;
+
+	return __cgfsng_setup_limits_unified(hdata, &conf->cgroup2);
+}
+
 static struct cgroup_ops cgfsng_ops = {
 	.init = cgfsng_init,
 	.destroy = cgfsng_destroy,
diff --git a/src/lxc/cgroups/cgmanager.c b/src/lxc/cgroups/cgmanager.c
index dccc04c3c..c23443c9f 100644
--- a/src/lxc/cgroups/cgmanager.c
+++ b/src/lxc/cgroups/cgmanager.c
@@ -1479,11 +1479,12 @@ static bool cgm_unfreeze(void *hdata)
 	return ret;
 }
 
-static bool cgm_setup_limits(void *hdata, struct lxc_list *cgroup_settings, bool do_devices)
+static bool cgm_setup_limits(void *hdata, struct lxc_conf *conf, bool do_devices)
 {
 	struct cgm_data *d = hdata;
 	struct lxc_list *iterator, *sorted_cgroup_settings, *next;
 	struct lxc_cgroup *cg;
+	struct lxc_list *cgroup_settings = &conf->cgroup;
 	bool ret = false;
 
 	if (lxc_list_empty(cgroup_settings))
diff --git a/src/lxc/cgroups/cgroup.c b/src/lxc/cgroups/cgroup.c
index 36a665b1c..1f78a6317 100644
--- a/src/lxc/cgroups/cgroup.c
+++ b/src/lxc/cgroups/cgroup.c
@@ -150,7 +150,7 @@ bool cgroup_setup_limits(struct lxc_handler *handler, bool with_devices)
 {
 	if (ops)
 		return ops->setup_limits(handler->cgroup_data,
-					 &handler->conf->cgroup, with_devices);
+					 handler->conf, with_devices);
 
 	return false;
 }
diff --git a/src/lxc/cgroups/cgroup.h b/src/lxc/cgroups/cgroup.h
index f409eee7c..d288b4c72 100644
--- a/src/lxc/cgroups/cgroup.h
+++ b/src/lxc/cgroups/cgroup.h
@@ -60,7 +60,7 @@ struct cgroup_ops {
 	int (*set)(const char *filename, const char *value, const char *name, const char *lxcpath);
 	int (*get)(const char *filename, char *value, size_t len, const char *name, const char *lxcpath);
 	bool (*unfreeze)(void *hdata);
-	bool (*setup_limits)(void *hdata, struct lxc_list *cgroup_conf, bool with_devices);
+	bool (*setup_limits)(void *hdata, struct lxc_conf *conf, bool with_devices);
 	bool (*chown)(void *hdata, struct lxc_conf *conf);
 	bool (*attach)(const char *name, const char *lxcpath, pid_t pid);
 	bool (*mount_cgroup)(void *hdata, const char *root, int type);
@@ -80,7 +80,8 @@ extern bool cgroup_enter(struct lxc_handler *handler);
 extern void cgroup_cleanup(struct lxc_handler *handler);
 extern bool cgroup_create_legacy(struct lxc_handler *handler);
 extern int cgroup_nrtasks(struct lxc_handler *handler);
-extern const char *cgroup_get_cgroup(struct lxc_handler *handler, const char *subsystem);
+extern const char *cgroup_get_cgroup(struct lxc_handler *handler,
+				     const char *subsystem);
 extern bool cgroup_escape();
 extern int cgroup_num_hierarchies();
 extern bool cgroup_get_hierarchies(int i, char ***out);


More information about the lxc-devel mailing list