[lxc-devel] [lxc/master] cgroup2: rework controller delegation

Fri Dec 6 14:41:01 UTC 2019

A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 364 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20191206/30b81e53/attachment.bin>
-------------- next part --------------
From c581d2a6732fa91f57731d9217004f871e80a2de Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Fri, 6 Dec 2019 09:42:47 +0100
Subject: [PATCH] cgroup2: rework controller delegation

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 src/lxc/cgroups/cgfsng.c | 239 +++++++++++++++++++++------------------
 src/lxc/cgroups/cgroup.c |   1 +
 src/lxc/cgroups/cgroup.h |  12 +-
 src/lxc/lxccontainer.c   |  34 +++---
 src/lxc/macro.h          |   6 +
 src/lxc/start.c          |  40 +++++--
 src/lxc/start.h          |   3 +
 7 files changed, 194 insertions(+), 141 deletions(-)

diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c
index 9541031828..d5ddc8388d 100644
--- a/src/lxc/cgroups/cgfsng.c
+++ b/src/lxc/cgroups/cgfsng.c
@@ -1184,71 +1184,6 @@ __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
 	}
 }
 
-static bool cg_unified_create_cgroup(struct hierarchy *h, char *cgname)
-{
-	__do_free char *add_controllers = NULL, *cgroup = NULL;
-	size_t i, parts_len;
-	char **it;
-	size_t full_len = 0;
-	char **parts = NULL;
-	bool bret = false;
-
-	if (h->version != CGROUP2_SUPER_MAGIC)
-		return true;
-
-	if (!h->controllers)
-		return true;
-
-	/* For now we simply enable all controllers that we have detected by
-	 * creating a string like "+memory +pids +cpu +io".
-	 * TODO: In the near future we might want to support "-<controller>"
-	 * etc. but whether supporting semantics like this make sense will need
-	 * some thinking.
-	 */
-	for (it = h->controllers; it && *it; it++) {
-		full_len += strlen(*it) + 2;
-		add_controllers = must_realloc(add_controllers, full_len + 1);
-
-		if (h->controllers[0] == *it)
-			add_controllers[0] = '\0';
-
-		(void)strlcat(add_controllers, "+", full_len + 1);
-		(void)strlcat(add_controllers, *it, full_len + 1);
-
-		if ((it + 1) && *(it + 1))
-			(void)strlcat(add_controllers, " ", full_len + 1);
-	}
-
-	parts = lxc_string_split(cgname, '/');
-	if (!parts)
-		goto on_error;
-
-	parts_len = lxc_array_len((void **)parts);
-	if (parts_len > 0)
-		parts_len--;
-
-	cgroup = must_make_path(h->mountpoint, h->container_base_path, NULL);
-	for (i = 0; i < parts_len; i++) {
-		int ret;
-		__do_free char *target = NULL;
-
-		cgroup = must_append_path(cgroup, parts[i], NULL);
-		target = must_make_path(cgroup, "cgroup.subtree_control", NULL);
-		ret = lxc_write_to_file(target, add_controllers, full_len, false, 0666);
-		if (ret < 0) {
-			SYSERROR("Could not enable \"%s\" controllers in the "
-				 "unified cgroup \"%s\"", add_controllers, cgroup);
-			goto on_error;
-		}
-	}
-
-	bret = true;
-
-on_error:
-	lxc_free_array((void **)parts, free);
-	return bret;
-}
-
 static int mkdir_eexist_on_last(const char *dir, mode_t mode)
 {
 	const char *tmp = dir;
@@ -1298,7 +1233,7 @@ static bool monitor_create_path_for_hierarchy(struct hierarchy *h, char *cgname)
 		return false;
 	}
 
-	return cg_unified_create_cgroup(h, cgname);
+	return true;
 }
 
 static bool container_create_path_for_hierarchy(struct hierarchy *h, char *cgname)
@@ -1317,7 +1252,7 @@ static bool container_create_path_for_hierarchy(struct hierarchy *h, char *cgnam
 		return false;
 	}
 
-	return cg_unified_create_cgroup(h, cgname);
+	return true;
 }
 
 static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname, bool monitor)
@@ -1400,6 +1335,7 @@ __cgfsng_ops static inline bool cgfsng_monitor_create(struct cgroup_ops *ops,
 		return false;
 
 	INFO("The monitor process uses \"%s\" as cgroup", monitor_cgroup);
+	ops->monitor_cgroup = move_ptr(monitor_cgroup);
 	return true;
 }
 
@@ -1479,47 +1415,66 @@ __cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
 	return true;
 }
 
-__cgfsng_ops static bool __do_cgroup_enter(struct cgroup_ops *ops, pid_t pid,
-					     bool monitor)
+__cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
+					      struct lxc_handler *handler)
 {
-	int len;
-	char pidstr[INTTYPE_TO_STRLEN(pid_t)];
+	int monitor_len, transient_len;
+	char monitor[INTTYPE_TO_STRLEN(pid_t)],
+	    transient[INTTYPE_TO_STRLEN(pid_t)];
 
 	if (!ops->hierarchies)
 		return true;
 
-	len = snprintf(pidstr, sizeof(pidstr), "%d", pid);
-	if (len < 0 || (size_t)len >= sizeof(pidstr))
-		return false;
+	monitor_len = snprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
+	if (handler->transient_pid > 0)
+		transient_len = snprintf(transient, sizeof(transient), "%d",
+					 handler->transient_pid);
 
 	for (int i = 0; ops->hierarchies[i]; i++) {
-		int ret;
 		__do_free char *path = NULL;
+		int ret;
 
-		if (monitor)
-			path = must_make_path(ops->hierarchies[i]->monitor_full_path,
-					      "cgroup.procs", NULL);
-		else
-			path = must_make_path(ops->hierarchies[i]->container_full_path,
-					      "cgroup.procs", NULL);
-		ret = lxc_write_to_file(path, pidstr, len, false, 0666);
-		if (ret != 0) {
-			SYSERROR("Failed to enter cgroup \"%s\"", path);
-			return false;
-		}
+		path = must_make_path(ops->hierarchies[i]->monitor_full_path,
+				      "cgroup.procs", NULL);
+		ret = lxc_writeat(-1, path, monitor, monitor_len);
+		if (ret != 0)
+			return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", path);
+
+                if (handler->transient_pid < 0)
+			return true;
+
+		ret = lxc_writeat(-1, path, transient, transient_len);
+		if (ret != 0)
+			return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", path);
 	}
+	handler->transient_pid = -1;
 
 	return true;
 }
 
-__cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops, pid_t pid)
+__cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
+					      struct lxc_handler *handler)
 {
-	return __do_cgroup_enter(ops, pid, true);
-}
+	int len;
+	char pidstr[INTTYPE_TO_STRLEN(pid_t)];
 
-static bool cgfsng_payload_enter(struct cgroup_ops *ops, pid_t pid)
-{
-	return __do_cgroup_enter(ops, pid, false);
+	if (!ops->hierarchies)
+		return true;
+
+	len = snprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
+
+	for (int i = 0; ops->hierarchies[i]; i++) {
+		__do_free char *path = NULL;
+		int ret;
+
+		path = must_make_path(ops->hierarchies[i]->container_full_path,
+				      "cgroup.procs", NULL);
+		ret = lxc_writeat(-1, path, pidstr, len);
+		if (ret != 0)
+			return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", path);
+	}
+
+	return true;
 }
 
 static int chowmod(char *path, uid_t chown_uid, gid_t chown_gid,
@@ -2625,11 +2580,12 @@ static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
 	return ret;
 }
 
-static bool __cg_legacy_setup_limits(struct cgroup_ops *ops,
-				     struct lxc_list *cgroup_settings,
-				     bool do_devices)
+__cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
+						    struct lxc_conf *conf,
+						    bool do_devices)
 {
 	__do_free struct lxc_list *sorted_cgroup_settings = NULL;
+	struct lxc_list *cgroup_settings = &conf->cgroup;
 	struct lxc_list *iterator, *next;
 	struct lxc_cgroup *cg;
 	bool ret = false;
@@ -2699,12 +2655,13 @@ static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
 	return 0;
 }
 
-static bool __cg_unified_setup_limits(struct cgroup_ops *ops,
-				      struct lxc_list *cgroup_settings,
-				      struct lxc_conf *conf)
+__cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
+					     struct lxc_handler *handler)
 {
 	struct lxc_list *iterator;
 	struct hierarchy *h = ops->unified;
+	struct lxc_conf *conf = handler->conf;
+	struct lxc_list *cgroup_settings = &conf->cgroup2;
 
 	if (lxc_list_empty(cgroup_settings))
 		return true;
@@ -2798,18 +2755,79 @@ __cgfsng_ops bool cgfsng_devices_activate(struct cgroup_ops *ops,
 	return true;
 }
 
-__cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
-					     struct lxc_conf *conf,
-					     bool do_devices)
+bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
 {
-	if (!__cg_legacy_setup_limits(ops, &conf->cgroup, do_devices))
-		return false;
+	__do_free char *add_controllers = NULL, *base_path = NULL;
+	struct hierarchy *unified = ops->unified;
+	ssize_t parts_len;
+	char **it;
+	size_t full_len = 0;
+	char **parts = NULL;
+	bool bret = false;
 
-	/* for v2 we will have already set up devices */
-	if (do_devices)
+	if (!ops->hierarchies || !pure_unified_layout(ops) ||
+	    !unified->controllers[0])
 		return true;
 
-	return __cg_unified_setup_limits(ops, &conf->cgroup2, conf);
+	/* For now we simply enable all controllers that we have detected by
+	 * creating a string like "+memory +pids +cpu +io".
+	 * TODO: In the near future we might want to support "-<controller>"
+	 * etc. but whether supporting semantics like this make sense will need
+	 * some thinking.
+	 */
+	for (it = unified->controllers; it && *it; it++) {
+		full_len += strlen(*it) + 2;
+		add_controllers = must_realloc(add_controllers, full_len + 1);
+
+		if (unified->controllers[0] == *it)
+			add_controllers[0] = '\0';
+
+		(void)strlcat(add_controllers, "+", full_len + 1);
+		(void)strlcat(add_controllers, *it, full_len + 1);
+
+		if ((it + 1) && *(it + 1))
+			(void)strlcat(add_controllers, " ", full_len + 1);
+	}
+
+	parts = lxc_string_split(cgroup, '/');
+	if (!parts)
+		goto on_error;
+
+	parts_len = lxc_array_len((void **)parts);
+	if (parts_len > 0)
+		parts_len--;
+
+	base_path = must_make_path(unified->mountpoint, unified->container_base_path, NULL);
+	for (ssize_t i = -1; i < parts_len; i++) {
+		int ret;
+		__do_free char *target = NULL;
+
+		if (i >= 0)
+			base_path = must_append_path(base_path, parts[i], NULL);
+		target = must_make_path(base_path, "cgroup.subtree_control", NULL);
+		ret = lxc_writeat(-1, target, add_controllers, full_len);
+		if (ret < 0) {
+			SYSERROR("Could not enable \"%s\" controllers in the unified cgroup \"%s\"", add_controllers, target);
+			goto on_error;
+		}
+		TRACE("Enable \"%s\" controllers in the unified cgroup \"%s\"", add_controllers, target);
+	}
+
+	bret = true;
+
+on_error:
+	lxc_free_array((void **)parts, free);
+	return bret;
+}
+
+__cgfsng_ops bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
+{
+	return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
+}
+
+__cgfsng_ops bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
+{
+	return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
 }
 
 static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
@@ -3062,15 +3080,15 @@ static int cg_unified_init(struct cgroup_ops *ops, bool relative,
 	base_cgroup = cg_unified_get_current_cgroup(relative);
 	if (!base_cgroup)
 		return -EINVAL;
-	prune_init_scope(base_cgroup);
+	if (!relative)
+		prune_init_scope(base_cgroup);
 
 	/* We assume that we have already been given controllers to delegate
 	 * further down the hierarchy. If not it is up to the user to delegate
 	 * them to us.
 	 */
 	mountpoint = must_copy_string(DEFAULT_CGROUP_MOUNTPOINT);
-	subtree_path = must_make_path(mountpoint, base_cgroup,
-				      "cgroup.subtree_control", NULL);
+	subtree_path = must_make_path(mountpoint, base_cgroup, "cgroup.controllers", NULL);
 	delegatable = cg_unified_get_controllers(subtree_path);
 	if (!delegatable)
 		delegatable = cg_unified_make_empty_controller();
@@ -3162,6 +3180,8 @@ struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
 	cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy;
 	cgfsng_ops->monitor_create = cgfsng_monitor_create;
 	cgfsng_ops->monitor_enter = cgfsng_monitor_enter;
+	cgfsng_ops->monitor_delegate_controllers = cgfsng_monitor_delegate_controllers;
+	cgfsng_ops->payload_delegate_controllers = cgfsng_payload_delegate_controllers;
 	cgfsng_ops->payload_create = cgfsng_payload_create;
 	cgfsng_ops->payload_enter = cgfsng_payload_enter;
 	cgfsng_ops->escape = cgfsng_escape;
@@ -3172,6 +3192,7 @@ struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
 	cgfsng_ops->set = cgfsng_set;
 	cgfsng_ops->freeze = cgfsng_freeze;
 	cgfsng_ops->unfreeze = cgfsng_unfreeze;
+	cgfsng_ops->setup_limits_legacy = cgfsng_setup_limits_legacy;
 	cgfsng_ops->setup_limits = cgfsng_setup_limits;
 	cgfsng_ops->driver = "cgfsng";
 	cgfsng_ops->version = "1.0.0";
diff --git a/src/lxc/cgroups/cgroup.c b/src/lxc/cgroups/cgroup.c
index 35e4b5ae4e..8804d59ac3 100644
--- a/src/lxc/cgroups/cgroup.c
+++ b/src/lxc/cgroups/cgroup.c
@@ -65,6 +65,7 @@ void cgroup_exit(struct cgroup_ops *ops)
 
 	free(ops->cgroup_pattern);
 	free(ops->container_cgroup);
+	free(ops->monitor_cgroup);
 
 	if (ops->cgroup2_devices)
 		bpf_program_free(ops->cgroup2_devices);
diff --git a/src/lxc/cgroups/cgroup.h b/src/lxc/cgroups/cgroup.h
index 81320e4876..80d2c315a3 100644
--- a/src/lxc/cgroups/cgroup.h
+++ b/src/lxc/cgroups/cgroup.h
@@ -88,6 +88,7 @@ struct cgroup_ops {
 	char **cgroup_use;
 	char *cgroup_pattern;
 	char *container_cgroup;
+	char *monitor_cgroup;
 
 	/* Static memory, do not free.*/
 	const char *monitor_pattern;
@@ -135,9 +136,9 @@ struct cgroup_ops {
 	void (*payload_destroy)(struct cgroup_ops *ops, struct lxc_handler *handler);
 	void (*monitor_destroy)(struct cgroup_ops *ops, struct lxc_handler *handler);
 	bool (*monitor_create)(struct cgroup_ops *ops, struct lxc_handler *handler);
-	bool (*monitor_enter)(struct cgroup_ops *ops, pid_t pid);
+	bool (*monitor_enter)(struct cgroup_ops *ops, struct lxc_handler *handler);
 	bool (*payload_create)(struct cgroup_ops *ops, struct lxc_handler *handler);
-	bool (*payload_enter)(struct cgroup_ops *ops, pid_t pid);
+	bool (*payload_enter)(struct cgroup_ops *ops, struct lxc_handler *handler);
 	const char *(*get_cgroup)(struct cgroup_ops *ops, const char *controller);
 	bool (*escape)(const struct cgroup_ops *ops, struct lxc_conf *conf);
 	int (*num_hierarchies)(struct cgroup_ops *ops);
@@ -148,8 +149,9 @@ struct cgroup_ops {
 		   size_t len, const char *name, const char *lxcpath);
 	int (*freeze)(struct cgroup_ops *ops, int timeout);
 	int (*unfreeze)(struct cgroup_ops *ops, int timeout);
-	bool (*setup_limits)(struct cgroup_ops *ops, struct lxc_conf *conf,
-			     bool with_devices);
+	bool (*setup_limits_legacy)(struct cgroup_ops *ops,
+				    struct lxc_conf *conf, bool with_devices);
+	bool (*setup_limits)(struct cgroup_ops *ops, struct lxc_handler *handler);
 	bool (*chown)(struct cgroup_ops *ops, struct lxc_conf *conf);
 	bool (*attach)(struct cgroup_ops *ops, const char *name,
 		       const char *lxcpath, pid_t pid);
@@ -158,6 +160,8 @@ struct cgroup_ops {
 	int (*nrtasks)(struct cgroup_ops *ops);
 	bool (*devices_activate)(struct cgroup_ops *ops,
 				 struct lxc_handler *handler);
+	bool (*monitor_delegate_controllers)(struct cgroup_ops *ops);
+	bool (*payload_delegate_controllers)(struct cgroup_ops *ops);
 };
 
 extern struct cgroup_ops *cgroup_init(struct lxc_conf *conf);
diff --git a/src/lxc/lxccontainer.c b/src/lxc/lxccontainer.c
index e89caf4e15..b97b58ec6a 100644
--- a/src/lxc/lxccontainer.c
+++ b/src/lxc/lxccontainer.c
@@ -824,6 +824,15 @@ static bool wait_on_daemonized_start(struct lxc_handler *handler, int pid)
 {
 	int ret, state;
 
+	/* The first child is going to fork() again and then exits. So we reap
+	 * the first child here.
+	 */
+	ret = wait_for_pid(pid);
+	if (ret < 0)
+		DEBUG("Failed waiting on first child %d", pid);
+	else
+		DEBUG("First child %d exited", pid);
+
 	/* Close write end of the socket pair. */
 	close(handler->state_socket_pair[1]);
 	handler->state_socket_pair[1] = -1;
@@ -834,15 +843,6 @@ static bool wait_on_daemonized_start(struct lxc_handler *handler, int pid)
 	close(handler->state_socket_pair[0]);
 	handler->state_socket_pair[0] = -1;
 
-	/* The first child is going to fork() again and then exits. So we reap
-	 * the first child here.
-	 */
-	ret = wait_for_pid(pid);
-	if (ret < 0)
-		DEBUG("Failed waiting on first child %d", pid);
-	else
-		DEBUG("First child %d exited", pid);
-
 	if (state < 0) {
 		SYSERROR("Failed to receive the container state");
 		return false;
@@ -935,17 +935,17 @@ static bool do_lxcapi_start(struct lxc_container *c, int useinit, char * const a
 	if (c->daemonize) {
 		bool started;
 		char title[2048];
-		pid_t pid;
+		pid_t pid_first, pid_second;
 
-		pid = fork();
-		if (pid < 0) {
+		pid_first = fork();
+		if (pid_first < 0) {
 			free_init_cmd(init_cmd);
 			lxc_free_handler(handler);
 			return false;
 		}
 
 		/* first parent */
-		if (pid != 0) {
+		if (pid_first != 0) {
 			/* Set to NULL because we don't want father unlink
 			 * the PID file, child will do the free and unlink.
 			 */
@@ -954,7 +954,7 @@ static bool do_lxcapi_start(struct lxc_container *c, int useinit, char * const a
 			/* Wait for container to tell us whether it started
 			 * successfully.
 			 */
-			started = wait_on_daemonized_start(handler, pid);
+			started = wait_on_daemonized_start(handler, pid_first);
 
 			free_init_cmd(init_cmd);
 			lxc_free_handler(handler);
@@ -980,14 +980,14 @@ static bool do_lxcapi_start(struct lxc_container *c, int useinit, char * const a
 		 * POSIX's daemon() function we change to "/" and redirect
 		 * std{in,out,err} to /dev/null.
 		 */
-		pid = fork();
-		if (pid < 0) {
+		pid_second = fork();
+		if (pid_second < 0) {
 			SYSERROR("Failed to fork first child process");
 			_exit(EXIT_FAILURE);
 		}
 
 		/* second parent */
-		if (pid != 0) {
+		if (pid_second != 0) {
 			free_init_cmd(init_cmd);
 			lxc_free_handler(handler);
 			_exit(EXIT_SUCCESS);
diff --git a/src/lxc/macro.h b/src/lxc/macro.h
index 2aeda4e3da..e011596d21 100644
--- a/src/lxc/macro.h
+++ b/src/lxc/macro.h
@@ -448,6 +448,12 @@ enum {
 		-1;                    \
 	})
 
+#define ret_set_errno(__ret__, __errno__) \
+	({                                \
+		errno = __errno__;        \
+		__ret__;                  \
+	})
+
 #define free_replace_move_ptr(a, b) \
 	({                          \
 		free(a);            \
diff --git a/src/lxc/start.c b/src/lxc/start.c
index 6e2f0ab046..aa4939945d 100644
--- a/src/lxc/start.c
+++ b/src/lxc/start.c
@@ -737,6 +737,10 @@ struct lxc_handler *lxc_init_handler(const char *name, struct lxc_conf *conf,
 		handler->nsfd[i] = -1;
 
 	handler->name = name;
+	if (daemonize)
+		handler->transient_pid = lxc_raw_getpid();
+	else
+		handler->transient_pid = -1;
 
 	if (daemonize && handler->conf->reboot == REBOOT_NONE) {
 		/* Create socketpair() to synchronize on daemonized startup.
@@ -912,7 +916,7 @@ int lxc_init(const char *name, struct lxc_handler *handler)
 	ret = lsm_process_prepare(conf, handler->lxcpath);
 	if (ret < 0) {
 		ERROR("Failed to initialize LSM");
-		goto out_destroy_cgroups;
+		goto out_delete_terminal;
 	}
 	TRACE("Initialized LSM");
 
@@ -920,10 +924,6 @@ int lxc_init(const char *name, struct lxc_handler *handler)
 	handler->monitor_status_fd = move_fd(status_fd);
 	return 0;
 
-out_destroy_cgroups:
-	handler->cgroup_ops->payload_destroy(handler->cgroup_ops, handler);
-	handler->cgroup_ops->monitor_destroy(handler->cgroup_ops, handler);
-
 out_delete_terminal:
 	lxc_terminal_delete(&handler->conf->console);
 
@@ -1016,8 +1016,10 @@ void lxc_fini(const char *name, struct lxc_handler *handler)
 
 	lsm_process_cleanup(handler->conf, handler->lxcpath);
 
-	cgroup_ops->payload_destroy(cgroup_ops, handler);
-	cgroup_ops->monitor_destroy(cgroup_ops, handler);
+	if (cgroup_ops) {
+		cgroup_ops->payload_destroy(cgroup_ops, handler);
+		cgroup_ops->monitor_destroy(cgroup_ops, handler);
+	}
 
 	if (handler->conf->reboot == REBOOT_NONE) {
 		/* For all new state clients simply close the command socket.
@@ -1813,14 +1815,24 @@ static int lxc_spawn(struct lxc_handler *handler)
 	if (ret < 0)
 		goto out_delete_net;
 
-	if (!cgroup_ops->setup_limits(cgroup_ops, handler->conf, false)) {
+	if (!cgroup_ops->setup_limits_legacy(cgroup_ops, handler->conf, false)) {
 		ERROR("Failed to setup cgroup limits for container \"%s\"", name);
 		goto out_delete_net;
 	}
 
-	if (!cgroup_ops->payload_enter(cgroup_ops, handler->pid))
+	if (!cgroup_ops->payload_enter(cgroup_ops, handler))
 		goto out_delete_net;
 
+	if (!cgroup_ops->payload_delegate_controllers(cgroup_ops)) {
+		ERROR("Failed to delegate controllers to payload cgroup");
+		goto out_delete_net;
+	}
+
+	if (!cgroup_ops->setup_limits(cgroup_ops, handler)) {
+		ERROR("Failed to setup cgroup limits for container \"%s\"", name);
+		goto out_delete_net;
+	}
+
 	if (!cgroup_ops->chown(cgroup_ops, handler->conf))
 		goto out_delete_net;
 
@@ -1883,7 +1895,7 @@ static int lxc_spawn(struct lxc_handler *handler)
 	if (ret < 0)
 		goto out_delete_net;
 
-	if (!cgroup_ops->setup_limits(cgroup_ops, handler->conf, true)) {
+	if (!cgroup_ops->setup_limits_legacy(cgroup_ops, handler->conf, true)) {
 		ERROR("Failed to setup legacy device cgroup controller limits");
 		goto out_delete_net;
 	}
@@ -2015,12 +2027,18 @@ int __lxc_start(const char *name, struct lxc_handler *handler,
 		goto out_fini_nonet;
 	}
 
-	if (!cgroup_ops->monitor_enter(cgroup_ops, handler->monitor_pid)) {
+	if (!cgroup_ops->monitor_enter(cgroup_ops, handler)) {
 		ERROR("Failed to enter monitor cgroup");
 		ret = -1;
 		goto out_fini_nonet;
 	}
 
+	if (!cgroup_ops->monitor_delegate_controllers(cgroup_ops)) {
+		ERROR("Failed to delegate controllers to monitor cgroup");
+		ret = -1;
+		goto out_fini_nonet;
+	}
+
 	if (geteuid() == 0 && !lxc_list_empty(&conf->id_map)) {
 		/* If the backing store is a device, mount it here and now. */
 		if (rootfs_is_blockdev(conf)) {
diff --git a/src/lxc/start.h b/src/lxc/start.h
index dc40f29eeb..662ac55704 100644
--- a/src/lxc/start.h
+++ b/src/lxc/start.h
@@ -89,6 +89,9 @@ struct lxc_handler {
 	 */
 	int proc_pidfd;
 
+	/* The grandfather's pid when double-forking. */
+	pid_t transient_pid;
+
 	/* The monitor's pid. */
 	pid_t monitor_pid;