[lxc-devel] [lxc/master] introduce lxc.cgroup.dir.{payload, monitor, limit_prefix}

Blub on Github lxc-bot at linuxcontainers.org
Fri Apr 3 08:43:24 UTC 2020


A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 1256 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20200403/3f85658b/attachment-0001.bin>
-------------- next part --------------
From 8f1e5612cf33b41cc15ebb3712860409bfe64b69 Mon Sep 17 00:00:00 2001
From: Wolfgang Bumiller <w.bumiller at proxmox.com>
Date: Thu, 2 Apr 2020 10:01:37 +0200
Subject: [PATCH] introduce lxc.cgroup.dir.{payload,monitor,limit_prefix}

This is a new approach to #1302 with a container-side
configuration instead of a global boolean flag.

Contrary to the previous PR using an optional additional
parameter for the get-cgroup command, this introduces two
new additional commands to get the limiting cgroup path and
cgroup2 file descriptor. If the limiting option is not in
use, these behave identical to their full-path counterparts.

The main use case is to be able to add an additional
subdirectory to the payload while having the limits applied
a directory further up, and with that prevent privileged
containers from overriding their initial cgroup limits.

Signed-off-by: Wolfgang Bumiller <w.bumiller at proxmox.com>
---
 doc/lxc.container.conf.sgml.in |  41 +++++++++
 src/lxc/cgroups/cgfsng.c       | 161 +++++++++++++++++++++++++++------
 src/lxc/cgroups/cgroup.h       |  15 ++-
 src/lxc/commands.c             | 126 ++++++++++++++++++++------
 src/lxc/commands.h             |   6 ++
 src/lxc/conf.c                 |   3 +
 src/lxc/conf.h                 |   3 +
 src/lxc/confile.c              | 124 +++++++++++++++++++++++++
 src/lxc/criu.c                 |  10 +-
 9 files changed, 423 insertions(+), 66 deletions(-)

diff --git a/doc/lxc.container.conf.sgml.in b/doc/lxc.container.conf.sgml.in
index ae04e3af36..5288aff20f 100644
--- a/doc/lxc.container.conf.sgml.in
+++ b/doc/lxc.container.conf.sgml.in
@@ -1571,6 +1571,47 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
             </para>
           </listitem>
         </varlistentry>
+        <varlistentry>
+          <term>
+            <option>lxc.cgroup.dir.payload</option>
+          </term>
+          <listitem>
+            <para>
+              This is similar to <option>lxc.cgroup.dir</option>, but must be
+              used together with <option>lxc.cgroup.dir.monitor</option> and
+              affects only the container's final path. Setting this option will
+              clear any previous <option>lxc.cgroup.dir</option> as these are
+              mutually exclusive.
+            </para>
+          </listitem>
+        </varlistentry>
+        <varlistentry>
+          <term>
+            <option>lxc.cgroup.dir.monitor</option>
+          </term>
+          <listitem>
+            <para>
+              This is the monitor process counterpart to
+              <option>lxc.cgroup.dir.payload</option>.
+            </para>
+          </listitem>
+        </varlistentry>
+        <varlistentry>
+          <term>
+            <option>lxc.cgroup.dir.limit_prefix</option>
+          </term>
+          <listitem>
+            <para>
+              Specify an alternative path to where cgroup limits should be
+              applied to. This allows enforcing limits for privileged containers
+              in a way they cannot override them. This only works in conjunction
+              with the <option>lxc.cgroup.dir.payload</option> and
+              <option>lxc.cgroup.dir.monitor</option> options and has otherwise
+              no effect. It must point to a prefix directory of
+              <option>lxc.cgroup.dir.payload</option>.
+            </para>
+          </listitem>
+        </varlistentry>
         <varlistentry>
           <term>
             <option>lxc.cgroup.relative</option>
diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c
index d3595bcdf9..50625472c5 100644
--- a/src/lxc/cgroups/cgfsng.c
+++ b/src/lxc/cgroups/cgfsng.c
@@ -725,6 +725,7 @@ static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char
 	new->container_base_path = container_base_path;
 	new->version = type;
 	new->cgfd_con = -EBADF;
+	new->cgfd_limit = -EBADF;
 	new->cgfd_mon = -EBADF;
 
 	newentry = append_null_to_list((void ***)h);
@@ -956,13 +957,15 @@ static int cgroup_tree_remove(struct hierarchy **hierarchies,
 		struct hierarchy *h = hierarchies[i];
 		int ret;
 
-		if (!h->container_full_path)
+		if (!h->container_limit_path)
 			continue;
 
-		ret = lxc_rm_rf(h->container_full_path);
+		ret = lxc_rm_rf(h->container_limit_path);
 		if (ret < 0)
-			WARN("Failed to destroy \"%s\"", h->container_full_path);
+			WARN("Failed to destroy \"%s\"", h->container_limit_path);
 
+		if (h->container_limit_path != h->container_full_path)
+			free_disarm(h->container_limit_path);
 		free_disarm(h->container_full_path);
 	}
 
@@ -1089,7 +1092,12 @@ __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
 			goto try_lxc_rm_rf;
 		}
 
-		if (conf && conf->cgroup_meta.dir)
+		if (conf && conf->cgroup_meta.monitor_dir)
+			pivot_path = must_make_path(h->mountpoint,
+						    h->container_base_path,
+						    conf->cgroup_meta.monitor_dir,
+						    CGROUP_PIVOT, NULL);
+		else if (conf && conf->cgroup_meta.dir)
 			pivot_path = must_make_path(h->mountpoint,
 						    h->container_base_path,
 						    conf->cgroup_meta.dir,
@@ -1147,7 +1155,8 @@ static int mkdir_eexist_on_last(const char *dir, mode_t mode)
 }
 
 static bool cgroup_tree_create(struct hierarchy *h, const char *cgroup_tree,
-			       const char *cgroup_leaf, bool payload)
+			       const char *cgroup_leaf, bool payload,
+			       const char *cgroup_limit_dir)
 {
 	__do_free char *path = NULL;
 	int ret, ret_cpuset;
@@ -1176,6 +1185,16 @@ static bool cgroup_tree_create(struct hierarchy *h, const char *cgroup_tree,
 		if (h->cgfd_con < 0)
 			return log_error_errno(false, errno, "Failed to open %s", path);
 		h->container_full_path = move_ptr(path);
+		if (cgroup_limit_dir) {
+			path = must_make_path(h->mountpoint, h->container_base_path, cgroup_limit_dir, NULL);
+			h->cgfd_limit = lxc_open_dirfd(path);
+			if (h->cgfd_limit < 0)
+				return log_error_errno(false, errno, "Failed to open %s", path);
+			h->container_limit_path = move_ptr(path);
+		} else {
+			h->container_limit_path = h->container_full_path;
+			h->cgfd_limit = h->cgfd_con;
+		}
 	} else {
 		h->cgfd_mon = lxc_open_dirfd(path);
 		if (h->cgfd_mon < 0)
@@ -1188,11 +1207,15 @@ static bool cgroup_tree_create(struct hierarchy *h, const char *cgroup_tree,
 
 static void cgroup_tree_leaf_remove(struct hierarchy *h, bool payload)
 {
-	__do_free char *full_path = NULL;
+	__do_free char *full_path = NULL, *__limit_path = NULL;
+	char *limit_path = NULL;
 
 	if (payload) {
 		__lxc_unused __do_close int fd = move_fd(h->cgfd_con);
 		full_path = move_ptr(h->container_full_path);
+		limit_path = move_ptr(h->container_limit_path);
+		if (limit_path != full_path)
+			__limit_path = limit_path;
 	} else {
 		__lxc_unused __do_close int fd = move_fd(h->cgfd_mon);
 		full_path = move_ptr(h->monitor_full_path);
@@ -1200,6 +1223,55 @@ static void cgroup_tree_leaf_remove(struct hierarchy *h, bool payload)
 
 	if (full_path && rmdir(full_path))
 		SYSWARN("Failed to rmdir(\"%s\") cgroup", full_path);
+	if (limit_path && rmdir(limit_path))
+		SYSWARN("Failed to rmdir(\"%s\") cgroup", limit_path);
+}
+
+/*
+ * Check we have no lxc.cgroup.dir, and that lxc.cgroup.dir.limit_prefix is a
+ * proper prefix directory of lxc.cgroup.dir.payload.
+ *
+ * Returns the prefix length if it is set, otherwise zero on success.
+ */
+static bool check_cgroup_dir_config(struct lxc_conf *conf)
+{
+	const char *payload_dir, *limit_prefix, *monitor_dir;
+	size_t prefix_len;
+
+	monitor_dir = conf->cgroup_meta.monitor_dir;
+	payload_dir = conf->cgroup_meta.payload_dir;
+	limit_prefix = conf->cgroup_meta.limit_prefix_dir;
+
+	/* none of the new options are set, all is fine */
+	if (!monitor_dir && !payload_dir && !limit_prefix)
+		return true;
+
+	/* some are set, make sure lxc.cgroup.dir is not also set*/
+	if (conf->cgroup_meta.dir)
+		return log_error_errno(false, EINVAL,
+				       "lxc.cgroup.dir conflicts with"
+				       " lxc.cgroup.dir.payload/monitor");
+
+	/* make sure both monitor and payload are set */
+	if (!monitor_dir || !payload_dir)
+		return log_error_errno(false, EINVAL,
+				       "lxc.cgroup.dir.payload and"
+				       " lxc.cgroup.dir.monitor must both be"
+				       " set");
+
+	if (!limit_prefix)
+		return true;
+
+	/* we have a limit prefix directory, check it: */
+
+	prefix_len = strlen(limit_prefix);
+	if (strncmp(payload_dir, limit_prefix, prefix_len) == 0 &&
+	    payload_dir[prefix_len] == '/')
+		return true;
+
+	return log_error_errno(false, EINVAL,
+			       "lxc.cgroup.limit_prefix must be a prefix"
+			       " directory of lxc.cgroup.dir.payload");
 }
 
 __cgfsng_ops static inline bool cgfsng_monitor_create(struct cgroup_ops *ops,
@@ -1210,7 +1282,7 @@ __cgfsng_ops static inline bool cgfsng_monitor_create(struct cgroup_ops *ops,
 	int idx = 0;
 	int i;
 	size_t len;
-	char *suffix;
+	char *suffix = NULL;
 	struct lxc_conf *conf;
 
 	if (!ops)
@@ -1227,7 +1299,13 @@ __cgfsng_ops static inline bool cgfsng_monitor_create(struct cgroup_ops *ops,
 
 	conf = handler->conf;
 
-	if (conf->cgroup_meta.dir) {
+	if (!check_cgroup_dir_config(conf))
+		return false;
+
+	if (conf->cgroup_meta.monitor_dir) {
+		cgroup_tree = NULL;
+		monitor_cgroup = strdup(conf->cgroup_meta.monitor_dir);
+	} else if (conf->cgroup_meta.dir) {
 		cgroup_tree = conf->cgroup_meta.dir;
 		monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
 					     DEFAULT_MONITOR_CGROUP_PREFIX,
@@ -1251,14 +1329,16 @@ __cgfsng_ops static inline bool cgfsng_monitor_create(struct cgroup_ops *ops,
 	if (!monitor_cgroup)
 		return ret_set_errno(false, ENOMEM);
 
-	suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
-	*suffix = '\0';
+	if (!conf->cgroup_meta.monitor_dir) {
+		suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
+		*suffix = '\0';
+	}
 	do {
-		if (idx)
+		if (idx && suffix)
 			sprintf(suffix, "-%d", idx);
 
 		for (i = 0; ops->hierarchies[i]; i++) {
-			if (cgroup_tree_create(ops->hierarchies[i], cgroup_tree, monitor_cgroup, false))
+			if (cgroup_tree_create(ops->hierarchies[i], cgroup_tree, monitor_cgroup, false, NULL))
 				continue;
 
 			ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->monitor_full_path ?: "(null)");
@@ -1268,9 +1348,9 @@ __cgfsng_ops static inline bool cgfsng_monitor_create(struct cgroup_ops *ops,
 			idx++;
 			break;
 		}
-	} while (ops->hierarchies[i] && idx > 0 && idx < 1000);
+	} while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
 
-	if (idx == 1000)
+	if (idx == 1000 || (!suffix && idx != 0))
 		return ret_set_errno(false, ERANGE);
 
 	ops->monitor_cgroup = move_ptr(monitor_cgroup);
@@ -1289,7 +1369,7 @@ __cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
 	int idx = 0;
 	int i;
 	size_t len;
-	char *suffix;
+	char *suffix = NULL;
 	struct lxc_conf *conf;
 
 	if (!ops)
@@ -1306,7 +1386,13 @@ __cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
 
 	conf = handler->conf;
 
-	if (conf->cgroup_meta.dir) {
+	if (!check_cgroup_dir_config(conf))
+		return false;
+
+	if (conf->cgroup_meta.payload_dir) {
+		cgroup_tree = NULL;
+		container_cgroup = strdup(conf->cgroup_meta.payload_dir);
+	} else if (conf->cgroup_meta.dir) {
 		cgroup_tree = conf->cgroup_meta.dir;
 		container_cgroup = must_concat(&len, cgroup_tree, "/",
 					     DEFAULT_PAYLOAD_CGROUP_PREFIX,
@@ -1330,14 +1416,18 @@ __cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
 	if (!container_cgroup)
 		return ret_set_errno(false, ENOMEM);
 
-	suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
-	*suffix = '\0';
+	if (!conf->cgroup_meta.payload_dir) {
+		suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
+		*suffix = '\0';
+	}
 	do {
-		if (idx)
+		if (idx && suffix)
 			sprintf(suffix, "-%d", idx);
 
 		for (i = 0; ops->hierarchies[i]; i++) {
-			if (cgroup_tree_create(ops->hierarchies[i], cgroup_tree, container_cgroup, true))
+			if (cgroup_tree_create(ops->hierarchies[i], cgroup_tree,
+					       container_cgroup, true,
+					       conf->cgroup_meta.limit_prefix_dir))
 				continue;
 
 			ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->container_full_path ?: "(null)");
@@ -1347,9 +1437,9 @@ __cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
 			idx++;
 			break;
 		}
-	} while (ops->hierarchies[i] && idx > 0 && idx < 1000);
+	} while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
 
-	if (idx == 1000)
+	if (idx == 1000 || (!suffix && idx != 0))
 		return ret_set_errno(false, ERANGE);
 
 	ops->container_cgroup = move_ptr(container_cgroup);
@@ -2040,7 +2130,8 @@ __cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
 }
 
 __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
-						  const char *controller)
+						  const char *controller,
+						  bool limiting)
 {
 	struct hierarchy *h;
 
@@ -2049,6 +2140,11 @@ __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
 		return log_warn_errno(NULL, ENOENT, "Failed to find hierarchy for controller \"%s\"",
 				      controller ? controller : "(null)");
 
+	if (limiting)
+		return h->container_limit_path
+			   ? h->container_limit_path + strlen(h->mountpoint)
+			   : NULL;
+
 	return h->container_full_path
 		   ? h->container_full_path + strlen(h->mountpoint)
 		   : NULL;
@@ -2382,7 +2478,7 @@ __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
 	if (p)
 		*p = '\0';
 
-	path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
+	path = lxc_cmd_get_limiting_cgroup_path(name, lxcpath, controller);
 	/* not running */
 	if (!path)
 		return -1;
@@ -2547,7 +2643,7 @@ __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
 		return 0;
 	}
 
-	path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
+	path = lxc_cmd_get_limiting_cgroup_path(name, lxcpath, controller);
 	/* not running */
 	if (!path)
 		return -1;
@@ -2657,7 +2753,7 @@ static int convert_devpath(const char *invalue, char *dest)
  * we created the cgroups.
  */
 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
-			      const char *value)
+			      const char *value, bool is_cpuset)
 {
 	__do_free char *controller = NULL;
 	char *p;
@@ -2683,7 +2779,12 @@ static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
 	if (!h)
 		return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
 
-	return lxc_write_openat(h->container_full_path, filename, value, strlen(value));
+	if (is_cpuset) {
+		int ret = lxc_write_openat(h->container_full_path, filename, value, strlen(value));
+		if (ret)
+			return ret;
+	}
+	return lxc_write_openat(h->container_limit_path, filename, value, strlen(value));
 }
 
 __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
@@ -2717,7 +2818,7 @@ __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
 		cg = iterator->elem;
 
 		if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
-			if (cg_legacy_set_data(ops, cg->subsystem, cg->value)) {
+			if (cg_legacy_set_data(ops, cg->subsystem, cg->value, strncmp("cpuset", cg->subsystem, 6) == 0)) {
 				if (do_devices && (errno == EACCES || errno == EPERM)) {
 					SYSWARN("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
 					continue;
@@ -2802,7 +2903,7 @@ __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
 			ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem,
 							cg->value);
 		} else {
-			ret = lxc_write_openat(h->container_full_path,
+			ret = lxc_write_openat(h->container_limit_path,
 					       cg->subsystem, cg->value,
 					       strlen(cg->value));
 			if (ret < 0)
@@ -2878,7 +2979,7 @@ __cgfsng_ops bool cgfsng_devices_activate(struct cgroup_ops *ops,
 		return log_error_errno(false, ENOMEM, "Failed to finalize bpf program");
 
 	ret = bpf_program_cgroup_attach(devices, BPF_CGROUP_DEVICE,
-					unified->container_full_path,
+					unified->container_limit_path,
 					BPF_F_ALLOW_MULTI);
 	if (ret)
 		return log_error_errno(false, ENOMEM, "Failed to attach bpf program");
diff --git a/src/lxc/cgroups/cgroup.h b/src/lxc/cgroups/cgroup.h
index 1e08a017a9..6e5dea853f 100644
--- a/src/lxc/cgroups/cgroup.h
+++ b/src/lxc/cgroups/cgroup.h
@@ -54,7 +54,11 @@ typedef enum {
  *   init's cgroup (if root).
  *
  * @container_full_path
- * - The full path to the containers cgroup.
+ * - The full path to the container's cgroup.
+ *
+ * @container_limit_path
+ * - The full path to the container's limiting cgroup. May simply point to
+ *   container_full_path.
  *
  * @monitor_full_path
  * - The full path to the monitor's cgroup.
@@ -77,15 +81,18 @@ struct hierarchy {
 	char *mountpoint;
 	char *container_base_path;
 	char *container_full_path;
+	char *container_limit_path;
 	char *monitor_full_path;
 	int version;
 
 	/* cgroup2 only */
 	unsigned int bpf_device_controller:1;
 
-	/* monitor cgroup fd */
-	int cgfd_con;
 	/* container cgroup fd */
+	int cgfd_con;
+	/* limiting cgroup fd (may be equal to cgfd_con if not separated) */
+	int cgfd_limit;
+	/* monitor cgroup fd */
 	int cgfd_mon;
 };
 
@@ -146,7 +153,7 @@ struct cgroup_ops {
 	bool (*monitor_enter)(struct cgroup_ops *ops, struct lxc_handler *handler);
 	bool (*payload_create)(struct cgroup_ops *ops, struct lxc_handler *handler);
 	bool (*payload_enter)(struct cgroup_ops *ops, struct lxc_handler *handler);
-	const char *(*get_cgroup)(struct cgroup_ops *ops, const char *controller);
+	const char *(*get_cgroup)(struct cgroup_ops *ops, const char *controller, bool limiting);
 	bool (*escape)(const struct cgroup_ops *ops, struct lxc_conf *conf);
 	int (*num_hierarchies)(struct cgroup_ops *ops);
 	bool (*get_hierarchies)(struct cgroup_ops *ops, int n, char ***out);
diff --git a/src/lxc/commands.c b/src/lxc/commands.c
index 991bca290e..5d30b85a57 100644
--- a/src/lxc/commands.c
+++ b/src/lxc/commands.c
@@ -84,6 +84,8 @@ static const char *lxc_cmd_str(lxc_cmd_t cmd)
 		[LXC_CMD_UNFREEZE]			= "unfreeze",
 		[LXC_CMD_GET_CGROUP2_FD]		= "get_cgroup2_fd",
 		[LXC_CMD_GET_INIT_PIDFD]        	= "get_init_pidfd",
+		[LXC_CMD_GET_LIMITING_CGROUP]		= "get_limiting_cgroup",
+		[LXC_CMD_GET_LIMITING_CGROUP2_FD]	= "get_limiting_cgroup2_fd",
 	};
 
 	if (cmd >= LXC_CMD_MAX)
@@ -142,7 +144,9 @@ static int lxc_cmd_rsp_recv(int sock, struct lxc_cmd_rr *cmd)
 		rsp->data = rspdata;
 	}
 
-	if (cmd->req.cmd == LXC_CMD_GET_CGROUP2_FD) {
+	if (cmd->req.cmd == LXC_CMD_GET_CGROUP2_FD ||
+	    cmd->req.cmd == LXC_CMD_GET_LIMITING_CGROUP2_FD)
+	{
 		int cgroup2_fd = move_fd(fd_rsp);
 		rsp->data = INT_TO_PTR(cgroup2_fd);
 	}
@@ -483,25 +487,14 @@ static int lxc_cmd_get_clone_flags_callback(int fd, struct lxc_cmd_req *req,
 	return 0;
 }
 
-/*
- * lxc_cmd_get_cgroup_path: Calculate a container's cgroup path for a
- * particular subsystem. This is the cgroup path relative to the root
- * of the cgroup filesystem.
- *
- * @name      : name of container to connect to
- * @lxcpath   : the lxcpath in which the container is running
- * @subsystem : the subsystem being asked about
- *
- * Returns the path on success, NULL on failure. The caller must free() the
- * returned path.
- */
-char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
-			      const char *subsystem)
+static char *lxc_cmd_get_cgroup_path_do(const char *name, const char *lxcpath,
+					const char *subsystem,
+					lxc_cmd_t command)
 {
 	int ret, stopped;
 	struct lxc_cmd_rr cmd = {
 		.req = {
-			.cmd = LXC_CMD_GET_CGROUP,
+			.cmd = command,
 			.data = subsystem,
 			.datalen = 0,
 		},
@@ -525,9 +518,50 @@ char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
 	return cmd.rsp.data;
 }
 
-static int lxc_cmd_get_cgroup_callback(int fd, struct lxc_cmd_req *req,
-				       struct lxc_handler *handler,
-				       struct lxc_epoll_descr *descr)
+/*
+ * lxc_cmd_get_cgroup_path: Calculate a container's cgroup path for a
+ * particular subsystem. This is the cgroup path relative to the root
+ * of the cgroup filesystem.
+ *
+ * @name      : name of container to connect to
+ * @lxcpath   : the lxcpath in which the container is running
+ * @subsystem : the subsystem being asked about
+ *
+ * Returns the path on success, NULL on failure. The caller must free() the
+ * returned path.
+ */
+char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
+			      const char *subsystem)
+{
+	return lxc_cmd_get_cgroup_path_do(name, lxcpath, subsystem,
+					  LXC_CMD_GET_CGROUP);
+}
+
+/*
+ * lxc_cmd_get_limiting_cgroup_path: Calculate a container's limiting cgroup
+ * path for a particular subsystem. This is the cgroup path relative to the
+ * root of the cgroup filesystem. This may be the same as the path returned by
+ * lxc_cmd_get_cgroup_path if the container doesn't have a limiting path prefix
+ * set.
+ *
+ * @name      : name of container to connect to
+ * @lxcpath   : the lxcpath in which the container is running
+ * @subsystem : the subsystem being asked about
+ *
+ * Returns the path on success, NULL on failure. The caller must free() the
+ * returned path.
+ */
+char *lxc_cmd_get_limiting_cgroup_path(const char *name, const char *lxcpath,
+				       const char *subsystem)
+{
+	return lxc_cmd_get_cgroup_path_do(name, lxcpath, subsystem,
+					  LXC_CMD_GET_LIMITING_CGROUP);
+}
+
+static int lxc_cmd_get_cgroup_callback_do(int fd, struct lxc_cmd_req *req,
+					  struct lxc_handler *handler,
+					  struct lxc_epoll_descr *descr,
+					  bool limiting_cgroup)
 {
 	int ret;
 	const char *path;
@@ -539,9 +573,11 @@ static int lxc_cmd_get_cgroup_callback(int fd, struct lxc_cmd_req *req,
 		if (ret != 0)
 			return ret;
 
-		path = cgroup_ops->get_cgroup(cgroup_ops, req->data);
+		path = cgroup_ops->get_cgroup(cgroup_ops, req->data,
+					      limiting_cgroup);
 	} else {
-		path = cgroup_ops->get_cgroup(cgroup_ops, NULL);
+		path = cgroup_ops->get_cgroup(cgroup_ops, NULL,
+					      limiting_cgroup);
 	}
 	if (!path)
 		return -1;
@@ -557,6 +593,20 @@ static int lxc_cmd_get_cgroup_callback(int fd, struct lxc_cmd_req *req,
 	return 0;
 }
 
+static int lxc_cmd_get_cgroup_callback(int fd, struct lxc_cmd_req *req,
+				       struct lxc_handler *handler,
+				       struct lxc_epoll_descr *descr)
+{
+	return lxc_cmd_get_cgroup_callback_do(fd, req, handler, descr, false);
+}
+
+static int lxc_cmd_get_limiting_cgroup_callback(int fd, struct lxc_cmd_req *req,
+						struct lxc_handler *handler,
+						struct lxc_epoll_descr *descr)
+{
+	return lxc_cmd_get_cgroup_callback_do(fd, req, handler, descr, true);
+}
+
 /*
  * lxc_cmd_get_config_item: Get config item the running container
  *
@@ -1366,28 +1416,48 @@ int lxc_cmd_get_cgroup2_fd(const char *name, const char *lxcpath)
 	return PTR_TO_INT(cmd.rsp.data);
 }
 
-static int lxc_cmd_get_cgroup2_fd_callback(int fd, struct lxc_cmd_req *req,
-					   struct lxc_handler *handler,
-					   struct lxc_epoll_descr *descr)
+static int lxc_cmd_get_cgroup2_fd_callback_do(int fd, struct lxc_cmd_req *req,
+					      struct lxc_handler *handler,
+					      struct lxc_epoll_descr *descr,
+					      bool limiting_cgroup)
 {
 	struct lxc_cmd_rsp rsp = {
 		.ret = -EINVAL,
 	};
 	struct cgroup_ops *ops = handler->cgroup_ops;
-	int ret;
+	int ret, send_fd;
 
 	if (!pure_unified_layout(ops) || !ops->unified)
 		return lxc_cmd_rsp_send(fd, &rsp);
 
+	send_fd = limiting_cgroup ? ops->unified->cgfd_limit
+				  : ops->unified->cgfd_con;
+
 	rsp.ret = 0;
-	ret = lxc_abstract_unix_send_fds(fd, &ops->unified->cgfd_con, 1, &rsp,
-					 sizeof(rsp));
+	ret = lxc_abstract_unix_send_fds(fd, &send_fd, 1, &rsp, sizeof(rsp));
 	if (ret < 0)
 		return log_error(LXC_CMD_REAP_CLIENT_FD, "Failed to send cgroup2 fd");
 
 	return 0;
 }
 
+static int lxc_cmd_get_cgroup2_fd_callback(int fd, struct lxc_cmd_req *req,
+					   struct lxc_handler *handler,
+					   struct lxc_epoll_descr *descr)
+{
+	return lxc_cmd_get_cgroup2_fd_callback_do(fd, req, handler, descr,
+						  false);
+}
+
+static int lxc_cmd_get_limiting_cgroup2_fd_callback(int fd,
+						    struct lxc_cmd_req *req,
+						    struct lxc_handler *handler,
+						    struct lxc_epoll_descr *descr)
+{
+	return lxc_cmd_get_cgroup2_fd_callback_do(fd, req, handler, descr,
+						  true);
+}
+
 static int lxc_cmd_process(int fd, struct lxc_cmd_req *req,
 			   struct lxc_handler *handler,
 			   struct lxc_epoll_descr *descr)
@@ -1415,6 +1485,8 @@ static int lxc_cmd_process(int fd, struct lxc_cmd_req *req,
 		[LXC_CMD_UNFREEZE]			= lxc_cmd_unfreeze_callback,
 		[LXC_CMD_GET_CGROUP2_FD]		= lxc_cmd_get_cgroup2_fd_callback,
 		[LXC_CMD_GET_INIT_PIDFD]                = lxc_cmd_get_init_pidfd_callback,
+		[LXC_CMD_GET_LIMITING_CGROUP]           = lxc_cmd_get_limiting_cgroup_callback,
+		[LXC_CMD_GET_LIMITING_CGROUP2_FD]       = lxc_cmd_get_limiting_cgroup2_fd_callback,
 	};
 
 	if (req->cmd >= LXC_CMD_MAX)
diff --git a/src/lxc/commands.h b/src/lxc/commands.h
index 9e52484249..878998832b 100644
--- a/src/lxc/commands.h
+++ b/src/lxc/commands.h
@@ -38,6 +38,8 @@ typedef enum {
 	LXC_CMD_UNFREEZE,
 	LXC_CMD_GET_CGROUP2_FD,
 	LXC_CMD_GET_INIT_PIDFD,
+	LXC_CMD_GET_LIMITING_CGROUP,
+	LXC_CMD_GET_LIMITING_CGROUP2_FD,
 	LXC_CMD_MAX,
 } lxc_cmd_t;
 
@@ -129,5 +131,9 @@ extern int lxc_cmd_add_bpf_device_cgroup(const char *name, const char *lxcpath,
 extern int lxc_cmd_freeze(const char *name, const char *lxcpath, int timeout);
 extern int lxc_cmd_unfreeze(const char *name, const char *lxcpath, int timeout);
 extern int lxc_cmd_get_cgroup2_fd(const char *name, const char *lxcpath);
+extern char *lxc_cmd_get_limiting_cgroup_path(const char *name,
+					      const char *lxcpath,
+					      const char *subsystem);
+extern int lxc_cmd_get_limiting_cgroup2_fd(const char *name, const char *lxcpath);
 
 #endif /* __commands_h */
diff --git a/src/lxc/conf.c b/src/lxc/conf.c
index 2f6be9f263..7b594e90c7 100644
--- a/src/lxc/conf.c
+++ b/src/lxc/conf.c
@@ -3832,6 +3832,9 @@ void lxc_conf_free(struct lxc_conf *conf)
 	lxc_clear_apparmor_raw(conf);
 	lxc_clear_namespace(conf);
 	free(conf->cgroup_meta.dir);
+	free(conf->cgroup_meta.monitor_dir);
+	free(conf->cgroup_meta.payload_dir);
+	free(conf->cgroup_meta.limit_prefix_dir);
 	free(conf->cgroup_meta.controllers);
 	free(conf->shmount.path_host);
 	free(conf->shmount.path_cont);
diff --git a/src/lxc/conf.h b/src/lxc/conf.h
index 64885c35ea..f037c392ef 100644
--- a/src/lxc/conf.h
+++ b/src/lxc/conf.h
@@ -60,6 +60,9 @@ struct lxc_cgroup {
 		struct /* meta */ {
 			char *controllers;
 			char *dir;
+			char *monitor_dir;
+			char *payload_dir;
+			char *limit_prefix_dir;
 			bool relative;
 		};
 	};
diff --git a/src/lxc/confile.c b/src/lxc/confile.c
index 0ca577fa3f..1ecdbe645f 100644
--- a/src/lxc/confile.c
+++ b/src/lxc/confile.c
@@ -71,6 +71,9 @@ lxc_config_define(cap_keep);
 lxc_config_define(cgroup_controller);
 lxc_config_define(cgroup2_controller);
 lxc_config_define(cgroup_dir);
+lxc_config_define(cgroup_monitor_dir);
+lxc_config_define(cgroup_payload_dir);
+lxc_config_define(cgroup_limit_prefix_dir);
 lxc_config_define(cgroup_relative);
 lxc_config_define(console_buffer_size);
 lxc_config_define(console_logfile);
@@ -170,6 +173,9 @@ static struct lxc_config_t config_jump_table[] = {
 	{ "lxc.cap.drop",                  set_config_cap_drop,                    get_config_cap_drop,                    clr_config_cap_drop,                  },
 	{ "lxc.cap.keep",                  set_config_cap_keep,                    get_config_cap_keep,                    clr_config_cap_keep,                  },
 	{ "lxc.cgroup2",                   set_config_cgroup2_controller,          get_config_cgroup2_controller,          clr_config_cgroup2_controller,        },
+	{ "lxc.cgroup.dir.monitor",        set_config_cgroup_monitor_dir,          get_config_cgroup_monitor_dir,          clr_config_cgroup_monitor_dir,        },
+	{ "lxc.cgroup.dir.payload",        set_config_cgroup_payload_dir,          get_config_cgroup_payload_dir,          clr_config_cgroup_payload_dir,        },
+	{ "lxc.cgroup.dir.limit_prefix",   set_config_cgroup_limit_prefix_dir,     get_config_cgroup_limit_prefix_dir,     clr_config_cgroup_limit_prefix_dir,   },
 	{ "lxc.cgroup.dir",                set_config_cgroup_dir,                  get_config_cgroup_dir,                  clr_config_cgroup_dir,                },
 	{ "lxc.cgroup.relative",           set_config_cgroup_relative,             get_config_cgroup_relative,             clr_config_cgroup_relative,           },
 	{ "lxc.cgroup",                    set_config_cgroup_controller,           get_config_cgroup_controller,           clr_config_cgroup_controller,         },
@@ -1721,6 +1727,38 @@ static int set_config_cgroup_dir(const char *key, const char *value,
 	return set_config_string_item(&lxc_conf->cgroup_meta.dir, value);
 }
 
+static int set_config_cgroup_monitor_dir(const char *key, const char *value,
+					 struct lxc_conf *lxc_conf, void *data)
+{
+	if (lxc_config_value_empty(value))
+		return clr_config_cgroup_monitor_dir(key, lxc_conf, NULL);
+
+	return set_config_string_item(&lxc_conf->cgroup_meta.monitor_dir,
+				      value);
+}
+
+static int set_config_cgroup_payload_dir(const char *key, const char *value,
+					 struct lxc_conf *lxc_conf, void *data)
+{
+	if (lxc_config_value_empty(value))
+		return clr_config_cgroup_payload_dir(key, lxc_conf, NULL);
+
+	return set_config_string_item(&lxc_conf->cgroup_meta.payload_dir,
+				      value);
+}
+
+static int set_config_cgroup_limit_prefix_dir(const char *key,
+					      const char *value,
+					      struct lxc_conf *lxc_conf,
+					      void *data)
+{
+	if (lxc_config_value_empty(value))
+		return clr_config_cgroup_limit_prefix_dir(key, lxc_conf, NULL);
+
+	return set_config_string_item(&lxc_conf->cgroup_meta.limit_prefix_dir,
+				      value);
+}
+
 static int set_config_cgroup_relative(const char *key, const char *value,
 				      struct lxc_conf *lxc_conf, void *data)
 {
@@ -3644,6 +3682,56 @@ static int get_config_cgroup_dir(const char *key, char *retv, int inlen,
 	return fulllen;
 }
 
+static int get_config_cgroup_monitor_dir(const char *key, char *retv, int inlen,
+					 struct lxc_conf *lxc_conf, void *data)
+{
+	int len;
+	int fulllen = 0;
+
+	if (!retv)
+		inlen = 0;
+	else
+		memset(retv, 0, inlen);
+
+	strprint(retv, inlen, "%s", lxc_conf->cgroup_meta.monitor_dir);
+
+	return fulllen;
+}
+
+static int get_config_cgroup_payload_dir(const char *key, char *retv, int inlen,
+					 struct lxc_conf *lxc_conf, void *data)
+{
+	int len;
+	int fulllen = 0;
+
+	if (!retv)
+		inlen = 0;
+	else
+		memset(retv, 0, inlen);
+
+	strprint(retv, inlen, "%s", lxc_conf->cgroup_meta.payload_dir);
+
+	return fulllen;
+}
+
+static int get_config_cgroup_limit_prefix_dir(const char *key, char *retv,
+					      int inlen,
+					      struct lxc_conf *lxc_conf,
+					      void *data)
+{
+	int len;
+	int fulllen = 0;
+
+	if (!retv)
+		inlen = 0;
+	else
+		memset(retv, 0, inlen);
+
+	strprint(retv, inlen, "%s", lxc_conf->cgroup_meta.limit_prefix_dir);
+
+	return fulllen;
+}
+
 static inline int get_config_cgroup_relative(const char *key, char *retv,
 					     int inlen, struct lxc_conf *lxc_conf,
 					     void *data)
@@ -4458,6 +4546,42 @@ static int clr_config_cgroup_dir(const char *key, struct lxc_conf *lxc_conf,
 	return 0;
 }
 
+static int clr_config_cgroup_monitor_dir(const char *key,
+					 struct lxc_conf *lxc_conf,
+					 void *data)
+{
+	if (lxc_conf->cgroup_meta.monitor_dir) {
+		free(lxc_conf->cgroup_meta.monitor_dir);
+		lxc_conf->cgroup_meta.monitor_dir = NULL;
+	}
+
+	return 0;
+}
+
+static int clr_config_cgroup_payload_dir(const char *key,
+					 struct lxc_conf *lxc_conf,
+					 void *data)
+{
+	if (lxc_conf->cgroup_meta.payload_dir) {
+		free(lxc_conf->cgroup_meta.payload_dir);
+		lxc_conf->cgroup_meta.payload_dir = NULL;
+	}
+
+	return 0;
+}
+
+static int clr_config_cgroup_limit_prefix_dir(const char *key,
+					      struct lxc_conf *lxc_conf,
+					      void *data)
+{
+	if (lxc_conf->cgroup_meta.limit_prefix_dir) {
+		free(lxc_conf->cgroup_meta.limit_prefix_dir);
+		lxc_conf->cgroup_meta.limit_prefix_dir = NULL;
+	}
+
+	return 0;
+}
+
 static inline int clr_config_cgroup_relative(const char *key,
 					     struct lxc_conf *lxc_conf,
 					     void *data)
diff --git a/src/lxc/criu.c b/src/lxc/criu.c
index 1a909bb6c4..698eec7124 100644
--- a/src/lxc/criu.c
+++ b/src/lxc/criu.c
@@ -303,7 +303,7 @@ static void exec_criu(struct cgroup_ops *cgroup_ops, struct lxc_conf *conf,
 		 * the handler the restore task created.
 		 */
 		if (!strcmp(opts->action, "dump") || !strcmp(opts->action, "pre-dump")) {
-			path = lxc_cmd_get_cgroup_path(opts->c->name, opts->c->config_path, controllers[0]);
+			path = lxc_cmd_get_limiting_cgroup_path(opts->c->name, opts->c->config_path, controllers[0]);
 			if (!path) {
 				ERROR("failed to get cgroup path for %s", controllers[0]);
 				goto err;
@@ -311,7 +311,7 @@ static void exec_criu(struct cgroup_ops *cgroup_ops, struct lxc_conf *conf,
 		} else {
 			const char *p;
 
-			p = cgroup_ops->get_cgroup(cgroup_ops, controllers[0]);
+			p = cgroup_ops->get_cgroup(cgroup_ops, controllers[0], true);
 			if (!p) {
 				ERROR("failed to get cgroup path for %s", controllers[0]);
 				goto err;
@@ -406,9 +406,9 @@ static void exec_criu(struct cgroup_ops *cgroup_ops, struct lxc_conf *conf,
 		DECLARE_ARG("-t");
 		DECLARE_ARG(pid);
 
-		freezer_relative = lxc_cmd_get_cgroup_path(opts->c->name,
-							   opts->c->config_path,
-							   "freezer");
+		freezer_relative = lxc_cmd_get_limiting_cgroup_path(opts->c->name,
+								    opts->c->config_path,
+								    "freezer");
 		if (!freezer_relative) {
 			ERROR("failed getting freezer path");
 			goto err;


More information about the lxc-devel mailing list