[lxc-devel] [PATCH 6/6] cgroup: Major rewrite of cgroup logic

Christian Seiler christian at iwakd.de
Sun Sep 8 19:44:44 UTC 2013


This patch rewrites most of the cgroup logic. It creates a set of data
structures to store the kernel state of the cgroup hierarchies and
their mountpoints.

Mainly, everything is now grouped with respect to the hierarchies of
the system. Multiple controllers may be mounted together or separately
to different hierarchies, the data structures reflect this.

Each hierarchy may have multiple mount points (that were created
previously using the bind mount method) and each of these mount points
may point to a different prefix inside the cgroup tree. The current
code does not make any assumptions regarding the mount points, it just
parses /proc/self/mountinfo to acquire the relevant information.

The only requirement is that the current cgroup of either init (if
cgroup.pattern starts with '/' and the tools are executed as root) or
the current process (otherwise) are accessible. The root cgroup need
not be accessible.

The configuration option cgroup.pattern is introduced. For
root-executed containers, it specifies which format the cgroups should
be in. Example values may include '/lxc/%n', 'lxc/%n', '%n' or
'/machine/%n.lxc'. Any occurrence of '%n' is replaced with the name of
the container (and if clashes occur in any hierarchy, -1, -2, etc. are
appended globally). If the pattern starts with /, new containers'
cgroups will be located relative to init's cgroup; if it doesn't, they
will be located relative to the current process's cgroup.

Some changes to the cgroup.h API have been done to make it more
consistent, both with respect to naming and with respect to the
parameters. This causes some changes in other parts of the code that
are included in the patch.

There has been some testing of this functionality, but there are
probably still quite a few bugs in there, especially for people with
different configurations.

Signed-off-by: Christian Seiler <christian at iwakd.de>
---
 src/lxc/attach.c       |   19 +-
 src/lxc/cgroup.c       | 2307 +++++++++++++++++++++++++++---------------------
 src/lxc/cgroup.h       |  155 +++-
 src/lxc/commands.c     |    6 +-
 src/lxc/freezer.c      |   19 +-
 src/lxc/lxc.h          |   14 +-
 src/lxc/lxccontainer.c |    4 +-
 src/lxc/lxcutmp.c      |    2 +-
 src/lxc/start.c        |   33 +-
 src/lxc/start.h        |    2 +-
 src/lxc/state.c        |    2 +-
 src/lxc/utils.c        |    1 +
 src/tests/cgpath.c     |   18 +-
 13 files changed, 1494 insertions(+), 1088 deletions(-)

diff --git a/src/lxc/attach.c b/src/lxc/attach.c
index 413b78b..6f33252 100644
--- a/src/lxc/attach.c
+++ b/src/lxc/attach.c
@@ -727,7 +727,24 @@ int lxc_attach(const char* name, const char* lxcpath, lxc_attach_exec_t exec_fun
 
 		/* attach to cgroup, if requested */
 		if (options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) {
-			ret = lxc_cgroup_attach(attached_pid, name, lxcpath);
+			struct cgroup_meta_data *meta_data;
+			struct cgroup_process_info *container_info;
+
+			meta_data = lxc_cgroup_load_meta();
+			if (!meta_data) {
+				ERROR("could not move attached process %ld to cgroup of container", (long)attached_pid);
+				goto cleanup_error;
+			}
+
+			container_info = lxc_cgroup_get_container_info(name, lxcpath, meta_data);
+			lxc_cgroup_put_meta(meta_data);
+			if (!container_info) {
+				ERROR("could not move attached process %ld to cgroup of container", (long)attached_pid);
+				goto cleanup_error;
+			}
+
+			ret = lxc_cgroup_enter(container_info, attached_pid, false);
+			lxc_cgroup_process_info_free(container_info);
 			if (ret < 0) {
 				ERROR("could not move attached process %ld to cgroup of container", (long)attached_pid);
 				goto cleanup_error;
diff --git a/src/lxc/cgroup.c b/src/lxc/cgroup.c
index 2bd158b..288aa2e 100644
--- a/src/lxc/cgroup.c
+++ b/src/lxc/cgroup.c
@@ -42,6 +42,7 @@
 #include "commands.h"
 #include "list.h"
 #include "conf.h"
+#include "utils.h"
 
 #include <lxc/log.h>
 #include <lxc/cgroup.h>
@@ -61,1245 +62,1365 @@
 
 lxc_log_define(lxc_cgroup, lxc);
 
-#define MTAB "/proc/mounts"
+static struct cgroup_process_info *lxc_cgroup_process_info_getx(const char *proc_pid_cgroup_str, struct cgroup_meta_data *meta);
+static char **subsystems_from_mount_options(const char *mount_options, char **kernel_list);
+static void lxc_cgroup_mount_point_free(struct cgroup_mount_point *mp);
+static void lxc_cgroup_hierarchy_free(struct cgroup_hierarchy *h);
+static bool is_valid_cgroup(const char *name);
+static int create_or_remove_cgroup(bool remove, struct cgroup_mount_point *mp, const char *path);
+static int create_cgroup(struct cgroup_mount_point *mp, const char *path);
+static int remove_cgroup(struct cgroup_mount_point *mp, const char *path);
+static char *cgroup_to_absolute_path(struct cgroup_mount_point *mp, const char *path, const char *suffix);
+static struct cgroup_process_info *find_info_for_subsystem(struct cgroup_process_info *info, const char *subsystem);
+static int do_cgroup_get(const char *cgroup_path, const char *sub_filename, char *value, size_t len);
+static int do_cgroup_set(const char *cgroup_path, const char *sub_filename, const char *value);
+static bool cgroup_devices_has_allow_or_deny(struct lxc_handler *h, char *v, bool for_allow);
+static int do_setup_cgroup(struct lxc_handler *h, struct lxc_list *cgroup_settings, bool do_devices);
+static int cgroup_recursive_task_count(const char *cgroup_path);
+static int count_lines(const char *fn);
+static int handle_clone_children(struct cgroup_mount_point *mp, char *cgroup_path);
+
+struct cgroup_meta_data *lxc_cgroup_load_meta()
+{
+	const char *cgroup_use = NULL;
+	char **cgroup_use_list = NULL;
+	struct cgroup_meta_data *md = NULL;
+	int saved_errno;
+
+	errno = 0;
+	cgroup_use = lxc_global_config_value("cgroup.use");
+	if (!cgroup_use && errno != 0)
+		return NULL;
+	if (cgroup_use) {
+		cgroup_use_list = lxc_string_split_and_trim(cgroup_use, ',');
+		if (!cgroup_use_list)
+			return NULL;
+	}
 
-/* In the case of a bind mount, there could be two long pathnames in the
- * mntent plus options so use large enough buffer size
- */
-#define LARGE_MAXPATHLEN 4 * MAXPATHLEN
+	md = lxc_cgroup_load_meta2((const char **)cgroup_use_list);
+	saved_errno = errno;
+	lxc_free_array((void **)cgroup_use_list, free);
+	errno = saved_errno;
+	return md;
+}
 
-/* Check if a mount is a cgroup hierarchy for any subsystem.
- * Return the first subsystem found (or NULL if none).
- */
-static char *mount_has_subsystem(const struct mntent *mntent)
+struct cgroup_meta_data *lxc_cgroup_load_meta2(const char **subsystem_whitelist)
 {
-	FILE *f;
-	char *c, *ret = NULL;
-	char line[MAXPATHLEN];
+	FILE *proc_cgroups = NULL;
+	FILE *proc_self_cgroup = NULL;
+	FILE *proc_self_mountinfo = NULL;
+	bool all_kernel_subsystems = true;
+	bool all_named_subsystems = false;
+	struct cgroup_meta_data *meta_data = NULL;
+	char **kernel_subsystems = NULL;
+	size_t kernel_subsystems_count = 0;
+	size_t kernel_subsystems_capacity = 0;
+	size_t hierarchy_capacity = 0;
+	size_t mount_point_capacity = 0;
+	size_t mount_point_count = 0;
+	char **tokens = NULL;
+	size_t token_capacity = 0;
+	char *line = NULL;
+	size_t sz = 0;
+	int r, saved_errno = 0;
 
-	/* read the list of subsystems from the kernel */
-	f = fopen("/proc/cgroups", "r");
-	if (!f)
-		return 0;
+	/* if the subsystem whitelist is not specified, include all
+	 * hierarchies that contain kernel subsystems by default but
+	 * no hierarchies that only contain named subsystems
+	 *
+	 * if it is specified, the specifier @all will select all
+	 * hierarchies, @kernel will select all hierarchies with
+	 * kernel subsystems and @named will select all named
+	 * hierarchies
+	 */
+	all_kernel_subsystems = subsystem_whitelist ?
+		(lxc_string_in_array("@kernel", subsystem_whitelist) || lxc_string_in_array("@all", subsystem_whitelist)) :
+		true;
+	all_named_subsystems = subsystem_whitelist ?
+		(lxc_string_in_array("@named", subsystem_whitelist) || lxc_string_in_array("@all", subsystem_whitelist)) :
+		false;
+
+	meta_data = calloc(1, sizeof(struct cgroup_meta_data));
+	if (!meta_data)
+		return NULL;
+	meta_data->ref = 1;
 
-	/* skip the first line, which contains column headings */
-	if (!fgets(line, MAXPATHLEN, f)) {
-		fclose(f);
-		return 0;
-	}
+	/* Step 1: determine all kernel subsystems */
+	proc_cgroups = fopen_cloexec("/proc/cgroups", "r");
+	if (!proc_cgroups)
+		goto out_error;
 
-	while (fgets(line, MAXPATHLEN, f)) {
-		c = strchr(line, '\t');
-		if (!c)
-			continue;
-		*c = '\0';
+	while (getline(&line, &sz, proc_cgroups) != -1) {
+		char *tab1;
+		char *tab2;
+		int hierarchy_number;
 
-		ret = hasmntopt(mntent, line);
-		if (ret)
-			break;
-	}
-
-	fclose(f);
-	return ret;
-}
+		if (line[0] == '#')
+			continue;
+		if (!line[0])
+			continue;
 
-/*
- * Determine mountpoint for a cgroup subsystem.
- * @dest: a passed-in buffer of at least size MAXPATHLEN into which the path
- * is copied.
- * @subsystem: cgroup subsystem (i.e. freezer)
- *
- * Returns true on success, false on error.
- */
-bool get_subsys_mount(char *dest, const char *subsystem)
-{
-	struct mntent mntent_r;
-	FILE *file = NULL;
-	int ret;
-	bool retv = false;
-	char buf[LARGE_MAXPATHLEN] = {0};
+		tab1 = strchr(line, '\t');
+		if (!tab1)
+			continue;  
+		*tab1++ = '\0';
+		tab2 = strchr(tab1, '\t');
+		if (!tab2)
+			continue;
+		*tab2 = '\0';
 
-	file = setmntent(MTAB, "r");
-	if (!file) {
-		SYSERROR("failed to open %s", MTAB);
-		return -1;
+		tab2 = NULL;
+		hierarchy_number = strtoul(tab1, &tab2, 10);
+		if (!tab2 || *tab2)
+			continue;
+		(void)hierarchy_number;
+
+		r = lxc_grow_array((void ***)&kernel_subsystems, &kernel_subsystems_capacity, kernel_subsystems_count + 1, 12);
+		if (r < 0)
+			goto out_error;
+		kernel_subsystems[kernel_subsystems_count] = strdup(line);
+		if (!kernel_subsystems[kernel_subsystems_count])
+			goto out_error;
+		kernel_subsystems_count++;
 	}
 
-	while ((getmntent_r(file, &mntent_r, buf, sizeof(buf)))) {
-		if (strcmp(mntent_r.mnt_type, "cgroup"))
-			continue;
+	fclose(proc_cgroups);
+	proc_cgroups = NULL;
 
-		if (subsystem) {
-			if (!hasmntopt(&mntent_r, subsystem))
-				continue;
-		} else {
-			if (!mount_has_subsystem(&mntent_r))
-				continue;
-		}
+	/* Step 2: determine all hierarchies (by reading /proc/self/cgroup),
+	 *         since mount points don't specify hierarchy number and
+	 *         /proc/cgroups does not contain named hierarchies
+	 */
+	proc_self_cgroup = fopen_cloexec("/proc/self/cgroup", "r");
+	/* if for some reason (because of setns() and pid namespace for example),
+	 * /proc/self is not valid, we try /proc/1/cgroup... */
+	if (!proc_self_cgroup)
+		proc_self_cgroup = fopen_cloexec("/proc/1/cgroup", "r");
+	if (!proc_self_cgroup)
+		goto out_error;
+
+	while (getline(&line, &sz, proc_self_cgroup) != -1) {
+		/* file format: hierarchy:subsystems:group,
+		 * we only extract hierarchy and subsystems
+		 * here */
+		char *colon1;
+		char *colon2;
+		int hierarchy_number;
+		struct cgroup_hierarchy *h = NULL;
+		char **p;
+
+		if (!line[0])
+			continue;
 
-		ret = snprintf(dest, MAXPATHLEN, "%s", mntent_r.mnt_dir);
-		if (ret < 0 || ret >= MAXPATHLEN)
-			goto fail;
+		colon1 = strchr(line, ':');
+		if (!colon1)
+			continue;  
+		*colon1++ = '\0';
+		colon2 = strchr(colon1, ':');
+		if (!colon2)
+			continue;
+		*colon2 = '\0';
 
-		retv = true;
-		goto out;
-	};
+		colon2 = NULL;
+		hierarchy_number = strtoul(line, &colon2, 10);
+		if (!colon2 || *colon2)
+			continue;
 
-fail:
-	DEBUG("Failed to find cgroup for %s\n",
-	      subsystem ? subsystem : "(NULL)");
-out:
-	endmntent(file);
-	return retv;
-}
+		if (hierarchy_number > meta_data->maximum_hierarchy) {
+			/* lxc_grow_array will never shrink, so even if we find a lower
+			* hierarchy number here, the array will never be smaller
+			*/
+			r = lxc_grow_array((void ***)&meta_data->hierarchies, &hierarchy_capacity, hierarchy_number + 1, 12);
+			if (r < 0)
+				goto out_error;
 
-/*
- * is_in_cgroup: check whether pid is found in the passed-in cgroup tasks
- * file.
- * @path:  in full path to a cgroup tasks file
- * Note that in most cases the file will simply not exist, which is ok - it
- * just means that's not our cgroup.
- */
-static bool is_in_cgroup(pid_t pid, char *path)
-{
-	int cmppid;
-	FILE *f = fopen(path, "r");
-	char *line = NULL;
-	size_t sz = 0;
+			meta_data->maximum_hierarchy = hierarchy_number;
+		}
 
-	if (!f)
-		return false;
-	while (getline(&line, &sz, f) != -1) {
-		if (sscanf(line, "%d", &cmppid) == 1 && cmppid == pid) {
-			fclose(f);
-			free(line);
-			return true;
+		/* this shouldn't happen, we had this already */
+		if (meta_data->hierarchies[hierarchy_number])
+			goto out_error;
+
+		h = calloc(1, sizeof(struct cgroup_hierarchy));
+		if (!h)
+			goto out_error;
+
+		meta_data->hierarchies[hierarchy_number] = h;
+
+		h->index = hierarchy_number;
+		h->subsystems = lxc_string_split_and_trim(colon1, ',');
+		if (!h->subsystems)
+			goto out_error;
+		/* see if this hierarchy should be considered */
+		if (!all_kernel_subsystems || !all_named_subsystems) {
+			for (p = h->subsystems; *p; p++) {
+				if (!strncmp(*p, "name=", 5)) {
+					if (all_named_subsystems || (subsystem_whitelist && lxc_string_in_array(*p, subsystem_whitelist))) {
+						h->used = true;
+						break;
+					}
+				} else {
+					if (all_kernel_subsystems || (subsystem_whitelist && lxc_string_in_array(*p, subsystem_whitelist))) {
+						h->used = true;
+						break;
+					}
+				}
+			}
+		} else {
+			/* we want all hierarchy anyway */
+			h->used = true;
 		}
 	}
-	fclose(f);
-	if (line)
-		free(line);
-	return false;
-}
 
-/*
- * lxc_cgroup_path_get: Get the absolute pathname for a cgroup
- * file for a running container.
- *
- * @subsystem : subsystem of interest (e.g. "freezer"). If NULL, then
- *              the first cgroup entry in mtab will be used.
- * @name      : name of container to connect to
- * @lxcpath   : the lxcpath in which the container is running
- *
- * This is the exported function, which determines cgpath from the
- * lxc-start of the @name container running in @lxcpath.
- *
- * Returns path on success, NULL on error. The caller must free()
- * the returned path.
- */
-char *lxc_cgroup_path_get(const char *subsystem, const char *name,
-			  const char *lxcpath)
-{
-	char *cgpath, *cgp, path[MAXPATHLEN], *pathp, *p;
-	pid_t initpid = lxc_cmd_get_init_pid(name, lxcpath);
-	int ret;
+	fclose(proc_self_cgroup);
+	proc_self_cgroup = NULL;
+	
+	/* Step 3: determine all mount points of each hierarchy */
+	proc_self_mountinfo = fopen_cloexec("/proc/self/mountinfo", "r");
+	/* if for some reason (because of setns() and pid namespace for example),
+	 * /proc/self is not valid, we try /proc/1/cgroup... */
+	if (!proc_self_mountinfo)
+		proc_self_mountinfo = fopen_cloexec("/proc/1/mountinfo", "r");
+	if (!proc_self_mountinfo)
+		goto out_error;
+
+	while (getline(&line, &sz, proc_self_mountinfo) != -1) {
+		char *token, *saveptr = NULL;
+		size_t i, j, k;
+		struct cgroup_mount_point *mount_point;
+		struct cgroup_hierarchy *h;
+		char **subsystems;
+
+		if (line[0] && line[strlen(line) - 1] == '\n')
+			line[strlen(line) - 1] = '\0';
+
+		for (i = 0; (token = strtok_r(line, " ", &saveptr)); line = NULL) {
+			r = lxc_grow_array((void ***)&tokens, &token_capacity, i + 1, 64);
+			if (r < 0)
+				goto out_error;
+			tokens[i++] = token;
+		}
 
-	if (initpid < 0)
-		return NULL;
+		/* layout of /proc/self/mountinfo:
+		 *      0: id
+		 *      1: parent id
+		 *      2: device major:minor
+		 *      3: mount prefix
+		 *      4: mount point 
+		 *      5: per-mount options
+		 *    [optional X]: additional data
+		 *    X+7: "-"
+		 *    X+8: type
+		 *    X+9: source
+		 *    X+10: per-superblock options
+		 */
+		for (j = 6; j < i && tokens[j]; j++)
+			if (!strcmp(tokens[j], "-"))
+				break;
 
-	cgpath = lxc_cmd_get_cgroup_path(name, lxcpath, subsystem);
-	if (!cgpath)
-		return NULL;
+		/* could not find separator */
+		if (j >= i || !tokens[j])
+			continue;
+		/* there should be exactly three fields after
+		 * the separator
+		 */
+		if (i != j + 4)
+			continue;
 
-	if (!get_subsys_mount(path, subsystem))
-		return NULL;
+		/* not a cgroup filesystem */
+		if (strcmp(tokens[j + 1], "cgroup") != 0)
+			continue;
 
-	pathp = path + strlen(path);
-	/*
-	 * find a mntpt where i have the subsystem mounted, then find
-	 * a subset cgpath under that which has pid in it.
-	 *
-	 * If d->mntpt is '/a/b/c/d', and the mountpoint is /x/y/z,
-	 * then look for ourselves in:
-	 *    /x/y/z/a/b/c/d/tasks
-	 *    /x/y/z/b/c/d/tasks
-	 *    /x/y/z/c/d/tasks
-	 *    /x/y/z/d/tasks
-	 *    /x/y/z/tasks
-	 */
-	cgp = cgpath;
-	while (cgp[0]) {
-		ret = snprintf(pathp, MAXPATHLEN - (pathp - path), "%s/tasks", cgp);
-		if (ret < 0 || ret >= MAXPATHLEN)
-			return NULL;
-		if (!is_in_cgroup(initpid, path)) {
-			// does not exist, try the next one
-			cgp = index(cgp+1, '/');
-			if (!cgp)
+		subsystems = subsystems_from_mount_options(tokens[j + 3], kernel_subsystems);
+		if (!subsystems)
+			goto out_error;
+
+		h = NULL;
+		for (k = 1; k <= meta_data->maximum_hierarchy; k++) {
+			if (meta_data->hierarchies[k] &&
+			    meta_data->hierarchies[k]->subsystems[0] &&
+			    lxc_string_in_array(meta_data->hierarchies[k]->subsystems[0], (const char **)subsystems)) {
+				/* TODO: we could also check if the lists really match completely,
+				 *       just to have an additional sanity check */
+				h = meta_data->hierarchies[k];
 				break;
-			continue;
+			}
 		}
-		break;
-	}
-	if (!cgp || !*cgp) {
-		// try just the path
-		ret = snprintf(pathp, MAXPATHLEN - (pathp - path), "/tasks");
-		if (ret < 0 || ret >= MAXPATHLEN)
-			return NULL;
-		if (!is_in_cgroup(initpid, path)) {
-			return NULL;
+		lxc_free_array((void **)subsystems, free);
+
+		r = lxc_grow_array((void ***)&meta_data->mount_points, &mount_point_capacity, mount_point_count + 1, 12);
+		if (r < 0)
+			goto out_error;
+
+		/* create mount point object */
+		mount_point = calloc(1, sizeof(*mount_point));
+		if (!mount_point)
+			goto out_error;
+
+		meta_data->mount_points[mount_point_count++] = mount_point;
+
+		mount_point->hierarchy = h;
+		mount_point->mount_point = strdup(tokens[4]);
+		mount_point->mount_prefix = strdup(tokens[3]);
+		if (!mount_point->mount_point || !mount_point->mount_prefix)
+			goto out_error;
+		mount_point->read_only = !lxc_string_in_list("rw", tokens[5], ',');
+
+		if (!strcmp(mount_point->mount_prefix, "/")) {
+			if (mount_point->read_only) {
+				if (!h->ro_absolute_mount_point)
+					h->ro_absolute_mount_point = mount_point;
+			} else {
+				if (!h->rw_absolute_mount_point)
+					h->rw_absolute_mount_point = mount_point;
+			}
 		}
-		return strdup("/");
-	}
-	// path still has 'tasks' on the end, drop it
-	if ((p = strrchr(path, '/')) != NULL)
-		*p = '\0';
-	return strdup(path);
-}
 
-/*
- * do_cgroup_set: Write a value into a cgroup file
- *
- * @path      : absolute path to cgroup file
- * @value     : value to write into file
- *
- * Returns 0 on success, < 0 on error.
- */
-static int do_cgroup_set(const char *path, const char *value)
-{
-	int fd, ret;
-
-	if ((fd = open(path, O_WRONLY)) < 0) {
-		SYSERROR("open %s : %s", path, strerror(errno));
-		return -1;
+		k = lxc_array_len((void **)h->all_mount_points);
+		r = lxc_grow_array((void ***)&h->all_mount_points, &h->all_mount_point_capacity, k + 1, 4);
+		if (r < 0)
+			goto out_error;
+		h->all_mount_points[k] = mount_point;
 	}
 
-	if ((ret = write(fd, value, strlen(value))) < 0) {
-		close(fd);
-		SYSERROR("write %s : %s", path, strerror(errno));
-		return ret;
+	/* oops, we couldn't find anything */
+	if (!meta_data->hierarchies || !meta_data->mount_points) {
+		errno = EINVAL;
+		goto out_error;
 	}
 
-	if ((ret = close(fd)) < 0) {
-		SYSERROR("close %s : %s", path, strerror(errno));
-		return ret;
-	}
-	return 0;
+	return meta_data;
+
+out_error:
+	saved_errno = errno;
+	if (proc_cgroups)
+		fclose(proc_cgroups);
+	if (proc_self_cgroup)
+		fclose(proc_self_cgroup);
+	if (proc_self_mountinfo)
+		fclose(proc_self_mountinfo);
+	free(line);
+	free(tokens);
+	lxc_free_array((void **)kernel_subsystems, free);
+	lxc_cgroup_put_meta(meta_data);
+	errno = saved_errno;
+	return NULL;
 }
 
-static int in_subsys_list(const char *s, const char *list)
+struct cgroup_meta_data *lxc_cgroup_get_meta(struct cgroup_meta_data *meta_data)
 {
-	char *token, *str, *saveptr = NULL;
-
-	if (!list || !s)
-		return 0;
+	meta_data->ref++;
+	return meta_data;
+}
 
-	str = alloca(strlen(list)+1);
-	strcpy(str, list);
-	for (; (token = strtok_r(str, ",", &saveptr)); str = NULL) {
-		if (strcmp(s, token) == 0)
-			return 1;
+struct cgroup_meta_data *lxc_cgroup_put_meta(struct cgroup_meta_data *meta_data)
+{
+	size_t i;
+	if (!meta_data)
+		return NULL;
+	if (--meta_data->ref > 0)
+		return meta_data;
+	lxc_free_array((void **)meta_data->mount_points, (lxc_free_fn)lxc_cgroup_mount_point_free);
+	if (meta_data->hierarchies) {
+		for (i = 0; i <= meta_data->maximum_hierarchy; i++)
+			lxc_cgroup_hierarchy_free(meta_data->hierarchies[i]);
 	}
-
-	return 0;
+	free(meta_data->hierarchies);
+	return NULL;
 }
 
-static char *cgroup_get_subsys_abspath(struct lxc_handler *handler, const char *subsys)
+struct cgroup_hierarchy *lxc_cgroup_find_hierarchy(struct cgroup_meta_data *meta_data, const char *subsystem)
 {
-	struct cgroup_desc *d;
-
-	for (d = handler->cgroup; d; d = d->next) {
-		if (in_subsys_list(subsys, d->subsystems))
-			return d->curcgroup;
+	size_t i;
+	for (i = 0; i <= meta_data->maximum_hierarchy; i++) {
+		struct cgroup_hierarchy *h = meta_data->hierarchies[i];
+		if (h && lxc_string_in_array(subsystem, (const char **)h->subsystems))
+			return h;
 	}
-
 	return NULL;
 }
 
-static bool cgroup_devices_has_deny(struct lxc_handler *h, char *v)
+struct cgroup_mount_point *lxc_cgroup_find_mount_point(struct cgroup_hierarchy *hierarchy, const char *group, bool should_be_writable)
 {
-	char *cgabspath, path[MAXPATHLEN];
-	FILE *f;
-	char *line = NULL;
-	size_t len = 0;
-	bool ret = true;
-	int r;
+	struct cgroup_mount_point **mps;
+	struct cgroup_mount_point *current_result = NULL;
+	ssize_t quality = -1;
 
-	// XXX FIXME if users could use something other than 'lxc.devices.deny = a'.
-	// not sure they ever do, but they *could*
-	// right now, I'm assuming they do NOT
-	if (strcmp(v, "a") && strcmp(v, "a *:* rwm"))
-		return false;
-	cgabspath = cgroup_get_subsys_abspath(h, "devices");
-	if (!cgabspath)
-		return false;
+	/* trivial case */
+	if (hierarchy->rw_absolute_mount_point)
+		return hierarchy->rw_absolute_mount_point;
+	if (!should_be_writable && hierarchy->ro_absolute_mount_point)
+		return hierarchy->ro_absolute_mount_point;
 
-	r = snprintf(path, MAXPATHLEN, "%s/devices.list", cgabspath);
-	if (r < 0 || r >= MAXPATHLEN) {
-		ERROR("pathname too long for devices.list");
-		return false;
-	}
+	for (mps = hierarchy->all_mount_points; mps && *mps; mps++) {
+		struct cgroup_mount_point *mp = *mps;
+		size_t prefix_len = mp->mount_prefix ? strlen(mp->mount_prefix) : 0;
 
-	if (!(f = fopen(path, "r")))
-		return false;
+		if (prefix_len == 1 && mp->mount_prefix[0] == '/')
+			prefix_len = 0;
 
-	while (getline(&line, &len, f) != -1) {
-		size_t len = strlen(line);
-		if (len > 0 && line[len-1] == '\n')
-			line[len-1] = '\0';
-		if (strcmp(line, "a *:* rwm") == 0) {
-			ret = false;
-			goto out;
+		if (should_be_writable && mp->read_only)
+			continue;
+
+		if (!prefix_len ||
+		    (strncmp(group, mp->mount_prefix, prefix_len) == 0 &&
+		     (group[prefix_len] == '\0' || group[prefix_len] == '/'))) {
+			/* search for the best quality match, i.e. the match with the
+			 * shortest prefix where this group is still contained
+			 */
+			if (quality == -1 || prefix_len < quality) {
+				current_result = mp;
+				quality = prefix_len;
+			}
 		}
 	}
 
-out:
-	fclose(f);
-	if (line)
-		free(line);
-	return ret;
+	if (!current_result)
+		errno = ENOENT;
+	return current_result;
 }
 
-static bool cgroup_devices_has_allow(struct lxc_handler *h, char *v)
+char *lxc_cgroup_find_abs_path(const char *subsystem, const char *group, bool should_be_writable, const char *suffix)
 {
-	char *cgabspath, path[MAXPATHLEN];
-	int r;
-	bool ret = false;
-	FILE *f;
-	char *line = NULL;
-	size_t len = 0;
+	struct cgroup_meta_data *meta_data;
+	struct cgroup_hierarchy *h;
+	struct cgroup_mount_point *mp;
+	char *result;
+	int saved_errno;
+
+	meta_data = lxc_cgroup_load_meta();
+	if (!meta_data)
+		return NULL;
 
-	cgabspath = cgroup_get_subsys_abspath(h, "devices");
-	if (!cgabspath)
-		return false;
+	h = lxc_cgroup_find_hierarchy(meta_data, subsystem);
+	if (!h)
+		goto out_error;
 
-	r = snprintf(path, MAXPATHLEN, "%s/devices.list", cgabspath);
-	if (r < 0 || r >= MAXPATHLEN) {
-		ERROR("pathname too long to for devices.list");
-		return false;
-	}
+	mp = lxc_cgroup_find_mount_point(h, group, should_be_writable);
+	if (!mp)
+		goto out_error;
 
-	if (!(f = fopen(path, "r")))
-		return false;
+	result = cgroup_to_absolute_path(mp, group, suffix);
+	if (!result)
+		goto out_error;
 
-	while (getline(&line, &len, f) != -1) {
-		if (len < 1)
-			goto out;
-		if (line[len-1] == '\n')
-			line[len-1] = '\0';
-		if (strcmp(line, "a *:* rwm") == 0 || strcmp(line, v) == 0) {
-			ret = true;
-			goto out;
-		}
-	}
+	lxc_cgroup_put_meta(meta_data);
+	return result;
 
-out:
-	if (line)
-		free(line);
-	fclose(f);
-	return ret;
+out_error:
+	saved_errno = errno;
+	lxc_cgroup_put_meta(meta_data);
+	errno = saved_errno;
+	return NULL;
 }
 
-/*
- * lxc_cgroup_set_bypath: Write a value into a cgroup file
- *
- * @cgrelpath : a container's relative cgroup path (e.g. "lxc/c1")
- * @filename  : the cgroup file to write (e.g. "freezer.state")
- * @value     : value to write into file
- *
- * Returns 0 on success, < 0 on error.
- */
-int lxc_cgroup_set_value(struct lxc_handler *handler, const char *filename,
-			const char *value)
+struct cgroup_process_info *lxc_cgroup_process_info_get(pid_t pid, struct cgroup_meta_data *meta)
 {
-	char *cgabspath, path[MAXPATHLEN], *p;
-	int ret;
-
-	ret = snprintf(path, MAXPATHLEN, "%s", filename);
-	if (ret < 0 || ret >= MAXPATHLEN)
-		return -1;
-	if ((p = index(path, '.')) != NULL)
-		*p = '\0';
-	cgabspath = cgroup_get_subsys_abspath(handler, path);
-	if (!cgabspath)
-		return -1;
-
-	ret = snprintf(path, MAXPATHLEN, "%s/%s", cgabspath, filename);
-	if (ret < 0 || ret >= MAXPATHLEN) {
-		ERROR("pathname too long to set cgroup value %s to %s",
-			filename, value);
-		return -1;
-	}
-
-	return do_cgroup_set(path, value);
+	char pid_buf[32];
+	snprintf(pid_buf, 32, "/proc/%lu/cgroup", (unsigned long)pid);
+	return lxc_cgroup_process_info_getx(pid_buf, meta);
 }
 
-/*
- * lxc_cgroup_set: Write a value into a cgroup file
- *
- * @name      : name of container to connect to
- * @filename  : the cgroup file to write (e.g. "freezer.state")
- * @value     : value to write into file
- * @lxcpath   : the lxcpath in which the container is running
- *
- * Returns 0 on success, < 0 on error.
- */
-int lxc_cgroup_set(const char *name, const char *filename, const char *value,
-		   const char *lxcpath)
+struct cgroup_process_info *lxc_cgroup_process_info_get_init(struct cgroup_meta_data *meta)
 {
-	int ret;
-	char *cgabspath;
-	char path[MAXPATHLEN];
-	char *subsystem = alloca(strlen(filename)+1), *p;
-	strcpy(subsystem, filename);
+	return lxc_cgroup_process_info_get(1, meta);
+}
 
-	if ((p = index(subsystem, '.')) != NULL)
-		*p = '\0';
+struct cgroup_process_info *lxc_cgroup_process_info_get_self(struct cgroup_meta_data *meta)
+{
+	struct cgroup_process_info *i;
+	i = lxc_cgroup_process_info_getx("/proc/self/cgroup", meta);
+	if (!i)
+		i = lxc_cgroup_process_info_get(getpid(), meta);
+	return i;
+}
 
-	cgabspath = lxc_cgroup_path_get(subsystem, name, lxcpath);
-	if (!cgabspath)
-		return -1;
+/* create a new cgroup */
+extern struct cgroup_process_info *lxc_cgroup_create(const char *name, const char *path_pattern, struct cgroup_meta_data *meta_data, const char *sub_pattern)
+{
+	char **cgroup_path_components;
+	char **p = NULL;
+	char *path_so_far = NULL;
+	char **new_cgroup_paths = NULL;
+	char **new_cgroup_paths_sub = NULL;
+	struct cgroup_mount_point *mp;
+	struct cgroup_hierarchy *h;
+	struct cgroup_process_info *base_info = NULL;
+	struct cgroup_process_info *info_ptr;
+	int saved_errno;
+	int r;
+	unsigned suffix = 0;
+	bool had_sub_pattern = false;
+	size_t i;
 
-	ret = snprintf(path, MAXPATHLEN, "%s/%s", cgabspath, filename);
-	if (ret < 0 || ret >= MAXPATHLEN) {
-		ERROR("pathname too long");
-		ret = -1;
-		goto out;
+	if (!is_valid_cgroup(name)) {
+		ERROR("Invalid cgroup name: '%s'", name);
+		errno = EINVAL;
+		return NULL;
 	}
 
-	ret = do_cgroup_set(path, value);
+	if (!strstr(path_pattern, "%n")) {
+		ERROR("Invalid cgroup path pattern: '%s'; contains no %%n for specifying container name", path_pattern);
+		errno = EINVAL;
+		return NULL;
+	}
 
-out:
-	free(cgabspath);
-	return ret;
-}
+	/* we will modify the result of this operation directly,
+	 * so we don't have to copy the data structure
+	 */
+	base_info = (path_pattern[0] == '/') ?
+		lxc_cgroup_process_info_get_init(meta_data) :
+		lxc_cgroup_process_info_get_self(meta_data);
+	if (!base_info)
+		return NULL;
 
-/*
- * lxc_cgroup_get: Read value from a cgroup file
- *
- * @name      : name of container to connect to
- * @filename  : the cgroup file to read (e.g. "freezer.state")
- * @value     : a pre-allocated buffer to copy the answer into
- * @len       : the length of pre-allocated @value
- * @lxcpath   : the lxcpath in which the container is running
- *
- * Returns the number of bytes read on success, < 0 on error
- *
- * If you pass in NULL value or 0 len, the return value will be the size of
- * the file, and @value will not contain the contents.
- *
- * Note that we can't get the file size quickly through stat or lseek.
- * Therefore if you pass in len > 0 but less than the file size, your only
- * indication will be that the return value will be equal to the passed-in ret.
- * We will not return the actual full file size.
- */
-int lxc_cgroup_get(const char *name, const char *filename, char *value,
-		   size_t len, const char *lxcpath)
-{
-	int fd, ret;
-	char *cgabspath;
-	char path[MAXPATHLEN];
-	char *subsystem = alloca(strlen(filename)+1), *p;
+	new_cgroup_paths = calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
+	if (!new_cgroup_paths)
+		goto out_initial_error;
+
+	new_cgroup_paths_sub = calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
+	if (!new_cgroup_paths_sub)
+		goto out_initial_error;
+
+	/* find mount points we can use */
+	for (info_ptr = base_info; info_ptr; info_ptr = info_ptr->next) {
+		h = info_ptr->hierarchy;
+		mp = lxc_cgroup_find_mount_point(h, info_ptr->cgroup_path, true);
+		if (!mp) {
+			ERROR("Could not find writable mount point for cgroup hierarchy %d while trying to create cgroup.", h->index);
+			goto out_initial_error;
+		}
+		info_ptr->designated_mount_point = mp;
 
-	strcpy(subsystem, filename);
+		if (handle_clone_children(mp, info_ptr->cgroup_path) < 0) {
+			ERROR("Could not set clone_children to 1 for cpuset hierarchy in parent cgroup.");
+			goto out_initial_error;
+		}
+	}
 
-	if ((p = index(subsystem, '.')) != NULL)
-		*p = '\0';
+	/* normalize the path */
+	cgroup_path_components = lxc_normalize_path(path_pattern);
+	if (!cgroup_path_components)
+		goto out_initial_error;
+
+	/* go through the path components to see if we can create them */
+	for (p = cgroup_path_components; *p || (sub_pattern && !had_sub_pattern); p++) {
+		/* we only want to create the same component with -1, -2, etc.
+		 * if the component contains the container name itself, otherwise
+		 * it's not an error if it already exists
+		 */
+		char *p_eff = *p ? *p : (char *)sub_pattern;
+		bool contains_name = strstr(p_eff, "%n");
+		char *current_component = NULL;
+		char *current_subpath = NULL;
+		char *current_entire_path = NULL;
+		char *parts[3];
+		size_t j = 0;
+		i = 0;
+
+		/* if we are processing the subpattern, we want to make sure
+		 * loop is ended the next time around
+		 */
+		if (!*p) {
+			had_sub_pattern = true;
+			p--;
+		}
 
-	cgabspath = lxc_cgroup_path_get(subsystem, name, lxcpath);
-	if (!cgabspath)
-		return -1;
+		goto find_name_on_this_level;
+	
+	cleanup_name_on_this_level:
+		/* This is reached if we found a name clash.
+		 * In that case, remove the cgroup from all previous hierarchies
+		 */
+		for (j = 0, info_ptr = base_info; j < i && info_ptr; info_ptr = info_ptr->next, j++) {
+			r = remove_cgroup(info_ptr->designated_mount_point, info_ptr->created_paths[info_ptr->created_paths_count - 1]);
+			if (r < 0)
+				WARN("could not clean up cgroup we created when trying to create container");
+			free(info_ptr->created_paths[info_ptr->created_paths_count - 1]);
+			info_ptr->created_paths[--info_ptr->created_paths_count] = NULL;
+		}
+		if (current_component != current_subpath)
+			free(current_subpath);
+		if (current_component != p_eff)
+			free(current_component);
+		current_component = current_subpath = NULL;
+		/* try again with another suffix */
+		++suffix;
+	
+	find_name_on_this_level:
+		/* determine name of the path component we should create */
+		if (contains_name && suffix > 0) {
+			char *buf = calloc(strlen(name) + 32, 1);
+			if (!buf)
+				goto out_initial_error;
+			snprintf(buf, strlen(name) + 32, "%s-%u", name, suffix);
+			current_component = lxc_string_replace("%n", buf, p_eff);
+			free(buf);
+		} else {
+			current_component = contains_name ? lxc_string_replace("%n", name, p_eff) : p_eff;
+		}
+		parts[0] = path_so_far;
+		parts[1] = current_component;
+		parts[2] = NULL;
+		current_subpath = path_so_far ? lxc_string_join("/", (const char **)parts, false) : current_component;
+
+		/* Now go through each hierarchy and try to create the
+		 * corresponding cgroup
+		 */
+		for (i = 0, info_ptr = base_info; info_ptr; info_ptr = info_ptr->next, i++) {
+			char *parts2[3];
+			current_entire_path = NULL;
+
+			parts2[0] = !strcmp(info_ptr->cgroup_path, "/") ? "" : info_ptr->cgroup_path;
+			parts2[1] = current_subpath;
+			parts2[2] = NULL;
+			current_entire_path = lxc_string_join("/", (const char **)parts2, false);
+
+			if (!*p) {
+				/* we are processing the subpath, so only update that one */
+				free(new_cgroup_paths_sub[i]);
+				new_cgroup_paths_sub[i] = strdup(current_entire_path);
+				if (!new_cgroup_paths_sub[i])
+					goto cleanup_from_error;
+			} else {
+				/* remember which path was used on this controller */
+				free(new_cgroup_paths[i]);
+				new_cgroup_paths[i] = strdup(current_entire_path);
+				if (!new_cgroup_paths[i])
+					goto cleanup_from_error;
+			}
 
-	ret = snprintf(path, MAXPATHLEN, "%s/%s", cgabspath, filename);
-	if (ret < 0 || ret >= MAXPATHLEN) {
-		ERROR("pathname too long");
-		ret = -1;
-		goto out;
-	}
+			r = create_cgroup(info_ptr->designated_mount_point, current_entire_path);
+			if (r < 0 && errno == EEXIST && contains_name) {
+				/* name clash => try new name with new suffix */
+				free(current_entire_path);
+				current_entire_path = NULL;
+				goto cleanup_name_on_this_level;
+			} else if (r < 0 && errno != EEXIST) {
+				SYSERROR("Could not create cgroup %s", current_entire_path);
+				goto cleanup_from_error;
+			} else if (r == 0) {
+				/* successfully created */
+				r = lxc_grow_array((void ***)&info_ptr->created_paths, &info_ptr->created_paths_capacity, info_ptr->created_paths_count + 1, 8);
+				if (r < 0)
+					goto cleanup_from_error;
+				info_ptr->created_paths[info_ptr->created_paths_count++] = current_entire_path;
+			} else {
+				/* if we didn't create the cgroup, then we have to make sure that
+				 * further cgroups will be created properly
+				 */
+				if (handle_clone_children(mp, info_ptr->cgroup_path) < 0) {
+					ERROR("Could not set clone_children to 1 for cpuset hierarchy in pre-existing cgroup.");
+					goto cleanup_from_error;
+				}
+
+				/* already existed but path component of pattern didn't contain '%n',
+				 * so this is not an error; but then we don't need current_entire_path
+				 * anymore...
+				 */
+				free(current_entire_path);
+				current_entire_path = NULL;
+			}
+		}
 
-	fd = open(path, O_RDONLY);
-	if (fd < 0) {
-		ERROR("open %s : %s", path, strerror(errno));
-		ret = -1;
-		goto out;
+		/* save path so far */
+		free(path_so_far);
+		path_so_far = strdup(current_subpath);
+		if (!path_so_far)
+			goto cleanup_from_error;
+
+		/* cleanup */
+		if (current_component != current_subpath)
+			free(current_subpath);
+		if (current_component != p_eff)
+			free(current_component);
+		current_component = current_subpath = NULL;
+		continue;
+	
+	cleanup_from_error:
+		/* called if an error occured in the loop, so we
+		 * do some additional cleanup here
+		 */
+		saved_errno = errno;
+		if (current_component != current_subpath)
+			free(current_subpath);
+		if (current_component != p_eff)
+			free(current_component);
+		free(current_entire_path);
+		errno = saved_errno;
+		goto out_initial_error;
 	}
 
-	if (!len || !value) {
-		char buf[100];
-		int count = 0;
-		while ((ret = read(fd, buf, 100)) > 0)
-			count += ret;
-		if (ret >= 0)
-			ret = count;
-	} else {
-		memset(value, 0, len);
-		ret = read(fd, value, len);
+	/* we're done, now update the paths */
+	for (i = 0, info_ptr = base_info; info_ptr; info_ptr = info_ptr->next, i++) {
+		free(info_ptr->cgroup_path);
+		info_ptr->cgroup_path = new_cgroup_paths[i];
+		info_ptr->cgroup_path_sub = new_cgroup_paths_sub[i];
 	}
-
-	if (ret < 0)
-		ERROR("read %s : %s", path, strerror(errno));
-
-	close(fd);
-out:
-	free(cgabspath);
-	return ret;
+	/* don't use lxc_free_array since we used the array members
+	 * to store them in our result...
+	 */
+	free(new_cgroup_paths);
+	free(new_cgroup_paths_sub);
+	free(path_so_far);
+	lxc_free_array((void **)cgroup_path_components, free);
+	return base_info;
+
+out_initial_error:
+	saved_errno = errno;
+	free(path_so_far);
+	lxc_cgroup_process_info_free_and_remove(base_info);
+	lxc_free_array((void **)new_cgroup_paths, free);
+	lxc_free_array((void **)new_cgroup_paths_sub, free);
+	lxc_free_array((void **)cgroup_path_components, free);
+	errno = saved_errno;
+	return NULL;
 }
 
-int lxc_cgroup_nrtasks(struct lxc_handler *handler)
+/* get the cgroup membership of a given container */
+struct cgroup_process_info *lxc_cgroup_get_container_info(const char *name, const char *lxcpath, struct cgroup_meta_data *meta_data)
 {
-	char path[MAXPATHLEN];
-	int pid, ret;
-	FILE *file;
-
-	if (!handler->cgroup)
-		return -1;
-
-	/* XXX Should we use a specific subsystem rather than the first one we
-	 * found (handler->cgroup->curcgroup)? */
-	ret = snprintf(path, MAXPATHLEN, "%s/tasks", handler->cgroup->curcgroup);
-	if (ret < 0 || ret >= MAXPATHLEN) {
-		ERROR("pathname too long");
-		return -1;
-	}
+	struct cgroup_process_info *result = NULL;
+	int saved_errno = 0;
+	size_t i;
+	struct cgroup_process_info **cptr = &result;
+	struct cgroup_process_info *entry = NULL;
+	char *path = NULL;
+
+	for (i = 0; i <= meta_data->maximum_hierarchy; i++) {
+		struct cgroup_hierarchy *h = meta_data->hierarchies[i];
+		if (!h || !h->used)
+			continue;
 
-	file = fopen(path, "r");
-	if (!file) {
-		SYSERROR("fopen '%s' failed", path);
-		return -1;
+		/* use the command interface to look for the cgroup */
+		path = lxc_cmd_get_cgroup_path(name, lxcpath, h->subsystems[0]);
+		if (!path)
+			goto out_error;
+
+		entry = calloc(1, sizeof(struct cgroup_process_info));
+		if (!entry)
+			goto out_error;
+		entry->meta_ref = lxc_cgroup_get_meta(meta_data);
+		entry->hierarchy = h;
+		entry->cgroup_path = path;
+		path = NULL;
+
+		/* it is not an error if we don't find anything here,
+		 * it is up to the caller to decide what to do in that
+		 * case */
+		entry->designated_mount_point = lxc_cgroup_find_mount_point(h, entry->cgroup_path, true);
+
+		*cptr = entry;
+		cptr = &entry->next;
+		entry = NULL;
 	}
 
-	ret = 0;
-	while (fscanf(file, "%d", &pid) != EOF)
-		ret++;
-
-	fclose(file);
-	return ret;
+	return result;
+out_error:
+	saved_errno = errno;
+	free(path);
+	lxc_cgroup_process_info_free(result);
+	lxc_cgroup_process_info_free(entry);
+	errno = saved_errno;
+	return NULL;
 }
 
-static int subsys_lists_match(const char *list1, const char *list2)
+/* move a processs to the cgroups specified by the membership */
+int lxc_cgroup_enter(struct cgroup_process_info *info, pid_t pid, bool enter_sub)
 {
-	char *token, *str, *saveptr = NULL;
-
-	if (!list1 || !list2)
-		return 0;
+	char pid_buf[32];
+	char *cgroup_tasks_fn;
+	int r;
+	struct cgroup_process_info *info_ptr;
+
+	snprintf(pid_buf, 32, "%lu", (unsigned long)pid);
+	for (info_ptr = info; info_ptr; info_ptr = info_ptr->next) {
+		char *cgroup_path = (enter_sub && info_ptr->cgroup_path_sub) ?
+			info_ptr->cgroup_path_sub :
+			info_ptr->cgroup_path;
+
+		if (!info_ptr->designated_mount_point) {
+			info_ptr->designated_mount_point = lxc_cgroup_find_mount_point(info_ptr->hierarchy, cgroup_path, true);
+			if (!info_ptr->designated_mount_point) {
+				SYSERROR("Could not add pid %lu to cgroup %s: internal error (couldn't find any writable mountpoint to cgroup filesystem)", (unsigned long)pid, cgroup_path);
+				return -1;
+			}
+		}
 
-        if (strlen(list1) != strlen(list2))
-                return 0;
+		cgroup_tasks_fn = cgroup_to_absolute_path(info_ptr->designated_mount_point, cgroup_path, "/tasks");
+		if (!cgroup_tasks_fn) {
+			SYSERROR("Could not add pid %lu to cgroup %s: internal error", (unsigned long)pid, cgroup_path);
+			return -1;
+		}
 
-	str = alloca(strlen(list1)+1);
-	strcpy(str, list1);
-	for (; (token = strtok_r(str, ",", &saveptr)); str = NULL) {
-		if (in_subsys_list(token, list2) == 0)
-			return 0;
+		r = lxc_write_to_file(cgroup_tasks_fn, pid_buf, strlen(pid_buf), false);
+		if (r < 0) {
+			SYSERROR("Could not add pid %lu to cgroup %s: internal error", (unsigned long)pid, cgroup_path);
+			return -1;
+		}
 	}
 
-	return 1;
+	return 0;
 }
 
-static void set_clone_children(struct mntent *m)
+/* free process membership information */
+void lxc_cgroup_process_info_free(struct cgroup_process_info *info)
 {
-	char path[MAXPATHLEN];
-	FILE *fout;
-	int ret;
-
-	if (!in_subsys_list("cpuset", m->mnt_opts))
+	struct cgroup_process_info *next;
+	if (!info)
 		return;
-	ret = snprintf(path, MAXPATHLEN, "%s/cgroup.clone_children", m->mnt_dir);
-	if (ret < 0 || ret > MAXPATHLEN)
-		return;
-	fout = fopen(path, "w");
-	if (!fout)
-		return;
-	fprintf(fout, "1\n");
-	fclose(fout);
+	next = info->next;
+	lxc_cgroup_put_meta(info->meta_ref);
+	free(info->cgroup_path);
+	free(info->cgroup_path_sub);
+	lxc_free_array((void **)info->created_paths, free);
+	free(info);
+	lxc_cgroup_process_info_free(next);
 }
 
-static bool have_visited(char *opts, char *visited, char *all_subsystems)
+/* free process membership information and remove cgroups that were created */
+void lxc_cgroup_process_info_free_and_remove(struct cgroup_process_info *info)
 {
-	char *str, *s = NULL, *token;
-
-	str = alloca(strlen(opts)+1);
-	strcpy(str, opts);
-	for (; (token = strtok_r(str, ",", &s)); str = NULL) {
-		if (!in_subsys_list(token, all_subsystems))
-			continue;
-		if (visited && in_subsys_list(token, visited))
-			return true;
+	struct cgroup_process_info *next;
+	char **pp;
+	if (!info)
+		return;
+	next = info->next;
+	for (pp = info->created_paths; pp && *pp; pp++);
+	for ((void)(pp && --pp); info->created_paths && pp >= info->created_paths; --pp) {
+		struct cgroup_mount_point *mp = info->designated_mount_point;
+		if (!mp)
+			mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
+		if (mp)
+			/* ignore return value here, perhaps we created the
+			 * '/lxc' cgroup in this container but another container
+			 * is still running (for example)
+			 */
+			(void)remove_cgroup(mp, *pp);
+		free(*pp);
 	}
+	free(info->created_paths);
+	lxc_cgroup_put_meta(info->meta_ref);
+	free(info->cgroup_path);
+	free(info->cgroup_path_sub);
+	free(info);
+	lxc_cgroup_process_info_free(next);
+}
 
-	return false;
+char *lxc_cgroup_get_hierarchy_path_handler(const char *subsystem, struct lxc_handler *handler)
+{
+	struct cgroup_process_info *info = find_info_for_subsystem(handler->cgroup, subsystem);
+	if (!info)
+		return NULL;
+	return info->cgroup_path;
 }
 
-static bool is_in_desclist(struct cgroup_desc *d, char *opts, char *all_subsystems)
+char *lxc_cgroup_get_hierarchy_path(const char *subsystem, const char *name, const char *lxcpath)
 {
-	while (d) {
-		if (have_visited(opts, d->subsystems, all_subsystems))
-			return true;
-		d = d->next;
-	}
-	return false;
+	return lxc_cmd_get_cgroup_path(name, lxcpath, subsystem);
 }
 
-static char *record_visited(char *opts, char *all_subsystems)
+char *lxc_cgroup_get_hierarchy_abs_path_handler(const char *subsystem, struct lxc_handler *handler)
 {
-	char *s = NULL, *token, *str;
-	int oldlen = 0, newlen, toklen;
-	char *visited = NULL;
-
-	str = alloca(strlen(opts)+1);
-	strcpy(str, opts);
-	for (; (token = strtok_r(str, ",", &s)); str = NULL) {
-		if (!in_subsys_list(token, all_subsystems))
-			continue;
-		toklen = strlen(token);
-		newlen = oldlen + toklen +  1; // ',' + token or token + '\0'
-		visited = realloc(visited, newlen);
-		if (!visited)
-			return (char *)-ENOMEM;
-		if (oldlen)
-			strcat(visited, ",");
-		else
-			*visited = '\0';
-		strcat(visited, token);
-		oldlen = newlen;
+	struct cgroup_mount_point *mp = NULL;
+	struct cgroup_process_info *info = find_info_for_subsystem(handler->cgroup, subsystem);
+	if (!info)
+		return NULL;
+	if (info->designated_mount_point) {
+		mp = info->designated_mount_point; 
+	} else {
+		mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
+		if (!mp)
+			return NULL;
 	}
-
-	return visited;
+	return cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
 }
 
-static char *get_all_subsystems(void)
+char *lxc_cgroup_get_hierarchy_abs_path(const char *subsystem, const char *name, const char *lxcpath)
 {
-	FILE *f;
-	char *line = NULL, *ret = NULL;
-	size_t len;
-	int first = 1;
-
-	/* read the list of subsystems from the kernel */
-	f = fopen("/proc/cgroups", "r");
-	if (!f)
+	struct cgroup_meta_data *meta;
+	struct cgroup_process_info *base_info, *info;
+	struct cgroup_mount_point *mp;
+	char *result = NULL;
+	int saved_errno;
+
+	meta = lxc_cgroup_load_meta();
+	if (!meta)
 		return NULL;
+	base_info = lxc_cgroup_get_container_info(name, lxcpath, meta);
+	if (!base_info)
+		return NULL;
+	info = find_info_for_subsystem(base_info, subsystem);
+	if (!info)
+		return NULL;
+	if (info->designated_mount_point) {
+		mp = info->designated_mount_point; 
+	} else {
+		mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
+		if (!mp)
+			return NULL;
+	}
+	result = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
+	saved_errno = errno;
+	lxc_cgroup_process_info_free(base_info);
+	lxc_cgroup_put_meta(meta);
+	errno = saved_errno;
+	return result;
+}
 
-	while (getline(&line, &len, f) != -1) {
-		char *c;
-		int oldlen, newlen, inc;
-
-		/* skip the first line */
-		if (first) {
-			first=0;
-			continue;
-		}
+int lxc_cgroup_set_handler(const char *filename, const char *value, struct lxc_handler *handler)
+{
+	char *subsystem = NULL, *p, *path;
+	int ret = -1;
 
-		c = strchr(line, '\t');
-		if (!c)
-			continue;
-		*c = '\0';
+	subsystem = alloca(strlen(filename) + 1);
+	strcpy(subsystem, filename);
+	if ((p = index(subsystem, '.')) != NULL)
+		*p = '\0';
 
-		oldlen = ret ? strlen(ret) : 0;
-		newlen = oldlen + strlen(line) + 2;
-		ret = realloc(ret, newlen);
-		if (!ret)
-			goto out;
-		inc = snprintf(ret + oldlen, newlen, ",%s", line);
-		if (inc < 0 || inc >= newlen) {
-			free(ret);
-			ret = NULL;
-			goto out;
-		}
+	path = lxc_cgroup_get_hierarchy_abs_path_handler(subsystem, handler);
+	if (path) {
+		ret = do_cgroup_set(path, filename, value);
+		free(path);
 	}
+	return ret;
+}
 
-out:
-	if (line)
-		free(line);
-	fclose(f);
+int lxc_cgroup_get_handler(const char *filename, char *value, size_t len, struct lxc_handler *handler)
+{
+	char *subsystem = NULL, *p, *path;
+	int ret = -1;
+
+	subsystem = alloca(strlen(filename) + 1);
+	strcpy(subsystem, filename);
+	if ((p = index(subsystem, '.')) != NULL)
+		*p = '\0';
+
+	path = lxc_cgroup_get_hierarchy_abs_path_handler(subsystem, handler);
+	if (path) {
+		ret = do_cgroup_get(path, filename, value, len);
+		free(path);
+	}
 	return ret;
 }
 
-/*
- * /etc/lxc/lxc.conf can contain lxc.cgroup.use = entries.
- * If any of those are present, then lxc will ONLY consider
- * cgroup filesystems mounted at one of the listed entries.
- */
-static char *get_cgroup_uselist()
+int lxc_cgroup_set(const char *filename, const char *value, const char *name, const char *lxcpath)
 {
-	FILE *f;
-	char *line = NULL, *ret = NULL;
-	size_t sz = 0, retsz = 0, newsz;
+	char *subsystem = NULL, *p, *path;
+	int ret = -1;
 
-	if ((f = fopen(LXC_GLOBAL_CONF, "r")) == NULL)
-		return NULL;
-	while (getline(&line, &sz, f) != -1) {
-		char *p = line;
-		while (*p && isblank(*p))
-			p++;
-		if (strncmp(p, "lxc.cgroup.use", 14) != 0)
-			continue;
-		p = index(p, '=');
-		if (!p)
-			continue;
-		p++;
-		while (*p && isblank(*p))
-			p++;
-		if (strlen(p) < 1)
-			continue;
-		newsz = retsz + strlen(p);
-		if (retsz == 0)
-			newsz += 1;  // for trailing \0
-		// the last line in the file could lack \n
-		if (p[strlen(p)-1] != '\n')
-			newsz += 1;
-		ret = realloc(ret, newsz);
-		if (!ret) {
-			ERROR("Out of memory reading cgroup uselist");
-			fclose(f);
-			free(line);
-			return (char *)-ENOMEM;
-		}
-		if (retsz == 0)
-			strcpy(ret, p);
-		else
-			strcat(ret, p);
-		if (p[strlen(p)-1] != '\n')
-			ret[newsz-2] = '\0';
-		ret[newsz-1] = '\0';
-		retsz = newsz;
-	}
+	subsystem = alloca(strlen(filename) + 1);
+	strcpy(subsystem, filename);
+	if ((p = index(subsystem, '.')) != NULL)
+		*p = '\0';
 
-	if (line)
-		free(line);
+	path = lxc_cgroup_get_hierarchy_abs_path(subsystem, name, lxcpath);
+	if (path) {
+		ret = do_cgroup_set(path, filename, value);
+		free(path);
+	}
 	return ret;
 }
 
-static bool is_in_uselist(char *uselist, struct mntent *m)
+int lxc_cgroup_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath)
 {
-	char *p;
-	if (!uselist)
-		return true;
-	if (!*uselist)
-		return false;
-	while (*uselist) {
-		p = index(uselist, '\n');
-		if (strncmp(m->mnt_dir, uselist, p - uselist) == 0)
-			return true;
-		uselist = p+1;
+	char *subsystem = NULL, *p, *path;
+	int ret = -1;
+
+	subsystem = alloca(strlen(filename) + 1);
+	strcpy(subsystem, filename);
+	if ((p = index(subsystem, '.')) != NULL)
+		*p = '\0';
+
+	path = lxc_cgroup_get_hierarchy_abs_path(subsystem, name, lxcpath);
+	if (path) {
+		ret = do_cgroup_get(path, filename, value, len);
+		free(path);
 	}
-	return false;
+	return ret;
 }
 
-static bool find_real_cgroup(struct cgroup_desc *d, char *path)
+/*
+ * lxc_cgroup_path_get: Get the absolute pathname for a cgroup
+ * file for a running container.
+ *
+ * @filename  : the file of interest (e.g. "freezer.state") or
+ *              the subsystem name (e.g. "freezer") in which case
+ *              the directory where the cgroup may be modified
+ *              will be returned
+ * @name      : name of container to connect to
+ * @lxcpath   : the lxcpath in which the container is running
+ * 
+ * This is the exported function, which determines cgpath from the
+ * lxc-start of the @name container running in @lxcpath.
+ *
+ * Returns path on success, NULL on error. The caller must free()
+ * the returned path.
+ */
+char *lxc_cgroup_path_get(const char *filename, const char *name,
+                          const char *lxcpath)
 {
-	FILE *f;
-	char *line = NULL, *p, *p2;
-	int ret = 0;
-	size_t len;
+	char *subsystem = NULL, *longer_file = NULL, *p, *group, *path;
 
-	if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
-		SYSERROR("Error opening /proc/self/cgroups");
-		return false;
+	subsystem = alloca(strlen(filename) + 1);
+	strcpy(subsystem, filename);
+	if ((p = index(subsystem, '.')) != NULL) {
+		*p = '\0';
+		longer_file = alloca(strlen(filename) + 2);
+		longer_file[0] = '/';
+		strcpy(longer_file + 1, filename);
 	}
 
-	// If there is no subsystem, ignore the mount.  Note we may want
-	// to change this, so that unprivileged users can use a unbound
-	// cgroup mount to arrange their container tasks.
-	if (!d->subsystems) {
-		fclose(f);
-		return false;
-	}
-	while (getline(&line, &len, f) != -1) {
-		if (!(p = index(line, ':')))
-			continue;
-		if (!(p2 = index(++p, ':')))
-			continue;
-		*p2 = '\0';
-		// remove trailing newlines
-		if (*(p2 + 1) && p2[strlen(p2 + 1)] == '\n')
-		        p2[strlen(p2 + 1)] = '\0';
-		// in case of multiple mounts it may be more correct to
-		// insist all subsystems be the same
-		if (subsys_lists_match(p, d->subsystems))
-			goto found;
-       }
-
-	if (line)
-		free(line);
-	fclose(f);
-	return false;;
+	group = lxc_cgroup_get_hierarchy_path(subsystem, name, lxcpath);
+	if (!group)
+		return NULL;
 
-found:
-	fclose(f);
-	ret = snprintf(path, MAXPATHLEN, "%s", p2+1);
-	if (ret < 0 || ret >= MAXPATHLEN) {
-		free(line);
-		return false;
-	}
-	free(line);
-	return true;
+	path = lxc_cgroup_find_abs_path(subsystem, group, true, *p ? longer_file : NULL);
+	free(group);
+	return path;
 }
 
+int lxc_setup_cgroup_without_devices(struct lxc_handler *h, struct lxc_list *cgroup_settings)
+{
+	return do_setup_cgroup(h, cgroup_settings, false);
+}
 
-/*
- * for a given cgroup mount entry, and a to-be-created container,
- * 1. Figure out full path of the cgroup we are currently in,
- * 2. Find a new free cgroup which is $path / $lxc_name with an
- *    optional '-$n' where n is an ever-increasing integer.
- */
-static char *find_free_cgroup(struct cgroup_desc *d, const char *lxc_name)
+int lxc_setup_cgroup_devices(struct lxc_handler *h, struct lxc_list *cgroup_settings)
 {
-	char tail[20], cgpath[MAXPATHLEN], *cgp, path[MAXPATHLEN];
-	int i = 0, ret;
-	size_t l;
+	return do_setup_cgroup(h, cgroup_settings, true);
+}
 
-	if (!find_real_cgroup(d, cgpath)) {
-		ERROR("Failed to find current cgroup");
-		return NULL;
-	}
+int lxc_cgroup_nrtasks_handler(struct lxc_handler *handler)
+{
+	struct cgroup_process_info *info = handler->cgroup;
+	struct cgroup_mount_point *mp = NULL;
+	char *abs_path = NULL;
+	int ret;
 
-	/*
-	 * If d->mntpt is '/a/b/c/d', and the mountpoint is /x/y/z,
-	 * then look for ourselves in:
-	 *    /x/y/z/a/b/c/d/tasks
-	 *    /x/y/z/b/c/d/tasks
-	 *    /x/y/z/c/d/tasks
-	 *    /x/y/z/d/tasks
-	 *    /x/y/z/tasks
-	 */
-	cgp = cgpath;
-	while (cgp[0]) {
-		ret = snprintf(path, MAXPATHLEN, "%s%s/tasks", d->mntpt, cgp);
-		if (ret < 0 || ret >= MAXPATHLEN)
-			return NULL;
-		if (!is_in_cgroup(getpid(), path)) {
-			// does not exist, try the next one
-			cgp = index(cgp+1, '/');
-			if (!cgp)
-				break;
-			continue;
-		}
-		break;
-	}
-	if (!cgp || !*cgp) {
-		// try just the path
-		ret = snprintf(path, MAXPATHLEN, "%s/tasks", d->mntpt);
-		if (ret < 0 || ret >= MAXPATHLEN)
-			return NULL;
-		if (!is_in_cgroup(getpid(), path))
-			return NULL;
-	}
-	// found it
-	// path has '/tasks' at end, drop that
-	if (!(cgp = strrchr(path, '/'))) {
-		ERROR("Got nonsensical path name %s\n", path);
-		return NULL;
+	if (!info) {
+		errno = ENOENT;
+		return -1;
 	}
-	*cgp = '\0';
 
-	if (strlen(path) + strlen(lxc_name) + 20 > MAXPATHLEN) {
-		ERROR("Error: cgroup path too long");
-		return NULL;
-	}
-	tail[0] = '\0';
-	while (1) {
-		struct stat sb;
-		int freebytes = MAXPATHLEN - (cgp - path);
-
-		if (i) {
-			ret = snprintf(tail, 20, "-%d", i);
-			if (ret < 0 || ret >= 20)
-				return NULL;
-		}
-		ret = snprintf(cgp, freebytes, "/%s%s", lxc_name, tail);
-		if (ret < 0 || ret >= freebytes)
-			return NULL;
-		if (stat(path, &sb) == -1)
-			break;
-		i++;
+	if (info->designated_mount_point) {
+		mp = info->designated_mount_point; 
+	} else {
+		mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, false);
+		if (!mp)
+			return -1;
 	}
 
-	l = strlen(cgpath);
-	ret = snprintf(cgpath + l, MAXPATHLEN - l, "/%s%s", lxc_name, tail);
-	if (ret < 0 || ret >= (MAXPATHLEN - l)) {
-		ERROR("Out of memory");
-		return NULL;
-	}
-	if ((d->realcgroup = strdup(cgpath)) == NULL) {
-		ERROR("Out of memory");
-		return NULL;
-	}
-	l = strlen(d->realcgroup);
-	if (l > 0 && d->realcgroup[l-1] == '\n')
-		d->realcgroup[l-1] = '\0';
-	return strdup(path);
+	abs_path = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
+	if (!abs_path)
+		return -1;
+
+	ret = cgroup_recursive_task_count(abs_path);
+	free(abs_path);
+	return ret;
 }
 
-/*
- * For a new container, find a cgroup path which is unique in all cgroup mounts.
- * I.e. if r1 is already running, then /lxc/r1-1 may be used.
- *
- * @lxcgroup: the cgroup 'group' the contaienr should run in.  By default, this
- * is just 'lxc'.  Admins may wish to group some containers into other groups,
- * i.e. 'build', to take advantage of cgroup hierarchy to simplify group
- * administration.  Also, unprivileged users who are placed into a cgroup by
- * libcgroup_pam will be using that cgroup rather than the system-wide 'lxc'
- * group.
- * @name: the name of the container
- *
- * The chosen cgpath is returned as a strdup'd string.  The caller will have to
- * free that eventually, however the lxc monitor will keep that string so as to
- * return it in response to a LXC_COMMAND_CGROUP query.
- *
- * Note the path is relative to cgroup mounts.  I.e. if the freezer subsystem
- * is at /sys/fs/cgroup/freezer, and this fn returns '/lxc/r1', then the
- * freezer cgroup's full path will be /sys/fs/cgroup/freezer/lxc/r1/.
- *
- * Races won't be determintal, you'll just end up with leftover unused cgroups
- */
-struct cgroup_desc *lxc_cgroup_path_create(const char *name)
+struct cgroup_process_info *lxc_cgroup_process_info_getx(const char *proc_pid_cgroup_str, struct cgroup_meta_data *meta)
 {
-	struct cgroup_desc *retdesc = NULL, *newdesc = NULL;
-	FILE *file = NULL;
-	struct mntent mntent_r;
-	char buf[LARGE_MAXPATHLEN] = {0};
-	char *all_subsystems = get_all_subsystems();
-	char *cgroup_uselist = get_cgroup_uselist();
-
-	if (cgroup_uselist == (char *)-ENOMEM) {
-		if (all_subsystems)
-			free(all_subsystems);
-		return NULL;
-	}
-	if (!all_subsystems) {
-		ERROR("failed to get a list of all cgroup subsystems");
-		if (cgroup_uselist)
-			free(cgroup_uselist);
-		return NULL;
-	}
-	file = setmntent(MTAB, "r");
-	if (!file) {
-		SYSERROR("failed to open %s", MTAB);
-		free(all_subsystems);
-		if (cgroup_uselist)
-			free(cgroup_uselist);
+	struct cgroup_process_info *result = NULL;
+	FILE *proc_pid_cgroup = NULL;
+	char *line = NULL;
+	size_t sz = 0;
+	int saved_errno = 0;
+	struct cgroup_process_info **cptr = &result;
+	struct cgroup_process_info *entry = NULL;
+
+	proc_pid_cgroup = fopen_cloexec(proc_pid_cgroup_str, "r");
+	if (!proc_pid_cgroup)
 		return NULL;
-	}
 
-	while ((getmntent_r(file, &mntent_r, buf, sizeof(buf)))) {
+	while (getline(&line, &sz, proc_pid_cgroup) != -1) {
+		/* file format: hierarchy:subsystems:group */
+		char *colon1;
+		char *colon2;
+		char *endptr;
+		int hierarchy_number;
+		struct cgroup_hierarchy *h = NULL;
 
-		if (strcmp(mntent_r.mnt_type, "cgroup"))
+		if (!line[0])
 			continue;
 
-		if (cgroup_uselist && !is_in_uselist(cgroup_uselist, &mntent_r))
+		if (line[strlen(line) - 1] == '\n')
+			line[strlen(line) - 1] = '\0';
+
+		colon1 = strchr(line, ':');
+		if (!colon1)
+			continue;  
+		*colon1++ = '\0';
+		colon2 = strchr(colon1, ':');
+		if (!colon2)
 			continue;
+		*colon2++ = '\0';
 
-		/* make sure we haven't checked this subsystem already */
-		if (is_in_desclist(retdesc, mntent_r.mnt_opts, all_subsystems))
+		endptr = NULL;
+		hierarchy_number = strtoul(line, &endptr, 10);
+		if (!endptr || *endptr)
 			continue;
 
-		if (!(newdesc = malloc(sizeof(struct cgroup_desc)))) {
-			ERROR("Out of memory reading cgroups");
-			goto fail;
+		if (hierarchy_number > meta->maximum_hierarchy) {
+			/* we encountered a hierarchy we didn't have before,
+			 * so probably somebody remounted some stuff in the
+			 * mean time...
+			 */
+			errno = EAGAIN;
+			goto out_error;
 		}
-		newdesc->subsystems = record_visited(mntent_r.mnt_opts, all_subsystems);
-		if (newdesc->subsystems == (char *)-ENOMEM) {
-			ERROR("Out of memory recording cgroup subsystems");
-			free(newdesc);
-			newdesc = NULL;
-			goto fail;
+
+		h = meta->hierarchies[hierarchy_number];
+		if (!h) {
+			/* we encountered a hierarchy that was thought to be
+			 * dead before, so probably somebody remounted some
+			 * stuff in the mean time...
+			 */
+			errno = EAGAIN;
+			goto out_error;
 		}
-		if (!newdesc->subsystems) {
-			free(newdesc);
-			newdesc = NULL;
+
+		/* we are told that we should ignore this hierarchy */
+		if (!h->used)
 			continue;
-		}
-		newdesc->mntpt = strdup(mntent_r.mnt_dir);
-		newdesc->realcgroup = NULL;
-		newdesc->curcgroup = find_free_cgroup(newdesc, name);
-		if (!newdesc->mntpt || !newdesc->curcgroup) {
-			ERROR("Out of memory reading cgroups");
-			goto fail;
-		}
 
-		set_clone_children(&mntent_r);
+		entry = calloc(1, sizeof(struct cgroup_process_info));
+		if (!entry)
+			goto out_error;
 
-		if (mkdir(newdesc->curcgroup, 0755)) {
-			ERROR("Error creating cgroup %s", newdesc->curcgroup);
-			goto fail;
-		}
-		newdesc->next = retdesc;
-		retdesc = newdesc;
-	}
+		entry->meta_ref = lxc_cgroup_get_meta(meta);
+		entry->hierarchy = h;
+		entry->cgroup_path = strdup(colon2);
+		if (!entry->cgroup_path)
+			goto out_error;
 
-	endmntent(file);
-	free(all_subsystems);
-	if (cgroup_uselist)
-		free(cgroup_uselist);
-	return retdesc;
-
-fail:
-	endmntent(file);
-	free(all_subsystems);
-	if (cgroup_uselist)
-		free(cgroup_uselist);
-	if (newdesc) {
-		if (newdesc->mntpt)
-			free(newdesc->mntpt);
-		if (newdesc->subsystems)
-			free(newdesc->subsystems);
-		if (newdesc->curcgroup)
-			free(newdesc->curcgroup);
-		if (newdesc->realcgroup)
-			free(newdesc->realcgroup);
-		free(newdesc);
+		*cptr = entry;
+		cptr = &entry->next;
+		entry = NULL;
 	}
-	while (retdesc) {
-		struct cgroup_desc *t = retdesc;
-		retdesc = retdesc->next;
-		if (t->mntpt)
-			free(t->mntpt);
-		if (t->subsystems)
-			free(t->subsystems);
-		if (t->curcgroup)
-			free(t->curcgroup);
-		if (t->realcgroup)
-			free(t->realcgroup);
-		free(t);
 
-	}
+	fclose(proc_pid_cgroup);
+	free(line);
+	return result;
+
+out_error:
+	saved_errno = errno;
+	if (proc_pid_cgroup)
+		fclose(proc_pid_cgroup);
+	lxc_cgroup_process_info_free(result);
+	lxc_cgroup_process_info_free(entry);
+	free(line);
+	errno = saved_errno;
 	return NULL;
 }
 
-static bool lxc_cgroup_enter_one(const char *dir, int pid)
+char **subsystems_from_mount_options(const char *mount_options, char **kernel_list)
 {
-	char path[MAXPATHLEN];
-	int ret;
-	FILE *fout;
+	char *token, *str, *saveptr = NULL;
+	char **result = NULL;
+	size_t result_capacity = 0;
+	size_t result_count = 0;   
+	int saved_errno;
+	int r;
 
-	ret = snprintf(path, MAXPATHLEN, "%s/tasks", dir);
-	if (ret < 0 || ret >= MAXPATHLEN) {
-		ERROR("Error entering cgroup");
-		return false;
-	}
-	fout = fopen(path, "w");
-	if (!fout) {
-		SYSERROR("Error entering cgroup");
-		return false;
-	}
-	if (fprintf(fout, "%d\n", (int)pid) < 0) {
-		ERROR("Error writing pid to %s to enter cgroup", path);
-		fclose(fout);
-		return false;
-	}
-	if (fclose(fout) < 0) {
-		SYSERROR("Error writing pid to %s to enter cgroup", path);
-		return false;
+	str = alloca(strlen(mount_options)+1);
+	strcpy(str, mount_options);
+	for (; (token = strtok_r(str, ",", &saveptr)); str = NULL) {
+		/* we have a subsystem if it's either in the list of
+		 * subsystems provided by the kernel OR if it starts
+		 * with name= for named hierarchies
+		 */
+		if (!strncmp(token, "name=", 5) || lxc_string_in_array(token, (const char **)kernel_list)) {
+			r = lxc_grow_array((void ***)&result, &result_capacity, result_count + 1, 12);
+			if (r < 0)
+				goto out_free;
+			result[result_count + 1] = NULL;
+			result[result_count] = strdup(token);
+			if (!result[result_count])
+				goto out_free;
+			result_count++;
+		}
 	}
 
-	return true;
+	return result;
+
+out_free:
+	saved_errno = errno;
+	lxc_free_array((void**)result, free);
+	errno = saved_errno;
+	return NULL;
 }
 
-int lxc_cgroup_enter(struct cgroup_desc *cgroups, pid_t pid)
+void lxc_cgroup_mount_point_free(struct cgroup_mount_point *mp)
 {
-	while (cgroups) {
-		if (!cgroups->subsystems)
-			goto next;
-
-		if (!lxc_cgroup_enter_one(cgroups->curcgroup, pid))
-			return -1;
-next:
-		cgroups = cgroups->next;
-	}
-	return 0;
+	if (!mp)
+		return;
+	free(mp->mount_point);
+	free(mp->mount_prefix);
+	free(mp);
 }
 
-static int cgroup_rmdir(char *dirname)
+void lxc_cgroup_hierarchy_free(struct cgroup_hierarchy *h)
 {
-	struct dirent dirent, *direntp;
-	DIR *dir;
-	int ret;
-	char pathname[MAXPATHLEN];
-
-	dir = opendir(dirname);
-	if (!dir) {
-		WARN("failed to open directory: %m");
-		return -1;
-	}
-
-	while (!readdir_r(dir, &dirent, &direntp)) {
-		struct stat mystat;
-		int rc;
-
-		if (!direntp)
-			break;
-
-		if (!strcmp(direntp->d_name, ".") ||
-				!strcmp(direntp->d_name, ".."))
-			continue;
+	if (!h)
+		return;
+	lxc_free_array((void **)h->subsystems, free);
+	free(h);
+}
 
-		rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
-		if (rc < 0 || rc >= MAXPATHLEN) {
-			ERROR("pathname too long");
-			continue;
-		}
-		ret = stat(pathname, &mystat);
-		if (ret)
-			continue;
-		if (S_ISDIR(mystat.st_mode))
-			cgroup_rmdir(pathname);
+bool is_valid_cgroup(const char *name)
+{
+	const char *p;
+	for (p = name; *p; p++) {
+		if (*p < 32 || *p == 127 || *p == '/')
+			return false;
 	}
+	return strcmp(name, ".") != 0 && strcmp(name, "..") != 0;
+}
 
-	ret = rmdir(dirname);
+int create_or_remove_cgroup(bool do_remove, struct cgroup_mount_point *mp, const char *path)
+{
+	int r, saved_errno = 0;
+	char *buf = cgroup_to_absolute_path(mp, path, NULL);
+	if (!buf)
+		return -1;
 
-	if (closedir(dir))
-		ERROR("failed to close directory");
-	return ret;
+	/* create or remove directory */
+	r = do_remove ?
+		rmdir(buf) :
+		mkdir(buf, 0777);
+	saved_errno = errno;
+	free(buf);
+	errno = saved_errno;
+	return r;
 }
 
-/*
- * for each mounted cgroup, destroy the cgroup for the container
- */
-void lxc_cgroup_destroy_desc(struct cgroup_desc *cgroups)
+int create_cgroup(struct cgroup_mount_point *mp, const char *path)
 {
-	while (cgroups) {
-		struct cgroup_desc *next = cgroups->next;
-		if (cgroup_rmdir(cgroups->curcgroup) < 0)
-			SYSERROR("Error removing cgroup directory %s", cgroups->curcgroup);
-		free(cgroups->mntpt);
-		free(cgroups->subsystems);
-		free(cgroups->curcgroup);
-		free(cgroups->realcgroup);
-		free(cgroups);
-		cgroups = next;
-	}
+	return create_or_remove_cgroup(false, mp, path);
 }
 
-int lxc_cgroup_attach(pid_t pid, const char *name, const char *lxcpath)
+int remove_cgroup(struct cgroup_mount_point *mp, const char *path)
 {
-	FILE *f;
-	char *line = NULL, ret = 0;
-	size_t len = 0;
-	int first = 1;
-	char *dirpath;
+	return create_or_remove_cgroup(true, mp, path);
+}
 
-	/* read the list of subsystems from the kernel */
-	f = fopen("/proc/cgroups", "r");
-	if (!f)
-		return -1;
+char *cgroup_to_absolute_path(struct cgroup_mount_point *mp, const char *path, const char *suffix)
+{
+	/* first we have to make sure we subtract the mount point's prefix */
+	char *prefix = mp->mount_prefix;
+	char *buf;
+	ssize_t len, rv;
+
+	/* we want to make sure only absolute paths to cgroups are passed to us */
+	if (path[0] != '/') {
+		errno = EINVAL;
+		return NULL;
+	}
 
-	while (getline(&line, &len, f) != -1) {
-		char *c;
+	if (prefix && !strcmp(prefix, "/"))
+		prefix = NULL;
 
-		/* skip the first line */
-		if (first) {
-			first=0;
-			continue;
-		}
+	/* prefix doesn't match */
+	if (prefix && strncmp(prefix, path, strlen(prefix)) != 0) {
+		errno = EINVAL;
+		return NULL;
+	}
+	/* if prefix is /foo and path is /foobar */
+	if (prefix && path[strlen(prefix)] != '/' && path[strlen(prefix)] != '\0') {
+		errno = EINVAL;
+		return NULL;
+	}
 
-		c = strchr(line, '\t');
-		if (!c)
-			continue;
-		*c = '\0';
-		dirpath = lxc_cgroup_path_get(line, name, lxcpath);
-		if (!dirpath)
-			continue;
+	/* remove prefix from path */
+	path += prefix ? strlen(prefix) : 0;
 
-		INFO("joining pid %d to cgroup %s", pid, dirpath);
-		if (!lxc_cgroup_enter_one(dirpath, pid)) {
-			ERROR("Failed joining %d to %s\n", pid, dirpath);
-			ret = -1;
-			continue;
-		}
+	len = strlen(mp->mount_point) + strlen(path) + (suffix ? strlen(suffix) : 0);
+	buf = calloc(len + 1, 1);
+	rv = snprintf(buf, len + 1, "%s%s%s", mp->mount_point, path, suffix ? suffix : "");
+	if (rv > len) { 
+		free(buf);
+		errno = ENOMEM;
+		return NULL; 
 	}
 
-	if (line)
-		free(line);
-	fclose(f);
-	return ret;
+	return buf;
 }
 
-bool is_in_subcgroup(int pid, const char *subsystem, struct cgroup_desc *d)
+struct cgroup_process_info *find_info_for_subsystem(struct cgroup_process_info *info, const char *subsystem)
 {
-	char filepath[MAXPATHLEN], *line = NULL, v1[MAXPATHLEN], v2[MAXPATHLEN];
-	FILE *f;
-	int ret, junk;
-	size_t sz = 0, l1, l2;
-	char *end = index(subsystem, '.');
-	int len = end ? (end - subsystem) : strlen(subsystem);
-	const char *cgpath = NULL;
-
-	while (d) {
-		if (in_subsys_list("devices", d->subsystems)) {
-			cgpath = d->realcgroup;
-			l1 = strlen(cgpath);
-			break;
-		}
-		d = d->next;
+	struct cgroup_process_info *info_ptr;
+	for (info_ptr = info; info_ptr; info_ptr = info_ptr->next) {
+		struct cgroup_hierarchy *h = info_ptr->hierarchy;
+		if (lxc_string_in_array(subsystem, (const char **)h->subsystems))
+			return info_ptr;
 	}
-	if (!d)
-		return false;
+	errno = ENOENT;
+	return NULL;
+}
 
-	ret = snprintf(filepath, MAXPATHLEN, "/proc/%d/cgroup", pid);
-	if (ret < 0 || ret >= MAXPATHLEN)
-		return false;
-	if ((f = fopen(filepath, "r")) == NULL)
-		return false;
-	while (getline(&line, &sz, f) != -1) {
-		// nr:subsystem:path
-		v2[0] = v2[1] = '\0';
-		ret = sscanf(line, "%d:%[^:]:%s", &junk, v1, v2);
-		if (ret != 3) {
-			fclose(f);
-			free(line);
-			return false;
-		}
-		len = end ? end - subsystem : strlen(subsystem);
-		if (strncmp(v1, subsystem, len) != 0)
-			continue;
-		// v2 will start with '/', skip it by using v2+1
-		// we must be in SUBcgroup, so make sure l2 > l1
-		l2 = strlen(v2+1);
-		if (l2 > l1 && strncmp(v2+1, cgpath, l1) == 0) {
-			fclose(f);
-			free(line);
-			return true;
-		}
-	}
-	fclose(f);
-	if (line)
-		free(line);
-	return false;
+int do_cgroup_get(const char *cgroup_path, const char *sub_filename, char *value, size_t len)
+{
+	const char *parts[3] = {
+		cgroup_path,
+		sub_filename,
+		NULL
+	};
+	char *filename;
+	int ret, saved_errno;
+
+	filename = lxc_string_join("/", parts, false);
+	if (!filename)
+		return -1;
+
+	ret = lxc_read_from_file(filename, value, len);
+	saved_errno = errno;
+	free(filename);
+	errno = saved_errno;
+	return ret;
 }
 
-char *cgroup_get_subsys_path(struct lxc_handler *handler, const char *subsys)
+int do_cgroup_set(const char *cgroup_path, const char *sub_filename, const char *value)
 {
-	struct cgroup_desc *d;
+	const char *parts[3] = {
+		cgroup_path,
+		sub_filename,
+		NULL
+	};
+	char *filename;
+	int ret, saved_errno;
 
-	for (d = handler->cgroup; d; d = d->next) {
-		if (in_subsys_list(subsys, d->subsystems))
-			return d->realcgroup;
-	}
+	filename = lxc_string_join("/", parts, false);
+	if (!filename)
+		return -1;
 
-	return NULL;
+	ret = lxc_write_to_file(filename, value, strlen(value), false);
+	saved_errno = errno;
+	free(filename);
+	errno = saved_errno;
+	return ret;
 }
 
-static int _setup_cgroup(struct lxc_handler *h, struct lxc_list *cgroups,
-			  int devices)
+int do_setup_cgroup(struct lxc_handler *h, struct lxc_list *cgroup_settings, bool do_devices)
 {
 	struct lxc_list *iterator;
 	struct lxc_cgroup *cg;
 	int ret = -1;
 
-	if (lxc_list_empty(cgroups))
+	if (lxc_list_empty(cgroup_settings))
 		return 0;
 
-	lxc_list_for_each(iterator, cgroups) {
+	lxc_list_for_each(iterator, cgroup_settings) {
 		cg = iterator->elem;
 
-		if (devices == !strncmp("devices", cg->subsystem, 7)) {
+		if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
 			if (strcmp(cg->subsystem, "devices.deny") == 0 &&
-					cgroup_devices_has_deny(h, cg->value))
+					cgroup_devices_has_allow_or_deny(h, cg->value, false))
 				continue;
 			if (strcmp(cg->subsystem, "devices.allow") == 0 &&
-					cgroup_devices_has_allow(h, cg->value))
+					cgroup_devices_has_allow_or_deny(h, cg->value, true))
 				continue;
-			if (lxc_cgroup_set_value(h, cg->subsystem, cg->value)) {
+			if (lxc_cgroup_set_handler(cg->subsystem, cg->value, h)) {
 				ERROR("Error setting %s to %s for %s\n",
 				      cg->subsystem, cg->value, h->name);
 				goto out;
@@ -1315,12 +1436,156 @@ out:
 	return ret;
 }
 
-int setup_cgroup_devices(struct lxc_handler *h, struct lxc_list *cgroups)
+bool cgroup_devices_has_allow_or_deny(struct lxc_handler *h, char *v, bool for_allow)
+{
+	char *path;
+	FILE *devices_list;
+	char *line = NULL; 
+	size_t sz = 0;
+	bool ret = !for_allow;
+	const char *parts[3] = {
+		NULL,
+		"devices.list",
+		NULL
+	};
+
+	// XXX FIXME if users could use something other than 'lxc.devices.deny = a'.
+	// not sure they ever do, but they *could*
+	// right now, I'm assuming they do NOT
+	if (!for_allow && strcmp(v, "a") != 0 && strcmp(v, "a *:* rwm") != 0)
+		return false;
+
+	parts[0] = (const char *)lxc_cgroup_get_hierarchy_abs_path_handler("devices", h);
+	if (!parts[0])
+		return false;
+	path = lxc_string_join("/", parts, false);
+	if (!path) {
+		free((void *)parts[0]);
+		return false;
+	}
+
+	devices_list = fopen_cloexec(path, "r");
+	if (!devices_list) {
+		free(path);
+		return false;
+	}
+
+	while (getline(&line, &sz, devices_list) != -1) {
+		size_t len = strlen(line);
+		if (len > 0 && line[len-1] == '\n')
+			line[len-1] = '\0';
+		if (strcmp(line, "a *:* rwm") == 0) {
+			ret = for_allow;
+			goto out;
+		} else if (for_allow && strcmp(line, v) == 0) {
+			ret = true;
+			goto out;  
+		}
+	}
+
+out:
+	fclose(devices_list);
+	free(line);
+	free(path);
+	return ret;
+}
+
+int cgroup_recursive_task_count(const char *cgroup_path)
 {
-	return _setup_cgroup(h, cgroups, 1);
+	DIR *d;
+	struct dirent *dent_buf;
+	struct dirent *dent;
+	ssize_t name_max;   
+	int n = 0, r;
+
+	/* see man readdir_r(3) */
+	name_max = pathconf(cgroup_path, _PC_NAME_MAX);
+	if (name_max <= 0)
+		name_max = 255;
+	dent_buf = malloc(offsetof(struct dirent, d_name) + name_max + 1);
+	if (!dent_buf)
+		return -1;
+
+	d = opendir(cgroup_path);
+	if (!d)
+		return 0;
+
+	while (readdir_r(d, dent_buf, &dent) == 0 && dent) {
+		const char *parts[3] = {
+			cgroup_path,
+			dent->d_name,
+			NULL
+		};
+		char *sub_path;
+		struct stat st;
+
+		if (!strcmp(dent->d_name, ".") || !strcmp(dent->d_name, ".."))
+			continue;
+		sub_path = lxc_string_join("/", parts, false);
+		if (!sub_path) {
+			closedir(d);
+			free(dent_buf);
+			return -1;
+		}
+		r = stat(sub_path, &st);
+		if (r < 0) {
+			closedir(d);
+			free(dent_buf);
+			free(sub_path);
+			return -1;
+		}
+		if (S_ISDIR(st.st_mode)) {
+			r = cgroup_recursive_task_count(sub_path);
+			if (r >= 0)
+				n += r;
+		} else if (!strcmp(dent->d_name, "tasks")) {
+			r = count_lines(sub_path);
+			if (r >= 0)
+				n += r;
+		}
+		free(sub_path);
+	}
+	closedir(d);
+	free(dent_buf);
+
+	return n;
+}
+
+int count_lines(const char *fn)  
+{
+	FILE *f;
+	char *line = NULL;
+	size_t sz = 0;
+	int n = 0;
+
+	f = fopen_cloexec(fn, "r");
+	if (!f)
+		return -1;
+
+	while (getline(&line, &sz, f) != -1) {
+		n++;
+	}
+	free(line);
+	fclose(f);
+	return n;
 }
 
-int setup_cgroup(struct lxc_handler *h, struct lxc_list *cgroups)
+int handle_clone_children(struct cgroup_mount_point *mp, char *cgroup_path)
 {
-	return _setup_cgroup(h, cgroups, 0);
+	int r, saved_errno = 0;
+	/* if this is a cpuset hierarchy, we have to set cgroup.clone_children in
+	 * the base cgroup, otherwise containers will start with an empty cpuset.mems
+	 * and cpuset.cpus and then
+	 */
+	if (lxc_string_in_array("cpuset", (const char **)mp->hierarchy->subsystems)) {
+		char *cc_path = cgroup_to_absolute_path(mp, cgroup_path, "/cgroup.clone_children");
+		if (!cc_path)
+			return -1;
+		r = lxc_write_to_file(cc_path, "1", 1, false);
+		saved_errno = errno;
+		free(cc_path);
+		errno = saved_errno;
+		return r < 0 ? -1 : 0;
+	}
+	return 0;
 }
diff --git a/src/lxc/cgroup.h b/src/lxc/cgroup.h
index 01ee931..7185ef8 100644
--- a/src/lxc/cgroup.h
+++ b/src/lxc/cgroup.h
@@ -20,38 +20,145 @@
  * License along with this library; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
-#ifndef _cgroup_h
-#define _cgroup_h
+#ifndef _ncgroup_h
+#define _ncgroup_h
 #include <stdbool.h>
+#include <stdint.h>
+#include <stddef.h>
+
+struct cgroup_hierarchy;
+struct cgroup_meta_data;
+struct cgroup_mount_point;
+
+/*
+ * cgroup_meta_data: the metadata about the cgroup infrastructure on this
+ *                   host
+ */
+struct cgroup_meta_data {
+	ptrdiff_t ref; /* simple refcount */
+	struct cgroup_hierarchy **hierarchies;
+	struct cgroup_mount_point **mount_points;
+	int maximum_hierarchy;
+};
+
+/*
+ * cgroup_hierarchy: describes a single cgroup hierarchy
+ *                   (may have multiple mount points)
+ */
+struct cgroup_hierarchy {
+	int index;
+	bool used; /* false if the hierarchy should be ignored by lxc */
+	char **subsystems;
+	struct cgroup_mount_point *rw_absolute_mount_point;
+	struct cgroup_mount_point *ro_absolute_mount_point;
+	struct cgroup_mount_point **all_mount_points;
+	size_t all_mount_point_capacity;
+};
+
+/*
+ * cgroup_mount_point: a mount point to where a hierarchy
+ *                     is mounted to
+ */
+struct cgroup_mount_point {
+	struct cgroup_hierarchy *hierarchy;
+	char *mount_point;
+	char *mount_prefix;
+	bool read_only;
+};
 
 /*
- * cgroup_desc: describe a container's cgroup membership
+ * cgroup_process_info: describes the membership of a
+ *                      process to the different cgroup
+ *                      hierarchies
  */
-struct cgroup_desc {
-	char *mntpt; /* where this is mounted */
-	char *subsystems; /* comma-separated list of subsystems, or NULL */
-	char *curcgroup; /* task's current cgroup, full pathanme */
-	char *realcgroup; /* the cgroup as known in /proc/self/cgroup */
-	struct cgroup_desc *next;
+struct cgroup_process_info {
+	struct cgroup_process_info *next;
+	struct cgroup_meta_data *meta_ref;
+	struct cgroup_hierarchy *hierarchy;
+	char *cgroup_path;
+	char *cgroup_path_sub;
+	char **created_paths;
+	size_t created_paths_capacity;
+	size_t created_paths_count;
+	struct cgroup_mount_point *designated_mount_point;
 };
 
+/* meta data management:
+ *    lxc_cgroup_load_meta  loads the meta data (using subsystem
+ *                          whitelist from main lxc configuration)
+ *    lxc_cgroup_load_meta2 does the same, but allows one to specify
+ *                          a custom whitelist
+ *    lxc_cgroup_get_meta   increments the refcount of a meta data
+ *                          object
+ *    lxc_cgroup_put_meta   decrements the refcount of a meta data
+ *                          object, potentially destroying it
+ */
+extern struct cgroup_meta_data *lxc_cgroup_load_meta();
+extern struct cgroup_meta_data *lxc_cgroup_load_meta2(const char **subsystem_whitelist);
+extern struct cgroup_meta_data *lxc_cgroup_get_meta(struct cgroup_meta_data *meta_data);
+extern struct cgroup_meta_data *lxc_cgroup_put_meta(struct cgroup_meta_data *meta_data);
+
+/* find the hierarchy corresponding to a given subsystem */
+extern struct cgroup_hierarchy *lxc_cgroup_find_hierarchy(struct cgroup_meta_data *meta_data, const char *subsystem);
+
+/* find a mount point for a given hierarchy that has access to the cgroup in 'cgroup' and (if wanted) is writable */
+extern struct cgroup_mount_point *lxc_cgroup_find_mount_point(struct cgroup_hierarchy *hierarchy, const char *group, bool should_be_writable);
+
+/* all-in-one: find a mount point for a given hierarchy that has access to the cgroup and return the correct path within */
+extern char *lxc_cgroup_find_abs_path(const char *subsystem, const char *group, bool should_be_writable, const char *suffix);
+
+/* determine the cgroup membership of a given process */
+extern struct cgroup_process_info *lxc_cgroup_process_info_get(pid_t pid, struct cgroup_meta_data *meta);
+extern struct cgroup_process_info *lxc_cgroup_process_info_get_init(struct cgroup_meta_data *meta);
+extern struct cgroup_process_info *lxc_cgroup_process_info_get_self(struct cgroup_meta_data *meta);
+
+/* create a new cgroup */
+extern struct cgroup_process_info *lxc_cgroup_create(const char *name, const char *path_pattern, struct cgroup_meta_data *meta_data, const char *sub_pattern);
+
+/* get the cgroup membership of a given container */
+extern struct cgroup_process_info *lxc_cgroup_get_container_info(const char *name, const char *lxcpath, struct cgroup_meta_data *meta_data);
+
+/* move a processs to the cgroups specified by the membership */
+extern int lxc_cgroup_enter(struct cgroup_process_info *info, pid_t pid, bool enter_sub);
+
+/* free process membership information */
+extern void lxc_cgroup_process_info_free(struct cgroup_process_info *info);
+extern void lxc_cgroup_process_info_free_and_remove(struct cgroup_process_info *info);
+
 struct lxc_handler;
-extern void lxc_cgroup_destroy_desc(struct cgroup_desc *cgroups);
-extern char *lxc_cgroup_path_get(const char *subsystem, const char *name,
-			      const char *lxcpath);
-extern int lxc_cgroup_nrtasks(struct lxc_handler *handler);
-struct cgroup_desc *lxc_cgroup_path_create(const char *name);
-extern int lxc_cgroup_enter(struct cgroup_desc *cgroups, pid_t pid);
-extern int lxc_cgroup_attach(pid_t pid, const char *name, const char *lxcpath);
-extern char *cgroup_path_get(const char *subsystem, const char *cgpath);
-extern bool get_subsys_mount(char *dest, const char *subsystem);
-extern bool is_in_subcgroup(int pid, const char *subsystem, struct cgroup_desc *d);
+extern char *lxc_cgroup_get_hierarchy_path_handler(const char *subsystem, struct lxc_handler *handler);
+extern char *lxc_cgroup_get_hierarchy_path(const char *subsystem, const char *name, const char *lxcpath);
+extern char *lxc_cgroup_get_hierarchy_abs_path_handler(const char *subsystem, struct lxc_handler *handler);
+extern char *lxc_cgroup_get_hierarchy_abs_path(const char *subsystem, const char *name, const char *lxcpath);
+extern int lxc_cgroup_set_handler(const char *filename, const char *value, struct lxc_handler *handler);
+extern int lxc_cgroup_get_handler(const char *filename, char *value, size_t len, struct lxc_handler *handler);
+extern int lxc_cgroup_set(const char *filename, const char *value, const char *name, const char *lxcpath);
+extern int lxc_cgroup_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath);
+
 /*
- * Called by commands.c by a container's monitor to find out the
- * container's cgroup path in a specific subsystem
+ * lxc_cgroup_path_get: Get the absolute pathname for a cgroup
+ * file for a running container.
+ *
+ * @filename  : the file of interest (e.g. "freezer.state") or
+ *              the subsystem name (e.g. "freezer") in which case
+ *              the directory where the cgroup may be modified
+ *              will be returned
+ * @name      : name of container to connect to
+ * @lxcpath   : the lxcpath in which the container is running
+ * 
+ * This is the exported function, which determines cgpath from the
+ * lxc-start of the @name container running in @lxcpath.
+ *
+ * Returns path on success, NULL on error. The caller must free()
+ * the returned path.
  */
-extern char *cgroup_get_subsys_path(struct lxc_handler *handler, const char *subsys);
+extern char *lxc_cgroup_path_get(const char *subsystem, const char *name,
+                                 const char *lxcpath);
+
 struct lxc_list;
-extern int setup_cgroup(struct lxc_handler *h, struct lxc_list *cgroups);
-extern int setup_cgroup_devices(struct lxc_handler *h, struct lxc_list *cgroups);
+extern int lxc_setup_cgroup_without_devices(struct lxc_handler *h, struct lxc_list *cgroup_settings);
+extern int lxc_setup_cgroup_devices(struct lxc_handler *h, struct lxc_list *cgroup_settings);
+
+extern int lxc_cgroup_nrtasks_handler(struct lxc_handler *handler);
+
 #endif
diff --git a/src/lxc/commands.c b/src/lxc/commands.c
index 0c05810..f12ae2d 100644
--- a/src/lxc/commands.c
+++ b/src/lxc/commands.c
@@ -38,6 +38,7 @@
 #include <lxc/conf.h>
 #include <lxc/start.h>	/* for struct lxc_handler */
 #include <lxc/utils.h>
+#include <lxc/cgroup.h>
 
 #include "commands.h"
 #include "console.h"
@@ -351,7 +352,6 @@ static int lxc_cmd_get_clone_flags_callback(int fd, struct lxc_cmd_req *req,
 	return lxc_cmd_rsp_send(fd, &rsp);
 }
 
-extern char *cgroup_get_subsys_path(struct lxc_handler *handler, const char *subsys);
 /*
  * lxc_cmd_get_cgroup_path: Calculate a container's cgroup path for a
  * particular subsystem. This is the cgroup path relative to the root
@@ -404,7 +404,7 @@ static int lxc_cmd_get_cgroup_callback(int fd, struct lxc_cmd_req *req,
 	if (req->datalen < 1)
 		return -1;
         
-	path = cgroup_get_subsys_path(handler, req->data);
+	path = lxc_cgroup_get_hierarchy_path_handler(req->data, handler);
 	if (!path)
 		return -1;
 	rsp.datalen = strlen(path) + 1,
@@ -560,7 +560,7 @@ static int lxc_cmd_stop_callback(int fd, struct lxc_cmd_req *req,
 	memset(&rsp, 0, sizeof(rsp));
 	rsp.ret = kill(handler->pid, stopsignal);
 	if (!rsp.ret) {
-		char *path = cgroup_get_subsys_path(handler, "freezer");
+		char *path = lxc_cgroup_get_hierarchy_path_handler("freezer", handler);
 		if (!path) {
 			ERROR("container %s:%s is not in a freezer cgroup",
 				handler->lxcpath, handler->name);
diff --git a/src/lxc/freezer.c b/src/lxc/freezer.c
index 7c8c61e..be97d75 100644
--- a/src/lxc/freezer.c
+++ b/src/lxc/freezer.c
@@ -123,7 +123,7 @@ static int freeze_unfreeze(const char *name, int freeze, const char *lxcpath)
 	char *cgabspath;
 	int ret;
 
-	cgabspath = lxc_cgroup_path_get("freezer", name, lxcpath);
+	cgabspath = lxc_cgroup_get_hierarchy_abs_path("freezer", name, lxcpath);
 	if (!cgabspath)
 		return -1;
 
@@ -145,17 +145,14 @@ int lxc_unfreeze(const char *name, const char *lxcpath)
 
 int lxc_unfreeze_bypath(const char *cgrelpath)
 {
-	char cgabspath[MAXPATHLEN];
-	int len, ret;
+	char *cgabspath;
+	int ret;
 
-	if (!get_subsys_mount(cgabspath, "freezer"))
-		return -1;
-	len = strlen(cgabspath);
-	ret = snprintf(cgabspath+len, MAXPATHLEN-len, "/%s", cgrelpath);
-	if (ret < 0 || ret >= MAXPATHLEN-len) {
-		ERROR("freezer path name too long");
+	cgabspath = lxc_cgroup_find_abs_path("freezer", cgrelpath, true, NULL);
+	if (!cgabspath)
 		return -1;
-	}
 
-	return do_unfreeze(cgabspath, 0, NULL, NULL);
+	ret = do_unfreeze(cgabspath, 0, NULL, NULL);
+	free(cgabspath);
+	return ret;
 }
diff --git a/src/lxc/lxc.h b/src/lxc/lxc.h
index 3477e83..84bcc04 100644
--- a/src/lxc/lxc.h
+++ b/src/lxc/lxc.h
@@ -141,37 +141,35 @@ struct lxc_handler;
 /*
  * Set a specified value for a specified subsystem. The specified
  * subsystem must be fully specified, eg. "cpu.shares"
- * @d         : the cgroup descriptor for the container
  * @filename  : the cgroup attribute filename
  * @value     : the value to be set
+ * @handler   : the lxc_handler structure of the container
  * Returns 0 on success, < 0 otherwise
  */
-extern int lxc_cgroup_set_value(struct lxc_handler *hander, const char *filename,
-				const char *value);
+extern int lxc_cgroup_set_handler(const char *filename, const char *value, struct lxc_handler *handler);
 
 /*
  * Set a specified value for a specified subsystem. The specified
  * subsystem must be fully specified, eg. "cpu.shares"
- * @name      : the name of the container
  * @filename  : the cgroup attribute filename
  * @value     : the value to be set
+ * @name      : the name of the container
  * @lxcpath   : lxc config path for container
  * Returns 0 on success, < 0 otherwise
  */
-extern int lxc_cgroup_set(const char *name, const char *filename, const char *value, const char *lxcpath);
+extern int lxc_cgroup_set(const char *filename, const char *value, const char *name, const char *lxcpath);
 
 /*
  * Get a specified value for a specified subsystem. The specified
  * subsystem must be fully specified, eg. "cpu.shares"
- * @name      : the name of the container
  * @filename  : the cgroup attribute filename
  * @value     : the value to be set
  * @len       : the len of the value variable
+ * @name      : the name of the container
  * @lxcpath   : lxc config path for container
  * Returns the number of bytes read, < 0 on error
  */
-extern int lxc_cgroup_get(const char *name, const char *filename,
-			  char *value, size_t len, const char *lxcpath);
+extern int lxc_cgroup_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath);
 
 /*
  * Retrieve the error string associated with the error returned by
diff --git a/src/lxc/lxccontainer.c b/src/lxc/lxccontainer.c
index 3c657ca..17f757a 100644
--- a/src/lxc/lxccontainer.c
+++ b/src/lxc/lxccontainer.c
@@ -1665,7 +1665,7 @@ static bool lxcapi_set_cgroup_item(struct lxc_container *c, const char *subsys,
 	if (container_disk_lock(c))
 		return false;
 
-	ret = lxc_cgroup_set(c->name, subsys, value, c->config_path);
+	ret = lxc_cgroup_set(subsys, value, c->name, c->config_path);
 
 	container_disk_unlock(c);
 	return ret == 0;
@@ -1684,7 +1684,7 @@ static int lxcapi_get_cgroup_item(struct lxc_container *c, const char *subsys, c
 	if (container_disk_lock(c))
 		return -1;
 
-	ret = lxc_cgroup_get(c->name, subsys, retv, inlen, c->config_path);
+	ret = lxc_cgroup_get(subsys, retv, inlen, c->name, c->config_path);
 
 	container_disk_unlock(c);
 	return ret;
diff --git a/src/lxc/lxcutmp.c b/src/lxc/lxcutmp.c
index ee51f87..8736f3f 100644
--- a/src/lxc/lxcutmp.c
+++ b/src/lxc/lxcutmp.c
@@ -283,7 +283,7 @@ static int utmp_get_ntasks(struct lxc_handler *handler)
 {
 	int ntasks;
 
-	ntasks = lxc_cgroup_nrtasks(handler);
+	ntasks = lxc_cgroup_nrtasks_handler(handler);
 
 	if (ntasks < 0) {
 		ERROR("failed to get the number of tasks");
diff --git a/src/lxc/start.c b/src/lxc/start.c
index e841bac..0356fc0 100644
--- a/src/lxc/start.c
+++ b/src/lxc/start.c
@@ -384,7 +384,7 @@ static void lxc_fini(const char *name, struct lxc_handler *handler)
 	handler->conf->maincmd_fd = -1;
 	free(handler->name);
 	if (handler->cgroup) {
-		lxc_cgroup_destroy_desc(handler->cgroup);
+		lxc_cgroup_process_info_free_and_remove(handler->cgroup);
 		handler->cgroup = NULL;
 	}
 	free(handler);
@@ -603,11 +603,12 @@ int save_phys_nics(struct lxc_conf *conf)
 	return 0;
 }
 
-extern bool is_in_subcgroup(int pid, const char *subsystem, struct cgroup_desc *d);
 int lxc_spawn(struct lxc_handler *handler)
 {
 	int failed_before_rename = 0;
 	const char *name = handler->name;
+	struct cgroup_meta_data *cgroup_meta = NULL;
+	const char *cgroup_pattern = NULL;
 
 	if (lxc_sync_init(handler))
 		return -1;
@@ -646,6 +647,22 @@ int lxc_spawn(struct lxc_handler *handler)
 		goto out_abort;
 	}
 
+	cgroup_meta = lxc_cgroup_load_meta();
+	if (!cgroup_meta) {
+		ERROR("failed to detect cgroup metadata");
+		goto out_delete_net;
+	}
+
+	/* if we are running as root, use system cgroup pattern, otherwise
+	 * just create a cgroup under the current one. But also fall back to
+	 * that if for some reason reading the configuration fails and no
+	 * default value is available
+	 */
+	if (getuid() == 0)
+		cgroup_pattern = lxc_global_config_value("cgroup.pattern");
+	if (!cgroup_pattern)
+		cgroup_pattern = "%n";
+
 	/*
 	 * if the rootfs is not a blockdev, prevent the container from
 	 * marking it readonly.
@@ -669,15 +686,17 @@ int lxc_spawn(struct lxc_handler *handler)
 	if (lxc_sync_wait_child(handler, LXC_SYNC_CONFIGURE))
 		failed_before_rename = 1;
 
-	if ((handler->cgroup = lxc_cgroup_path_create(name)) == NULL)
+	if ((handler->cgroup = lxc_cgroup_create(name, cgroup_pattern, cgroup_meta, NULL)) == NULL) {
+		ERROR("failed to create cgroups for '%s'", name);
 		goto out_delete_net;
+	}
 
-	if (setup_cgroup(handler, &handler->conf->cgroup)) {
+	if (lxc_setup_cgroup_without_devices(handler, &handler->conf->cgroup)) {
 		ERROR("failed to setup the cgroups for '%s'", name);
 		goto out_delete_net;
 	}
 
-	if (lxc_cgroup_enter(handler->cgroup, handler->pid) < 0)
+	if (lxc_cgroup_enter(handler->cgroup, handler->pid, false) < 0)
 		goto out_delete_net;
 
 	if (failed_before_rename)
@@ -707,7 +726,7 @@ int lxc_spawn(struct lxc_handler *handler)
 	if (lxc_sync_barrier_child(handler, LXC_SYNC_POST_CONFIGURE))
 		goto out_delete_net;
 
-	if (setup_cgroup_devices(handler, &handler->conf->cgroup)) {
+	if (lxc_setup_cgroup_devices(handler, &handler->conf->cgroup)) {
 		ERROR("failed to setup the devices cgroup for '%s'", name);
 		goto out_delete_net;
 	}
@@ -739,6 +758,7 @@ int lxc_spawn(struct lxc_handler *handler)
 		goto out_abort;
 	}
 
+	lxc_cgroup_put_meta(cgroup_meta);
 	lxc_sync_fini(handler);
 
 	return 0;
@@ -747,6 +767,7 @@ out_delete_net:
 	if (handler->clone_flags & CLONE_NEWNET)
 		lxc_delete_network(handler);
 out_abort:
+	lxc_cgroup_put_meta(cgroup_meta);
 	lxc_abort(name, handler);
 	lxc_sync_fini(handler);
 	if (handler->pinfd >= 0) {
diff --git a/src/lxc/start.h b/src/lxc/start.h
index 3e5ad64..9bf6024 100644
--- a/src/lxc/start.h
+++ b/src/lxc/start.h
@@ -55,7 +55,7 @@ struct lxc_handler {
 #endif
 	int pinfd;
 	const char *lxcpath;
-	struct cgroup_desc *cgroup;
+	struct cgroup_process_info *cgroup;
 };
 
 extern struct lxc_handler *lxc_init(const char *name, struct lxc_conf *, const char *);
diff --git a/src/lxc/state.c b/src/lxc/state.c
index 5492634..398833a 100644
--- a/src/lxc/state.c
+++ b/src/lxc/state.c
@@ -75,7 +75,7 @@ static lxc_state_t freezer_state(const char *name, const char *lxcpath)
 	FILE *file;
 	int ret;
 
-	cgabspath = lxc_cgroup_path_get("freezer", name, lxcpath);
+	cgabspath = lxc_cgroup_get_hierarchy_abs_path("freezer", name, lxcpath);
 	if (!cgabspath)
 		return -1;
 
diff --git a/src/lxc/utils.c b/src/lxc/utils.c
index 02336d7..e7bd562 100644
--- a/src/lxc/utils.c
+++ b/src/lxc/utils.c
@@ -233,6 +233,7 @@ const char *lxc_global_config_value(const char *option_name)
 		{ "zfsroot",         DEFAULT_ZFSROOT },
 		{ "lxcpath",         LXCPATH         },
 		{ "cgroup.pattern",  DEFAULT_CGROUP_PATTERN },
+		{ "cgroup.use",      NULL            },
 		{ NULL, NULL },
 	};
 	static const char *values[sizeof(options) / sizeof(options[0])] = { 0 };
diff --git a/src/tests/cgpath.c b/src/tests/cgpath.c
index 6761af4..f9aaced 100644
--- a/src/tests/cgpath.c
+++ b/src/tests/cgpath.c
@@ -75,21 +75,21 @@ static int test_running_container(const char *lxcpath,
 	}
 
 	/* test get/set value using memory.swappiness file */
-	ret = lxc_cgroup_get(c->name, "memory.swappiness", value,
-			     sizeof(value), c->config_path);
+	ret = lxc_cgroup_get("memory.swappiness", value, sizeof(value),
+			     c->name, c->config_path);
 	if (ret < 0) {
 		TSTERR("lxc_cgroup_get failed");
 		goto err3;
 	}
 	strcpy(value_save, value);
 
-	ret = lxc_cgroup_set(c->name, "memory.swappiness", "100", c->config_path);
+	ret = lxc_cgroup_set("memory.swappiness", "100", c->name, c->config_path);
 	if (ret < 0) {
 		TSTERR("lxc_cgroup_set_bypath failed");
 		goto err3;
 	}
-	ret = lxc_cgroup_get(c->name, "memory.swappiness", value,
-			     sizeof(value), c->config_path);
+	ret = lxc_cgroup_get("memory.swappiness", value, sizeof(value),
+			     c->name, c->config_path);
 	if (ret < 0) {
 		TSTERR("lxc_cgroup_get failed");
 		goto err3;
@@ -100,14 +100,14 @@ static int test_running_container(const char *lxcpath,
 	}
 
 	/* restore original value */
-	ret = lxc_cgroup_set(c->name, "memory.swappiness", value_save,
-			     c->config_path);
+	ret = lxc_cgroup_set("memory.swappiness", value_save,
+			     c->name, c->config_path);
 	if (ret < 0) {
 		TSTERR("lxc_cgroup_set failed");
 		goto err3;
 	}
-	ret = lxc_cgroup_get(c->name, "memory.swappiness", value,
-			     sizeof(value), c->config_path);
+	ret = lxc_cgroup_get("memory.swappiness", value, sizeof(value),
+			     c->name, c->config_path);
 	if (ret < 0) {
 		TSTERR("lxc_cgroup_get failed");
 		goto err3;
-- 
1.7.10.4





More information about the lxc-devel mailing list