[lxc-devel] [lxcfs/master] bindings: add infrastructure for cgroup2 support

brauner on Github lxc-bot at linuxcontainers.org
Thu Feb 20 15:31:48 UTC 2020


A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 414 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20200220/864a7793/attachment-0001.bin>
-------------- next part --------------
From 4f8198790acda0337010090255aac90b9f943902 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Thu, 20 Feb 2020 16:30:47 +0100
Subject: [PATCH] bindings: add infrastructure for cgroup2 support

Mostly based on code I've written for liblxc.

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 Makefile.am               |  10 +-
 bindings.c                | 497 ++++++------------------
 bindings.h                |   4 +-
 cgroups/cgfsng.c          | 787 ++++++++++++++++++++++++++++++++++++++
 cgroups/cgroup.c          |  79 ++++
 cgroups/cgroup.h          | 150 ++++++++
 cgroups/cgroup2_devices.c | 457 ++++++++++++++++++++++
 cgroups/cgroup2_devices.h | 154 ++++++++
 cgroups/cgroup_utils.c    | 726 +++++++++++++++++++++++++++++++++++
 cgroups/cgroup_utils.h    |  72 ++++
 configure.ac              |   9 +
 macro.h                   |  56 ++-
 memory_utils.h            |   2 +
 sysfs_fuse.c              |   4 +-
 14 files changed, 2618 insertions(+), 389 deletions(-)
 create mode 100644 cgroups/cgfsng.c
 create mode 100644 cgroups/cgroup.c
 create mode 100644 cgroups/cgroup.h
 create mode 100644 cgroups/cgroup2_devices.c
 create mode 100644 cgroups/cgroup2_devices.h
 create mode 100644 cgroups/cgroup_utils.c
 create mode 100644 cgroups/cgroup_utils.h

diff --git a/Makefile.am b/Makefile.am
index 13fb1e3..e783f29 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -13,6 +13,10 @@ AM_LDFLAGS = $(FUSE_LIBS) -pthread
 AM_CFLAGS += -DRUNTIME_PATH=\"$(RUNTIME_PATH)\"
 
 liblxcfs_la_SOURCES = bindings.c bindings.h \
+		      cgroups/cgfsng.c \
+		      cgroups/cgroup.c cgroups/cgroup.h \
+		      cgroups/cgroup2_devices.c cgroups/cgroup2_devices.h \
+		      cgroups/cgroup_utils.c cgroups/cgroup_utils.h \
 		      cpuset.c \
 		      memory_utils.h \
 		      sysfs_fuse.c sysfs_fuse.h
@@ -20,13 +24,17 @@ liblxcfs_la_CFLAGS = $(AM_CFLAGS)
 liblxcfs_la_LDFLAGS = $(AM_CFLAGS) -module -avoid-version -shared
 
 liblxcfstest_la_SOURCES = bindings.c bindings.h \
+			  cgroups/cgfsng.c \
+			  cgroups/cgroup.c cgroups/cgroup.h \
+			  cgroups/cgroup2_devices.c cgroups/cgroup2_devices.h \
+			  cgroups/cgroup_utils.c cgroups/cgroup_utils.h \
 			  cpuset.c \
 			  memory_utils.h \
 			  sysfs_fuse.c sysfs_fuse.h
 liblxcfstest_la_CFLAGS = $(AM_CFLAGS) -DRELOADTEST
 liblxcfstest_la_LDFLAGS = $(AM_CFLAGS) -module -avoid-version -shared
 
-noinst_HEADERS = bindings.h macro.h memory_utils.h sysfs_fuse.h
+noinst_HEADERS = bindings.h cgroups/cgroup.h cgroups/cgroup2_devices.h cgroups/cgroup_utils.h macro.h memory_utils.h sysfs_fuse.h
 
 sodir=$(libdir)
 lxcfs_LTLIBRARIES = liblxcfs.la
diff --git a/bindings.c b/bindings.c
index 4a8a421..ab0cd71 100644
--- a/bindings.c
+++ b/bindings.c
@@ -38,6 +38,8 @@
 #include <sys/vfs.h>
 
 #include "bindings.h"
+#include "cgroups/cgroup.h"
+#include "cgroups/cgroup_utils.h"
 #include "memory_utils.h"
 #include "config.h"
 
@@ -410,25 +412,8 @@ static void lock_mutex(pthread_mutex_t *l)
 	}
 }
 
-/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
- * Number of hierarchies mounted. */
-static int num_hierarchies;
+static struct cgroup_ops *cgroup_ops;
 
-/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
- * Hierachies mounted {cpuset, blkio, ...}:
- * Initialized via __constructor__ collect_and_mount_subsystems(). */
-static char **hierarchies;
-
-/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
- * Open file descriptors:
- * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
- * private mount namespace.
- * Initialized via __constructor__ collect_and_mount_subsystems().
- * @fd_hierarchies[i] can be used to perform file operations on the cgroup
- * mounts and respective files in the private namespace even when located in
- * another namespace using the *at() family of functions
- * {openat(), fchownat(), ...}. */
-static int *fd_hierarchies;
 static int cgroup_mount_ns_fd = -1;
 
 static void unlock_mutex(pthread_mutex_t *l)
@@ -599,70 +584,6 @@ static int is_dir(const char *path, int fd)
 	return 0;
 }
 
-static char *must_copy_string(const char *str)
-{
-	char *dup = NULL;
-	if (!str)
-		return NULL;
-	do {
-		dup = strdup(str);
-	} while (!dup);
-
-	return dup;
-}
-
-static inline void drop_trailing_newlines(char *s)
-{
-	int l;
-
-	for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
-		s[l-1] = '\0';
-}
-
-#define BATCH_SIZE 50
-static void dorealloc(char **mem, size_t oldlen, size_t newlen)
-{
-	int newbatches = (newlen / BATCH_SIZE) + 1;
-	int oldbatches = (oldlen / BATCH_SIZE) + 1;
-
-	if (!*mem || newbatches > oldbatches) {
-		char *tmp;
-		do {
-			tmp = realloc(*mem, newbatches * BATCH_SIZE);
-		} while (!tmp);
-		*mem = tmp;
-	}
-}
-static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
-{
-	size_t newlen = *len + linelen;
-	dorealloc(contents, *len, newlen + 1);
-	memcpy(*contents + *len, line, linelen+1);
-	*len = newlen;
-}
-
-static char *slurp_file(const char *from, int fd)
-{
-	char *line = NULL;
-	char *contents = NULL;
-	FILE *f = fdopen(fd, "r");
-	size_t len = 0, fulllen = 0;
-	ssize_t linelen;
-
-	if (!f)
-		return NULL;
-
-	while ((linelen = getline(&line, &len, f)) != -1) {
-		append_line(&contents, &fulllen, line, linelen);
-	}
-	fclose(f);
-
-	if (contents)
-		drop_trailing_newlines(contents);
-	free(line);
-	return contents;
-}
-
 static int preserve_ns(const int pid, const char *ns)
 {
 	int ret;
@@ -776,79 +697,29 @@ struct cgfs_files {
 	uint32_t mode;
 };
 
-#define ALLOC_NUM 20
-static bool store_hierarchy(char *stridx, char *h)
-{
-	if (num_hierarchies % ALLOC_NUM == 0) {
-		size_t n = (num_hierarchies / ALLOC_NUM) + 1;
-		n *= ALLOC_NUM;
-		char **tmp = realloc(hierarchies, n * sizeof(char *));
-		if (!tmp) {
-			lxcfs_error("%s\n", strerror(errno));
-			exit(1);
-		}
-		hierarchies = tmp;
-	}
-
-	hierarchies[num_hierarchies++] = must_copy_string(h);
-	return true;
-}
-
 static void print_subsystems(void)
 {
-	int i;
+	int i = 0;
 
 	fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
 	fprintf(stderr, "hierarchies:\n");
-	for (i = 0; i < num_hierarchies; i++) {
-		if (hierarchies[i])
-			fprintf(stderr, " %2d: fd: %3d: %s\n", i,
-				fd_hierarchies[i], hierarchies[i]);
+	for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
+		__do_free char *controllers = lxc_string_join(",", (const char **)(*h)->controllers, false);
+		fprintf(stderr, " %2d: fd: %3d: %s\n", i, (*h)->fd, controllers ?: "");
 	}
 }
 
-static bool in_comma_list(const char *needle, const char *haystack)
-{
-	const char *s = haystack, *e;
-	size_t nlen = strlen(needle);
-
-	while (*s && (e = strchr(s, ','))) {
-		if (nlen != e - s) {
-			s = e + 1;
-			continue;
-		}
-		if (strncmp(needle, s, nlen) == 0)
-			return true;
-		s = e + 1;
-	}
-	if (strcmp(needle, s) == 0)
-		return true;
-	return false;
-}
-
 /* do we need to do any massaging here?  I'm not sure... */
 /* Return the mounted controller and store the corresponding open file descriptor
  * referring to the controller mountpoint in the private lxcfs namespace in
  * @cfd.
  */
-static char *find_mounted_controller(const char *controller, int *cfd)
+static int find_mounted_controller(const char *controller)
 {
-	int i;
-
-	for (i = 0; i < num_hierarchies; i++) {
-		if (!hierarchies[i])
-			continue;
-		if (strcmp(hierarchies[i], controller) == 0) {
-			*cfd = fd_hierarchies[i];
-			return hierarchies[i];
-		}
-		if (in_comma_list(controller, hierarchies[i])) {
-			*cfd = fd_hierarchies[i];
-			return hierarchies[i];
-		}
-	}
+	struct hierarchy *h;
 
-	return NULL;
+	h = cgroup_ops->get_hierarchy(cgroup_ops, controller);
+	return h ? h->fd : -EBADF;
 }
 
 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
@@ -856,10 +727,10 @@ bool cgfs_set_value(const char *controller, const char *cgroup, const char *file
 {
 	int ret, fd, cfd;
 	size_t len;
-	char *fnam, *tmpc;
+	char *fnam;
 
-	tmpc = find_mounted_controller(controller, &cfd);
-	if (!tmpc)
+	cfd = find_mounted_controller(controller);
+	if (cfd < 0)
 		return false;
 
 	/* Make sure we pass a relative path to *at() family of functions.
@@ -922,10 +793,10 @@ int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
 {
 	int cfd;
 	size_t len;
-	char *dirnam, *tmpc;
+	char *dirnam;
 
-	tmpc = find_mounted_controller(controller, &cfd);
-	if (!tmpc)
+	cfd = find_mounted_controller(controller);
+	if (cfd < 0)
 		return -EINVAL;
 
 	/* Make sure we pass a relative path to *at() family of functions.
@@ -1012,11 +883,11 @@ bool cgfs_remove(const char *controller, const char *cg)
 {
 	int fd, cfd;
 	size_t len;
-	char *dirnam, *tmpc;
+	char *dirnam;
 	bool bret;
 
-	tmpc = find_mounted_controller(controller, &cfd);
-	if (!tmpc)
+	cfd = find_mounted_controller(controller);
+	if (cfd < 0)
 		return false;
 
 	/* Make sure we pass a relative path to *at() family of functions.
@@ -1039,10 +910,10 @@ bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
 {
 	int cfd;
 	size_t len;
-	char *pathname, *tmpc;
+	char *pathname;
 
-	tmpc = find_mounted_controller(controller, &cfd);
-	if (!tmpc)
+	cfd = find_mounted_controller(controller);
+	if (cfd < 0)
 		return false;
 
 	/* Make sure we pass a relative path to *at() family of functions.
@@ -1076,11 +947,11 @@ int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t g
 {
 	int cfd;
 	size_t len;
-	char *pathname, *tmpc;
+	char *pathname;
 
-	tmpc = find_mounted_controller(controller, &cfd);
-	if (!tmpc)
-		return -EINVAL;
+	cfd = find_mounted_controller(controller);
+	if (cfd < 0)
+		return false;
 
 	/* Make sure we pass a relative path to *at() family of functions.
 	 * . + /file + \0
@@ -1102,11 +973,11 @@ FILE *open_pids_file(const char *controller, const char *cgroup)
 {
 	int fd, cfd;
 	size_t len;
-	char *pathname, *tmpc;
+	char *pathname;
 
-	tmpc = find_mounted_controller(controller, &cfd);
-	if (!tmpc)
-		return NULL;
+	cfd = find_mounted_controller(controller);
+	if (cfd < 0)
+		return false;
 
 	/* Make sure we pass a relative path to *at() family of functions.
 	 * . + /cgroup + / "cgroup.procs" + \0
@@ -1128,15 +999,15 @@ static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool
 {
 	int cfd, fd, ret;
 	size_t len;
-	char *cg, *tmpc;
+	char *cg;
 	char pathname[MAXPATHLEN];
 	size_t sz = 0, asz = 0;
 	struct dirent *dirent;
 	DIR *dir;
 
-	tmpc = find_mounted_controller(controller, &cfd);
+	cfd = find_mounted_controller(controller);
 	*list = NULL;
-	if (!tmpc)
+	if (cfd < 0)
 		return false;
 
 	/* Make sure we pass a relative path to *at() family of functions. */
@@ -1233,12 +1104,12 @@ void free_keys(struct cgfs_files **keys)
 
 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
 {
-	int ret, fd, cfd;
+	int ret, cfd;
 	size_t len;
-	char *fnam, *tmpc;
+	char *fnam;
 
-	tmpc = find_mounted_controller(controller, &cfd);
-	if (!tmpc)
+	cfd = find_mounted_controller(controller);
+	if (cfd < 0)
 		return false;
 
 	/* Make sure we pass a relative path to *at() family of functions.
@@ -1250,11 +1121,7 @@ bool cgfs_get_value(const char *controller, const char *cgroup, const char *file
 	if (ret < 0 || (size_t)ret >= len)
 		return false;
 
-	fd = openat(cfd, fnam, O_RDONLY);
-	if (fd < 0)
-		return false;
-
-	*value = slurp_file(fnam, fd);
+	*value = readat_file(cfd, fnam);
 	return *value != NULL;
 }
 
@@ -1262,10 +1129,10 @@ bool cgfs_param_exist(const char *controller, const char *cgroup, const char *fi
 {
 	int ret, cfd;
 	size_t len;
-	char *fnam, *tmpc;
+	char *fnam;
 
-	tmpc = find_mounted_controller(controller, &cfd);
-	if (!tmpc)
+	cfd = find_mounted_controller(controller);
+	if (cfd < 0)
 		return false;
 
 	/* Make sure we pass a relative path to *at() family of functions.
@@ -1284,12 +1151,12 @@ struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, cons
 {
 	int ret, cfd;
 	size_t len;
-	char *fnam, *tmpc;
+	char *fnam;
 	struct stat sb;
 	struct cgfs_files *newkey;
 
-	tmpc = find_mounted_controller(controller, &cfd);
-	if (!tmpc)
+	cfd = find_mounted_controller(controller);
+	if (cfd < 0)
 		return false;
 
 	if (file && *file == '/')
@@ -1347,12 +1214,12 @@ bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
 {
 	int cfd;
 	size_t len;
-	char *fnam, *tmpc;
+	char *fnam;
 	int ret;
 	struct stat sb;
 
-	tmpc = find_mounted_controller(controller, &cfd);
-	if (!tmpc)
+	cfd = find_mounted_controller(controller);
+	if (cfd < 0)
 		return false;
 
 	/* Make sure we pass a relative path to *at() family of functions.
@@ -1707,58 +1574,18 @@ static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
 	return start;
 }
 
-static void stripnewline(char *x)
-{
-	size_t l = strlen(x);
-	if (l && x[l-1] == '\n')
-		x[l-1] = '\0';
-}
-
 char *get_pid_cgroup(pid_t pid, const char *contrl)
 {
 	int cfd;
-	char fnam[PROCLEN];
-	FILE *f;
-	char *answer = NULL;
-	char *line = NULL;
-	size_t len = 0;
-	int ret;
-	const char *h = find_mounted_controller(contrl, &cfd);
-	if (!h)
-		return NULL;
 
-	ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
-	if (ret < 0 || ret >= PROCLEN)
-		return NULL;
-	if (!(f = fopen(fnam, "r")))
-		return NULL;
+	cfd = find_mounted_controller(contrl);
+	if (cfd < 0)
+		return false;
 
-	while (getline(&line, &len, f) != -1) {
-		char *c1, *c2;
-		if (!line[0])
-			continue;
-		c1 = strchr(line, ':');
-		if (!c1)
-			goto out;
-		c1++;
-		c2 = strchr(c1, ':');
-		if (!c2)
-			goto out;
-		*c2 = '\0';
-		if (strcmp(c1, h) != 0)
-			continue;
-		c2++;
-		stripnewline(c2);
-		do {
-			answer = strdup(c2);
-		} while (!answer);
-		break;
-	}
+	if (pure_unified_layout(cgroup_ops))
+		return cg_unified_get_current_cgroup(pid);
 
-out:
-	fclose(f);
-	free(line);
-	return answer;
+	return cg_legacy_get_current_cgroup(pid, contrl);
 }
 
 /*
@@ -1939,10 +1766,9 @@ static char *pick_controller_from_path(struct fuse_context *fc, const char *path
 	if (slash)
 		*slash = '\0';
 
-	int i;
-	for (i = 0; i < num_hierarchies; i++) {
-		if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
-			return hierarchies[i];
+	for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
+		if ((*h)->__controllers && strcmp((*h)->__controllers, contr) == 0)
+			return (*h)->__controllers;
 	}
 	errno = ENOENT;
 	return NULL;
@@ -2005,7 +1831,7 @@ int cg_getattr(const char *path, struct stat *sb)
 	int ret = -ENOENT;
 
 
-	if (!fc)
+	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
 		return -EIO;
 
 	memset(sb, 0, sizeof(struct stat));
@@ -2110,7 +1936,7 @@ int cg_opendir(const char *path, struct fuse_file_info *fi)
 	struct file_info *dir_info;
 	char *controller = NULL;
 
-	if (!fc)
+	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
 		return -EIO;
 
 	if (strcmp(path, "/cgroup") == 0) {
@@ -2164,6 +1990,9 @@ int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset
 	struct fuse_context *fc = fuse_get_context();
 	char **clist = NULL;
 
+	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+		return -EIO;
+
 	if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
 		return -EIO;
 
@@ -2172,14 +2001,18 @@ int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset
 		return -EIO;
 	}
 	if (!d->cgroup && !d->controller) {
-		// ls /var/lib/lxcfs/cgroup - just show list of controllers
-		int i;
+		/*
+		 * ls /var/lib/lxcfs/cgroup - just show list of controllers.
+		 * This only works with the legacy hierarchy.
+		 */
+		for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
+			if (is_unified_hierarchy(*h))
+				continue;
 
-		for (i = 0;  i < num_hierarchies; i++) {
-			if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
+			if ((*h)->__controllers && filler(buf, (*h)->__controllers, NULL, 0))
 				return -EIO;
-			}
 		}
+
 		return 0;
 	}
 
@@ -2274,7 +2107,7 @@ int cg_open(const char *path, struct fuse_file_info *fi)
 	struct fuse_context *fc = fuse_get_context();
 	int ret;
 
-	if (!fc)
+	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
 		return -EIO;
 
 	controller = pick_controller_from_path(fc, path);
@@ -2342,12 +2175,12 @@ int cg_access(const char *path, int mode)
 	struct cgfs_files *k = NULL;
 	struct fuse_context *fc = fuse_get_context();
 
+	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+		return -EIO;
+
 	if (strcmp(path, "/cgroup") == 0)
 		return 0;
 
-	if (!fc)
-		return -EIO;
-
 	controller = pick_controller_from_path(fc, path);
 	if (!controller)
 		return -errno;
@@ -2758,6 +2591,9 @@ int cg_read(const char *path, char *buf, size_t size, off_t offset,
 	int ret, s;
 	bool r;
 
+	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+		return -EIO;
+
 	if (f->type != LXC_TYPE_CGFILE) {
 		lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
 		return -EIO;
@@ -2766,9 +2602,6 @@ int cg_read(const char *path, char *buf, size_t size, off_t offset,
 	if (offset)
 		return 0;
 
-	if (!fc)
-		return -EIO;
-
 	if (!f->controller)
 		return -EINVAL;
 
@@ -3068,6 +2901,9 @@ int cg_write(const char *path, const char *buf, size_t size, off_t offset,
 	struct file_info *f = (struct file_info *)fi->fh;
 	bool r;
 
+	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
+		return -EIO;
+
 	if (f->type != LXC_TYPE_CGFILE) {
 		lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
 		return -EIO;
@@ -3076,9 +2912,6 @@ int cg_write(const char *path, const char *buf, size_t size, off_t offset,
 	if (offset)
 		return 0;
 
-	if (!fc)
-		return -EIO;
-
 	localbuf = alloca(size+1);
 	localbuf[size] = '\0';
 	memcpy(localbuf, buf, size);
@@ -3118,7 +2951,7 @@ int cg_chown(const char *path, uid_t uid, gid_t gid)
 	const char *cgroup;
 	int ret;
 
-	if (!fc)
+	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
 		return -EIO;
 
 	if (strcmp(path, "/cgroup") == 0)
@@ -3184,7 +3017,7 @@ int cg_chmod(const char *path, mode_t mode)
 	const char *cgroup;
 	int ret;
 
-	if (!fc)
+	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
 		return -EIO;
 
 	if (strcmp(path, "/cgroup") == 0)
@@ -3252,7 +3085,7 @@ int cg_mkdir(const char *path, mode_t mode)
 	const char *cgroup;
 	int ret;
 
-	if (!fc)
+	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
 		return -EIO;
 
 	controller = pick_controller_from_path(fc, path);
@@ -3306,7 +3139,7 @@ int cg_rmdir(const char *path)
 	const char *cgroup;
 	int ret;
 
-	if (!fc)
+	if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
 		return -EIO;
 
 	controller = pick_controller_from_path(fc, path);
@@ -3427,7 +3260,7 @@ static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *
 	}
 }
 
-int read_file(const char *path, char *buf, size_t size, struct file_info *d)
+int read_file_fuse(const char *path, char *buf, size_t size, struct file_info *d)
 {
 	size_t linelen = 0, total_len = 0, rv = 0;
 	char *line = NULL;
@@ -3538,7 +3371,7 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset,
 		initpid = fc->pid;
 	cg = get_pid_cgroup(initpid, "memory");
 	if (!cg)
-		return read_file("/proc/meminfo", buf, size, d);
+		return read_file_fuse("/proc/meminfo", buf, size, d);
 	prune_init_slice(cg);
 
 	memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
@@ -3828,14 +3661,13 @@ static double exact_cpu_count(const char *cg)
 bool use_cpuview(const char *cg)
 {
 	int cfd;
-	char *tmpc;
 
-	tmpc = find_mounted_controller("cpu", &cfd);
-	if (!tmpc)
+	cfd = find_mounted_controller("cpu");
+	if (cfd < 0)
 		return false;
 
-	tmpc = find_mounted_controller("cpuacct", &cfd);
-	if (!tmpc)
+	cfd = find_mounted_controller("cpuacct");
+	if (cfd < 0)
 		return false;
 
 	return true;
@@ -3885,7 +3717,7 @@ static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
 		initpid = fc->pid;
 	cg = get_pid_cgroup(initpid, "cpuset");
 	if (!cg)
-		return read_file("proc/cpuinfo", buf, size, d);
+		return read_file_fuse("proc/cpuinfo", buf, size, d);
 	prune_init_slice(cg);
 
 	cpuset = get_cpuset(cg);
@@ -4988,13 +4820,13 @@ static int proc_stat_read(char *buf, size_t size, off_t offset,
 	 * in some case cpuacct_usage.all in "/" will larger then /proc/stat
 	 */
 	if (initpid == 1) {
-	    return read_file("/proc/stat", buf, size, d);
+	    return read_file_fuse("/proc/stat", buf, size, d);
 	}
 
 	cg = get_pid_cgroup(initpid, "cpuset");
 	lxcfs_v("cg: %s\n", cg);
 	if (!cg)
-		return read_file("/proc/stat", buf, size, d);
+		return read_file_fuse("/proc/stat", buf, size, d);
 	prune_init_slice(cg);
 
 	cpuset = get_cpuset(cg);
@@ -5333,7 +5165,7 @@ static int proc_diskstats_read(char *buf, size_t size, off_t offset,
 		initpid = fc->pid;
 	cg = get_pid_cgroup(initpid, "blkio");
 	if (!cg)
-		return read_file("/proc/diskstats", buf, size, d);
+		return read_file_fuse("/proc/diskstats", buf, size, d);
 	prune_init_slice(cg);
 
 	if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
@@ -5455,7 +5287,7 @@ static int proc_swaps_read(char *buf, size_t size, off_t offset,
 		initpid = fc->pid;
 	cg = get_pid_cgroup(initpid, "memory");
 	if (!cg)
-		return read_file("/proc/swaps", buf, size, d);
+		return read_file_fuse("/proc/swaps", buf, size, d);
 	prune_init_slice(cg);
 
 	memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
@@ -5810,14 +5642,14 @@ static int proc_loadavg_read(char *buf, size_t size, off_t offset,
 		return total_len;
 	}
 	if (!loadavg)
-		return read_file("/proc/loadavg", buf, size, d);
+		return read_file_fuse("/proc/loadavg", buf, size, d);
 
 	initpid = lookup_initpid_in_store(fc->pid);
 	if (initpid <= 1 || is_shared_pidns(initpid))
 		initpid = fc->pid;
 	cg = get_pid_cgroup(initpid, "cpu");
 	if (!cg)
-		return read_file("/proc/loadavg", buf, size, d);
+		return read_file_fuse("/proc/loadavg", buf, size, d);
 
 	prune_init_slice(cg);
 	hash = calc_hash(cg) % LOAD_SIZE;
@@ -5825,7 +5657,8 @@ static int proc_loadavg_read(char *buf, size_t size, off_t offset,
 
 	/* First time */
 	if (n == NULL) {
-		if (!find_mounted_controller("cpu", &cfd)) {
+		cfd = find_mounted_controller("cpu");
+		if (cfd >= 0) {
 			/*
 			 * In locate_node() above, pthread_rwlock_unlock() isn't used
 			 * because delete is not allowed before read has ended.
@@ -6069,30 +5902,6 @@ int proc_read(const char *path, char *buf, size_t size, off_t offset,
  * Functions needed to setup cgroups in the __constructor__.
  */
 
-static bool mkdir_p(const char *dir, mode_t mode)
-{
-	const char *tmp = dir;
-	const char *orig = dir;
-	char *makeme;
-
-	do {
-		dir = tmp + strspn(tmp, "/");
-		tmp = dir + strcspn(dir, "/");
-		makeme = strndup(orig, dir - orig);
-		if (!makeme)
-			return false;
-		if (mkdir(makeme, mode) && errno != EEXIST) {
-			lxcfs_error("Failed to create directory '%s': %s.\n",
-				makeme, strerror(errno));
-			free(makeme);
-			return false;
-		}
-		free(makeme);
-	} while(tmp != dir);
-
-	return true;
-}
-
 static bool umount_if_mounted(void)
 {
 	if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
@@ -6345,45 +6154,19 @@ static bool cgfs_prepare_mounts(void)
 
 static bool cgfs_mount_hierarchies(void)
 {
-	char *target;
-	size_t clen, len;
-	int i, ret;
-
-	for (i = 0; i < num_hierarchies; i++) {
-		char *controller = hierarchies[i];
-
-		clen = strlen(controller);
-		len = strlen(BASEDIR) + clen + 2;
-		target = malloc(len);
-		if (!target)
-			return false;
+	if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
+		return false;
 
-		ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
-		if (ret < 0 || ret >= len) {
-			free(target);
-			return false;
-		}
-		if (mkdir(target, 0755) < 0 && errno != EEXIST) {
-			free(target);
-			return false;
-		}
-		if (!strcmp(controller, "unified"))
-			ret = mount("none", target, "cgroup2", 0, NULL);
-		else
-			ret = mount(controller, target, "cgroup", 0, controller);
-		if (ret < 0) {
-			lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
-			free(target);
-			return false;
-		}
+	if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
+		return false;
 
-		fd_hierarchies[i] = open(target, O_DIRECTORY);
-		if (fd_hierarchies[i] < 0) {
-			free(target);
+	for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
+		__do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
+		(*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
+		if ((*h)->fd < 0)
 			return false;
-		}
-		free(target);
 	}
+
 	return true;
 }
 
@@ -6405,45 +6188,13 @@ static bool cgfs_setup_controllers(void)
 
 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
 {
-	FILE *f;
-	char *cret, *line = NULL;
+	char *cret;
 	char cwd[MAXPATHLEN];
-	size_t len = 0;
-	int i, init_ns = -1;
-	bool found_unified = false;
+	int init_ns = -1;
 
-	if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
-		lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
+	cgroup_ops = cgroup_init();
+	if (!cgroup_ops)
 		return;
-	}
-
-	while (getline(&line, &len, f) != -1) {
-		char *idx, *p, *p2;
-
-		p = strchr(line, ':');
-		if (!p)
-			goto out;
-		idx = line;
-		*(p++) = '\0';
-
-		p2 = strrchr(p, ':');
-		if (!p2)
-			goto out;
-		*p2 = '\0';
-
-		/* With cgroupv2 /proc/self/cgroup can contain entries of the
-		 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
-		 * because it parses out the empty string "" and later on passes
-		 * it to mount(). Let's skip such entries.
-		 */
-		if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
-			found_unified = true;
-			p = "unified";
-		}
-
-		if (!store_hierarchy(line, p))
-			goto out;
-	}
 
 	/* Preserve initial namespace. */
 	init_ns = preserve_mnt_ns(getpid());
@@ -6452,15 +6203,6 @@ static void __attribute__((constructor)) collect_and_mount_subsystems(void)
 		goto out;
 	}
 
-	fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
-	if (!fd_hierarchies) {
-		lxcfs_error("%s\n", strerror(errno));
-		goto out;
-	}
-
-	for (i = 0; i < num_hierarchies; i++)
-		fd_hierarchies[i] = -1;
-
 	cret = getcwd(cwd, MAXPATHLEN);
 	if (!cret)
 		lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
@@ -6488,26 +6230,15 @@ static void __attribute__((constructor)) collect_and_mount_subsystems(void)
 	print_subsystems();
 
 out:
-	free(line);
-	fclose(f);
 	if (init_ns >= 0)
 		close(init_ns);
 }
 
 static void __attribute__((destructor)) free_subsystems(void)
 {
-	int i;
-
 	lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
 
-	for (i = 0; i < num_hierarchies; i++) {
-		if (hierarchies[i])
-			free(hierarchies[i]);
-		if (fd_hierarchies && fd_hierarchies[i] >= 0)
-			close(fd_hierarchies[i]);
-	}
-	free(hierarchies);
-	free(fd_hierarchies);
+	cgroup_exit(cgroup_ops);
 	free_cpuview();
 
 	if (cgroup_mount_ns_fd >= 0)
diff --git a/bindings.h b/bindings.h
index 250bbac..229d64c 100644
--- a/bindings.h
+++ b/bindings.h
@@ -75,8 +75,8 @@ extern int stop_load_daemon(pthread_t pid);
 
 extern pid_t lookup_initpid_in_store(pid_t qpid);
 extern char *get_pid_cgroup(pid_t pid, const char *contrl);
-extern int read_file(const char *path, char *buf, size_t size,
-		     struct file_info *d);
+extern int read_file_fuse(const char *path, char *buf, size_t size,
+			  struct file_info *d);
 extern void prune_init_slice(char *cg);
 extern char *get_cpuset(const char *cg);
 extern bool use_cpuview(const char *cg);
diff --git a/cgroups/cgfsng.c b/cgroups/cgfsng.c
new file mode 100644
index 0000000..08b719d
--- /dev/null
+++ b/cgroups/cgfsng.c
@@ -0,0 +1,787 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+/*
+ * cgfs-ng.c: this is a new, simplified implementation of a filesystem
+ * cgroup backend.  The original cgfs.c was designed to be as flexible
+ * as possible.  It would try to find cgroup filesystems no matter where
+ * or how you had them mounted, and deduce the most usable mount for
+ * each controller.
+ *
+ * This new implementation assumes that cgroup filesystems are mounted
+ * under /sys/fs/cgroup/clist where clist is either the controller, or
+ * a comma-separated list of controllers.
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#endif
+#include <ctype.h>
+#include <dirent.h>
+#include <errno.h>
+#include <grp.h>
+#include <linux/kdev_t.h>
+#include <linux/types.h>
+#include <poll.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "cgroup.h"
+#include "cgroup2_devices.h"
+#include "cgroup_utils.h"
+#include "macro.h"
+#include "memory_utils.h"
+
+static void free_string_list(char **clist)
+{
+	int i;
+
+	if (!clist)
+		return;
+
+	for (i = 0; clist[i]; i++)
+		free(clist[i]);
+
+	free(clist);
+}
+
+/* Given a pointer to a null-terminated array of pointers, realloc to add one
+ * entry, and point the new entry to NULL. Do not fail. Return the index to the
+ * second-to-last entry - that is, the one which is now available for use
+ * (keeping the list null-terminated).
+ */
+static int append_null_to_list(void ***list)
+{
+	int newentry = 0;
+
+	if (*list)
+		for (; (*list)[newentry]; newentry++)
+			;
+
+	*list = must_realloc(*list, (newentry + 2) * sizeof(void **));
+	(*list)[newentry + 1] = NULL;
+	return newentry;
+}
+
+/* Given a null-terminated array of strings, check whether @entry is one of the
+ * strings.
+ */
+static bool string_in_list(char **list, const char *entry)
+{
+	int i;
+
+	if (!list)
+		return false;
+
+	for (i = 0; list[i]; i++)
+		if (strcmp(list[i], entry) == 0)
+			return true;
+
+	return false;
+}
+
+/* Return a copy of @entry prepending "name=", i.e.  turn "systemd" into
+ * "name=systemd". Do not fail.
+ */
+static char *cg_legacy_must_prefix_named(char *entry)
+{
+	size_t len;
+	char *prefixed;
+
+	len = strlen(entry);
+	prefixed = must_realloc(NULL, len + 6);
+
+	memcpy(prefixed, "name=", STRLITERALLEN("name="));
+	memcpy(prefixed + STRLITERALLEN("name="), entry, len);
+	prefixed[len + 5] = '\0';
+
+	return prefixed;
+}
+
+/* Append an entry to the clist. Do not fail. @clist must be NULL the first time
+ * we are called.
+ *
+ * We also handle named subsystems here. Any controller which is not a kernel
+ * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
+ * we refuse to use because we're not sure which we have here.
+ * (TODO: We could work around this in some cases by just remounting to be
+ * unambiguous, or by comparing mountpoint contents with current cgroup.)
+ *
+ * The last entry will always be NULL.
+ */
+static void must_append_controller(char **klist, char **nlist, char ***clist,
+				   char *entry)
+{
+	int newentry;
+	char *copy;
+
+	if (string_in_list(klist, entry) && string_in_list(nlist, entry))
+		return;
+
+	newentry = append_null_to_list((void ***)clist);
+
+	if (strncmp(entry, "name=", 5) == 0)
+		copy = must_copy_string(entry);
+	else if (string_in_list(klist, entry))
+		copy = must_copy_string(entry);
+	else
+		copy = cg_legacy_must_prefix_named(entry);
+
+	(*clist)[newentry] = copy;
+}
+
+/* Given a handler's cgroup data, return the struct hierarchy for the controller
+ * @c, or NULL if there is none.
+ */
+static struct hierarchy *cgfsng_get_hierarchy(struct cgroup_ops *ops,
+					      const char *controller)
+{
+	int i;
+
+	errno = ENOENT;
+
+	if (!ops->hierarchies)
+		return NULL;
+
+	for (i = 0; ops->hierarchies[i]; i++) {
+		if (!controller) {
+			/* This is the empty unified hierarchy. */
+			if (ops->hierarchies[i]->controllers &&
+			    !ops->hierarchies[i]->controllers[0])
+				return ops->hierarchies[i];
+			continue;
+		} else if (pure_unified_layout(ops) &&
+			   strcmp(controller, "devices") == 0) {
+			if (ops->unified->bpf_device_controller)
+				return ops->unified;
+			break;
+		}
+
+		if (string_in_list(ops->hierarchies[i]->controllers, controller))
+			return ops->hierarchies[i];
+	}
+
+	return NULL;
+}
+
+static inline struct hierarchy *get_hierarchy(struct cgroup_ops *ops,
+					      const char *controller)
+{
+	return cgfsng_get_hierarchy(ops, controller);
+}
+
+/* Given two null-terminated lists of strings, return true if any string is in
+ * both.
+ */
+static bool controller_lists_intersect(char **l1, char **l2)
+{
+	int i;
+
+	if (!l1 || !l2)
+		return false;
+
+	for (i = 0; l1[i]; i++) {
+		if (string_in_list(l2, l1[i]))
+			return true;
+	}
+
+	return false;
+}
+
+/* For a null-terminated list of controllers @clist, return true if any of those
+ * controllers is already listed the null-terminated list of hierarchies @hlist.
+ * Realistically, if one is present, all must be present.
+ */
+static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
+{
+	int i;
+
+	if (!hlist)
+		return false;
+
+	for (i = 0; hlist[i]; i++)
+		if (controller_lists_intersect(hlist[i]->controllers, clist))
+			return true;
+
+	return false;
+}
+
+/* Get the controllers from a mountinfo line There are other ways we could get
+ * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
+ * could parse the mount options. But we simply assume that the mountpoint must
+ * be /sys/fs/cgroup/controller-list
+ */
+static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
+					int type, char **controllers)
+{
+	/* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
+	 * for legacy hierarchies.
+	 */
+	int i;
+	char *p2, *tok;
+	char *p = line, *sep = ",";
+	char **aret = NULL;
+
+	for (i = 0; i < 4; i++) {
+		p = strchr(p, ' ');
+		if (!p)
+			return NULL;
+		p++;
+	}
+
+	/* Note, if we change how mountinfo works, then our caller will need to
+	 * verify /sys/fs/cgroup/ in this field.
+	 */
+	if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
+		return NULL;
+
+	p += 15;
+	p2 = strchr(p, ' ');
+	if (!p2)
+		return NULL;
+	*p2 = '\0';
+
+	if (type == CGROUP_SUPER_MAGIC) {
+		__do_free char *dup = NULL;
+
+		/* strdup() here for v1 hierarchies. Otherwise
+		 * lxc_iterate_parts() will destroy mountpoints such as
+		 * "/sys/fs/cgroup/cpu,cpuacct".
+		 */
+		dup = must_copy_string(p);
+		if (!dup)
+			return NULL;
+
+		lxc_iterate_parts (tok, dup, sep)
+			must_append_controller(klist, nlist, &aret, tok);
+		*controllers = move_ptr(dup);
+	}
+	*p2 = ' ';
+
+	return aret;
+}
+
+static char **cg_unified_make_empty_controller(void)
+{
+	int newentry;
+	char **aret = NULL;
+
+	newentry = append_null_to_list((void ***)&aret);
+	aret[newentry] = NULL;
+	return aret;
+}
+
+static char **cg_unified_get_controllers(const char *file)
+{
+	__do_free char *buf = NULL;
+	char *sep = " \t\n";
+	char **aret = NULL;
+	char *tok;
+
+	buf = read_file(file);
+	if (!buf)
+		return NULL;
+
+	lxc_iterate_parts(tok, buf, sep) {
+		int newentry;
+		char *copy;
+
+		newentry = append_null_to_list((void ***)&aret);
+		copy = must_copy_string(tok);
+		aret[newentry] = copy;
+	}
+
+	return aret;
+}
+
+static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint,
+				       char *container_base_path, int type)
+{
+	struct hierarchy *new;
+	int newentry;
+
+	new = zalloc(sizeof(*new));
+	new->controllers = clist;
+	new->mountpoint = mountpoint;
+	new->container_base_path = container_base_path;
+	new->version = type;
+
+	newentry = append_null_to_list((void ***)h);
+	(*h)[newentry] = new;
+	return new;
+}
+
+/* Get a copy of the mountpoint from @line, which is a line from
+ * /proc/self/mountinfo.
+ */
+static char *cg_hybrid_get_mountpoint(char *line)
+{
+	int i;
+	size_t len;
+	char *p2;
+	char *p = line, *sret = NULL;
+
+	for (i = 0; i < 4; i++) {
+		p = strchr(p, ' ');
+		if (!p)
+			return NULL;
+		p++;
+	}
+
+	if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
+		return NULL;
+
+	p2 = strchr(p + 15, ' ');
+	if (!p2)
+		return NULL;
+	*p2 = '\0';
+
+	len = strlen(p);
+	sret = must_realloc(NULL, len + 1);
+	memcpy(sret, p, len);
+	sret[len] = '\0';
+	return sret;
+}
+
+static void must_append_string(char ***list, char *entry)
+{
+	int newentry;
+	char *copy;
+
+	newentry = append_null_to_list((void ***)list);
+	copy = must_copy_string(entry);
+	(*list)[newentry] = copy;
+}
+
+static int get_existing_subsystems(char ***klist, char ***nlist)
+{
+	__do_free char *line = NULL;
+	__do_fclose FILE *f = NULL;
+	size_t len = 0;
+
+	f = fopen("/proc/self/cgroup", "r");
+	if (!f)
+		return -1;
+
+	while (getline(&line, &len, f) != -1) {
+		char *p, *p2, *tok;
+		p = strchr(line, ':');
+		if (!p)
+			continue;
+		p++;
+		p2 = strchr(p, ':');
+		if (!p2)
+			continue;
+		*p2 = '\0';
+
+		/* If the kernel has cgroup v2 support, then /proc/self/cgroup
+		 * contains an entry of the form:
+		 *
+		 *	0::/some/path
+		 *
+		 * In this case we use "cgroup2" as controller name.
+		 */
+		if ((p2 - p) == 0) {
+			must_append_string(klist, "cgroup2");
+			continue;
+		}
+
+		lxc_iterate_parts(tok, p, ",") {
+			if (strncmp(tok, "name=", 5) == 0)
+				must_append_string(nlist, tok);
+			else
+				must_append_string(klist, tok);
+		}
+	}
+
+	return 0;
+}
+
+static void trim(char *s)
+{
+	size_t len;
+
+	len = strlen(s);
+	while ((len > 1) && (s[len - 1] == '\n'))
+		s[--len] = '\0';
+}
+
+/* __cg_mount_direct
+ *
+ * Mount cgroup hierarchies directly without using bind-mounts. The main
+ * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
+ * cgroups for the LXC_AUTO_CGROUP_FULL option.
+ */
+static int __cg_mount_direct(struct hierarchy *h, const char *controllerpath)
+{
+	 __do_free char *controllers = NULL;
+	 char *fstype = "cgroup2";
+	 unsigned long flags = 0;
+	 int ret;
+
+	 flags |= MS_NOSUID;
+	 flags |= MS_NOEXEC;
+	 flags |= MS_NODEV;
+	 flags |= MS_RELATIME;
+
+	 if (h->version != CGROUP2_SUPER_MAGIC) {
+		 controllers = lxc_string_join(",", (const char **)h->controllers, false);
+		 if (!controllers)
+			 return -ENOMEM;
+		 fstype = "cgroup";
+	}
+
+	ret = mount("cgroup", controllerpath, fstype, flags, controllers);
+	if (ret < 0)
+		return -1;
+
+	return 0;
+}
+
+static inline int cg_mount_cgroup_full(struct hierarchy *h,
+				       const char *controllerpath)
+{
+	return __cg_mount_direct(h, controllerpath);
+}
+
+static bool cgfsng_mount(struct cgroup_ops *ops, const char *root)
+{
+	__do_free char *cgroup_root = NULL;
+	int ret;
+	bool retval = false;
+
+	if (!ops)
+		return ret_set_errno(false, ENOENT);
+
+	if (!ops->hierarchies)
+		return true;
+
+	cgroup_root = must_make_path(root, DEFAULT_CGROUP_MOUNTPOINT, NULL);
+	if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED)
+		return cg_mount_cgroup_full(ops->unified, cgroup_root) == 0;
+
+	/* mount tmpfs */
+	ret = safe_mount(NULL, cgroup_root, "tmpfs",
+			 MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
+			 "size=10240k,mode=755", root);
+	if (ret < 0)
+		goto on_error;
+
+	for (int i = 0; ops->hierarchies[i]; i++) {
+		__do_free char *controllerpath = NULL;
+		struct hierarchy *h = ops->hierarchies[i];
+		char *controller = strrchr(h->mountpoint, '/');
+
+		if (!controller)
+			continue;
+		controller++;
+
+		controllerpath = must_make_path(cgroup_root, controller, NULL);
+		if (dir_exists(controllerpath))
+			continue;
+
+		ret = mkdir(controllerpath, 0755);
+		if (ret < 0)
+			log_error_errno(goto on_error, errno,
+					"Error creating cgroup path: %s",
+					controllerpath);
+
+		ret = cg_mount_cgroup_full( h, controllerpath);
+		if (ret < 0)
+			goto on_error;
+	}
+	retval = true;
+
+on_error:
+	return retval;
+}
+
+static int recursive_count_nrtasks(char *dirname)
+{
+	__do_free char *path = NULL;
+	__do_closedir DIR *dir = NULL;
+	struct dirent *direntp;
+	int count = 0, ret;
+
+	dir = opendir(dirname);
+	if (!dir)
+		return 0;
+
+	while ((direntp = readdir(dir))) {
+		struct stat mystat;
+
+		if (!strcmp(direntp->d_name, ".") ||
+		    !strcmp(direntp->d_name, ".."))
+			continue;
+
+		path = must_make_path(dirname, direntp->d_name, NULL);
+
+		if (lstat(path, &mystat))
+			continue;
+
+		if (!S_ISDIR(mystat.st_mode))
+			continue;
+
+		count += recursive_count_nrtasks(path);
+	}
+
+	path = must_make_path(dirname, "cgroup.procs", NULL);
+	ret = lxc_count_file_lines(path);
+	if (ret != -1)
+		count += ret;
+
+	return count;
+}
+
+static int cgfsng_nrtasks(struct cgroup_ops *ops)
+{
+	__do_free char *path = NULL;
+
+	if (!ops)
+		return ret_set_errno(-1, ENOENT);
+
+	if (!ops->container_cgroup || !ops->hierarchies)
+		return ret_set_errno(-1, EINVAL);
+
+	path = must_make_path(ops->hierarchies[0]->container_full_path, NULL);
+	return recursive_count_nrtasks(path);
+}
+
+static int cgfsng_num_hierarchies(struct cgroup_ops *ops)
+{
+	int i = 0;
+
+	if (!ops)
+		return ret_set_errno(-1, ENOENT);
+
+	if (!ops->hierarchies)
+		return 0;
+
+	for (; ops->hierarchies[i]; i++)
+		;
+
+	return i;
+}
+
+static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n, char ***out)
+{
+	int i;
+
+	if (!ops)
+		return ret_set_errno(false, ENOENT);
+
+	if (!ops->hierarchies)
+		return false;
+
+	/* sanity check n */
+	for (i = 0; i < n; i++)
+		if (!ops->hierarchies[i])
+			return ret_set_errno(false, ENOENT);
+
+	*out = ops->hierarchies[i]->controllers;
+
+	return true;
+}
+
+/* At startup, parse_hierarchies finds all the info we need about cgroup
+ * mountpoints and current cgroups, and stores it in @d.
+ */
+static int cg_hybrid_init(struct cgroup_ops *ops)
+{
+	__do_free char *basecginfo = NULL;
+	__do_free char *line = NULL;
+	__do_fclose FILE *f = NULL;
+	int ret;
+	size_t len = 0;
+	char **klist = NULL, **nlist = NULL;
+
+	/* Root spawned containers escape the current cgroup, so use init's
+	 * cgroups as our base in that case.
+	 */
+	basecginfo = read_file("/proc/1/cgroup");
+	if (!basecginfo)
+		return ret_set_errno(-1, ENOMEM);
+
+	ret = get_existing_subsystems(&klist, &nlist);
+	if (ret < 0)
+		return log_error_errno(-1, errno, "Failed to retrieve available legacy cgroup controllers");
+
+	f = fopen("/proc/self/mountinfo", "r");
+	if (!f)
+		return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
+
+	while (getline(&line, &len, f) != -1) {
+		int type;
+		struct hierarchy *new;
+		char *base_cgroup = NULL, *mountpoint = NULL;
+		char **controller_list = NULL;
+		__do_free char *controllers = NULL;
+
+		type = get_cgroup_version(line);
+		if (type == 0)
+			continue;
+
+		if (type == CGROUP2_SUPER_MAGIC && ops->unified)
+			continue;
+
+		if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
+			if (type == CGROUP2_SUPER_MAGIC)
+				ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
+			else if (type == CGROUP_SUPER_MAGIC)
+				ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
+		} else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
+			if (type == CGROUP_SUPER_MAGIC)
+				ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
+		} else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
+			if (type == CGROUP2_SUPER_MAGIC)
+				ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
+		}
+
+		controller_list = cg_hybrid_get_controllers(klist, nlist, line,
+							    type, &controllers);
+		if (!controller_list && type == CGROUP_SUPER_MAGIC)
+			continue;
+
+		if (type == CGROUP_SUPER_MAGIC)
+			if (controller_list_is_dup(ops->hierarchies, controller_list))
+				ret_set_errno(goto next, EEXIST);
+
+		mountpoint = cg_hybrid_get_mountpoint(line);
+		if (!mountpoint)
+			log_error_errno(goto next, EINVAL, "Failed parsing mountpoint from \"%s\"", line);
+
+		if (type == CGROUP_SUPER_MAGIC) {
+			base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
+		} else {
+			base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
+		}
+		if (!base_cgroup)
+			log_error_errno(goto next, EINVAL, "Failed to find current cgroup %s", mountpoint);
+
+		trim(base_cgroup);
+		prune_init_scope(base_cgroup);
+
+		if (type == CGROUP2_SUPER_MAGIC) {
+			char *cgv2_ctrl_path;
+
+			cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
+							"cgroup.controllers",
+							NULL);
+
+			controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
+			free(cgv2_ctrl_path);
+			if (!controller_list)
+				controller_list = cg_unified_make_empty_controller();
+		}
+
+		new = add_hierarchy(&ops->hierarchies, controller_list, mountpoint, base_cgroup, type);
+		new->__controllers = move_ptr(controllers);
+		if (type == CGROUP2_SUPER_MAGIC && !ops->unified)
+			ops->unified = new;
+
+		continue;
+
+	next:
+		free_string_list(controller_list);
+		free(mountpoint);
+		free(base_cgroup);
+	}
+
+	free_string_list(klist);
+	free_string_list(nlist);
+
+	return 0;
+}
+
+static int cg_unified_init(struct cgroup_ops *ops)
+{
+	__do_free char *subtree_path = NULL;
+	int ret;
+	char *mountpoint;
+	char **delegatable;
+	struct hierarchy *new;
+	char *base_cgroup = NULL;
+
+	ret = unified_cgroup_hierarchy();
+	if (ret == -ENOMEDIUM)
+		return ret_errno(ENOMEDIUM);
+
+	if (ret != CGROUP2_SUPER_MAGIC)
+		return 0;
+
+	base_cgroup = cg_unified_get_current_cgroup(1);
+	if (!base_cgroup)
+		return ret_errno(EINVAL);
+	prune_init_scope(base_cgroup);
+
+	/*
+	 * We assume that the cgroup we're currently in has been delegated to
+	 * us and we are free to further delege all of the controllers listed
+	 * in cgroup.controllers further down the hierarchy.
+	 */
+	mountpoint = must_copy_string(DEFAULT_CGROUP_MOUNTPOINT);
+	subtree_path = must_make_path(mountpoint, base_cgroup, "cgroup.controllers", NULL);
+	delegatable = cg_unified_get_controllers(subtree_path);
+	if (!delegatable)
+		delegatable = cg_unified_make_empty_controller();
+
+	/* TODO: If the user requested specific controllers via lxc.cgroup.use
+	 * we should verify here. The reason I'm not doing it right is that I'm
+	 * not convinced that lxc.cgroup.use will be the future since it is a
+	 * global property. I much rather have an option that lets you request
+	 * controllers per container.
+	 */
+
+	new = add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
+
+	if (bpf_devices_cgroup_supported())
+		new->bpf_device_controller = 1;
+
+	ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
+	ops->unified = new;
+	return CGROUP2_SUPER_MAGIC;
+}
+
+static int cg_init(struct cgroup_ops *ops)
+{
+	int ret;
+
+	ret = cg_unified_init(ops);
+	if (ret < 0)
+		return -1;
+
+	if (ret == CGROUP2_SUPER_MAGIC)
+		return 0;
+
+	return cg_hybrid_init(ops);
+}
+
+struct cgroup_ops *cgfsng_ops_init(void)
+{
+	__do_free struct cgroup_ops *cgfsng_ops = NULL;
+
+	cgfsng_ops = malloc(sizeof(struct cgroup_ops));
+	if (!cgfsng_ops)
+		return ret_set_errno(NULL, ENOMEM);
+
+	memset(cgfsng_ops, 0, sizeof(struct cgroup_ops));
+	cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
+
+	if (cg_init(cgfsng_ops))
+		return NULL;
+
+	cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies;
+	cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies;
+	cgfsng_ops->get_hierarchy = get_hierarchy;
+	cgfsng_ops->driver = "cgfsng";
+	cgfsng_ops->version = "1.0.0";
+	cgfsng_ops->mount = cgfsng_mount;
+	cgfsng_ops->nrtasks = cgfsng_nrtasks;
+
+	return move_ptr(cgfsng_ops);
+}
diff --git a/cgroups/cgroup.c b/cgroups/cgroup.c
new file mode 100644
index 0000000..aebafbd
--- /dev/null
+++ b/cgroups/cgroup.c
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#endif
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "cgroup.h"
+#include "cgroup2_devices.h"
+
+extern struct cgroup_ops *cgfsng_ops_init(void);
+
+struct cgroup_ops *cgroup_init(void)
+{
+	struct cgroup_ops *cgroup_ops;
+
+	cgroup_ops = cgfsng_ops_init();
+	if (!cgroup_ops)
+		return log_error_errno(NULL, errno, "Failed to initialize cgroup driver");
+
+	return cgroup_ops;
+}
+
+void cgroup_exit(struct cgroup_ops *ops)
+{
+	struct hierarchy **it;
+
+	if (!ops)
+		return;
+
+	free(ops->container_cgroup);
+	free(ops->monitor_cgroup);
+
+	for (it = ops->hierarchies; it && *it; it++) {
+		char **p;
+
+		for (p = (*it)->controllers; p && *p; p++)
+			free(*p);
+		free((*it)->controllers);
+		free((*it)->__controllers);
+
+		if ((*it)->fd >= 0)
+			close((*it)->fd);
+
+		free((*it)->mountpoint);
+		free((*it)->container_base_path);
+		free((*it)->container_full_path);
+		free((*it)->monitor_full_path);
+		free(*it);
+	}
+	free(ops->hierarchies);
+
+	free(ops);
+
+	return;
+}
+
+#define INIT_SCOPE "/init.scope"
+void prune_init_scope(char *cg)
+{
+	char *point;
+
+	if (!cg)
+		return;
+
+	point = cg + strlen(cg) - strlen(INIT_SCOPE);
+	if (point < cg)
+		return;
+
+	if (strcmp(point, INIT_SCOPE) == 0) {
+		if (point == cg)
+			*(point + 1) = '\0';
+		else
+			*point = '\0';
+	}
+}
diff --git a/cgroups/cgroup.h b/cgroups/cgroup.h
new file mode 100644
index 0000000..8895533
--- /dev/null
+++ b/cgroups/cgroup.h
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+#ifndef __LXC_CGROUP_H
+#define __LXC_CGROUP_H
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <sys/types.h>
+
+#include "macro.h"
+
+#define DEFAULT_CGROUP_MOUNTPOINT "/sys/fs/cgroup"
+
+typedef enum {
+        CGROUP_LAYOUT_UNKNOWN = -1,
+        CGROUP_LAYOUT_LEGACY  =  0,
+        CGROUP_LAYOUT_HYBRID  =  1,
+        CGROUP_LAYOUT_UNIFIED =  2,
+} cgroup_layout_t;
+
+/* A descriptor for a mounted hierarchy
+ *
+ * @controllers
+ * - legacy hierarchy
+ *   Either NULL, or a null-terminated list of all the co-mounted controllers.
+ * - unified hierarchy
+ *   Either NULL, or a null-terminated list of all enabled controllers.
+ *
+ * @mountpoint
+ * - The mountpoint we will use.
+ * - legacy hierarchy
+ *   It will be either /sys/fs/cgroup/controller or
+ *   /sys/fs/cgroup/controllerlist.
+ * - unified hierarchy
+ *   It will either be /sys/fs/cgroup or /sys/fs/cgroup/<mountpoint-name>
+ *   depending on whether this is a hybrid cgroup layout (mix of legacy and
+ *   unified hierarchies) or a pure unified cgroup layout.
+ *
+ * @container_base_path
+ * - The cgroup under which the container cgroup path
+ *   is created. This will be either the caller's cgroup (if not root), or
+ *   init's cgroup (if root).
+ *
+ * @container_full_path
+ * - The full path to the containers cgroup.
+ *
+ * @monitor_full_path
+ * - The full path to the monitor's cgroup.
+ *
+ * @version
+ * - legacy hierarchy
+ *   If the hierarchy is a legacy hierarchy this will be set to
+ *   CGROUP_SUPER_MAGIC.
+ * - unified hierarchy
+ *   If the hierarchy is a unified hierarchy this will be set to
+ *   CGROUP2_SUPER_MAGIC.
+ */
+struct hierarchy {
+	/*
+	 * cgroup2 only: what files need to be chowned to delegate a cgroup to
+	 * an unprivileged user.
+	 */
+	char **controllers;
+	char *__controllers;
+	char *mountpoint;
+	char *container_base_path;
+	char *container_full_path;
+	char *monitor_full_path;
+	int version;
+
+	/* cgroup2 only */
+	unsigned int bpf_device_controller:1;
+	int fd;
+};
+
+struct cgroup_ops {
+	/* string constant */
+	const char *driver;
+
+	/* string constant */
+	const char *version;
+
+	/* What controllers is the container supposed to use. */
+	char *container_cgroup;
+	char *monitor_cgroup;
+
+	/* @hierarchies
+	 * - A NULL-terminated array of struct hierarchy, one per legacy
+	 *   hierarchy. No duplicates. First sufficient, writeable mounted
+	 *   hierarchy wins.
+	 */
+	struct hierarchy **hierarchies;
+	/* Pointer to the unified hierarchy. Do not free! */
+	struct hierarchy *unified;
+
+	/*
+	 * @cgroup_layout
+	 * - What cgroup layout the container is running with.
+	 *   - CGROUP_LAYOUT_UNKNOWN
+	 *     The cgroup layout could not be determined. This should be treated
+	 *     as an error condition.
+	 *   - CGROUP_LAYOUT_LEGACY
+	 *     The container is running with all controllers mounted into legacy
+	 *     cgroup hierarchies.
+	 *   - CGROUP_LAYOUT_HYBRID
+	 *     The container is running with at least one controller mounted
+	 *     into a legacy cgroup hierarchy and a mountpoint for the unified
+	 *     hierarchy. The unified hierarchy can be empty (no controllers
+	 *     enabled) or non-empty (controllers enabled).
+	 *   - CGROUP_LAYOUT_UNIFIED
+	 *     The container is running on a pure unified cgroup hierarchy. The
+	 *     unified hierarchy can be empty (no controllers enabled) or
+	 *     non-empty (controllers enabled).
+	 */
+	cgroup_layout_t cgroup_layout;
+
+	int (*num_hierarchies)(struct cgroup_ops *ops);
+	bool (*get_hierarchies)(struct cgroup_ops *ops, int n, char ***out);
+	bool (*mount)(struct cgroup_ops *ops, const char *root);
+	int (*nrtasks)(struct cgroup_ops *ops);
+	struct hierarchy *(*get_hierarchy)(struct cgroup_ops *ops,
+					   const char *controller);
+};
+
+extern struct cgroup_ops *cgroup_init(void);
+extern void cgroup_exit(struct cgroup_ops *ops);
+
+extern void prune_init_scope(char *cg);
+
+static inline void __auto_cgroup_exit__(struct cgroup_ops **ops)
+{
+	if (*ops)
+		cgroup_exit(*ops);
+}
+
+extern int cgroup_attach(const char *name, const char *lxcpath, int64_t pid);
+
+#define __do_cgroup_exit __attribute__((__cleanup__(__auto_cgroup_exit__)))
+
+static inline bool pure_unified_layout(const struct cgroup_ops *ops)
+{
+	return ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED;
+}
+
+static inline bool is_unified_hierarchy(const struct hierarchy *h)
+{
+	return h->version == CGROUP2_SUPER_MAGIC;
+}
+
+#endif
diff --git a/cgroups/cgroup2_devices.c b/cgroups/cgroup2_devices.c
new file mode 100644
index 0000000..92df160
--- /dev/null
+++ b/cgroups/cgroup2_devices.c
@@ -0,0 +1,457 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+/* Parts of this taken from systemd's implementation. */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#endif
+#include <errno.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "cgroup2_devices.h"
+#include "macro.h"
+#include "memory_utils.h"
+
+#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
+#include <linux/bpf.h>
+#include <linux/filter.h>
+
+static int bpf_program_add_instructions(struct bpf_program *prog,
+					const struct bpf_insn *instructions,
+					size_t count)
+{
+
+	struct bpf_insn *new_insn;
+
+	if (prog->kernel_fd >= 0)
+		return log_error_errno(-1, EBUSY, "Refusing to update bpf cgroup program that's already loaded");
+
+	new_insn = realloc(prog->instructions, sizeof(struct bpf_insn) * (count + prog->n_instructions));
+	if (!new_insn)
+		return log_error_errno(-1, ENOMEM, "Failed to reallocate bpf cgroup program");
+
+	prog->instructions = new_insn;
+	memcpy(prog->instructions + prog->n_instructions, instructions,
+	       sizeof(struct bpf_insn) * count);
+	prog->n_instructions += count;
+
+	return 0;
+}
+
+void bpf_program_free(struct bpf_program *prog)
+{
+	if (!prog)
+		return;
+
+	(void)bpf_program_cgroup_detach(prog);
+
+	if (prog->kernel_fd >= 0)
+		close(prog->kernel_fd);
+	free(prog->instructions);
+	free(prog->attached_path);
+	free(prog);
+}
+
+/* Memory load, dst_reg = *(uint *) (src_reg + off16) */
+#define BPF_LDX_MEM(SIZE, DST, SRC, OFF)                               \
+	((struct bpf_insn){.code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \
+			   .dst_reg = DST,                             \
+			   .src_reg = SRC,                             \
+			   .off = OFF,                                 \
+			   .imm = 0})
+
+/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */
+#define BPF_ALU32_IMM(OP, DST, IMM)                              \
+	((struct bpf_insn){.code = BPF_ALU | BPF_OP(OP) | BPF_K, \
+			   .dst_reg = DST,                       \
+			   .src_reg = 0,                         \
+			   .off = 0,                             \
+			   .imm = IMM})
+
+/* Short form of mov, dst_reg = src_reg */
+#define BPF_MOV64_IMM(DST, IMM)                                 \
+	((struct bpf_insn){.code = BPF_ALU64 | BPF_MOV | BPF_K, \
+			   .dst_reg = DST,                      \
+			   .src_reg = 0,                        \
+			   .off = 0,                            \
+			   .imm = IMM})
+
+#define BPF_MOV32_REG(DST, SRC)                               \
+	((struct bpf_insn){.code = BPF_ALU | BPF_MOV | BPF_X, \
+			   .dst_reg = DST,                    \
+			   .src_reg = SRC,                    \
+			   .off = 0,                          \
+			   .imm = 0})
+
+/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */
+#define BPF_JMP_REG(OP, DST, SRC, OFF)                           \
+	((struct bpf_insn){.code = BPF_JMP | BPF_OP(OP) | BPF_X, \
+			   .dst_reg = DST,                       \
+			   .src_reg = SRC,                       \
+			   .off = OFF,                           \
+			   .imm = 0})
+
+/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */
+#define BPF_JMP_IMM(OP, DST, IMM, OFF)                           \
+	((struct bpf_insn){.code = BPF_JMP | BPF_OP(OP) | BPF_K, \
+			   .dst_reg = DST,                       \
+			   .src_reg = 0,                         \
+			   .off = OFF,                           \
+			   .imm = IMM})
+
+/* Program exit */
+#define BPF_EXIT_INSN()                                \
+	((struct bpf_insn){.code = BPF_JMP | BPF_EXIT, \
+			   .dst_reg = 0,               \
+			   .src_reg = 0,               \
+			   .off = 0,                   \
+			   .imm = 0})
+
+static int bpf_access_mask(const char *acc)
+{
+	int mask = 0;
+
+	if (!acc)
+		return mask;
+
+	for (; *acc; acc++)
+		switch (*acc) {
+		case 'r':
+			mask |= BPF_DEVCG_ACC_READ;
+			break;
+		case 'w':
+			mask |= BPF_DEVCG_ACC_WRITE;
+			break;
+		case 'm':
+			mask |= BPF_DEVCG_ACC_MKNOD;
+			break;
+		default:
+			return -EINVAL;
+		}
+
+	return mask;
+}
+
+static int bpf_device_type(char type)
+{
+	switch (type) {
+	case 'a':
+		return 0;
+	case 'b':
+		return BPF_DEVCG_DEV_BLOCK;
+	case 'c':
+		return BPF_DEVCG_DEV_CHAR;
+	}
+
+	return -1;
+}
+
+static inline bool bpf_device_all_access(int access_mask)
+{
+	return (access_mask == (BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE |
+				BPF_DEVCG_ACC_MKNOD));
+}
+
+struct bpf_program *bpf_program_new(uint32_t prog_type)
+{
+	__do_free struct bpf_program *prog = NULL;
+
+	prog = calloc(1, sizeof(struct bpf_program));
+	if (!prog)
+		return NULL;
+
+	prog->prog_type = prog_type;
+	prog->kernel_fd = -EBADF;
+	/*
+	 * By default a whitelist is used unless the user tells us otherwise.
+	 */
+	prog->device_list_type = LXC_BPF_DEVICE_CGROUP_WHITELIST;
+
+	return move_ptr(prog);
+}
+
+int bpf_program_init(struct bpf_program *prog)
+{
+	if (!prog)
+		return ret_set_errno(-1, EINVAL);
+
+	const struct bpf_insn pre_insn[] = {
+	    /* load device type to r2 */
+	    BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct bpf_cgroup_dev_ctx, access_type)),
+	    BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+
+	    /* load access type to r3 */
+	    BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct bpf_cgroup_dev_ctx, access_type)),
+	    BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+
+	    /* load major number to r4 */
+	    BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_cgroup_dev_ctx, major)),
+
+	    /* load minor number to r5 */
+	    BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, offsetof(struct bpf_cgroup_dev_ctx, minor)),
+	};
+
+	return bpf_program_add_instructions(prog, pre_insn, ARRAY_SIZE(pre_insn));
+}
+
+int bpf_program_append_device(struct bpf_program *prog, struct device_item *device)
+{
+	int ret;
+	int jump_nr = 1;
+	struct bpf_insn bpf_access_decision[] = {
+	    BPF_MOV64_IMM(BPF_REG_0, device->allow),
+	    BPF_EXIT_INSN(),
+	};
+	int access_mask;
+	int device_type;
+
+	if (!prog || !device)
+		return ret_set_errno(-1, EINVAL);
+
+	/* This is a global rule so no need to append anything. */
+	if (device->global_rule > LXC_BPF_DEVICE_CGROUP_LOCAL_RULE) {
+		prog->device_list_type = device->global_rule;
+		return 0;
+	}
+
+	device_type = bpf_device_type(device->type);
+	if (device_type < 0)
+		return log_error_errno(-1, EINVAL, "Invalid bpf cgroup device type %c", device->type);
+
+	if (device_type > 0)
+		jump_nr++;
+
+	access_mask = bpf_access_mask(device->access);
+	if (!bpf_device_all_access(access_mask))
+		jump_nr += 3;
+
+	if (device->major != -1)
+		jump_nr++;
+
+	if (device->minor != -1)
+		jump_nr++;
+
+	if (device_type > 0) {
+		struct bpf_insn ins[] = {
+		    BPF_JMP_IMM(BPF_JNE, BPF_REG_2, device_type, jump_nr--),
+		};
+
+		ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins));
+		if (ret)
+			return log_error_errno(-1, errno, "Failed to add instructions to bpf cgroup program");
+	}
+
+	if (!bpf_device_all_access(access_mask)) {
+		struct bpf_insn ins[] = {
+		    BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
+		    BPF_ALU32_IMM(BPF_AND, BPF_REG_1, access_mask),
+		    BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, jump_nr),
+		};
+
+		jump_nr -= 3;
+		ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins));
+		if (ret)
+			return log_error_errno(-1, errno, "Failed to add instructions to bpf cgroup program");
+	}
+
+	if (device->major >= 0) {
+		struct bpf_insn ins[] = {
+		    BPF_JMP_IMM(BPF_JNE, BPF_REG_4, device->major, jump_nr--),
+		};
+
+		ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins));
+		if (ret)
+			return log_error_errno(-1, errno, "Failed to add instructions to bpf cgroup program");
+	}
+
+	if (device->minor >= 0) {
+		struct bpf_insn ins[] = {
+		    BPF_JMP_IMM(BPF_JNE, BPF_REG_5, device->minor, jump_nr--),
+		};
+
+		ret = bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins));
+		if (ret)
+			return log_error_errno(-1, errno, "Failed to add instructions to bpf cgroup program");
+	}
+
+	ret = bpf_program_add_instructions(prog, bpf_access_decision,
+					    ARRAY_SIZE(bpf_access_decision));
+	if (ret)
+		return log_error_errno(-1, errno, "Failed to add instructions to bpf cgroup program");
+
+	return 0;
+}
+
+int bpf_program_finalize(struct bpf_program *prog)
+{
+	struct bpf_insn ins[] = {
+	    BPF_MOV64_IMM(BPF_REG_0, prog->device_list_type),
+	    BPF_EXIT_INSN(),
+	};
+
+	if (!prog)
+		return ret_set_errno(-1, EINVAL);
+
+	TRACE("Implementing %s bpf device cgroup program",
+	      prog->device_list_type == LXC_BPF_DEVICE_CGROUP_BLACKLIST
+		  ? "blacklist"
+		  : "whitelist");
+	return bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins));
+}
+
+static int bpf_program_load_kernel(struct bpf_program *prog, char *log_buf,
+				   size_t log_size)
+{
+	union bpf_attr attr;
+
+	if (prog->kernel_fd >= 0) {
+		memset(log_buf, 0, log_size);
+		return 0;
+	}
+
+	attr = (union bpf_attr){
+	    .prog_type	= prog->prog_type,
+	    .insns	= PTR_TO_UINT64(prog->instructions),
+	    .insn_cnt	= prog->n_instructions,
+	    .license	= PTR_TO_UINT64("GPL"),
+	    .log_buf	= PTR_TO_UINT64(log_buf),
+	    .log_level	= !!log_buf,
+	    .log_size	= log_size,
+	};
+
+	prog->kernel_fd = bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
+	if (prog->kernel_fd < 0)
+		return log_error_errno(-1, errno, "Failed to load bpf program");
+
+	return 0;
+}
+
+int bpf_program_cgroup_attach(struct bpf_program *prog, int type,
+			      const char *path, uint32_t flags)
+{
+	__do_free char *copy = NULL;
+	__do_close_prot_errno int fd = -EBADF;
+	union bpf_attr attr;
+	int ret;
+
+	if (!prog)
+		return ret_set_errno(-1, EINVAL);
+
+	if (flags & ~(BPF_F_ALLOW_OVERRIDE, BPF_F_ALLOW_MULTI))
+		return log_error_errno(-1, EINVAL, "Invalid flags for bpf program");
+
+	if (prog->attached_path) {
+		if (prog->attached_type != type)
+			return log_error_errno(-1, EBUSY, "Wrong type for bpf program");
+
+		if (prog->attached_flags != flags)
+			return log_error_errno(-1, EBUSY, "Wrong flags for bpf program");
+
+		if (flags != BPF_F_ALLOW_OVERRIDE)
+			return true;
+	}
+
+	ret = bpf_program_load_kernel(prog, NULL, 0);
+	if (ret < 0)
+		return log_error_errno(-1, ret, "Failed to load bpf program");
+
+	copy = strdup(path);
+	if (!copy)
+		return log_error_errno(-1, ENOMEM, "Failed to duplicate cgroup path %s", path);
+
+	fd = open(path, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
+	if (fd < 0)
+		return log_error_errno(-1, errno, "Failed to open cgroup path %s", path);
+
+	attr = (union bpf_attr){
+	    .attach_type	= type,
+	    .target_fd		= fd,
+	    .attach_bpf_fd	= prog->kernel_fd,
+	    .attach_flags	= flags,
+	};
+
+	ret = bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
+	if (ret < 0)
+		return log_error_errno(-1, errno, "Failed to attach bpf program");
+
+	free_replace_move_ptr(prog->attached_path, copy);
+	prog->attached_type = type;
+	prog->attached_flags = flags;
+
+	TRACE("Loaded and attached bpf program to cgroup %s", prog->attached_path);
+	return 0;
+}
+
+int bpf_program_cgroup_detach(struct bpf_program *prog)
+{
+	int ret;
+	__do_close_prot_errno int fd = -EBADF;
+
+	if (!prog)
+		return 0;
+
+	if (!prog->attached_path)
+		return 0;
+
+	fd = open(prog->attached_path, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
+	if (fd < 0) {
+		if (errno != ENOENT)
+			return log_error_errno(-1, errno, "Failed to open attach cgroup %s",
+					       prog->attached_path);
+	} else {
+		union bpf_attr attr;
+
+		attr = (union bpf_attr){
+		    .attach_type	= prog->attached_type,
+		    .target_fd		= fd,
+		    .attach_bpf_fd	= prog->kernel_fd,
+		};
+
+		ret = bpf(BPF_PROG_DETACH, &attr, sizeof(attr));
+		if (ret < 0)
+			return log_error_errno(-1, errno, "Failed to detach bpf program from cgroup %s",
+					       prog->attached_path);
+	}
+
+	free(prog->attached_path);
+	prog->attached_path = NULL;
+
+	return 0;
+}
+
+bool bpf_devices_cgroup_supported(void)
+{
+	const struct bpf_insn dummy[] = {
+	    BPF_MOV64_IMM(BPF_REG_0, 1),
+	    BPF_EXIT_INSN(),
+	};
+
+	__do_bpf_program_free struct bpf_program *prog = NULL;
+	int ret;
+
+	if (geteuid() != 0)
+		return log_trace(false,
+				 "The bpf device cgroup requires real root");
+
+	prog = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE);
+	if (prog < 0)
+		return log_trace(false, "Failed to allocate new bpf device cgroup program");
+
+	ret = bpf_program_add_instructions(prog, dummy, ARRAY_SIZE(dummy));
+	if (ret < 0)
+		return log_trace(false, "Failed to add new instructions to bpf device cgroup program");
+
+	ret = bpf_program_load_kernel(prog, NULL, 0);
+	if (ret < 0)
+		return log_trace(false, "Failed to load new bpf device cgroup program");
+
+	return log_trace(true, "The bpf device cgroup is supported");
+}
+#endif
diff --git a/cgroups/cgroup2_devices.h b/cgroups/cgroup2_devices.h
new file mode 100644
index 0000000..4fee779
--- /dev/null
+++ b/cgroups/cgroup2_devices.h
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+/* Parts of this taken from systemd's implementation. */
+
+#ifndef __LXC_CGROUP2_DEVICES_H
+#define __LXC_CGROUP2_DEVICES_H
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#endif
+
+#if !HAVE_BPF
+#if !(defined __NR_bpf && __NR_bpf > 0)
+#if defined __NR_bpf
+#undef __NR_bpf
+#endif
+#if defined __i386__
+#define __NR_bpf 357
+#elif defined __x86_64__
+#define __NR_bpf 321
+#elif defined __aarch64__
+#define __NR_bpf 280
+#elif defined __arm__
+#define __NR_bpf 386
+#elif defined __sparc__
+#define __NR_bpf 349
+#elif defined __s390__
+#define __NR_bpf 351
+#elif defined __tilegx__
+#define __NR_bpf 280
+#else
+#warning "__NR_bpf not defined for your architecture"
+#endif
+#endif
+
+union bpf_attr;
+
+static inline int missing_bpf(int cmd, union bpf_attr *attr, size_t size)
+{
+#ifdef __NR_bpf
+	return (int)syscall(__NR_bpf, cmd, attr, size);
+#else
+	errno = ENOSYS;
+	return -1;
+#endif
+}
+
+#define bpf missing_bpf
+#endif
+
+struct bpf_program {
+	int device_list_type;
+	int kernel_fd;
+	uint32_t prog_type;
+
+	size_t n_instructions;
+#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
+	struct bpf_insn *instructions;
+#endif
+
+	char *attached_path;
+	int attached_type;
+	uint32_t attached_flags;
+};
+
+#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
+struct bpf_program *bpf_program_new(uint32_t prog_type);
+int bpf_program_init(struct bpf_program *prog);
+int bpf_program_append_device(struct bpf_program *prog,
+			      struct device_item *device);
+int bpf_program_finalize(struct bpf_program *prog);
+int bpf_program_cgroup_attach(struct bpf_program *prog, int type,
+			      const char *path, uint32_t flags);
+int bpf_program_cgroup_detach(struct bpf_program *prog);
+void bpf_program_free(struct bpf_program *prog);
+bool bpf_devices_cgroup_supported(void);
+static inline void __auto_bpf_program_free__(struct bpf_program **prog)
+{
+	if (*prog) {
+		bpf_program_free(*prog);
+		*prog = NULL;
+	}
+}
+#else
+static inline struct bpf_program *bpf_program_new(uint32_t prog_type)
+{
+	errno = ENOSYS;
+	return NULL;
+}
+
+static inline int bpf_program_init(struct bpf_program *prog)
+{
+	errno = ENOSYS;
+	return -1;
+}
+
+static inline int bpf_program_append_device(struct bpf_program *prog, char type,
+					    int major, int minor,
+					    const char *access, int allow)
+{
+	errno = ENOSYS;
+	return -1;
+}
+
+static inline int bpf_program_finalize(struct bpf_program *prog)
+{
+	errno = ENOSYS;
+	return -1;
+}
+
+static inline int bpf_program_cgroup_attach(struct bpf_program *prog, int type,
+					    const char *path, uint32_t flags)
+{
+	errno = ENOSYS;
+	return -1;
+}
+
+static inline int bpf_program_cgroup_detach(struct bpf_program *prog)
+{
+	errno = ENOSYS;
+	return -1;
+}
+
+static inline void bpf_program_free(struct bpf_program *prog)
+{
+}
+
+
+static inline bool bpf_devices_cgroup_supported(void)
+{
+	return false;
+}
+
+static inline void __auto_bpf_program_free__(struct bpf_program **prog)
+{
+}
+
+#endif
+
+#define __do_bpf_program_free \
+	__attribute__((__cleanup__(__auto_bpf_program_free__)))
+
+#endif /* __LXC_CGROUP2_DEVICES_H */
diff --git a/cgroups/cgroup_utils.c b/cgroups/cgroup_utils.c
new file mode 100644
index 0000000..26e7438
--- /dev/null
+++ b/cgroups/cgroup_utils.c
@@ -0,0 +1,726 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#endif
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/vfs.h>
+#include <unistd.h>
+
+#include "cgroup.h"
+#include "cgroup_utils.h"
+#include "macro.h"
+#include "memory_utils.h"
+
+int get_cgroup_version(char *line)
+{
+	if (is_cgroupfs_v1(line))
+		return CGROUP_SUPER_MAGIC;
+
+	if (is_cgroupfs_v2(line))
+		return CGROUP2_SUPER_MAGIC;
+
+	return 0;
+}
+
+bool is_cgroupfs_v1(char *line)
+{
+	char *p = strstr(line, " - ");
+	if (!p)
+		return false;
+	return strncmp(p, " - cgroup ", 10) == 0;
+}
+
+bool is_cgroupfs_v2(char *line)
+{
+	char *p = strstr(line, " - ");
+	if (!p)
+		return false;
+
+	return strncmp(p, " - cgroup2 ", 11) == 0;
+}
+
+int unified_cgroup_hierarchy(void)
+{
+
+	int ret;
+	struct statfs fs;
+
+	ret = statfs(DEFAULT_CGROUP_MOUNTPOINT, &fs);
+	if (ret < 0)
+		return -ENOMEDIUM;
+
+	if (is_fs_type(&fs, CGROUP2_SUPER_MAGIC))
+		return CGROUP2_SUPER_MAGIC;
+
+	return 0;
+}
+
+void *must_realloc(void *orig, size_t sz)
+{
+	void *ret;
+
+	do {
+		ret = realloc(orig, sz);
+	} while (!ret);
+
+	return ret;
+}
+
+char *must_make_path(const char *first, ...)
+{
+	va_list args;
+	char *cur, *dest;
+	size_t full_len = strlen(first);
+	size_t buf_len;
+	size_t cur_len;
+
+	dest = must_copy_string(first);
+	cur_len = full_len;
+
+	va_start(args, first);
+	while ((cur = va_arg(args, char *)) != NULL) {
+		buf_len = strlen(cur);
+
+		full_len += buf_len;
+		if (cur[0] != '/')
+			full_len++;
+
+		dest = must_realloc(dest, full_len + 1);
+
+		if (cur[0] != '/') {
+			memcpy(dest + cur_len, "/", 1);
+			cur_len++;
+		}
+
+		memcpy(dest + cur_len, cur, buf_len);
+		cur_len += buf_len;
+	}
+	va_end(args);
+
+	dest[cur_len] = '\0';
+	return dest;
+}
+
+bool is_fs_type(const struct statfs *fs, fs_type_magic magic_val)
+{
+	return (fs->f_type == (fs_type_magic)magic_val);
+}
+
+char *must_copy_string(const char *entry)
+{
+	char *ret;
+
+	if (!entry)
+		return NULL;
+
+	do {
+		ret = strdup(entry);
+	} while (!ret);
+
+	return ret;
+}
+
+char *lxc_string_join(const char *sep, const char **parts, bool use_as_prefix)
+{
+	char *result;
+	char **p;
+	size_t sep_len = strlen(sep);
+	size_t result_len = use_as_prefix * sep_len;
+	size_t buf_len;
+
+	/* calculate new string length */
+	for (p = (char **)parts; *p; p++)
+		result_len += (p > (char **)parts) * sep_len + strlen(*p);
+
+	buf_len = result_len + 1;
+	result = calloc(buf_len, 1);
+	if (!result)
+		return NULL;
+
+	if (use_as_prefix)
+		(void)strlcpy(result, sep, buf_len);
+
+	for (p = (char **)parts; *p; p++) {
+		if (p > (char **)parts)
+			(void)strlcat(result, sep, buf_len);
+
+		(void)strlcat(result, *p, buf_len);
+	}
+
+	return result;
+}
+
+int lxc_count_file_lines(const char *fn)
+{
+	FILE *f;
+	char *line = NULL;
+	size_t sz = 0;
+	int n = 0;
+
+	f = fopen_cloexec(fn, "r");
+	if (!f)
+		return -1;
+
+	while (getline(&line, &sz, f) != -1) {
+		n++;
+	}
+
+	free(line);
+	fclose(f);
+	return n;
+}
+
+bool dir_exists(const char *path)
+{
+	struct stat sb;
+	int ret;
+
+	ret = stat(path, &sb);
+	if (ret < 0)
+		/* Could be something other than eexist, just say "no". */
+		return false;
+
+	return S_ISDIR(sb.st_mode);
+}
+
+/*
+ * @path:    a pathname where / replaced with '\0'.
+ * @offsetp: pointer to int showing which path segment was last seen.
+ *           Updated on return to reflect the next segment.
+ * @fulllen: full original path length.
+ * Returns a pointer to the next path segment, or NULL if done.
+ */
+static char *get_nextpath(char *path, int *offsetp, int fulllen)
+{
+	int offset = *offsetp;
+
+	if (offset >= fulllen)
+		return NULL;
+
+	while (offset < fulllen && path[offset] != '\0')
+		offset++;
+
+	while (offset < fulllen && path[offset] == '\0')
+		offset++;
+
+	*offsetp = offset;
+
+	return (offset < fulllen) ? &path[offset] : NULL;
+}
+
+/*
+ * Check that @subdir is a subdir of @dir.  @len is the length of
+ * @dir (to avoid having to recalculate it).
+ */
+static bool is_subdir(const char *subdir, const char *dir, size_t len)
+{
+	size_t subdirlen = strlen(subdir);
+
+	if (subdirlen < len)
+		return false;
+
+	if (strncmp(subdir, dir, len) != 0)
+		return false;
+
+	if (dir[len-1] == '/')
+		return true;
+
+	if (subdir[len] == '/' || subdirlen == len)
+		return true;
+
+	return false;
+}
+
+/*
+ * Check if the open fd is a symlink.  Return -ELOOP if it is.  Return
+ * -ENOENT if we couldn't fstat.  Return 0 if the fd is ok.
+ */
+static int check_symlink(int fd)
+{
+	struct stat sb;
+	int ret;
+
+	ret = fstat(fd, &sb);
+	if (ret < 0)
+		return -ENOENT;
+
+	if (S_ISLNK(sb.st_mode))
+		return -ELOOP;
+
+	return 0;
+}
+
+/*
+ * Open a file or directory, provided that it contains no symlinks.
+ *
+ * CAVEAT: This function must not be used for other purposes than container
+ * setup before executing the container's init
+ */
+static int open_if_safe(int dirfd, const char *nextpath)
+{
+	int newfd = openat(dirfd, nextpath, O_RDONLY | O_NOFOLLOW);
+	if (newfd >= 0) /* Was not a symlink, all good. */
+		return newfd;
+
+	if (errno == ELOOP)
+		return newfd;
+
+	if (errno == EPERM || errno == EACCES) {
+		/* We're not root (cause we got EPERM) so try opening with
+		 * O_PATH.
+		 */
+		newfd = openat(dirfd, nextpath, O_PATH | O_NOFOLLOW);
+		if (newfd >= 0) {
+			/* O_PATH will return an fd for symlinks. We know
+			 * nextpath wasn't a symlink at last openat, so if fd is
+			 * now a link, then something * fishy is going on.
+			 */
+			int ret = check_symlink(newfd);
+			if (ret < 0) {
+				close(newfd);
+				newfd = ret;
+			}
+		}
+	}
+
+	return newfd;
+}
+
+/*
+ * Open a path intending for mounting, ensuring that the final path
+ * is inside the container's rootfs.
+ *
+ * CAVEAT: This function must not be used for other purposes than container
+ * setup before executing the container's init
+ *
+ * @target: path to be opened
+ * @prefix_skip: a part of @target in which to ignore symbolic links.  This
+ * would be the container's rootfs.
+ *
+ * Return an open fd for the path, or <0 on error.
+ */
+static int open_without_symlink(const char *target, const char *prefix_skip)
+{
+	int curlen = 0, dirfd, fulllen, i;
+	char *dup;
+
+	fulllen = strlen(target);
+
+	/* make sure prefix-skip makes sense */
+	if (prefix_skip && strlen(prefix_skip) > 0) {
+		curlen = strlen(prefix_skip);
+		if (!is_subdir(target, prefix_skip, curlen))
+			return -EINVAL;
+
+		/*
+		 * get_nextpath() expects the curlen argument to be
+		 * on a  (turned into \0) / or before it, so decrement
+		 * curlen to make sure that happens
+		 */
+		if (curlen)
+			curlen--;
+	} else {
+		prefix_skip = "/";
+		curlen = 0;
+	}
+
+	/* Make a copy of target which we can hack up, and tokenize it */
+	if ((dup = strdup(target)) == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < fulllen; i++) {
+		if (dup[i] == '/')
+			dup[i] = '\0';
+	}
+
+	dirfd = open(prefix_skip, O_RDONLY);
+	if (dirfd < 0)
+		goto out;
+
+	for (;;) {
+		int newfd, saved_errno;
+		char *nextpath;
+
+		if ((nextpath = get_nextpath(dup, &curlen, fulllen)) == NULL)
+			goto out;
+
+		newfd = open_if_safe(dirfd, nextpath);
+		saved_errno = errno;
+		close(dirfd);
+
+		dirfd = newfd;
+		if (newfd < 0) {
+			errno = saved_errno;
+			goto out;
+		}
+	}
+
+out:
+	free(dup);
+	return dirfd;
+}
+
+/*
+ * Safely mount a path into a container, ensuring that the mount target
+ * is under the container's @rootfs.  (If @rootfs is NULL, then the container
+ * uses the host's /)
+ *
+ * CAVEAT: This function must not be used for other purposes than container
+ * setup before executing the container's init
+ */
+int safe_mount(const char *src, const char *dest, const char *fstype,
+		unsigned long flags, const void *data, const char *rootfs)
+{
+	int destfd, ret, saved_errno;
+	/* Only needs enough for /proc/self/fd/<fd>. */
+	char srcbuf[50], destbuf[50];
+	int srcfd = -1;
+	const char *mntsrc = src;
+
+	if (!rootfs)
+		rootfs = "";
+
+	/* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */
+	if (flags & MS_BIND && src && src[0] != '/') {
+
+		srcfd = open_without_symlink(src, NULL);
+		if (srcfd < 0)
+			return srcfd;
+
+		ret = snprintf(srcbuf, sizeof(srcbuf), "/proc/self/fd/%d", srcfd);
+		if (ret < 0 || ret >= (int)sizeof(srcbuf)) {
+			close(srcfd);
+			return -EINVAL;
+		}
+		mntsrc = srcbuf;
+	}
+
+	destfd = open_without_symlink(dest, rootfs);
+	if (destfd < 0) {
+		if (srcfd != -1) {
+			saved_errno = errno;
+			close(srcfd);
+			errno = saved_errno;
+		}
+
+		return destfd;
+	}
+
+	ret = snprintf(destbuf, sizeof(destbuf), "/proc/self/fd/%d", destfd);
+	if (ret < 0 || ret >= (int)sizeof(destbuf)) {
+		if (srcfd != -1)
+			close(srcfd);
+
+		close(destfd);
+		return -EINVAL;
+	}
+
+	ret = mount(mntsrc, destbuf, fstype, flags, data);
+	saved_errno = errno;
+	if (srcfd != -1)
+		close(srcfd);
+
+	close(destfd);
+	if (ret < 0) {
+		errno = saved_errno;
+		return ret;
+	}
+
+	return 0;
+}
+
+#ifndef HAVE_STRLCPY
+size_t strlcpy(char *dest, const char *src, size_t size)
+{
+	size_t ret = strlen(src);
+
+	if (size) {
+		size_t len = (ret >= size) ? size - 1 : ret;
+		memcpy(dest, src, len);
+		dest[len] = '\0';
+	}
+
+	return ret;
+}
+#endif
+
+#ifndef HAVE_STRLCAT
+size_t strlcat(char *d, const char *s, size_t n)
+{
+	size_t l = strnlen(d, n);
+	if (l == n)
+		return l + strlen(s);
+
+	return l + strlcpy(d + l, s, n - l);
+}
+#endif
+
+FILE *fopen_cloexec(const char *path, const char *mode)
+{
+	int open_mode = 0;
+	int step = 0;
+	int fd;
+	int saved_errno = 0;
+	FILE *ret;
+
+	if (!strncmp(mode, "r+", 2)) {
+		open_mode = O_RDWR;
+		step = 2;
+	} else if (!strncmp(mode, "r", 1)) {
+		open_mode = O_RDONLY;
+		step = 1;
+	} else if (!strncmp(mode, "w+", 2)) {
+		open_mode = O_RDWR | O_TRUNC | O_CREAT;
+		step = 2;
+	} else if (!strncmp(mode, "w", 1)) {
+		open_mode = O_WRONLY | O_TRUNC | O_CREAT;
+		step = 1;
+	} else if (!strncmp(mode, "a+", 2)) {
+		open_mode = O_RDWR | O_CREAT | O_APPEND;
+		step = 2;
+	} else if (!strncmp(mode, "a", 1)) {
+		open_mode = O_WRONLY | O_CREAT | O_APPEND;
+		step = 1;
+	}
+	for (; mode[step]; step++)
+		if (mode[step] == 'x')
+			open_mode |= O_EXCL;
+	open_mode |= O_CLOEXEC;
+
+	fd = open(path, open_mode, 0660);
+	if (fd < 0)
+		return NULL;
+
+	ret = fdopen(fd, mode);
+	saved_errno = errno;
+	if (!ret)
+		close(fd);
+	errno = saved_errno;
+	return ret;
+}
+
+/* Given a multi-line string, return a null-terminated copy of the current line. */
+static char *copy_to_eol(char *p)
+{
+	char *p2 = strchr(p, '\n'), *sret;
+	size_t len;
+
+	if (!p2)
+		return NULL;
+
+	len = p2 - p;
+	sret = must_realloc(NULL, len + 1);
+	memcpy(sret, p, len);
+	sret[len] = '\0';
+	return sret;
+}
+
+static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
+{
+	int newbatches = (newlen / BATCH_SIZE) + 1;
+	int oldbatches = (oldlen / BATCH_SIZE) + 1;
+
+	if (!*mem || newbatches > oldbatches) {
+		*mem = must_realloc(*mem, newbatches * BATCH_SIZE);
+	}
+}
+
+void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
+{
+	size_t full = oldlen + newlen;
+
+	batch_realloc(dest, oldlen, full + 1);
+
+	memcpy(*dest + oldlen, new, newlen + 1);
+}
+
+static inline void drop_trailing_newlines(char *s)
+{
+	int l;
+
+	for (l = strlen(s); l > 0 && s[l - 1] == '\n'; l--)
+		s[l - 1] = '\0';
+}
+
+/* Slurp in a whole file */
+char *read_file(const char *fnam)
+{
+	__do_free char *line = NULL;
+	__do_fclose FILE *f = NULL;
+	int linelen;
+	char *buf = NULL;
+	size_t len = 0, fulllen = 0;
+
+	f = fopen(fnam, "r");
+	if (!f)
+		return NULL;
+	while ((linelen = getline(&line, &len, f)) != -1) {
+		append_line(&buf, fulllen, line, linelen);
+		fulllen += linelen;
+	}
+	return buf;
+}
+
+char *read_file_strip_newline(const char *fnam)
+{
+	char *buf;
+
+	buf = read_file(fnam);
+	if (buf)
+		drop_trailing_newlines(buf);
+	return buf;
+}
+
+/* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
+char *cg_unified_get_current_cgroup(pid_t pid)
+{
+	__do_free char *basecginfo = NULL;
+	char path[STRLITERALLEN("/proc//cgroup") + INTTYPE_TO_STRLEN(pid_t) + 1];
+	char *base_cgroup;
+
+	snprintf(path, sizeof(path), "/proc/%d/cgroup", pid > 0 ?: 1);
+	basecginfo = read_file(path);
+	if (!basecginfo)
+		return NULL;
+
+	base_cgroup = strstr(basecginfo, "0::/");
+	if (!base_cgroup)
+		return NULL;
+
+	base_cgroup = base_cgroup + 3;
+	return copy_to_eol(base_cgroup);
+}
+
+/* cgline: pointer to character after the first ':' in a line in a \n-terminated
+ * /proc/self/cgroup file. Check whether controller c is present.
+ */
+static bool controller_in_clist(char *cgline, const char *c)
+{
+	__do_free char *tmp = NULL;
+	char *tok, *eol;
+	size_t len;
+
+	eol = strchr(cgline, ':');
+	if (!eol)
+		return false;
+
+	len = eol - cgline;
+	tmp = must_realloc(NULL, len + 1);
+	memcpy(tmp, cgline, len);
+	tmp[len] = '\0';
+
+	lxc_iterate_parts(tok, tmp, ",")
+		if (strcmp(tok, c) == 0)
+			return true;
+
+	return false;
+}
+
+/* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
+ * @controller.
+ */
+char *cg_hybrid_get_current_cgroup(char *basecginfo, const char *controller, int type)
+{
+	char *p = basecginfo;
+
+	for (;;) {
+		bool is_cgv2_base_cgroup = false;
+
+		/* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
+		if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0'))
+			is_cgv2_base_cgroup = true;
+
+		p = strchr(p, ':');
+		if (!p)
+			return NULL;
+		p++;
+
+		if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) {
+			p = strchr(p, ':');
+			if (!p)
+				return NULL;
+			p++;
+			return copy_to_eol(p);
+		}
+
+		p = strchr(p, '\n');
+		if (!p)
+			return NULL;
+		p++;
+	}
+}
+
+char *cg_legacy_get_current_cgroup(pid_t pid, const char *controller)
+{
+	__do_free char *basecginfo = NULL;
+	char path[STRLITERALLEN("/proc//cgroup") + INTTYPE_TO_STRLEN(pid_t) + 1];
+
+	snprintf(path, sizeof(path), "/proc/%d/cgroup", pid > 0 ?: 1);
+	basecginfo = read_file(path);
+	if (!basecginfo)
+		return ret_set_errno(NULL, ENOMEM);
+
+	return cg_hybrid_get_current_cgroup(basecginfo, controller,
+					    CGROUP_SUPER_MAGIC);
+}
+
+
+char *readat_file(int dirfd, const char *path)
+{
+	__do_close_prot_errno int fd = -EBADF;
+	__do_free char *line = NULL;
+	__do_fclose FILE *f = NULL;
+	char *buf = NULL;
+	size_t len = 0, fulllen = 0;
+	ssize_t linelen;
+
+	fd = openat(dirfd, path, O_NOFOLLOW | O_RDONLY | O_CLOEXEC);
+	if (fd < 0)
+		return NULL;
+
+	/* transfer ownership of fd */
+	f = fdopen(move_fd(fd), "re");
+	if (!f)
+		return NULL;
+
+	while ((linelen = getline(&line, &len, f)) != -1) {
+		append_line(&buf, fulllen, line, linelen);
+		fulllen += linelen;
+	}
+
+	if (buf)
+		drop_trailing_newlines(buf);
+
+	return buf;
+}
+
+bool mkdir_p(const char *dir, mode_t mode)
+{
+	const char *tmp = dir;
+	const char *orig = dir;
+	char *makeme;
+
+	do {
+		dir = tmp + strspn(tmp, "/");
+		tmp = dir + strcspn(dir, "/");
+		makeme = strndup(orig, dir - orig);
+		if (!makeme)
+			return false;
+		if (mkdir(makeme, mode) && errno != EEXIST) {
+			lxcfs_error("Failed to create directory '%s': %s.\n",
+				makeme, strerror(errno));
+			free(makeme);
+			return false;
+		}
+		free(makeme);
+	} while(tmp != dir);
+
+	return true;
+}
diff --git a/cgroups/cgroup_utils.h b/cgroups/cgroup_utils.h
new file mode 100644
index 0000000..d4df757
--- /dev/null
+++ b/cgroups/cgroup_utils.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+#ifndef __LXC_CGROUP_UTILS_H
+#define __LXC_CGROUP_UTILS_H
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/vfs.h>
+
+/* Retrieve the cgroup version of a given entry from /proc/<pid>/mountinfo. */
+extern int get_cgroup_version(char *line);
+
+/* Check if given entry from /proc/<pid>/mountinfo is a cgroupfs v1 mount. */
+extern bool is_cgroupfs_v1(char *line);
+
+/* Check if given entry from /proc/<pid>/mountinfo is a cgroupfs v2 mount. */
+extern bool is_cgroupfs_v2(char *line);
+
+/* Given a v1 hierarchy @mountpoint and base @path, verify that we can create
+ * directories underneath it.
+ */
+extern bool test_writeable_v1(char *mountpoint, char *path);
+
+/* Given a v2 hierarchy @mountpoint and base @path, verify that we can create
+ * directories underneath it and that we have write access to the cgroup's
+ * "cgroup.procs" file.
+ */
+extern bool test_writeable_v2(char *mountpoint, char *path);
+
+extern int unified_cgroup_hierarchy(void);
+
+extern void *must_realloc(void *orig, size_t sz);
+
+extern char *must_make_path(const char *first, ...);
+
+extern char *must_copy_string(const char *entry);
+
+/* __typeof__ should be safe to use with all compilers. */
+typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
+extern bool is_fs_type(const struct statfs *fs, fs_type_magic magic_val);
+
+extern char *lxc_string_join(const char *sep, const char **parts,
+			     bool use_as_prefix);
+extern int lxc_count_file_lines(const char *fn);
+
+extern bool dir_exists(const char *path);
+
+extern int safe_mount(const char *src, const char *dest, const char *fstype,
+		      unsigned long flags, const void *data, const char *rootfs);
+
+#ifndef HAVE_STRLCPY
+extern size_t strlcpy(char *, const char *, size_t);
+#endif
+
+#ifndef HAVE_STRLCAT
+extern size_t strlcat(char *d, const char *s, size_t n);
+#endif
+
+extern FILE *fopen_cloexec(const char *path, const char *mode);
+extern void append_line(char **dest, size_t oldlen, char *new, size_t newlen);
+extern char *read_file(const char *fnam);
+extern char *readat_file(int fd, const char *path);
+extern char *read_file_strip_newline(const char *fnam);
+extern char *cg_unified_get_current_cgroup(pid_t pid);
+extern char *cg_hybrid_get_current_cgroup(char *basecginfo,
+					  const char *controller, int type);
+extern char *cg_legacy_get_current_cgroup(pid_t pid, const char *controller);
+extern bool mkdir_p(const char *dir, mode_t mode);
+
+#endif /* __LXC_CGROUP_UTILS_H */
diff --git a/configure.ac b/configure.ac
index 81027cd..63cd934 100644
--- a/configure.ac
+++ b/configure.ac
@@ -162,4 +162,13 @@ AC_ARG_WITH([rootfs-path],
 
 AS_AC_EXPAND(LIBDIR, "$libdir")
 
+AC_CHECK_FUNCS([strlcpy],
+	AM_CONDITIONAL(HAVE_STRLCPY, true)
+	AC_DEFINE(HAVE_STRLCPY,1,[Have strlcpy]),
+	AM_CONDITIONAL(HAVE_STRLCPY, false))
+AC_CHECK_FUNCS([strlcat],
+	AM_CONDITIONAL(HAVE_STRLCAT, true)
+	AC_DEFINE(HAVE_STRLCAT,1,[Have strlcat]),
+	AM_CONDITIONAL(HAVE_STRLCAT, false))
+
 AC_OUTPUT
diff --git a/macro.h b/macro.h
index 3e9ef82..4ec3876 100644
--- a/macro.h
+++ b/macro.h
@@ -1,9 +1,22 @@
 #ifndef __LXCFS_MACRO_H
 #define __LXCFS_MACRO_H
 
+#include <stdio.h>
+
+#define BATCH_SIZE 50
+
+/* filesystem magic values */
+#ifndef CGROUP_SUPER_MAGIC
+#define CGROUP_SUPER_MAGIC 0x27e0eb
+#endif
+
+#ifndef CGROUP2_SUPER_MAGIC
+#define CGROUP2_SUPER_MAGIC 0x63677270
+#endif
+
 #define lxcfs_debug_stream(stream, format, ...)                                \
 	do {                                                                   \
-		fprintf(stream, "%s: %d: %s: " format, __FILE__, __LINE__,     \
+		fprintf(stream, "%s: %d: %s: " format "\n", __FILE__, __LINE__,     \
 			__func__, ##__VA_ARGS__);                                \
 	} while (false)
 
@@ -21,4 +34,45 @@
 #define lxcfs_v(format, ...)
 #endif /* VERBOSE */
 
+#define log_error_errno(__ret__, __errno__, format, ...) \
+	({						 \
+		errno = __errno__;			 \
+		lxcfs_error(format, ##__VA_ARGS__);	 \
+		__ret__;				 \
+	})
+
+#define STRLITERALLEN(x) (sizeof(""x"") - 1)
+
+/* Calculate the number of chars needed to represent a given integer as a C
+ * string. Include room for '-' to indicate negative numbers and the \0 byte.
+ * This is based on systemd.
+ */
+#define INTTYPE_TO_STRLEN(type)                   \
+	(2 + (sizeof(type) <= 1                   \
+		  ? 3                             \
+		  : sizeof(type) <= 2             \
+			? 5                       \
+			: sizeof(type) <= 4       \
+			      ? 10                \
+			      : sizeof(type) <= 8 \
+				    ? 20          \
+				    : sizeof(int[-2 * (sizeof(type) > 8)])))
+
+#define ret_errno(__errno__)       \
+	({                         \
+		errno = __errno__; \
+		-__errno__;        \
+	})
+
+#define ret_set_errno(__ret__, __errno__) \
+	({                                \
+		errno = __errno__;        \
+		__ret__;                  \
+	})
+
+#define lxc_iterate_parts(__iterator, __splitme, __separators)                  \
+	for (char *__p = NULL, *__it = strtok_r(__splitme, __separators, &__p); \
+	     (__iterator = __it);                                               \
+	     __iterator = __it = strtok_r(NULL, __separators, &__p))
+
 #endif /* __LXCFS_MACRO_H */
diff --git a/memory_utils.h b/memory_utils.h
index 73e04fc..ac00b10 100644
--- a/memory_utils.h
+++ b/memory_utils.h
@@ -67,4 +67,6 @@ static inline void __auto_close__(int *fd)
 		__internal_fd__;            \
 	})
 
+#define zalloc(__size__) (calloc(1, __size__))
+
 #endif /* __LXCFS_MEMORY_UTILS_H */
diff --git a/sysfs_fuse.c b/sysfs_fuse.c
index 32a59b7..d2b187b 100644
--- a/sysfs_fuse.c
+++ b/sysfs_fuse.c
@@ -65,7 +65,7 @@ static int sys_devices_system_cpu_online_read(char *buf, size_t size,
 		initpid = fc->pid;
 	cg = get_pid_cgroup(initpid, "cpuset");
 	if (!cg)
-		return read_file("/sys/devices/system/cpu/online", buf, size, d);
+		return read_file_fuse("/sys/devices/system/cpu/online", buf, size, d);
 	prune_init_slice(cg);
 
 	cpuset = get_cpuset(cg);
@@ -78,7 +78,7 @@ static int sys_devices_system_cpu_online_read(char *buf, size_t size,
 		max_cpus = max_cpu_count(cg);
 
 	if (max_cpus == 0)
-		return read_file("/sys/devices/system/cpu/online", buf, size, d);
+		return read_file_fuse("/sys/devices/system/cpu/online", buf, size, d);
 	if (max_cpus > 1)
 		total_len = snprintf(d->buf, d->buflen, "0-%d\n", max_cpus - 1);
 	else


More information about the lxc-devel mailing list