[lxc-devel] [lxc/master] [WIP]: cgroups: add cgroup2 device controller support

Thu Nov 28 16:34:17 UTC 2019

A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 364 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20191128/c1469d78/attachment-0001.bin>
-------------- next part --------------
From ceaa8d4be7ae2a549925f9c24b86169aa64c82d8 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Thu, 28 Nov 2019 16:22:36 +0100
Subject: [PATCH] [WIP]: cgroups: add cgroup2 device controller support

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 src/lxc/Makefile.am               |   2 +
 src/lxc/cgroups/cgfsng.c          | 163 +++++++++++-
 src/lxc/cgroups/cgroup.h          |   2 +
 src/lxc/cgroups/cgroup2_devices.c | 411 ++++++++++++++++++++++++++++++
 src/lxc/cgroups/cgroup2_devices.h |  81 ++++++
 src/lxc/conf.c                    |   2 +
 src/lxc/conf.h                    |   1 +
 src/lxc/macro.h                   |  78 ++++++
 src/lxc/start.c                   |   6 +
 9 files changed, 737 insertions(+), 9 deletions(-)
 create mode 100644 src/lxc/cgroups/cgroup2_devices.c
 create mode 100644 src/lxc/cgroups/cgroup2_devices.h

diff --git a/src/lxc/Makefile.am b/src/lxc/Makefile.am
index 4b18ac5d82..56c64f596a 100644
--- a/src/lxc/Makefile.am
+++ b/src/lxc/Makefile.am
@@ -7,6 +7,7 @@ noinst_HEADERS = api_extensions.h \
 		 caps.h \
 		 cgroups/cgroup.h \
 		 cgroups/cgroup_utils.h \
+		 cgroups/cgroup2_devices.h \
 		 compiler.h \
 		 conf.h \
 		 confile.h \
@@ -95,6 +96,7 @@ liblxc_la_SOURCES = af_unix.c af_unix.h \
 		    caps.c caps.h \
 		    cgroups/cgfsng.c \
 		    cgroups/cgroup.c cgroups/cgroup.h \
+		    cgroups/cgroup2_devices.c cgroups/cgroup2_devices.h \
 		    cgroups/cgroup_utils.c cgroups/cgroup_utils.h \
 		    compiler.h \
 		    commands.c commands.h \
diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c
index 1e6a45cff2..3db0602dbc 100644
--- a/src/lxc/cgroups/cgfsng.c
+++ b/src/lxc/cgroups/cgfsng.c
@@ -54,6 +54,7 @@
 
 #include "caps.h"
 #include "cgroup.h"
+#include "cgroup2_devices.h"
 #include "cgroup_utils.h"
 #include "commands.h"
 #include "conf.h"
@@ -2474,8 +2475,17 @@ static bool __cg_legacy_setup_limits(struct cgroup_ops *ops,
 	return ret;
 }
 
+struct dev_exception_item {
+	char type;
+	int major;
+	int minor;
+	char access[100];
+	int allow;
+};
+
 static bool __cg_unified_setup_limits(struct cgroup_ops *ops,
-				      struct lxc_list *cgroup_settings)
+				      struct lxc_list *cgroup_settings,
+				      struct lxc_conf *conf)
 {
 	struct lxc_list *iterator;
 	struct hierarchy *h = ops->unified;
@@ -2486,17 +2496,130 @@ static bool __cg_unified_setup_limits(struct cgroup_ops *ops,
 	if (!h)
 		return false;
 
-	lxc_list_for_each(iterator, cgroup_settings) {
+	lxc_list_for_each (iterator, cgroup_settings) {
 		__do_free char *fullpath = NULL;
 		int ret;
 		struct lxc_cgroup *cg = iterator->elem;
 
-		fullpath = must_make_path(h->container_full_path, cg->subsystem, NULL);
-		ret = lxc_write_to_file(fullpath, cg->value, strlen(cg->value), false, 0666);
-		if (ret < 0) {
-			SYSERROR("Failed to set \"%s\" to \"%s\"",
-				 cg->subsystem, cg->value);
-			return false;
+		if (strncmp("devices", cg->subsystem, 7) == 0) {
+			const char *val = cg->value;
+			struct dev_exception_item ex = {0};
+			int count, rc = 0;
+			char temp[50];
+			struct bpf_program *device;
+
+			if (conf->cgroup2_devices) {
+				device = conf->cgroup2_devices;
+			} else {
+				device = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE);
+				if (device)
+					device = bpf_program_init(device);
+			}
+			if (!device) {
+				ERROR("Failed to create new ebpf device program");
+				return false;
+			}
+
+			switch (*val) {
+			case 'a':
+				__fallthrough;
+			case 'b':
+				__fallthrough;
+			case 'c':
+				ex.type = *val;
+				break;
+			default:
+				return false;
+			}
+
+			val++;
+			if (!isspace(*val))
+				return false;
+			val++;
+			if (*val == '*') {
+				ex.major = ~0;
+				val++;
+			} else if (isdigit(*val)) {
+				memset(temp, 0, sizeof(temp));
+				for (count = 0; count < sizeof(temp) - 1;
+				     count++) {
+					temp[count] = *val;
+					val++;
+					if (!isdigit(*val))
+						break;
+				}
+				rc = lxc_safe_uint(temp, &ex.major);
+				if (rc)
+					return false;
+			} else {
+				return false;
+			}
+			if (*val != ':')
+				return false;
+			val++;
+
+			/* read minor */
+			if (*val == '*') {
+				ex.minor = ~0;
+				val++;
+			} else if (isdigit(*val)) {
+				memset(temp, 0, sizeof(temp));
+				for (count = 0; count < sizeof(temp) - 1;
+				     count++) {
+					temp[count] = *val;
+					val++;
+					if (!isdigit(*val))
+						break;
+				}
+				rc = lxc_safe_uint(temp, &ex.minor);
+				if (rc)
+					return false;
+			} else {
+				return false;
+			}
+			if (!isspace(*val))
+				return false;
+			for (val++, count = 0; count < 3; count++, val++) {
+				switch (*val) {
+				case 'r':
+					ex.access[count] = *val;
+					break;
+				case 'w':
+					ex.access[count] = *val;
+					break;
+				case 'm':
+					ex.access[count] = *val;
+					break;
+				case '\n':
+				case '\0':
+					count = 3;
+					break;
+				default:
+					return false;
+				}
+			}
+
+			if (strcmp("devices.allow", cg->subsystem) == 0)
+				ex.allow = 1;
+
+			device = bpf_program_append_device(device, ex.type,
+							   ex.major, ex.minor,
+							   ex.access, ex.allow);
+			if (!device) {
+				ERROR("Failed to add new rule to bpf device program");
+				return false;
+			}
+		} else {
+
+			fullpath = must_make_path(h->container_full_path,
+						  cg->subsystem, NULL);
+			ret = lxc_write_to_file(fullpath, cg->value,
+						strlen(cg->value), false, 0666);
+			if (ret < 0) {
+				SYSERROR("Failed to set \"%s\" to \"%s\"",
+					 cg->subsystem, cg->value);
+				return false;
+			}
 		}
 		TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
 	}
@@ -2505,6 +2628,27 @@ static bool __cg_unified_setup_limits(struct cgroup_ops *ops,
 	return true;
 }
 
+__cgfsng_ops bool cgfsng_devices_activate(struct cgroup_ops *ops,
+					  struct lxc_handler *handler)
+{
+	struct hierarchy *h = ops->unified;
+	struct bpf_program *device = handler->conf->cgroup2_devices;
+
+	if (!h)
+		return false;
+
+	if (!device)
+		return true;
+
+ 	device = bpf_program_complete_finalize(device);
+ 	if (!device)
+		return false;
+
+	return bpf_program_cgroup_attach(device, BPF_CGROUP_DEVICE,
+					 h->container_full_path,
+					 BPF_F_ALLOW_MULTI);
+}
+
 __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
 					     struct lxc_conf *conf,
 					     bool do_devices)
@@ -2512,7 +2656,7 @@ __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
 	if (!__cg_legacy_setup_limits(ops, &conf->cgroup, do_devices))
 		return false;
 
-	return __cg_unified_setup_limits(ops, &conf->cgroup2);
+	return __cg_unified_setup_limits(ops, &conf->cgroup2, conf);
 }
 
 static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
@@ -2893,6 +3037,7 @@ struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
 	cgfsng_ops->chown = cgfsng_chown;
 	cgfsng_ops->mount = cgfsng_mount;
 	cgfsng_ops->nrtasks = cgfsng_nrtasks;
+	cgfsng_ops->devices_activate = cgfsng_devices_activate;
 
 	return move_ptr(cgfsng_ops);
 }
diff --git a/src/lxc/cgroups/cgroup.h b/src/lxc/cgroups/cgroup.h
index 6ab5187c25..bb6c91cce8 100644
--- a/src/lxc/cgroups/cgroup.h
+++ b/src/lxc/cgroups/cgroup.h
@@ -164,6 +164,8 @@ struct cgroup_ops {
 	bool (*mount)(struct cgroup_ops *ops, struct lxc_handler *handler,
 		      const char *root, int type);
 	int (*nrtasks)(struct cgroup_ops *ops);
+	bool (*devices_activate)(struct cgroup_ops *ops,
+				 struct lxc_handler *handler);
 };
 
 extern struct cgroup_ops *cgroup_init(struct lxc_conf *conf);
diff --git a/src/lxc/cgroups/cgroup2_devices.c b/src/lxc/cgroups/cgroup2_devices.c
new file mode 100644
index 0000000000..c3c897a011
--- /dev/null
+++ b/src/lxc/cgroups/cgroup2_devices.c
@@ -0,0 +1,411 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+#include <fcntl.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "cgroup2_devices.h"
+#include "macro.h"
+#include "memory_utils.h"
+
+static struct bpf_program *bpf_program_add_instructions(struct bpf_program *prog,
+							const struct bpf_insn *instructions,
+							size_t count)
+{
+
+	struct bpf_insn *new_insn;
+
+	/* Don't allow modification after we loaded things into the kernel. */
+	if (prog->kernel_fd >= 0)
+		return NULL;
+
+	new_insn =
+	    realloc(prog->instructions,
+		    sizeof(struct bpf_insn) * (count + prog->n_instructions));
+	if (!new_insn)
+		return NULL;
+
+	prog->instructions = new_insn;
+	memcpy(prog->instructions + prog->n_instructions, instructions,
+	       sizeof(struct bpf_insn) * count);
+	prog->n_instructions += count;
+
+	return prog;
+}
+
+static struct bpf_program *bpf_program_free(struct bpf_program *prog)
+{
+	/* Unfortunately, the kernel currently doesn't implicitly detach BPF
+	 * programs from their cgroups when the last fd to the BPF program is
+	 * closed. This has nasty side-effects since this means that abnormally
+	 * terminated programs that attached one of their BPF programs to a
+	 * cgroup will leave this programs pinned for good with zero chance of
+	 * recovery, until the cgroup is removed. This is particularly
+	 * problematic if the cgroup in question is the root cgroup (or any
+	 * other cgroup belonging to a service that cannot be restarted during
+	 * operation, such as dbus), as the memory for the BPF program can only
+	 * be reclaimed through a reboot. To counter this, we track closely to
+	 * which cgroup a program was attached to and will detach it on our own
+	 * whenever we close the BPF fd. */
+	(void)bpf_program_cgroup_detach(prog);
+
+	close(prog->kernel_fd);
+	free(prog->instructions);
+	free(prog->attached_path);
+	free(prog);
+
+	return NULL;
+}
+
+/* Memory load, dst_reg = *(uint *) (src_reg + off16) */
+#define BPF_LDX_MEM(SIZE, DST, SRC, OFF)                               \
+	((struct bpf_insn){.code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \
+			   .dst_reg = DST,                             \
+			   .src_reg = SRC,                             \
+			   .off = OFF,                                 \
+			   .imm = 0})
+
+/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */
+#define BPF_ALU32_IMM(OP, DST, IMM)                              \
+	((struct bpf_insn){.code = BPF_ALU | BPF_OP(OP) | BPF_K, \
+			   .dst_reg = DST,                       \
+			   .src_reg = 0,                         \
+			   .off = 0,                             \
+			   .imm = IMM})
+
+/* Short form of mov, dst_reg = src_reg */
+#define BPF_MOV64_IMM(DST, IMM)                                 \
+	((struct bpf_insn){.code = BPF_ALU64 | BPF_MOV | BPF_K, \
+			   .dst_reg = DST,                      \
+			   .src_reg = 0,                        \
+			   .off = 0,                            \
+			   .imm = IMM})
+
+#define BPF_MOV32_REG(DST, SRC)                               \
+	((struct bpf_insn){.code = BPF_ALU | BPF_MOV | BPF_X, \
+			   .dst_reg = DST,                    \
+			   .src_reg = SRC,                    \
+			   .off = 0,                          \
+			   .imm = 0})
+
+/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */
+#define BPF_JMP_REG(OP, DST, SRC, OFF)                           \
+	((struct bpf_insn){.code = BPF_JMP | BPF_OP(OP) | BPF_X, \
+			   .dst_reg = DST,                       \
+			   .src_reg = SRC,                       \
+			   .off = OFF,                           \
+			   .imm = 0})
+
+/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */
+#define BPF_JMP_IMM(OP, DST, IMM, OFF)                           \
+	((struct bpf_insn){.code = BPF_JMP | BPF_OP(OP) | BPF_K, \
+			   .dst_reg = DST,                       \
+			   .src_reg = 0,                         \
+			   .off = OFF,                           \
+			   .imm = IMM})
+
+/* Program exit */
+#define BPF_EXIT_INSN()                                \
+	((struct bpf_insn){.code = BPF_JMP | BPF_EXIT, \
+			   .dst_reg = 0,               \
+			   .src_reg = 0,               \
+			   .off = 0,                   \
+			   .imm = 0})
+
+static int bpf_access_mask(const char *acc)
+{
+	int mask = 0;
+
+	for (; *acc; acc++)
+		switch (*acc) {
+		case 'r':
+			mask |= BPF_DEVCG_ACC_READ;
+			break;
+		case 'w':
+			mask |= BPF_DEVCG_ACC_WRITE;
+			break;
+		case 'm':
+			mask |= BPF_DEVCG_ACC_MKNOD;
+			break;
+		default:
+			return -EINVAL;
+		}
+
+	return mask;
+}
+
+static int bpf_device_type(char type)
+{
+	switch (type) {
+	case 'a':
+		return 0;
+	case 'b':
+		return BPF_DEVCG_DEV_BLOCK;
+	case 'c':
+		return BPF_DEVCG_DEV_CHAR;
+	}
+
+	return -1;
+}
+
+static inline bool bpf_device_all_access(int access_mask)
+{
+	return (access_mask == (BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE |
+				BPF_DEVCG_ACC_MKNOD));
+}
+
+struct bpf_program *bpf_program_new(uint32_t prog_type)
+{
+	__do_free struct bpf_program *prog = NULL;
+
+	prog = calloc(1, sizeof(struct bpf_program));
+	if (!prog)
+		return NULL;
+
+	prog->prog_type = prog_type;
+	prog->kernel_fd = -EBADF;
+
+	return move_ptr(prog);
+}
+
+struct bpf_program *bpf_program_init(struct bpf_program *prog)
+{
+	const struct bpf_insn pre_insn[] = {
+	    /* load device type to r2 */
+	    BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct bpf_cgroup_dev_ctx, access_type)),
+	    BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF),
+
+	    /* load access type to r3 */
+	    BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct bpf_cgroup_dev_ctx, access_type)),
+	    BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
+
+	    /* load major number to r4 */
+	    BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_cgroup_dev_ctx, major)),
+
+	    /* load minor number to r5 */
+	    BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, offsetof(struct bpf_cgroup_dev_ctx, minor)),
+	};
+
+	return bpf_program_add_instructions(prog, pre_insn, ARRAY_SIZE(pre_insn));
+}
+
+struct bpf_program *bpf_program_append_device(struct bpf_program *prog,
+					      char type, int major, int minor,
+					      const char *access, int allow)
+{
+	int jump_nr = 1;
+	struct bpf_insn bpf_access_decision[] = {
+	    BPF_MOV64_IMM(BPF_REG_0, allow),
+	    BPF_EXIT_INSN(),
+	};
+	int access_mask;
+	int device_type;
+
+	device_type = bpf_device_type(type);
+	if (device_type < 0)
+		return NULL;
+
+	if (device_type > 0)
+		jump_nr++;
+
+	access_mask = bpf_access_mask(access);
+	if (!bpf_device_all_access(access_mask))
+		jump_nr += 3;
+
+	if (major >= 0)
+		jump_nr++;
+
+	if (minor >= 0)
+		jump_nr++;
+
+	if (device_type > 0) {
+		puts("A");
+		struct bpf_insn ins[] = {
+		    BPF_JMP_IMM(BPF_JNE, BPF_REG_2, device_type, jump_nr--),
+		};
+
+		if (bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins)))
+			return NULL;
+	}
+
+	if (!bpf_device_all_access(access_mask)) {
+		puts("B");
+		struct bpf_insn ins[] = {
+		    BPF_MOV32_REG(BPF_REG_1, BPF_REG_3),
+		    BPF_ALU32_IMM(BPF_AND, BPF_REG_1, access_mask),
+		    BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, jump_nr),	/* compare access type */
+		};
+
+		jump_nr -= 3;
+		if (bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins)))
+			return NULL;
+	}
+
+	if (major >= 0) {
+		puts("C");
+		struct bpf_insn ins[] = {
+		    BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, jump_nr--),
+		};
+
+		if (bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins)))
+			return NULL;
+	}
+
+	if (minor >= 0) {
+		puts("D");
+		struct bpf_insn ins[] = {
+		    BPF_JMP_IMM(BPF_JNE, BPF_REG_5, minor, jump_nr--),
+		};
+
+		if (bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins)))
+			return NULL;
+	}
+
+	return bpf_program_add_instructions(prog, bpf_access_decision,
+					    ARRAY_SIZE(bpf_access_decision));
+}
+
+struct bpf_program *bpf_program_complete_finalize(struct bpf_program *prog)
+{
+	struct bpf_insn ins[] = {
+	    BPF_MOV64_IMM(BPF_REG_0, 0 /* This determines blacklist or whitelist. */),
+	    BPF_EXIT_INSN(),
+	};
+
+	return bpf_program_add_instructions(prog, ins, ARRAY_SIZE(ins));
+}
+
+static int bpf_program_load_kernel(struct bpf_program *prog, char *log_buf,
+				   size_t log_size)
+{
+	union bpf_attr attr;
+
+	if (prog->kernel_fd >= 0) {
+		memset(log_buf, 0, log_size);
+		return 0;
+	}
+
+	attr = (union bpf_attr){
+	    .prog_type	= prog->prog_type,
+	    .insns	= PTR_TO_UINT64(prog->instructions),
+	    .insn_cnt	= prog->n_instructions,
+	    .license	= PTR_TO_UINT64("GPL"),
+	    .log_buf	= PTR_TO_UINT64(log_buf),
+	    .log_level	= !!log_buf,
+	    .log_size	= log_size,
+	};
+
+	prog->kernel_fd = bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
+	if (prog->kernel_fd < 0)
+		return -1;
+
+	return 0;
+}
+
+struct bpf_program *bpf_program_cgroup_attach(struct bpf_program *prog, int type,
+					      const char *path, uint32_t flags)
+{
+	__do_free char *copy = NULL;
+	__do_close_prot_errno int fd = -EBADF;
+	union bpf_attr attr;
+	int r;
+
+	if (flags & ~(BPF_F_ALLOW_OVERRIDE, BPF_F_ALLOW_MULTI))
+		return NULL;
+
+	/* We need to track which cgroup the program is attached to, and we can
+	 * only track one attachment, hence let's refuse this early. */
+	if (prog->attached_path) {
+		if (prog->attached_type != type)
+			return NULL;
+		if (prog->attached_flags != flags)
+			return NULL;
+
+		/* Here's a shortcut: if we previously attached this program already,
+		 * then we don't have to do so again. Well, with one exception:
+		 * if we are in BPF_F_ALLOW_OVERRIDE mode then someone else might have
+		 * replaced our program since the last time, hence let's reattach
+		 * it again, just to be safe. In flags
+		 * == 0 mode this is not an issue since nobody else can replace
+		 * our program in that case, and in flags
+		 * == BPF_F_ALLOW_MULTI mode any other's program would be installed
+		 * in addition to ours hence ours would remain in effect. */
+		if (flags != BPF_F_ALLOW_OVERRIDE)
+			return prog;
+	}
+
+	/* Ensure we have a kernel object for this. */
+	r = bpf_program_load_kernel(prog, NULL, 0);
+	if (r < 0)
+		return NULL;
+
+	copy = strdup(path);
+	if (!copy)
+		return NULL;
+
+	fd = open(path, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
+	if (fd < 0)
+		return NULL;
+
+	attr = (union bpf_attr){
+	    .attach_type	= type,
+	    .target_fd		= fd,
+	    .attach_bpf_fd	= prog->kernel_fd,
+	    .attach_flags	= flags,
+	};
+
+	if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0)
+		return NULL;
+
+	free_and_replace(prog->attached_path, copy);
+	prog->attached_type = type;
+	prog->attached_flags = flags;
+
+	return prog;
+}
+
+struct bpf_program *bpf_program_cgroup_detach(struct bpf_program *prog)
+{
+	__do_close_prot_errno int fd = -EBADF;
+
+	if (!prog->attached_path)
+		return NULL;
+
+	fd = open(prog->attached_path, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
+	if (fd < 0) {
+		if (errno != ENOENT)
+			return NULL;
+
+		/* If the cgroup does not exist anymore, then we don't have to
+		 * explicitly detach, it got detached implicitly by the removal, hence don't complain */
+
+	} else {
+		union bpf_attr attr;
+
+		attr = (union bpf_attr){
+		    .attach_type	= prog->attached_type,
+		    .target_fd		= fd,
+		    .attach_bpf_fd	= prog->kernel_fd,
+		};
+
+		if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0)
+			return NULL;
+	}
+
+	free(prog->attached_path);
+	prog->attached_path = NULL;
+
+	return prog;
+}
+
+void lxc_clear_cgroup2_devices(struct lxc_conf *conf)
+{
+	(void)bpf_program_cgroup_detach(conf->cgroup2_devices);
+	(void)bpf_program_free(conf->cgroup2_devices);
+}
diff --git a/src/lxc/cgroups/cgroup2_devices.h b/src/lxc/cgroups/cgroup2_devices.h
new file mode 100644
index 0000000000..e9893ce21f
--- /dev/null
+++ b/src/lxc/cgroups/cgroup2_devices.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+
+#ifndef __LXC_CGROUP2_DEVICES_H
+#define __LXC_CGROUP2_DEVICES_H
+
+#include <fcntl.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "conf.h"
+
+#if !HAVE_BPF
+#if !(defined __NR_bpf && __NR_bpf > 0)
+#if defined __NR_bpf
+#undef __NR_bpf
+#endif
+#if defined __i386__
+#define __NR_bpf 357
+#elif defined __x86_64__
+#define __NR_bpf 321
+#elif defined __aarch64__
+#define __NR_bpf 280
+#elif defined __arm__
+#define __NR_bpf 386
+#elif defined __sparc__
+#define __NR_bpf 349
+#elif defined __s390__
+#define __NR_bpf 351
+#elif defined __tilegx__
+#define __NR_bpf 280
+#else
+#warning "__NR_bpf not defined for your architecture"
+#endif
+#endif
+
+union bpf_attr;
+
+static inline int missing_bpf(int cmd, union bpf_attr *attr, size_t size)
+{
+#ifdef __NR_bpf
+	return (int)syscall(__NR_bpf, cmd, attr, size);
+#else
+	errno = ENOSYS;
+	return -1;
+#endif
+}
+
+#define bpf missing_bpf
+#endif
+
+struct bpf_program {
+	int kernel_fd;
+	uint32_t prog_type;
+
+	size_t n_instructions;
+	struct bpf_insn *instructions;
+
+	char *attached_path;
+	int attached_type;
+	uint32_t attached_flags;
+};
+
+struct bpf_program *bpf_program_new(uint32_t prog_type);
+struct bpf_program *bpf_program_init(struct bpf_program *prog);
+struct bpf_program *bpf_program_append_device(struct bpf_program *prog,
+					      char type, int major, int minor,
+					      const char *access, int allow);
+struct bpf_program *bpf_program_complete_finalize(struct bpf_program *prog);
+struct bpf_program *bpf_program_cgroup_attach(struct bpf_program *prog, int type,
+					      const char *path, uint32_t flags);
+struct bpf_program *bpf_program_cgroup_detach(struct bpf_program *prog);
+void lxc_clear_cgroup2_devices(struct lxc_conf *conf);
+
+#endif /* __LXC_CGROUP2_DEVICES_H */
diff --git a/src/lxc/conf.c b/src/lxc/conf.c
index 06e4adcc38..c03b663835 100644
--- a/src/lxc/conf.c
+++ b/src/lxc/conf.c
@@ -57,6 +57,7 @@
 #include "af_unix.h"
 #include "caps.h"
 #include "cgroup.h"
+#include "cgroup2_devices.h"
 #include "conf.h"
 #include "config.h"
 #include "confile.h"
@@ -4118,6 +4119,7 @@ void lxc_conf_free(struct lxc_conf *conf)
 	lxc_clear_config_keepcaps(conf);
 	lxc_clear_cgroups(conf, "lxc.cgroup", CGROUP_SUPER_MAGIC);
 	lxc_clear_cgroups(conf, "lxc.cgroup2", CGROUP2_SUPER_MAGIC);
+	lxc_clear_cgroup2_devices(conf);
 	lxc_clear_hooks(conf, "lxc.hook");
 	lxc_clear_mount_entries(conf);
 	lxc_clear_idmaps(conf);
diff --git a/src/lxc/conf.h b/src/lxc/conf.h
index 9f4a93d0b2..741ac4f096 100644
--- a/src/lxc/conf.h
+++ b/src/lxc/conf.h
@@ -241,6 +241,7 @@ struct lxc_conf {
 	struct {
 		struct lxc_list cgroup;
 		struct lxc_list cgroup2;
+		struct bpf_program *cgroup2_devices;
 	};
 
 	struct {
diff --git a/src/lxc/macro.h b/src/lxc/macro.h
index f96a90019e..6f3379b3c4 100644
--- a/src/lxc/macro.h
+++ b/src/lxc/macro.h
@@ -429,6 +429,8 @@ enum {
 #define PTR_TO_INTMAX(p) ((intmax_t)((intptr_t)(p)))
 #define INTMAX_TO_PTR(u) ((void *)((intptr_t)(u)))
 
+#define PTR_TO_UINT64(p) ((uint64_t)((intptr_t)(p)))
+
 #define LXC_INVALID_UID ((uid_t)-1)
 #define LXC_INVALID_GID ((gid_t)-1)
 
@@ -465,4 +467,80 @@ enum {
 #define LXC_TIMESTAMP_FNAME   "ts"
 #define LXC_COMMENT_FNAME     "comment"
 
+/* Taken from systemd. */
+#define free_and_replace(a, b) \
+	({                     \
+		free(a);       \
+		(a) = (b);     \
+		(b) = NULL;    \
+		0;             \
+	})
+
+#define XCONCATENATE(x, y) x##y
+#define CONCATENATE(x, y) XCONCATENATE(x, y)
+#define UNIQ_T(x, uniq) CONCATENATE(__unique_prefix_, CONCATENATE(x, uniq))
+#define UNIQ __COUNTER__
+#undef MIN
+#define MIN(a, b) __MIN(UNIQ, (a), UNIQ, (b))
+#define __MIN(aq, a, bq, b)                                                    \
+	({                                                                     \
+		const typeof(a) UNIQ_T(A, aq) = (a);                           \
+		const typeof(b) UNIQ_T(B, bq) = (b);                           \
+		UNIQ_T(A, aq) < UNIQ_T(B, bq) ? UNIQ_T(A, aq) : UNIQ_T(B, bq); \
+	})
+
+/* Taken from the kernel. */
+
+/*
+ * min()/max()/clamp() macros must accomplish three things:
+ *
+ * - avoid multiple evaluations of the arguments (so side-effects like
+ *   "x++" happen only once) when non-constant.
+ * - perform strict type-checking (to generate warnings instead of
+ *   nasty runtime surprises). See the "unnecessary" pointer comparison
+ *   in __typecheck().
+ * - retain result as a constant expressions when called with only
+ *   constant expressions (to avoid tripping VLA warnings in stack
+ *   allocation usage).
+ */
+#define __typecheck(x, y) (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1)))
+
+/*
+ * This returns a constant expression while determining if an argument is
+ * a constant expression, most importantly without evaluating the argument.
+ * Glory to Martin Uecker <Martin.Uecker at med.uni-goettingen.de>
+ */
+#define __is_constexpr(x) \
+	(sizeof(int) == sizeof(*(8 ? ((void *)((long)(x)*0l)) : (int *)8)))
+
+#define __no_side_effects(x, y) (__is_constexpr(x) && __is_constexpr(y))
+
+#define __safe_cmp(x, y) (__typecheck(x, y) && __no_side_effects(x, y))
+
+#define __cmp(x, y, op) ((x)op(y) ? (x) : (y))
+
+#define __cmp_once(x, y, unique_x, unique_y, op) \
+	({                                       \
+		typeof(x) unique_x = (x);        \
+		typeof(y) unique_y = (y);        \
+		__cmp(unique_x, unique_y, op);   \
+	})
+
+#define __careful_cmp(x, y, op)                                  \
+	__builtin_choose_expr(__safe_cmp(x, y), __cmp(x, y, op), \
+			      __cmp_once(x, y, __UNIQUE_ID(__x), \
+					 __UNIQUE_ID(__y), op))
+
+/**
+ * min - return minimum of two values of the same or compatible types
+ * @x: first value
+ * @y: second value
+ */
+#define min(x, y) __careful_cmp(x, y, <)
+
+#define ARRAY_SIZE(x)                                                        \
+	(__builtin_choose_expr(!__builtin_types_compatible_p(typeof(x),      \
+							     typeof(&*(x))), \
+			       sizeof(x) / sizeof((x)[0]), ((void)0)))
+
 #endif /* __LXC_MACRO_H */
diff --git a/src/lxc/start.c b/src/lxc/start.c
index 3cfc8b2f57..ec1557fdec 100644
--- a/src/lxc/start.c
+++ b/src/lxc/start.c
@@ -1912,6 +1912,12 @@ static int lxc_spawn(struct lxc_handler *handler)
 	}
 	TRACE("Set up legacy device cgroup controller limits");
 
+	if (!cgroup_ops->devices_activate(cgroup_ops, handler)) {
+		ERROR("Failed to setup cgroup2 device controller limits");
+		goto out_delete_net;
+	}
+	TRACE("Set up cgroup2 device controller limits");
+
 	if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
 		/* Now we're ready to preserve the cgroup namespace */
 		ret = lxc_try_preserve_ns(handler->pid, "cgroup");