[lxc-devel] [RFC PATCH 1/1] Enable running on top of unified hierarchy (with cgmanager)

Tue Jun 16 14:34:24 UTC 2015

1. log but don't fail on failure chowning tasks file, as this does not
exist in the unified hierarchy.

2. devcg: work around inability to switch to whitelist

We cannot switch from devices cgroup blacklist to whitelist if
there are sub-cgroups.  With unified hierarchy tasks must be in
leaf nodes so we must create a sub-cgroup to move the container
init into.  This causes a chicken and egg problem - we can't
move the container task into the cgroup later than we are, because
unprivileged users won't have the privilege.  We cannot blindly
set the container whitelist earlier because privileged containers
won't be able to mount devices.

So switch to a whitelist (if needed) early on, but also set
whitelist entries covering everything.  Then right before
enabling the device whitelist entries, clear our catch-all
entries, and skip the 'devices.deny = a' rule in the container
config.

Signed-off-by: Serge Hallyn <serge.hallyn at ubuntu.com>
---
 src/lxc/cgmanager.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 82 insertions(+), 3 deletions(-)

diff --git a/src/lxc/cgmanager.c b/src/lxc/cgmanager.c
index c143bea..bc70d33 100644
--- a/src/lxc/cgmanager.c
+++ b/src/lxc/cgmanager.c
@@ -503,7 +503,7 @@ static bool chown_cgroup(const char *cgroup_path, struct lxc_conf *conf)
 		if (!lxc_cgmanager_chmod(slist[i], cgroup_path, "", 0775))
 			return false;
 		if (!lxc_cgmanager_chmod(slist[i], cgroup_path, "tasks", 0775))
-			return false;
+			WARN("Failed to chown tasks file - might be unified hierarchy");
 		if (!lxc_cgmanager_chmod(slist[i], cgroup_path, "cgroup.procs", 0775))
 			return false;
 	}
@@ -1345,6 +1345,76 @@ static bool cgm_unfreeze(void *hdata)
 	return ret;
 }
 
+static bool check_for_devices_whitelist(struct lxc_list *settings)
+{
+	struct lxc_list *iterator;
+
+	lxc_list_for_each(iterator, settings) {
+		struct lxc_cgroup *cg = iterator->elem;
+		if (strcmp("devices.deny", cg->subsystem) != 0)
+			continue;
+		if (strcmp(cg->value, "a") == 0)
+			return true;
+	}
+	return false;
+}
+
+static bool add_devices_rule(const char *cg, const char *f, const char *v)
+{
+	if (cgmanager_set_value_sync(NULL, cgroup_manager, "devices",
+				 cg, f, v) != 0) {
+		NihError *nerr;
+		nerr = nih_error_get();
+		ERROR("call to cgmanager_set_value_sync failed: %s", nerr->message);
+		nih_free(nerr);
+		ERROR("Error setting cgroup devices file %s to %s for %s",
+			f, v, cg);
+		return false;
+	}
+INFO("Set cg devices file %s to %s for %s", f, v, cg);
+	return true;
+}
+
+/*
+ * The kernel doesn't allow switching a devices cg from blacklist to whitelist
+ * (i.e. echo a > devices.deny) once there is a child cgroup.  Since unified
+ * hierarchy only allows placing tasks in leaf nodes, we cannot place the
+ * container future init task into the cgroup before we switch to whitelist.
+ * We also cannot postpone moving the init task into the cgroup because an
+ * unprivileged user would not be able to do the move later (i.e. uid 1000
+ * would not be allowed to move a task owned by uid 100000, cgmanager wouldn't
+ * allow it).  Finally, we want to wait as long as possible to restrict
+ * device access so that the container setup can use block devices etc.
+ *
+ * So, as long as there are device cgroup entries, we first do effectively:
+ *	echo a > devices.deny
+ *	echo "b *:* rwm" . devices.allow
+ *	echo "c *:* rwm" . devices.allow
+ * This allows the container setup to use all devices it might need.  The
+ * cgroup limit list will include another
+ *	echo a > devices.deny
+ * which will clear out the two entries we added.
+ */
+static bool setup_full_whitelist(struct lxc_list *settings, const char *cg,
+		bool clear)
+{
+	if (!check_for_devices_whitelist(settings))
+		return true;
+
+	if (clear) {
+		if (!add_devices_rule(cg, "devices.deny", "b *:* rwm") ||
+				!add_devices_rule(cg, "devices.deny", "c *:* rwm"))
+			return false;
+	} else {
+		if (!add_devices_rule(cg, "devices.deny", "a") ||
+				!add_devices_rule(cg, "devices.allow", "b *:* rwm") ||
+				!add_devices_rule(cg, "devices.allow", "c *:* rwm"))
+			return false;
+	}
+
+	return true;
+}
+
 static bool cgm_setup_limits(void *hdata, struct lxc_list *cgroup_settings, bool do_devices)
 {
 	struct cgm_data *d = hdata;
@@ -1368,9 +1438,18 @@ static bool cgm_setup_limits(void *hdata, struct lxc_list *cgroup_settings, bool
 		return false;
 	}
 
+	if (!setup_full_whitelist(cgroup_settings, d->cgroup_path, do_devices))
+		goto out;
+
 	lxc_list_for_each(iterator, sorted_cgroup_settings) {
 		char controller[100], *p;
 		cg = iterator->elem;
+
+		/* this was done earlier at setup_full_whitelist() */
+		if (strcmp(cg->subsystem, "devices.deny") == 0 &&
+				strcmp(cg->value, "a") == 0)
+			continue;
+
 		if (do_devices != !strncmp("devices", cg->subsystem, 7))
 			continue;
 		if (strlen(cg->subsystem) > 100) // i smell a rat
@@ -1385,8 +1464,8 @@ static bool cgm_setup_limits(void *hdata, struct lxc_list *cgroup_settings, bool
 			nerr = nih_error_get();
 			ERROR("call to cgmanager_set_value_sync failed: %s", nerr->message);
 			nih_free(nerr);
-			ERROR("Error setting cgroup %s:%s limit type %s", controller,
-				d->cgroup_path, cg->subsystem);
+			ERROR("Error setting cgroup %s:%s limit type %s value %s", controller,
+				d->cgroup_path, cg->subsystem, cg->value);
 			goto out;
 		}
 
-- 
2.1.4