[lxc-devel] [PATCH v3 2/6] lxc-attach: Remodel cgroup attach logic and attach to namespaces again in parent process

Thu May 24 13:55:55 UTC 2012

With the introduction of lxc-attach's functionality to attach to cgroups,
the setns() calls were put in the child process after the fork() and not the
parent process before the fork() so the parent process remained outside the
namespaces and could add the child to the correct cgroup.

Unfortunately, the pid namespace really affects only children of the current
process and not the process itself, which has several drawbacks: The
attached program does not have a pid inside the container and the context
that is used when remounting /proc from that process is wrong. Thus, the
previous logic of first setting the namespaces and then forking so the child
process (which then exec()s to the desired program) is a real member of the
container.

However, inside the container, there is no guarantee that the cgroup
filesystem is still be mounted and that we are allowed to write to it (which
is why the setns() was moved in the first place).

To work around both problems, we separate the cgroup attach functionality
into two parts: Preparing the attach process, which just opens the tasks
files of all cgroups and keeps the file descriptors open and the writing to
those fds part. This allows us to open all the tasks files in lxc_attach,
then call setns(), then fork, in the child process close them completely and
in the parent process just write the pid of the child process to all those
fds.

Signed-off-by: Christian Seiler <christian at iwakd.de>
Cc: Daniel Lezcano <daniel.lezcano at free.fr>
Cc: Serge Hallyn <serge.hallyn at canonical.com>
---
 src/lxc/cgroup.c     |  152 ++++++++++++++++++++++++++++++++++++++++++++-----
 src/lxc/cgroup.h     |    3 +
 src/lxc/lxc_attach.c |   62 +++++++++++++++-----
 3 files changed, 186 insertions(+), 31 deletions(-)

diff --git a/src/lxc/cgroup.c b/src/lxc/cgroup.c
index e124499..f1461f4 100644
--- a/src/lxc/cgroup.c
+++ b/src/lxc/cgroup.c
@@ -254,13 +254,37 @@ static int cgroup_enable_clone_children(const char *path)
 	return ret;
 }
 
-static int lxc_one_cgroup_attach(const char *name,
-				 struct mntent *mntent, pid_t pid)
+static int lxc_one_cgroup_finish_attach(int fd, pid_t pid)
 {
-	FILE *f;
+	char buf[32];
+	int ret;
+
+	snprintf(buf, 32, "%ld", (long)pid);
+
+	ret = write(fd, buf, strlen(buf));
+	if (ret <= 0) {
+		SYSERROR("failed to write pid '%ld' to fd '%d'", (long)pid, fd);
+		ret = -1;
+	} else {
+		ret = 0;
+	}
+
+	close(fd);
+	return ret;
+}
+
+static int lxc_one_cgroup_dispose_attach(int fd)
+{
+	close(fd);
+	return 0;
+}
+
+static int lxc_one_cgroup_prepare_attach(const char *name, struct mntent *mntent)
+{
+	int fd;
 	char tasks[MAXPATHLEN], initcgroup[MAXPATHLEN];
 	char *cgmnt = mntent->mnt_dir;
-	int flags, ret = 0;
+	int flags;
 
 	flags = get_cgroup_flags(mntent);
 
@@ -269,31 +293,83 @@ static int lxc_one_cgroup_attach(const char *name,
 	         (flags & CGROUP_NS_CGROUP) ? "" : "/lxc",
 	         name);
 
-	f = fopen(tasks, "w");
-	if (!f) {
+	fd = open(tasks, O_WRONLY);
+	if (fd < 0) {
 		SYSERROR("failed to open '%s'", tasks);
 		return -1;
 	}
 
-	if (fprintf(f, "%d", pid) <= 0) {
-		SYSERROR("failed to write pid '%d' to '%s'", pid, tasks);
-		ret = -1;
+	return fd;
+}
+
+static int lxc_one_cgroup_attach(const char *name, struct mntent *mntent, pid_t pid)
+{
+	int fd;
+
+	fd = lxc_one_cgroup_prepare_attach(name, mntent);
+	if (fd < 0) {
+		return -1;
 	}
 
-	fclose(f);
+	return lxc_one_cgroup_finish_attach(fd, pid);
+}
+
+int lxc_cgroup_dispose_attach(void *data)
+{
+	int *fds = data;
+	int ret, err;
+
+	if (!fds) {
+		return 0;
+	}
+
+	ret = 0;
+
+	for (; *fds >= 0; fds++) {
+		err = lxc_one_cgroup_dispose_attach(*fds);
+		if (err) {
+			ret = err;
+		}
+	}
+
+	free(data);
 
 	return ret;
 }
 
-/*
- * for each mounted cgroup, attach a pid to the cgroup for the container
- */
-int lxc_cgroup_attach(const char *name, pid_t pid)
+int lxc_cgroup_finish_attach(void *data, pid_t pid)
+{
+	int *fds = data;
+	int err;
+
+	if (!fds) {
+		return 0;
+	}
+
+	for (; *fds >= 0; fds++) {
+		err = lxc_one_cgroup_finish_attach(*fds, pid);
+		if (err) {
+			/* get rid of the rest of them */
+			lxc_cgroup_dispose_attach(data);
+			return -1;
+		}
+		*fds = -1;
+	}
+
+	free(data);
+
+	return 0;
+}
+
+int lxc_cgroup_prepare_attach(const char *name, void **data)
 {
 	struct mntent *mntent;
 	FILE *file = NULL;
 	int err = -1;
 	int found = 0;
+	int *fds;
+	int i;
+	static const int MAXFDS = 256;
 
 	file = setmntent(MTAB, "r");
 	if (!file) {
@@ -301,7 +377,29 @@ int lxc_cgroup_attach(const char *name, pid_t pid)
 		return -1;
 	}
 
+	/* create a large enough buffer for all practical
+	 * use cases
+	 */
+	fds = malloc(sizeof(int) * MAXFDS);
+	if (!fds) {
+		err = -1;
+		goto out;
+	}
+	for (i = 0; i < MAXFDS; i++) {
+		fds[i] = -1;
+	}
+
+	err = 0;
+	i = 0;
 	while ((mntent = getmntent(file))) {
+		if (i >= MAXFDS - 1) {
+			ERROR("too many cgroups to attach to, aborting");
+			lxc_cgroup_dispose_attach(fds);
+			errno = ENOMEM;
+			err = -1;
+			goto out;
+		}
+
 		DEBUG("checking '%s' (%s)", mntent->mnt_dir, mntent->mnt_type);
 
 		if (strcmp(mntent->mnt_type, "cgroup"))
@@ -312,20 +410,42 @@ int lxc_cgroup_attach(const char *name, pid_t pid)
 		INFO("[%d] found cgroup mounted at '%s',opts='%s'",
 		     ++found, mntent->mnt_dir, mntent->mnt_opts);
 
-		err = lxc_one_cgroup_attach(name, mntent, pid);
-		if (err)
+		fds[i] = lxc_one_cgroup_prepare_attach(name, mntent);
+		if (fds[i] < 0) {
+			err = fds[i];
+			lxc_cgroup_dispose_attach(fds);
 			goto out;
+		}
+		i++;
 	};
 
 	if (!found)
 		ERROR("No cgroup mounted on the system");
 
+	*data = fds;
+
 out:
 	endmntent(file);
 	return err;
 }
 
 /*
+ * for each mounted cgroup, attach a pid to the cgroup for the container
+ */
+int lxc_cgroup_attach(const char *name, pid_t pid)
+{
+	void *data = NULL;
+	int ret;
+
+	ret = lxc_cgroup_prepare_attach(name, &data);
+	if (ret < 0) {
+		return ret;
+	}
+
+	return lxc_cgroup_finish_attach(data, pid);
+}
+
+/*
  * rename cgname, which is under cgparent, to a new name starting
  * with 'cgparent/dead'.  That way cgname can be reused.  Return
  * 0 on success, -1 on failure.
diff --git a/src/lxc/cgroup.h b/src/lxc/cgroup.h
index 3c90696..8167f39 100644
--- a/src/lxc/cgroup.h
+++ b/src/lxc/cgroup.h
@@ -31,5 +31,8 @@ extern int lxc_cgroup_destroy(const char *name);
 extern int lxc_cgroup_path_get(char **path, const char *subsystem, const char *name);
 extern int lxc_cgroup_nrtasks(const char *name);
 extern int lxc_cgroup_attach(const char *name, pid_t pid);
+extern int lxc_cgroup_prepare_attach(const char *name, void **data);
+extern int lxc_cgroup_finish_attach(void *data, pid_t pid);
+extern int lxc_cgroup_dispose_attach(void *data);
 extern int lxc_ns_is_mounted(void);
 #endif
diff --git a/src/lxc/lxc_attach.c b/src/lxc/lxc_attach.c
index 955e9f4..e4f604b 100644
--- a/src/lxc/lxc_attach.c
+++ b/src/lxc/lxc_attach.c
@@ -96,6 +96,7 @@ int main(int argc, char *argv[])
 	struct passwd *passwd;
 	struct lxc_proc_context_info *init_ctx;
 	struct lxc_handler *handler;
+	void *cgroup_data = NULL;
 	uid_t uid;
 	char *curdir;
 
@@ -124,6 +125,35 @@ int main(int argc, char *argv[])
 		return -1;
 	}
 
+	if (!elevated_privileges) {
+	        /* we have to do this now since /sys/fs/cgroup may not
+	         * be available inside the container or we may not have
+	         * the required permissions anymore
+	         */
+		ret = lxc_cgroup_prepare_attach(my_args.name, &cgroup_data);
+		if (ret < 0) {
+			ERROR("failed to prepare attaching to cgroup");
+			return -1;
+		}
+	}
+
+	curdir = get_current_dir_name();
+
+	/* we need to attach before we fork since certain namespaces
+	 * (such as pid namespaces) only really affect children of the
+	 * current process and not the process itself
+	 */
+	ret = lxc_attach_to_ns(init_pid);
+	if (ret < 0) {
+		ERROR("failed to enter the namespace");
+		return -1;
+	}
+
+	if (curdir && chdir(curdir))
+		WARN("could not change directory to '%s'", curdir);
+
+	free(curdir);
+
 	/* hack: we need sync.h infrastructure - and that needs a handler */
 	handler = calloc(1, sizeof(*handler));
 
@@ -150,8 +180,22 @@ int main(int argc, char *argv[])
 		if (lxc_sync_wait_child(handler, LXC_SYNC_CONFIGURE))
 			return -1;
 
-		if (!elevated_privileges && lxc_cgroup_attach(my_args.name, pid))
-			return -1;
+		/* now that we are done with all privileged operations,
+		 * we can add ourselves to the cgroup. Since we smuggled in
+		 * the fds earlier, we still have write permission
+		 */
+		if (!elevated_privileges) {
+			/* since setns() for pid namespaces only really
+			 * affects child processes, the pid we have is
+			 * still valid outside the container, so this is
+			 * fine
+			 */
+			ret = lxc_cgroup_finish_attach(cgroup_data, pid);
+			if (ret < 0) {
+				ERROR("failed to attach process to cgroup");
+				return -1;
+			}
+		}
 
 		/* tell the child we are done initializing */
 		if (lxc_sync_wake_child(handler, LXC_SYNC_POST_CONFIGURE))
@@ -175,19 +219,7 @@ int main(int argc, char *argv[])
 
 	if (!pid) {
 		lxc_sync_fini_parent(handler);
-
-		curdir = get_current_dir_name();
-
-		ret = lxc_attach_to_ns(init_pid);
-		if (ret < 0) {
-			ERROR("failed to enter the namespace");
-			return -1;
-		}
-
-		if (curdir && chdir(curdir))
-			WARN("could not change directory to '%s'", curdir);
-
-		free(curdir);
+		lxc_cgroup_dispose_attach(cgroup_data);
 
 		if (new_personality < 0)
 			new_personality = init_ctx->personality;
-- 
1.7.2.5