[lxc-devel] [PATCH 1/2] lxc_attach: fix break with user namespaces (v3)

Christian Seiler christian at iwakd.de
Sun Mar 3 12:55:09 UTC 2013


When you clone a new user_ns, the child cannot write to the fds
opened by the parent.  Hnadle this by doing an extra fork.  The
grandparent hangs around and waits for its child to tell it the
pid of of the grandchild, which will be the one attached to the
container.  The grandparent then moves the grandchild into the
right cgroup, then waits for the child who in turn is waiting on
the grandchild to complete.

Secondly, when attaching to a new user namespace, your old uid is
not valid, so you are uid -1.  This patch simply does setid+setuid
to 0 if that is the case.  We probably want to be smarter, but
for now this allows lxc-attach to work.

Signed-off-by: Christian Seiler <christian at iwakd.de>
---
 src/lxc/lxc_attach.c |  178 ++++++++++++++++++++++++++++++++++++++++++--------
 1 files changed, 150 insertions(+), 28 deletions(-)

diff --git a/src/lxc/lxc_attach.c b/src/lxc/lxc_attach.c
index e1511ef..1f60266 100644
--- a/src/lxc/lxc_attach.c
+++ b/src/lxc/lxc_attach.c
@@ -28,6 +28,7 @@
 #include <stdlib.h>
 #include <sys/param.h>
 #include <sys/types.h>
+#include <sys/socket.h>
 #include <sys/wait.h>
 
 #include "attach.h"
@@ -128,9 +129,9 @@ int main(int argc, char *argv[])
 	struct passwd *passwd;
 	struct lxc_proc_context_info *init_ctx;
 	struct lxc_handler *handler;
-	void *cgroup_data = NULL;
 	uid_t uid;
 	char *curdir;
+	int cgroup_ipc_sockets[2];
 
 	ret = lxc_caps_init();
 	if (ret)
@@ -157,18 +158,6 @@ int main(int argc, char *argv[])
 		return -1;
 	}
 
-	if (!elevated_privileges) {
-	        /* we have to do this now since /sys/fs/cgroup may not
-	         * be available inside the container or we may not have
-	         * the required permissions anymore
-	         */
-		ret = lxc_cgroup_prepare_attach(my_args.name, &cgroup_data);
-		if (ret < 0) {
-			ERROR("failed to prepare attaching to cgroup");
-			return -1;
-		}
-	}
-
 	curdir = getcwd(NULL, 0);
 
 	/* determine which namespaces the container was created with
@@ -184,6 +173,106 @@ int main(int argc, char *argv[])
 		}
 	}
 
+	/* For the cgroup attaching logic to work in conjunction with pid and user namespaces,
+	 * we need to have the following hierarchy:
+	 *
+	 *     lxc-attach [process executed externally]
+	 *         | socketpair(cgroup_ipc_sockets)
+	 *         | fork()           -> child
+	 *         |                       | setns()
+	 *         |                       | fork()    -> grandchild
+	 *         |                       |                   | initialize
+	 *         |                       |                   | signal parent
+	 *         |                       |<------------------|----+
+	 *         |                       | signal parent     |
+	 *         |<----------------------|-----+             |
+	 *         | add to cgroups        |                   |
+	 *         | signal child -------->|                   |
+	 *         |                       | signal child ---->|
+	 *         | waitpid()             | waitpid()         | exec()
+	 *         |                       |<------------------| exit()
+	 *         |<----------------------| exit()
+	 *         | exit()
+	 *
+	 * The rationale is the following: The first parent is needed because after
+	 * setns() (mount + user namespace) we can't access the cgroup filesystem
+	 * to add the pid to the corresponding cgroup. Therefore, we need to do that
+	 * in a process executed on the host, so that's why we need to fork and wait
+	 * for it to have done some initialization (cgroups may restrict certain
+	 * operations so we have to do that in the end) and use IPC for signaling.
+	 *
+	 * Then in the child process we do the setns(). However, a process is never
+	 * really attached to a pid namespace (never changes its pid, doesn't appear
+	 * in the pid namespace /proc), only child processes of that process are
+	 * truely inside the new pid namespace. That's why we need to fork() again
+	 * after setns() before performing final initializations, then signal our
+	 * parent, which signals the primary process, which does cgroup adding,
+	 * which then signals to the grandchild that it can exec().
+	 */
+	ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, cgroup_ipc_sockets);
+	if (ret < 0) {
+		SYSERROR("could not set up required IPC mechanism for attaching");
+		return -1;
+	}
+
+	pid = fork();
+	if (pid < 0) {
+		SYSERROR("failed to create first subprocess");
+		return -1;
+	}
+
+	if (pid) {
+		int status;
+		pid_t grandchild;
+
+		close(cgroup_ipc_sockets[1]);
+
+	gparent_reread:
+		ret = read(cgroup_ipc_sockets[0], &grandchild, sizeof(grandchild));
+		if (ret <= 0) {
+			if (ret < 0 && (errno == EAGAIN || errno == EINTR))
+				goto gparent_reread;
+			ERROR("failed to get pid of attached process to add to cgroup");
+			return -1;
+		}
+
+		if (!elevated_privileges) {
+			ret = lxc_cgroup_attach(my_args.name, grandchild);
+			if (ret < 0) {
+				ERROR("failed to attach process to cgroup");
+				return -1;
+			}
+		}
+
+		status = 0;
+		ret = write(cgroup_ipc_sockets[0], &status, sizeof(status));
+		if (ret <= 0) {
+			ERROR("failed to signal child that cgroup logic has finished");
+			return -1;
+		}
+
+		close(cgroup_ipc_sockets[0]);
+
+	gparent_again:
+		ret = waitpid(pid, &status, 0);
+		if (ret < 0) {
+			if (errno == EINTR)
+				goto gparent_again;
+			SYSERROR("failed to wait for process '%d'", pid);
+			return -1;
+		}
+
+		if (WIFEXITED(status))
+			return WEXITSTATUS(status);
+
+		return -1;
+	}
+
+	/* at this point we are in the 'parent' process so we need to close the
+	 * socket reserved for the 'grandparent' process
+	 */
+	close(cgroup_ipc_sockets[0]);
+
 	/* we need to attach before we fork since certain namespaces
 	 * (such as pid namespaces) only really affect children of the
 	 * current process and not the process itself
@@ -199,7 +288,10 @@ int main(int argc, char *argv[])
 
 	free(curdir);
 
-	/* hack: we need sync.h infrastructure - and that needs a handler */
+	/* hack: we need sync.h infrastructure - and that needs a handler
+	 * FIXME: perhaps we should also just use a very simple socketpair()
+	 * here? - like with the grandparent <-> parent communication?
+	 */
 	handler = calloc(1, sizeof(*handler));
 
 	if (lxc_sync_init(handler)) {
@@ -225,23 +317,40 @@ int main(int argc, char *argv[])
 		if (lxc_sync_wait_child(handler, LXC_SYNC_CONFIGURE))
 			return -1;
 
-		/* now that we are done with all privileged operations,
-		 * we can add ourselves to the cgroup. Since we smuggled in
-		 * the fds earlier, we still have write permission
+		/* ask grandparent to add child to cgroups, the grandparent will
+		 * itself check whether that's actually necessary
 		 */
-		if (!elevated_privileges) {
-			/* since setns() for pid namespaces only really
-			 * affects child processes, the pid we have is
-			 * still valid outside the container, so this is
-			 * fine
+		ret = write(cgroup_ipc_sockets[1], &pid, sizeof(pid));
+		if (ret != sizeof(pid)) {
+			ERROR("error using IPC to notify main process of pid to add to the cgroups of the container");
+			return -1;
+		}
+
+	parent_reread:
+		/* we need some mechanism to check whether the grandparent could
+		 * add us to the cgroups or not - so we await a dummy integer
+		 * on the same socket (that's why we don't use a pipe - we need
+		 * two-way communication). So if the parent fails and exits, that
+		 * will close the socket, which will cause a read of 0 bytes for
+		 * us, so we just terminate. If we read at least a byte, we don't
+		 * care about the contents...
+		 */
+		ret = read(cgroup_ipc_sockets[1], &status, sizeof(status));
+		if (ret <= 0) {
+			if (ret < 0 && (errno == EAGAIN || errno == EINTR))
+				goto parent_reread;
+			/* only print someting if we can't assume the parent already
+			 * gave an error message, that will reduce confusion for the
+			 * user
 			 */
-			ret = lxc_cgroup_finish_attach(cgroup_data, pid);
-			if (ret < 0) {
-				ERROR("failed to attach process to cgroup");
-				return -1;
-			}
+			if (ret != 0)
+				ERROR("failed to get notification that the child process was added to the container's cgroups");
+			return -1;
 		}
 
+		/* we don't need that IPC interface anymore */
+		close(cgroup_ipc_sockets[1]);
+
 		/* tell the child we are done initializing */
 		if (lxc_sync_wake_child(handler, LXC_SYNC_POST_CONFIGURE))
 			return -1;
@@ -264,7 +373,7 @@ int main(int argc, char *argv[])
 
 	if (!pid) {
 		lxc_sync_fini_parent(handler);
-		lxc_cgroup_dispose_attach(cgroup_data);
+		close(cgroup_ipc_sockets[1]);
 
 		if (attach_apparmor(init_ctx->aa_profile) < 0) {
 			ERROR("failed switching apparmor profiles");
@@ -307,6 +416,19 @@ int main(int argc, char *argv[])
 
 		lxc_sync_fini(handler);
 
+		if (namespace_flags & CLONE_NEWUSER) {
+			/* XXX FIXME this should get the uid of the container init and setuid to that */
+			/* XXX FIXME or perhaps try to map in the lxc-attach caller's uid? */
+			if (setgid(0)) {
+				SYSERROR("switching to container gid");
+				return -1;
+			}
+			if (setuid(0)) {
+				SYSERROR("switching to container uid");
+				return -1;
+			}
+		}
+
 		if (my_args.argc) {
 			execvp(my_args.argv[0], my_args.argv);
 			SYSERROR("failed to exec '%s'", my_args.argv[0]);
-- 
1.7.8.6





More information about the lxc-devel mailing list