[lxc-devel] [PATCH 1/2] lxc_attach: fix break with user namespaces (v3)

Serge Hallyn serge.hallyn at ubuntu.com
Mon Mar 4 16:06:12 UTC 2013


Quoting Christian Seiler (christian at iwakd.de):
> When you clone a new user_ns, the child cannot write to the fds
> opened by the parent.  Hnadle this by doing an extra fork.  The
> grandparent hangs around and waits for its child to tell it the
> pid of of the grandchild, which will be the one attached to the
> container.  The grandparent then moves the grandchild into the
> right cgroup, then waits for the child who in turn is waiting on
> the grandchild to complete.
> 
> Secondly, when attaching to a new user namespace, your old uid is
> not valid, so you are uid -1.  This patch simply does setid+setuid
> to 0 if that is the case.  We probably want to be smarter, but
> for now this allows lxc-attach to work.
> 
> Signed-off-by: Christian Seiler <christian at iwakd.de>

Acked-by: Serge E. Hallyn <serge.hallyn at ubuntu.com>

Thanks, Christian, this looks good.

> ---
>  src/lxc/lxc_attach.c |  178 ++++++++++++++++++++++++++++++++++++++++++--------
>  1 files changed, 150 insertions(+), 28 deletions(-)
> 
> diff --git a/src/lxc/lxc_attach.c b/src/lxc/lxc_attach.c
> index e1511ef..1f60266 100644
> --- a/src/lxc/lxc_attach.c
> +++ b/src/lxc/lxc_attach.c
> @@ -28,6 +28,7 @@
>  #include <stdlib.h>
>  #include <sys/param.h>
>  #include <sys/types.h>
> +#include <sys/socket.h>
>  #include <sys/wait.h>
>  
>  #include "attach.h"
> @@ -128,9 +129,9 @@ int main(int argc, char *argv[])
>  	struct passwd *passwd;
>  	struct lxc_proc_context_info *init_ctx;
>  	struct lxc_handler *handler;
> -	void *cgroup_data = NULL;
>  	uid_t uid;
>  	char *curdir;
> +	int cgroup_ipc_sockets[2];
>  
>  	ret = lxc_caps_init();
>  	if (ret)
> @@ -157,18 +158,6 @@ int main(int argc, char *argv[])
>  		return -1;
>  	}
>  
> -	if (!elevated_privileges) {
> -	        /* we have to do this now since /sys/fs/cgroup may not
> -	         * be available inside the container or we may not have
> -	         * the required permissions anymore
> -	         */
> -		ret = lxc_cgroup_prepare_attach(my_args.name, &cgroup_data);
> -		if (ret < 0) {
> -			ERROR("failed to prepare attaching to cgroup");
> -			return -1;
> -		}
> -	}
> -
>  	curdir = getcwd(NULL, 0);
>  
>  	/* determine which namespaces the container was created with
> @@ -184,6 +173,106 @@ int main(int argc, char *argv[])
>  		}
>  	}
>  
> +	/* For the cgroup attaching logic to work in conjunction with pid and user namespaces,
> +	 * we need to have the following hierarchy:
> +	 *
> +	 *     lxc-attach [process executed externally]
> +	 *         | socketpair(cgroup_ipc_sockets)
> +	 *         | fork()           -> child
> +	 *         |                       | setns()
> +	 *         |                       | fork()    -> grandchild
> +	 *         |                       |                   | initialize
> +	 *         |                       |                   | signal parent
> +	 *         |                       |<------------------|----+
> +	 *         |                       | signal parent     |
> +	 *         |<----------------------|-----+             |
> +	 *         | add to cgroups        |                   |
> +	 *         | signal child -------->|                   |
> +	 *         |                       | signal child ---->|
> +	 *         | waitpid()             | waitpid()         | exec()
> +	 *         |                       |<------------------| exit()
> +	 *         |<----------------------| exit()
> +	 *         | exit()
> +	 *
> +	 * The rationale is the following: The first parent is needed because after
> +	 * setns() (mount + user namespace) we can't access the cgroup filesystem
> +	 * to add the pid to the corresponding cgroup. Therefore, we need to do that
> +	 * in a process executed on the host, so that's why we need to fork and wait
> +	 * for it to have done some initialization (cgroups may restrict certain
> +	 * operations so we have to do that in the end) and use IPC for signaling.
> +	 *
> +	 * Then in the child process we do the setns(). However, a process is never
> +	 * really attached to a pid namespace (never changes its pid, doesn't appear
> +	 * in the pid namespace /proc), only child processes of that process are
> +	 * truely inside the new pid namespace. That's why we need to fork() again
> +	 * after setns() before performing final initializations, then signal our
> +	 * parent, which signals the primary process, which does cgroup adding,
> +	 * which then signals to the grandchild that it can exec().
> +	 */
> +	ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, cgroup_ipc_sockets);
> +	if (ret < 0) {
> +		SYSERROR("could not set up required IPC mechanism for attaching");
> +		return -1;
> +	}
> +
> +	pid = fork();
> +	if (pid < 0) {
> +		SYSERROR("failed to create first subprocess");
> +		return -1;
> +	}
> +
> +	if (pid) {
> +		int status;
> +		pid_t grandchild;
> +
> +		close(cgroup_ipc_sockets[1]);
> +
> +	gparent_reread:
> +		ret = read(cgroup_ipc_sockets[0], &grandchild, sizeof(grandchild));
> +		if (ret <= 0) {
> +			if (ret < 0 && (errno == EAGAIN || errno == EINTR))
> +				goto gparent_reread;
> +			ERROR("failed to get pid of attached process to add to cgroup");
> +			return -1;
> +		}
> +
> +		if (!elevated_privileges) {
> +			ret = lxc_cgroup_attach(my_args.name, grandchild);
> +			if (ret < 0) {
> +				ERROR("failed to attach process to cgroup");
> +				return -1;
> +			}
> +		}
> +
> +		status = 0;
> +		ret = write(cgroup_ipc_sockets[0], &status, sizeof(status));
> +		if (ret <= 0) {
> +			ERROR("failed to signal child that cgroup logic has finished");
> +			return -1;
> +		}
> +
> +		close(cgroup_ipc_sockets[0]);
> +
> +	gparent_again:
> +		ret = waitpid(pid, &status, 0);
> +		if (ret < 0) {
> +			if (errno == EINTR)
> +				goto gparent_again;
> +			SYSERROR("failed to wait for process '%d'", pid);
> +			return -1;
> +		}
> +
> +		if (WIFEXITED(status))
> +			return WEXITSTATUS(status);
> +
> +		return -1;
> +	}
> +
> +	/* at this point we are in the 'parent' process so we need to close the
> +	 * socket reserved for the 'grandparent' process
> +	 */
> +	close(cgroup_ipc_sockets[0]);
> +
>  	/* we need to attach before we fork since certain namespaces
>  	 * (such as pid namespaces) only really affect children of the
>  	 * current process and not the process itself
> @@ -199,7 +288,10 @@ int main(int argc, char *argv[])
>  
>  	free(curdir);
>  
> -	/* hack: we need sync.h infrastructure - and that needs a handler */
> +	/* hack: we need sync.h infrastructure - and that needs a handler
> +	 * FIXME: perhaps we should also just use a very simple socketpair()
> +	 * here? - like with the grandparent <-> parent communication?
> +	 */
>  	handler = calloc(1, sizeof(*handler));
>  
>  	if (lxc_sync_init(handler)) {
> @@ -225,23 +317,40 @@ int main(int argc, char *argv[])
>  		if (lxc_sync_wait_child(handler, LXC_SYNC_CONFIGURE))
>  			return -1;
>  
> -		/* now that we are done with all privileged operations,
> -		 * we can add ourselves to the cgroup. Since we smuggled in
> -		 * the fds earlier, we still have write permission
> +		/* ask grandparent to add child to cgroups, the grandparent will
> +		 * itself check whether that's actually necessary
>  		 */
> -		if (!elevated_privileges) {
> -			/* since setns() for pid namespaces only really
> -			 * affects child processes, the pid we have is
> -			 * still valid outside the container, so this is
> -			 * fine
> +		ret = write(cgroup_ipc_sockets[1], &pid, sizeof(pid));
> +		if (ret != sizeof(pid)) {
> +			ERROR("error using IPC to notify main process of pid to add to the cgroups of the container");
> +			return -1;
> +		}
> +
> +	parent_reread:
> +		/* we need some mechanism to check whether the grandparent could
> +		 * add us to the cgroups or not - so we await a dummy integer
> +		 * on the same socket (that's why we don't use a pipe - we need
> +		 * two-way communication). So if the parent fails and exits, that
> +		 * will close the socket, which will cause a read of 0 bytes for
> +		 * us, so we just terminate. If we read at least a byte, we don't
> +		 * care about the contents...
> +		 */
> +		ret = read(cgroup_ipc_sockets[1], &status, sizeof(status));
> +		if (ret <= 0) {
> +			if (ret < 0 && (errno == EAGAIN || errno == EINTR))
> +				goto parent_reread;
> +			/* only print someting if we can't assume the parent already
> +			 * gave an error message, that will reduce confusion for the
> +			 * user
>  			 */
> -			ret = lxc_cgroup_finish_attach(cgroup_data, pid);
> -			if (ret < 0) {
> -				ERROR("failed to attach process to cgroup");
> -				return -1;
> -			}
> +			if (ret != 0)
> +				ERROR("failed to get notification that the child process was added to the container's cgroups");
> +			return -1;
>  		}
>  
> +		/* we don't need that IPC interface anymore */
> +		close(cgroup_ipc_sockets[1]);
> +
>  		/* tell the child we are done initializing */
>  		if (lxc_sync_wake_child(handler, LXC_SYNC_POST_CONFIGURE))
>  			return -1;
> @@ -264,7 +373,7 @@ int main(int argc, char *argv[])
>  
>  	if (!pid) {
>  		lxc_sync_fini_parent(handler);
> -		lxc_cgroup_dispose_attach(cgroup_data);
> +		close(cgroup_ipc_sockets[1]);
>  
>  		if (attach_apparmor(init_ctx->aa_profile) < 0) {
>  			ERROR("failed switching apparmor profiles");
> @@ -307,6 +416,19 @@ int main(int argc, char *argv[])
>  
>  		lxc_sync_fini(handler);
>  
> +		if (namespace_flags & CLONE_NEWUSER) {
> +			/* XXX FIXME this should get the uid of the container init and setuid to that */
> +			/* XXX FIXME or perhaps try to map in the lxc-attach caller's uid? */
> +			if (setgid(0)) {
> +				SYSERROR("switching to container gid");
> +				return -1;
> +			}
> +			if (setuid(0)) {
> +				SYSERROR("switching to container uid");
> +				return -1;
> +			}
> +		}
> +
>  		if (my_args.argc) {
>  			execvp(my_args.argv[0], my_args.argv);
>  			SYSERROR("failed to exec '%s'", my_args.argv[0]);
> -- 
> 1.7.8.6
> 
> 
> ------------------------------------------------------------------------------
> Everyone hates slow websites. So do we.
> Make your web apps faster with AppDynamics
> Download AppDynamics Lite for free today:
> http://p.sf.net/sfu/appdyn_d2d_feb
> _______________________________________________
> Lxc-devel mailing list
> Lxc-devel at lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/lxc-devel




More information about the lxc-devel mailing list