[lxc-devel] [PATCH 1/2] lxc_attach: fix break with user namespaces (v3)
Serge Hallyn
serge.hallyn at ubuntu.com
Mon Mar 4 16:06:12 UTC 2013
Quoting Christian Seiler (christian at iwakd.de):
> When you clone a new user_ns, the child cannot write to the fds
> opened by the parent. Hnadle this by doing an extra fork. The
> grandparent hangs around and waits for its child to tell it the
> pid of of the grandchild, which will be the one attached to the
> container. The grandparent then moves the grandchild into the
> right cgroup, then waits for the child who in turn is waiting on
> the grandchild to complete.
>
> Secondly, when attaching to a new user namespace, your old uid is
> not valid, so you are uid -1. This patch simply does setid+setuid
> to 0 if that is the case. We probably want to be smarter, but
> for now this allows lxc-attach to work.
>
> Signed-off-by: Christian Seiler <christian at iwakd.de>
Acked-by: Serge E. Hallyn <serge.hallyn at ubuntu.com>
Thanks, Christian, this looks good.
> ---
> src/lxc/lxc_attach.c | 178 ++++++++++++++++++++++++++++++++++++++++++--------
> 1 files changed, 150 insertions(+), 28 deletions(-)
>
> diff --git a/src/lxc/lxc_attach.c b/src/lxc/lxc_attach.c
> index e1511ef..1f60266 100644
> --- a/src/lxc/lxc_attach.c
> +++ b/src/lxc/lxc_attach.c
> @@ -28,6 +28,7 @@
> #include <stdlib.h>
> #include <sys/param.h>
> #include <sys/types.h>
> +#include <sys/socket.h>
> #include <sys/wait.h>
>
> #include "attach.h"
> @@ -128,9 +129,9 @@ int main(int argc, char *argv[])
> struct passwd *passwd;
> struct lxc_proc_context_info *init_ctx;
> struct lxc_handler *handler;
> - void *cgroup_data = NULL;
> uid_t uid;
> char *curdir;
> + int cgroup_ipc_sockets[2];
>
> ret = lxc_caps_init();
> if (ret)
> @@ -157,18 +158,6 @@ int main(int argc, char *argv[])
> return -1;
> }
>
> - if (!elevated_privileges) {
> - /* we have to do this now since /sys/fs/cgroup may not
> - * be available inside the container or we may not have
> - * the required permissions anymore
> - */
> - ret = lxc_cgroup_prepare_attach(my_args.name, &cgroup_data);
> - if (ret < 0) {
> - ERROR("failed to prepare attaching to cgroup");
> - return -1;
> - }
> - }
> -
> curdir = getcwd(NULL, 0);
>
> /* determine which namespaces the container was created with
> @@ -184,6 +173,106 @@ int main(int argc, char *argv[])
> }
> }
>
> + /* For the cgroup attaching logic to work in conjunction with pid and user namespaces,
> + * we need to have the following hierarchy:
> + *
> + * lxc-attach [process executed externally]
> + * | socketpair(cgroup_ipc_sockets)
> + * | fork() -> child
> + * | | setns()
> + * | | fork() -> grandchild
> + * | | | initialize
> + * | | | signal parent
> + * | |<------------------|----+
> + * | | signal parent |
> + * |<----------------------|-----+ |
> + * | add to cgroups | |
> + * | signal child -------->| |
> + * | | signal child ---->|
> + * | waitpid() | waitpid() | exec()
> + * | |<------------------| exit()
> + * |<----------------------| exit()
> + * | exit()
> + *
> + * The rationale is the following: The first parent is needed because after
> + * setns() (mount + user namespace) we can't access the cgroup filesystem
> + * to add the pid to the corresponding cgroup. Therefore, we need to do that
> + * in a process executed on the host, so that's why we need to fork and wait
> + * for it to have done some initialization (cgroups may restrict certain
> + * operations so we have to do that in the end) and use IPC for signaling.
> + *
> + * Then in the child process we do the setns(). However, a process is never
> + * really attached to a pid namespace (never changes its pid, doesn't appear
> + * in the pid namespace /proc), only child processes of that process are
> + * truely inside the new pid namespace. That's why we need to fork() again
> + * after setns() before performing final initializations, then signal our
> + * parent, which signals the primary process, which does cgroup adding,
> + * which then signals to the grandchild that it can exec().
> + */
> + ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, cgroup_ipc_sockets);
> + if (ret < 0) {
> + SYSERROR("could not set up required IPC mechanism for attaching");
> + return -1;
> + }
> +
> + pid = fork();
> + if (pid < 0) {
> + SYSERROR("failed to create first subprocess");
> + return -1;
> + }
> +
> + if (pid) {
> + int status;
> + pid_t grandchild;
> +
> + close(cgroup_ipc_sockets[1]);
> +
> + gparent_reread:
> + ret = read(cgroup_ipc_sockets[0], &grandchild, sizeof(grandchild));
> + if (ret <= 0) {
> + if (ret < 0 && (errno == EAGAIN || errno == EINTR))
> + goto gparent_reread;
> + ERROR("failed to get pid of attached process to add to cgroup");
> + return -1;
> + }
> +
> + if (!elevated_privileges) {
> + ret = lxc_cgroup_attach(my_args.name, grandchild);
> + if (ret < 0) {
> + ERROR("failed to attach process to cgroup");
> + return -1;
> + }
> + }
> +
> + status = 0;
> + ret = write(cgroup_ipc_sockets[0], &status, sizeof(status));
> + if (ret <= 0) {
> + ERROR("failed to signal child that cgroup logic has finished");
> + return -1;
> + }
> +
> + close(cgroup_ipc_sockets[0]);
> +
> + gparent_again:
> + ret = waitpid(pid, &status, 0);
> + if (ret < 0) {
> + if (errno == EINTR)
> + goto gparent_again;
> + SYSERROR("failed to wait for process '%d'", pid);
> + return -1;
> + }
> +
> + if (WIFEXITED(status))
> + return WEXITSTATUS(status);
> +
> + return -1;
> + }
> +
> + /* at this point we are in the 'parent' process so we need to close the
> + * socket reserved for the 'grandparent' process
> + */
> + close(cgroup_ipc_sockets[0]);
> +
> /* we need to attach before we fork since certain namespaces
> * (such as pid namespaces) only really affect children of the
> * current process and not the process itself
> @@ -199,7 +288,10 @@ int main(int argc, char *argv[])
>
> free(curdir);
>
> - /* hack: we need sync.h infrastructure - and that needs a handler */
> + /* hack: we need sync.h infrastructure - and that needs a handler
> + * FIXME: perhaps we should also just use a very simple socketpair()
> + * here? - like with the grandparent <-> parent communication?
> + */
> handler = calloc(1, sizeof(*handler));
>
> if (lxc_sync_init(handler)) {
> @@ -225,23 +317,40 @@ int main(int argc, char *argv[])
> if (lxc_sync_wait_child(handler, LXC_SYNC_CONFIGURE))
> return -1;
>
> - /* now that we are done with all privileged operations,
> - * we can add ourselves to the cgroup. Since we smuggled in
> - * the fds earlier, we still have write permission
> + /* ask grandparent to add child to cgroups, the grandparent will
> + * itself check whether that's actually necessary
> */
> - if (!elevated_privileges) {
> - /* since setns() for pid namespaces only really
> - * affects child processes, the pid we have is
> - * still valid outside the container, so this is
> - * fine
> + ret = write(cgroup_ipc_sockets[1], &pid, sizeof(pid));
> + if (ret != sizeof(pid)) {
> + ERROR("error using IPC to notify main process of pid to add to the cgroups of the container");
> + return -1;
> + }
> +
> + parent_reread:
> + /* we need some mechanism to check whether the grandparent could
> + * add us to the cgroups or not - so we await a dummy integer
> + * on the same socket (that's why we don't use a pipe - we need
> + * two-way communication). So if the parent fails and exits, that
> + * will close the socket, which will cause a read of 0 bytes for
> + * us, so we just terminate. If we read at least a byte, we don't
> + * care about the contents...
> + */
> + ret = read(cgroup_ipc_sockets[1], &status, sizeof(status));
> + if (ret <= 0) {
> + if (ret < 0 && (errno == EAGAIN || errno == EINTR))
> + goto parent_reread;
> + /* only print someting if we can't assume the parent already
> + * gave an error message, that will reduce confusion for the
> + * user
> */
> - ret = lxc_cgroup_finish_attach(cgroup_data, pid);
> - if (ret < 0) {
> - ERROR("failed to attach process to cgroup");
> - return -1;
> - }
> + if (ret != 0)
> + ERROR("failed to get notification that the child process was added to the container's cgroups");
> + return -1;
> }
>
> + /* we don't need that IPC interface anymore */
> + close(cgroup_ipc_sockets[1]);
> +
> /* tell the child we are done initializing */
> if (lxc_sync_wake_child(handler, LXC_SYNC_POST_CONFIGURE))
> return -1;
> @@ -264,7 +373,7 @@ int main(int argc, char *argv[])
>
> if (!pid) {
> lxc_sync_fini_parent(handler);
> - lxc_cgroup_dispose_attach(cgroup_data);
> + close(cgroup_ipc_sockets[1]);
>
> if (attach_apparmor(init_ctx->aa_profile) < 0) {
> ERROR("failed switching apparmor profiles");
> @@ -307,6 +416,19 @@ int main(int argc, char *argv[])
>
> lxc_sync_fini(handler);
>
> + if (namespace_flags & CLONE_NEWUSER) {
> + /* XXX FIXME this should get the uid of the container init and setuid to that */
> + /* XXX FIXME or perhaps try to map in the lxc-attach caller's uid? */
> + if (setgid(0)) {
> + SYSERROR("switching to container gid");
> + return -1;
> + }
> + if (setuid(0)) {
> + SYSERROR("switching to container uid");
> + return -1;
> + }
> + }
> +
> if (my_args.argc) {
> execvp(my_args.argv[0], my_args.argv);
> SYSERROR("failed to exec '%s'", my_args.argv[0]);
> --
> 1.7.8.6
>
>
> ------------------------------------------------------------------------------
> Everyone hates slow websites. So do we.
> Make your web apps faster with AppDynamics
> Download AppDynamics Lite for free today:
> http://p.sf.net/sfu/appdyn_d2d_feb
> _______________________________________________
> Lxc-devel mailing list
> Lxc-devel at lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/lxc-devel
More information about the lxc-devel
mailing list