[lxc-devel] [PATCH 2/8] lxc-attach: Completely rework lxc-attach and move to API function

Christian Seiler christian at iwakd.de
Tue Aug 13 21:56:14 UTC 2013


 - Move attach functionality to a completely new API function for
   attaching to containers. The API functions accepts the name of the
   container, the lxcpath, a structure indicating options for attaching
   and returns the pid of the attached process. The calling thread may
   then use waitpid() or similar to wait for the attached process to
   finish. lxc-attach itself is just a simple wrapper around the new
   API function.

 - Use CLONE_PARENT when creating the attached process from the
   intermediate process. This allows the intermediate process to exit
   immediately after attach and the original thread may supervise the
   attached process directly.

 - Since the intermediate process exits quickly, its only job is to
   send the original process the pid of the attached process (as seen
   from outside the pidns) and exit. This allows us to simplify the
   synchronisation logic by quite a bit.

 - Use O_CLOEXEC / SOCK_CLOEXEC on (hopefully) all FDs opened in the
   main thread by the attach logic so that other threads of the same
   program may safely fork+exec off. Also, use shutdown() on the
   synchronisation socket, so that if another thread forks off without
   exec'ing, the synchronisation will not fail. (Not tested whether
   this solves this issue.)

 - Instead of directly specifying a program to execute on the API
   level, one specifies a callback function and a payload. This allows
   code using the API to execute a custom function directly inside the
   container without having to execute a program. Two default callbacks
   are provided directly, one to execute an arbitrary program, another
   to execute a shell. The lxc-attach utility will always use either
   one of these default callbacks.

 - More fine-grained control of the attached process on the API level
   (not implemented in lxc-attach utility yet, some may not be sensible):
     * Specify which file descriptors should be stdin/stdout/stderr of
       the newly created process. If fds other than 0/1/2 are
       specified, they will be dup'd in the attached process (and the
       originals closed). This allows e.g. threaded applications to
       specify pipes for communication with the attached process
       without having to modify its own stdin/stdout/stderr before
       running lxc-attach.
     * Specify user and group id for the newly attached process.
     * Specify initial working directory for the newly attached
       process.
     * Fine-grained control on whether to do any, all or none of the
       following: move attached process into the container's init's
       cgroup, drop capabilities of the process, set the processes's
       personality, load the proper apparmor profile and (for partial
       attaches to any but not mount-namespaces) whether to unshare the
       mount namespace and remount /sys and /proc. If additional
       features (SELinux policy, SMACK policy, ...) are implemented,
       flags for those may also be provided.

Signed-off-by: Christian Seiler <christian at iwakd.de>
---
 src/lxc/attach.c         |  495 +++++++++++++++++++++++++++++++++++++++++++++-
 src/lxc/attach.h         |    8 +-
 src/lxc/attach_options.h |  120 +++++++++++
 src/lxc/lxc_attach.c     |  407 +++-----------------------------------
 src/lxc/utils.h          |    1 +
 5 files changed, 643 insertions(+), 388 deletions(-)
 create mode 100644 src/lxc/attach_options.h

diff --git a/src/lxc/attach.c b/src/lxc/attach.c
index 5061b93..742ce76 100644
--- a/src/lxc/attach.c
+++ b/src/lxc/attach.c
@@ -47,6 +47,12 @@
 #include "config.h"
 #include "apparmor.h"
 #include "utils.h"
+#include "commands.h"
+#include "cgroup.h"
+
+#if HAVE_SYS_PERSONALITY_H
+#include <sys/personality.h>
+#endif
 
 lxc_log_define(lxc_attach, lxc);
 
@@ -151,7 +157,7 @@ int lxc_attach_to_ns(pid_t pid, int which)
 		}
 
 		snprintf(path, MAXPATHLEN, "/proc/%d/ns/%s", pid, ns[i]);
-		fd[i] = open(path, O_RDONLY);
+		fd[i] = open(path, O_RDONLY | O_CLOEXEC);
 		if (fd[i] < 0) {
 			saved_errno = errno;
 
@@ -476,3 +482,490 @@ void lxc_attach_get_init_uidgid(uid_t* init_uid, gid_t* init_gid)
 	/* TODO: we should also parse supplementary groups and use
 	 * setgroups() to set them */
 }
+
+struct attach_clone_payload {
+	int ipc_socket;
+	lxc_attach_options_t* options;
+	struct lxc_proc_context_info* init_ctx;
+	lxc_attach_exec_t exec_function;
+	void* exec_payload;
+};
+
+static int attach_child_main(void* data);
+
+/* help the optimizer along if it doesn't know that exit always exits */
+#define rexit(c)  do { int __c = (c); exit(__c); return __c; } while(0)
+
+/* define default options if no options are supplied by the user */
+static lxc_attach_options_t attach_static_default_options = LXC_ATTACH_OPTIONS_DEFAULT;
+
+int lxc_attach(const char* name, const char* lxcpath, lxc_attach_exec_t exec_function, void* exec_payload, lxc_attach_options_t* options, pid_t* attached_process)
+{
+	int ret, status;
+	pid_t init_pid, pid, attached_pid;
+	struct lxc_proc_context_info *init_ctx;
+	char* cwd;
+	char* new_cwd;
+	int ipc_sockets[2];
+
+	if (!options)
+		options = &attach_static_default_options;
+
+	init_pid = lxc_cmd_get_init_pid(name, lxcpath);
+	if (init_pid < 0) {
+		ERROR("failed to get the init pid");
+		return -1;
+	}
+
+	init_ctx = lxc_proc_get_context_info(init_pid);
+	if (!init_ctx) {
+		ERROR("failed to get context of the init process, pid = %ld", (long)init_pid);
+		return -1;
+	}
+
+	cwd = getcwd(NULL, 0);
+
+	/* determine which namespaces the container was created with
+	 * by asking lxc-start, if necessary
+	 */
+	if (options->namespaces == -1) {
+		options->namespaces = lxc_cmd_get_clone_flags(name, lxcpath);
+		/* call failed */
+		if (options->namespaces == -1) {
+			ERROR("failed to automatically determine the "
+			      "namespaces which the container unshared");
+			free(cwd);
+			free(init_ctx->aa_profile);
+			free(init_ctx);
+			return -1;
+		}
+	}
+
+	/* create a socket pair for IPC communication; set SOCK_CLOEXEC in order
+	 * to make sure we don't irritate other threads that want to fork+exec away
+	 *
+	 * IMPORTANT: if the initial process is multithreaded and another call
+	 * just fork()s away without exec'ing directly after, the socket fd will
+	 * exist in the forked process from the other thread and any close() in
+	 * our own child process will not really cause the socket to close properly,
+	 * potentiall causing the parent to hang.
+	 *
+	 * For this reason, while IPC is still active, we have to use shutdown()
+	 * if the child exits prematurely in order to signal that the socket
+	 * is closed and cannot assume that the child exiting will automatically
+	 * do that.
+	 *
+	 * IPC mechanism: (X is receiver)
+	 *   initial process        intermediate          attached
+	 *        X           <---  send pid of
+	 *                          attached proc,
+	 *                          then exit
+	 *    send 0 ------------------------------------>    X
+	 *                                              [do initialization]
+	 *        X  <------------------------------------  send 1
+	 *   [add to cgroup, ...]
+	 *    send 2 ------------------------------------>    X
+	 *   close socket                                 close socket
+	 *                                                run program
+	 */
+	ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+	if (ret < 0) {
+		SYSERROR("could not set up required IPC mechanism for attaching");
+		free(cwd);
+		free(init_ctx->aa_profile);
+		free(init_ctx);
+		return -1;
+	}
+
+	/* create intermediate subprocess, three reasons:
+	 *       1. runs all pthread_atfork handlers and the
+	 *          child will no longer be threaded
+	 *          (we can't properly setns() in a threaded process)
+	 *       2. we can't setns() in the child itself, since
+	 *          we want to make sure we are properly attached to
+	 *          the pidns
+	 *       3. also, the initial thread has to put the attached
+	 *          process into the cgroup, which we can only do if
+	 *          we didn't already setns() (otherwise, user
+	 *          namespaces will hate us)
+	 */
+	pid = fork();
+
+	if (pid < 0) {
+		SYSERROR("failed to create first subprocess");
+		free(cwd);
+		free(init_ctx->aa_profile);
+		free(init_ctx);
+		return -1;
+	}
+
+	if (pid) {
+		pid_t to_cleanup_pid = pid;
+		int expected = 0;
+
+		/* inital thread, we close the socket that is for the
+		 * subprocesses
+		 */
+		close(ipc_sockets[1]);
+		free(cwd);
+
+		/* get pid from intermediate process */
+		ret = lxc_read_nointr_expect(ipc_sockets[0], &attached_pid, sizeof(attached_pid), NULL);
+		if (ret <= 0) {
+			if (ret != 0)
+				ERROR("error using IPC to receive pid of attached process");
+			goto cleanup_error;
+		}
+
+		/* reap intermediate process */
+		ret = wait_for_pid(pid);
+		if (ret < 0)
+			goto cleanup_error;
+
+		/* we will always have to reap the grandchild now */
+		to_cleanup_pid = attached_pid;
+
+		/* tell attached process it may start initializing */
+		status = 0;
+		ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status));
+		if (ret <= 0) {
+			ERROR("error using IPC to notify attached process for initialization (0)");
+			goto cleanup_error;
+		}
+
+		/* wait for the attached process to finish initializing */
+		expected = 1;
+		ret = lxc_read_nointr_expect(ipc_sockets[0], &status, sizeof(status), &expected);
+		if (ret <= 0) {
+			if (ret != 0)
+				ERROR("error using IPC to receive notification from attached process (1)");
+			goto cleanup_error;
+		}
+
+		/* attach to cgroup, if requested */
+		if (options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) {
+			ret = lxc_cgroup_attach(attached_pid, name, lxcpath);
+			if (ret < 0) {
+				ERROR("could not move attached process %ld to cgroup of container", (long)attached_pid);
+				goto cleanup_error;
+			}
+		}
+
+		/* tell attached process we're done */
+		status = 2;
+		ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status));
+		if (ret <= 0) {
+			ERROR("error using IPC to notify attached process for initialization (2)");
+			goto cleanup_error;
+		}
+
+		/* now shut down communication with child, we're done */
+		shutdown(ipc_sockets[0], SHUT_RDWR);
+		close(ipc_sockets[0]);
+		free(init_ctx->aa_profile);
+		free(init_ctx);
+
+		/* we're done, the child process should now execute whatever
+		 * it is that the user requested. The parent can now track it
+		 * with waitpid() or similar.
+		 */
+
+		*attached_process = attached_pid;
+		return 0;
+
+	cleanup_error:
+		/* first shut down the socket, then wait for the pid,
+		 * otherwise the pid we're waiting for may never exit
+		 */
+		shutdown(ipc_sockets[0], SHUT_RDWR);
+		close(ipc_sockets[0]);
+		if (to_cleanup_pid)
+			(void) wait_for_pid(to_cleanup_pid);
+		free(init_ctx->aa_profile);
+		free(init_ctx);
+		return -1;
+	}
+
+	/* first subprocess begins here, we close the socket that is for the
+	 * initial thread
+	 */
+	close(ipc_sockets[0]);
+
+	/* attach now, create another subprocess later, since pid namespaces
+	 * only really affect the children of the current process
+	 */
+	ret = lxc_attach_to_ns(init_pid, options->namespaces);
+	if (ret < 0) {
+		ERROR("failed to enter the namespace");
+		shutdown(ipc_sockets[1], SHUT_RDWR);
+		rexit(-1);
+	}
+
+	/* attach succeeded, try to cwd */
+	if (options->initial_cwd)
+		new_cwd = options->initial_cwd;
+	else
+		new_cwd = cwd;
+	ret = chdir(new_cwd);
+	if (ret < 0)
+		WARN("could not change directory to '%s'", new_cwd);
+	free(cwd);
+
+	/* now create the real child process */
+	{
+		struct attach_clone_payload payload = {
+			.ipc_socket = ipc_sockets[1],
+			.options = options,
+			.init_ctx = init_ctx,
+			.exec_function = exec_function,
+			.exec_payload = exec_payload
+		};
+		/* We use clone_parent here to make this subprocess a direct child of
+		 * the initial process. Then this intermediate process can exit and
+		 * the parent can directly track the attached process.
+		 */
+		pid = lxc_clone(attach_child_main, &payload, CLONE_PARENT);
+	}
+
+	/* shouldn't happen, clone() should always return positive pid */
+	if (pid <= 0) {
+		SYSERROR("failed to create subprocess");
+		shutdown(ipc_sockets[1], SHUT_RDWR);
+		rexit(-1);
+	}
+
+	/* tell grandparent the pid of the pid of the newly created child */
+	ret = lxc_write_nointr(ipc_sockets[1], &pid, sizeof(pid));
+	if (ret != sizeof(pid)) {
+		/* if this really happens here, this is very unfortunate, since the
+		 * parent will not know the pid of the attached process and will
+		 * not be able to wait for it (and we won't either due to CLONE_PARENT)
+		 * so the parent won't be able to reap it and the attached process
+		 * will remain a zombie
+		 */
+		ERROR("error using IPC to notify main process of pid of the attached process");
+		shutdown(ipc_sockets[1], SHUT_RDWR);
+		rexit(-1);
+	}
+
+	/* the rest is in the hands of the initial and the attached process */
+	rexit(0);
+}
+
+int attach_child_main(void* data)
+{
+	struct attach_clone_payload* payload = (struct attach_clone_payload*)data;
+	int ipc_socket = payload->ipc_socket;
+	lxc_attach_options_t* options = payload->options;
+	struct lxc_proc_context_info* init_ctx = payload->init_ctx;
+	long new_personality;
+	int ret;
+	int status;
+	int expected;
+	long flags;
+	int fd;
+	uid_t new_uid;
+	gid_t new_gid;
+
+	/* wait for the initial thread to signal us that it's ready
+	 * for us to start initializing
+	 */
+	expected = 0;
+	status = -1;
+	ret = lxc_read_nointr_expect(ipc_socket, &status, sizeof(status), &expected);
+	if (ret <= 0) {
+		ERROR("error using IPC to receive notification from initial process (0)");
+		shutdown(ipc_socket, SHUT_RDWR);
+		rexit(-1);
+	}
+
+	/* load apparmor profile */
+	if ((options->namespaces & CLONE_NEWNS) && (options->attach_flags & LXC_ATTACH_APPARMOR)) {
+		ret = attach_apparmor(init_ctx->aa_profile);
+		if (ret < 0) {
+			shutdown(ipc_socket, SHUT_RDWR);
+			rexit(-1);
+		}
+	}
+
+	/* A description of the purpose of this functionality is
+	 * provided in the lxc-attach(1) manual page. We have to
+	 * remount here and not in the parent process, otherwise
+	 * /proc may not properly reflect the new pid namespace.
+	 */
+	if (!(options->namespaces & CLONE_NEWNS) && (options->attach_flags & LXC_ATTACH_REMOUNT_PROC_SYS)) {
+		ret = lxc_attach_remount_sys_proc();
+		if (ret < 0) {
+			shutdown(ipc_socket, SHUT_RDWR);
+			rexit(-1);
+		}
+	}
+
+	/* now perform additional attachments*/
+#if HAVE_SYS_PERSONALITY_H
+	if (options->personality < 0)
+		new_personality = init_ctx->personality;
+	else
+		new_personality = options->personality;
+
+	if (options->attach_flags & LXC_ATTACH_SET_PERSONALITY) {
+		ret = personality(new_personality);
+		if (ret < 0) {
+			SYSERROR("could not ensure correct architecture");
+			shutdown(ipc_socket, SHUT_RDWR);
+			rexit(-1);
+		}
+	}
+#endif
+
+	if (options->attach_flags & LXC_ATTACH_DROP_CAPABILITIES) {
+		ret = lxc_attach_drop_privs(init_ctx);
+		if (ret < 0) {
+			ERROR("could not drop privileges");
+			shutdown(ipc_socket, SHUT_RDWR);
+			rexit(-1);
+		}
+	}
+
+	/* always set the environment (specify (LXC_ATTACH_KEEP_ENV, NULL, NULL) if you want this to be a no-op) */
+	ret = lxc_attach_set_environment(options->env_policy, options->extra_env_vars, options->extra_keep_env);
+	if (ret < 0) {
+		ERROR("could not set initial environment for attached process");
+		shutdown(ipc_socket, SHUT_RDWR);
+		rexit(-1);
+	}
+
+	/* set user / group id */
+	new_uid = 0;
+	new_gid = 0;
+	/* ignore errors, we will fall back to root in that case
+	 * (/proc was not mounted etc.)
+	 */
+	if (options->namespaces & CLONE_NEWUSER)
+		lxc_attach_get_init_uidgid(&new_uid, &new_gid);
+
+	if (options->uid != (uid_t)-1)
+		new_uid = options->uid;
+	if (options->gid != (gid_t)-1)
+		new_gid = options->gid;
+
+	/* try to set the uid/gid combination */
+	if ((new_gid != 0 || options->namespaces & CLONE_NEWUSER) && setgid(new_gid)) {
+		SYSERROR("switching to container gid");
+		shutdown(ipc_socket, SHUT_RDWR);
+		rexit(-1);
+	}
+	if ((new_uid != 0 || options->namespaces & CLONE_NEWUSER) && setuid(new_uid)) {
+		SYSERROR("switching to container uid");
+		shutdown(ipc_socket, SHUT_RDWR);
+		rexit(-1);
+	}
+
+	/* tell initial process it may now put us into the cgroups */
+	status = 1;
+	ret = lxc_write_nointr(ipc_socket, &status, sizeof(status));
+	if (ret != sizeof(status)) {
+		ERROR("error using IPC to notify initial process for initialization (1)");
+		shutdown(ipc_socket, SHUT_RDWR);
+		rexit(-1);
+	}
+
+	/* wait for the initial thread to signal us that it has done
+	 * everything for us when it comes to cgroups etc.
+	 */
+	expected = 2;
+	status = -1;
+	ret = lxc_read_nointr_expect(ipc_socket, &status, sizeof(status), &expected);
+	if (ret <= 0) {
+		ERROR("error using IPC to receive final notification from initial process (2)");
+		shutdown(ipc_socket, SHUT_RDWR);
+		rexit(-1);
+	}
+
+	shutdown(ipc_socket, SHUT_RDWR);
+	close(ipc_socket);
+	free(init_ctx->aa_profile);
+	free(init_ctx);
+
+	/* The following is done after the communication socket is
+	 * shut down. That way, all errors that might (though
+	 * unlikely) occur up until this point will have their messages
+	 * printed to the original stderr (if logging is so configured)
+	 * and not the fd the user supplied, if any.
+	 */
+
+	/* fd handling for stdin, stdout and stderr;
+	 * ignore errors here, user may want to make sure
+	 * the fds are closed, for example */
+	if (options->stdin_fd >= 0 && options->stdin_fd != 0)
+		dup2(options->stdin_fd, 0);
+	if (options->stdout_fd >= 0 && options->stdout_fd != 1)
+		dup2(options->stdout_fd, 1);
+	if (options->stderr_fd >= 0 && options->stderr_fd != 2)
+		dup2(options->stderr_fd, 2);
+
+	/* close the old fds */
+	if (options->stdin_fd > 2)
+		close(options->stdin_fd);
+	if (options->stdout_fd > 2)
+		close(options->stdout_fd);
+	if (options->stderr_fd > 2)
+		close(options->stderr_fd);
+
+	/* try to remove CLOEXEC flag from stdin/stdout/stderr,
+	 * but also here, ignore errors */
+	for (fd = 0; fd <= 2; fd++) {
+		flags = fcntl(fd, F_GETFL);
+		if (flags < 0)
+			continue;
+		if (flags & FD_CLOEXEC)
+			fcntl(fd, F_SETFL, flags & ~FD_CLOEXEC);
+	}
+
+	/* we're done, so we can now do whatever the user intended us to do */
+	rexit(payload->exec_function(payload->exec_payload));
+}
+
+int lxc_attach_run_command(void* payload)
+{
+	lxc_attach_command_t* cmd = (lxc_attach_command_t*)payload;
+
+	execvp(cmd->program, cmd->argv);
+	SYSERROR("failed to exec '%s'", cmd->program);
+	return -1;
+}
+
+int lxc_attach_run_shell(void* payload)
+{
+	uid_t uid;
+	struct passwd *passwd;
+	char *user_shell;
+
+	/* ignore payload parameter */
+	(void)payload;
+
+	uid = getuid();
+	passwd = getpwuid(uid);
+
+	/* this probably happens because of incompatible nss
+	 * implementations in host and container (remember, this
+	 * code is still using the host's glibc but our mount
+	 * namespace is in the container)
+	 * we may try to get the information by spawning a
+	 * [getent passwd uid] process and parsing the result
+	 */
+	if (!passwd)
+		user_shell = lxc_attach_getpwshell(uid);
+	else
+		user_shell = passwd->pw_shell;
+
+	if (user_shell)
+		execlp(user_shell, user_shell, NULL);
+
+	/* executed if either no passwd entry or execvp fails,
+	 * we will fall back on /bin/sh as a default shell
+	 */
+	execlp("/bin/sh", "/bin/sh", NULL);
+	SYSERROR("failed to exec shell");
+	return -1;
+}
diff --git a/src/lxc/attach.h b/src/lxc/attach.h
index 151445a..1e7ce10 100644
--- a/src/lxc/attach.h
+++ b/src/lxc/attach.h
@@ -25,6 +25,7 @@
 #define _attach_h
 
 #include <sys/types.h>
+#include "attach_options.h"
 
 struct lxc_proc_context_info {
 	char *aa_profile;
@@ -34,11 +35,6 @@ struct lxc_proc_context_info {
 
 extern struct lxc_proc_context_info *lxc_proc_get_context_info(pid_t pid);
 
-typedef enum lxc_attach_env_policy_t {
-	LXC_ATTACH_KEEP_ENV,
-	LXC_ATTACH_CLEAR_ENV
-} lxc_attach_env_policy_t;
-
 extern int lxc_attach_to_ns(pid_t other_pid, int which);
 extern int lxc_attach_remount_sys_proc();
 extern int lxc_attach_drop_privs(struct lxc_proc_context_info *ctx);
@@ -48,4 +44,6 @@ extern char *lxc_attach_getpwshell(uid_t uid);
 
 extern void lxc_attach_get_init_uidgid(uid_t* init_uid, gid_t* init_gid);
 
+extern int lxc_attach(const char* name, const char* lxcpath, lxc_attach_exec_t exec_function, void* exec_payload, lxc_attach_options_t* options, pid_t* attached_process);
+
 #endif
diff --git a/src/lxc/attach_options.h b/src/lxc/attach_options.h
new file mode 100644
index 0000000..daec901
--- /dev/null
+++ b/src/lxc/attach_options.h
@@ -0,0 +1,120 @@
+/*
+ * lxc: linux Container library
+ *
+ * (C) Copyright IBM Corp. 2007, 2008
+ *
+ * Authors:
+ * Daniel Lezcano <daniel.lezcano at free.fr>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _LXC_ATTACH_OPTIONS_H
+#define _LXC_ATTACH_OPTIONS_H
+
+#include <sys/types.h>
+
+typedef enum lxc_attach_env_policy_t {
+	LXC_ATTACH_KEEP_ENV,
+	LXC_ATTACH_CLEAR_ENV
+} lxc_attach_env_policy_t;
+
+enum {
+	/* the following are on by default: */
+	LXC_ATTACH_MOVE_TO_CGROUP        = 0x00000001,
+	LXC_ATTACH_DROP_CAPABILITIES     = 0x00000002,
+	LXC_ATTACH_SET_PERSONALITY       = 0x00000004,
+	LXC_ATTACH_APPARMOR              = 0x00000008,
+
+	/* the following are off by default */
+	LXC_ATTACH_REMOUNT_PROC_SYS      = 0x00010000,
+
+	/* we have 16 bits for things that are on by default
+	 * and 16 bits that are off by default, that should
+	 * be sufficient to keep binary compatibility for
+	 * a while
+	 */
+	LXC_ATTACH_DEFAULT               = 0x0000FFFF
+};
+
+typedef struct lxc_attach_options_t lxc_attach_options_t;
+typedef int (*lxc_attach_exec_t)(void* payload);
+
+struct lxc_attach_options_t {
+	/* any combination of the above enum */
+	int attach_flags;
+	/* the namespaces to attach to (CLONE_NEW... flags) */
+	int namespaces;
+	/* initial personality, -1 to autodetect
+	 * (may be ignored if lxc is compiled w/o personality support) */
+	long personality;
+
+	/* inital current directory, use NULL to use cwd
+	 * (might not exist in container, then / will be
+	 * used because of kernel defaults)
+	 */
+	char* initial_cwd;
+
+	/* the uid and gid to attach to,
+	 * -1 for default (init uid/gid for userns containers,
+	 * otherwise or if detection fails 0/0)
+	 */
+	uid_t uid;
+	gid_t gid;
+
+	/* environment handling */
+	lxc_attach_env_policy_t env_policy;
+	char** extra_env_vars;
+	char** extra_keep_env;
+
+	/* file descriptors for stdin, stdout and stderr,
+	 * dup2() will be used before calling exec_function,
+	 * (assuming not 0, 1 and 2 are specified) and the
+	 * original fds are closed before passing control
+	 * over. Any O_CLOEXEC flag will be removed after
+	 * that
+	 */
+	int stdin_fd;
+	int stdout_fd;
+	int stderr_fd;
+};
+
+#define LXC_ATTACH_OPTIONS_DEFAULT \
+	{ \
+		/* .attach_flags = */   LXC_ATTACH_DEFAULT, \
+		/* .namespaces = */     -1, \
+		/* .personality = */    -1, \
+		/* .initial_cwd = */    NULL, \
+		/* .uid = */            (uid_t)-1, \
+		/* .gid = */            (gid_t)-1, \
+		/* .env_policy = */     LXC_ATTACH_KEEP_ENV, \
+		/* .extra_env_vars = */ NULL, \
+		/* .extra_keep_env = */ NULL, \
+		/* .stdin_fd = */       0, 1, 2 \
+	}
+
+typedef struct lxc_attach_command_t {
+	char* program; /* the program to run (passed to execvp) */
+	char** argv;   /* the argv pointer of that program, including the program itself in argv[0] */
+} lxc_attach_command_t;
+
+/* default execution functions:
+ *   run_command: pointer to lxc_attach_command_t
+ *   run_shell:   no payload, will be ignored
+ */
+extern int lxc_attach_run_command(void* payload);
+extern int lxc_attach_run_shell(void* payload);
+
+#endif
diff --git a/src/lxc/lxc_attach.c b/src/lxc/lxc_attach.c
index efa7d89..f7ec728 100644
--- a/src/lxc/lxc_attach.c
+++ b/src/lxc/lxc_attach.c
@@ -22,31 +22,17 @@
  */
 
 #define _GNU_SOURCE
-#include <unistd.h>
-#include <errno.h>
-#include <pwd.h>
-#include <stdlib.h>
-#include <sys/param.h>
-#include <sys/types.h>
-#include <sys/socket.h>
 #include <sys/wait.h>
+#include <sys/types.h>
 
 #include "attach.h"
-#include "commands.h"
 #include "arguments.h"
-#include "caps.h"
-#include "cgroup.h"
 #include "config.h"
 #include "confile.h"
-#include "start.h"
-#include "sync.h"
-#include "log.h"
 #include "namespace.h"
-#include "apparmor.h"
-
-#if HAVE_SYS_PERSONALITY_H
-#include <sys/personality.h>
-#endif
+#include "caps.h"
+#include "log.h"
+#include "utils.h"
 
 lxc_log_define(lxc_attach_ui, lxc);
 
@@ -140,148 +126,12 @@ Options :\n\
 	.checker  = NULL,
 };
 
-struct child_data {
-	struct lxc_proc_context_info *init_ctx;
-	struct lxc_handler *handler;
-	int ipc_socket;
-};
-
-static int child_main(void* data)
-{
-	struct child_data* child_data = data;
-	struct lxc_proc_context_info *init_ctx = child_data->init_ctx;
-	struct lxc_handler *handler = child_data->handler;
-	int ipc_socket = child_data->ipc_socket;
-	struct passwd *passwd;
-	char *user_shell;
-	uid_t uid;
-	int ret;
-
-	lxc_sync_fini_parent(handler);
-	close(ipc_socket);
-
-	if ((namespace_flags & CLONE_NEWNS)) {
-		if (attach_apparmor(init_ctx->aa_profile) < 0) {
-			ERROR("failed switching apparmor profiles");
-			return -1;
-		}
-	}
-
-	/* A description of the purpose of this functionality is
-	 * provided in the lxc-attach(1) manual page. We have to
-	 * remount here and not in the parent process, otherwise
-	 * /proc may not properly reflect the new pid namespace.
-	 */
-	if (!(namespace_flags & CLONE_NEWNS) && remount_sys_proc) {
-		ret = lxc_attach_remount_sys_proc();
-		if (ret < 0) {
-			return -1;
-		}
-	}
-
-#if HAVE_SYS_PERSONALITY_H
-	if (new_personality < 0)
-		new_personality = init_ctx->personality;
-
-	if (personality(new_personality) == -1) {
-		ERROR("could not ensure correct architecture: %s",
-		      strerror(errno));
-		return -1;
-	}
-#endif
-
-	if (!elevated_privileges && lxc_attach_drop_privs(init_ctx)) {
-		ERROR("could not drop privileges");
-		return -1;
-	}
-
-	if (lxc_attach_set_environment(env_policy, NULL, NULL)) {
-		ERROR("could not set environment");
-		return -1;
-	}
-
-	/* tell parent we are done setting up the container and wait
-	 * until we have been put in the container's cgroup, if
-	 * applicable */
-	if (lxc_sync_barrier_parent(handler, LXC_SYNC_CONFIGURE))
-		return -1;
-
-	lxc_sync_fini(handler);
-
-	if (namespace_flags & CLONE_NEWUSER) {
-		uid_t init_uid = 0;
-		gid_t init_gid = 0;
-
-		/* ignore errors, we will fall back to root in that case
-		 * (/proc was not mounted etc.)
-		 */
-		lxc_attach_get_init_uidgid(&init_uid, &init_gid);
-
-		/* try to set the uid/gid combination */
-		if (setgid(init_gid)) {
-			SYSERROR("switching to container gid");
-			return -1;
-		}
-		if (setuid(init_uid)) {
-			SYSERROR("switching to container uid");
-			return -1;
-		}
-	}
-
-	if (my_args.argc) {
-		execvp(my_args.argv[0], my_args.argv);
-		SYSERROR("failed to exec '%s'", my_args.argv[0]);
-		return -1;
-	}
-
-	uid = getuid();
-
-	passwd = getpwuid(uid);
-
-	/* this probably happens because of incompatible nss
-	 * implementations in host and container (remember, this
-	 * code is still using the host's glibc but our mount
-	 * namespace is in the container)
-	 * we may try to get the information by spawning a
-	 * [getent passwd uid] process and parsing the result
-	 */
-	if (!passwd)
-		user_shell = lxc_attach_getpwshell(uid);
-	else
-		user_shell = passwd->pw_shell;
-
-	if (user_shell) {
-		char *const args[] = {
-			user_shell,
-			NULL,
-		};
-
-		(void) execvp(args[0], args);
-	}
-
-	/* executed if either no passwd entry or execvp fails,
-	 * we will fall back on /bin/sh as a default shell
-	 */
-	{
-		char *const args[] = {
-			"/bin/sh",
-			NULL,
-		};
-
-		execvp(args[0], args);
-		SYSERROR("failed to exec '%s'", args[0]);
-		return -1;
-	}
-}
-
 int main(int argc, char *argv[])
 {
 	int ret;
-	pid_t pid, init_pid;
-	struct lxc_proc_context_info *init_ctx;
-	struct lxc_handler *handler;
-	char *curdir;
-	int cgroup_ipc_sockets[2];
+	pid_t pid;
+	lxc_attach_options_t attach_options = LXC_ATTACH_OPTIONS_DEFAULT;
+	lxc_attach_command_t command;
 
 	ret = lxc_caps_init();
 	if (ret)
@@ -296,238 +146,31 @@ int main(int argc, char *argv[])
 	if (ret)
 		return ret;
 
-	init_pid = lxc_cmd_get_init_pid(my_args.name, my_args.lxcpath[0]);
-	if (init_pid < 0) {
-		ERROR("failed to get the init pid");
-		return -1;
-	}
+	if (remount_sys_proc)
+		attach_options.attach_flags |= LXC_ATTACH_REMOUNT_PROC_SYS;
+	if (elevated_privileges)
+		attach_options.attach_flags &= ~(LXC_ATTACH_MOVE_TO_CGROUP | LXC_ATTACH_DROP_CAPABILITIES | LXC_ATTACH_APPARMOR);
+	attach_options.namespaces = namespace_flags;
+	attach_options.personality = new_personality;
+	attach_options.env_policy = env_policy;
 
-	init_ctx = lxc_proc_get_context_info(init_pid);
-	if (!init_ctx) {
-		ERROR("failed to get context of the init process, pid = %d", init_pid);
-		return -1;
-	}
-
-	curdir = getcwd(NULL, 0);
-
-	/* determine which namespaces the container was created with
-	 * by asking lxc-start
-	 */
-	if (namespace_flags == -1) {
-		namespace_flags = lxc_cmd_get_clone_flags(my_args.name, my_args.lxcpath[0]);
-		/* call failed */
-		if (namespace_flags == -1) {
-			ERROR("failed to automatically determine the "
-			      "namespaces which the container unshared");
-			return -1;
-		}
-	}
-
-	/* For the cgroup attaching logic to work in conjunction with pid and user namespaces,
-	 * we need to have the following hierarchy:
-	 *
-	 *     lxc-attach [process executed externally]
-	 *         | socketpair(cgroup_ipc_sockets)
-	 *         | fork()           -> child
-	 *         |                       | setns()
-	 *         |                       | fork()    -> grandchild
-	 *         |                       |                   | initialize
-	 *         |                       |                   | signal parent
-	 *         |                       |<------------------|----+
-	 *         |                       | signal parent     |
-	 *         |<----------------------|-----+             |
-	 *         | add to cgroups        |                   |
-	 *         | signal child -------->|                   |
-	 *         |                       | signal child ---->|
-	 *         | waitpid()             | waitpid()         | exec()
-	 *         |                       |<------------------| exit()
-	 *         |<----------------------| exit()
-	 *         | exit()
-	 *
-	 * The rationale is the following: The first parent is needed because after
-	 * setns() (mount + user namespace) we can't access the cgroup filesystem
-	 * to add the pid to the corresponding cgroup. Therefore, we need to do that
-	 * in a process executed on the host, so that's why we need to fork and wait
-	 * for it to have done some initialization (cgroups may restrict certain
-	 * operations so we have to do that in the end) and use IPC for signaling.
-	 *
-	 * Then in the child process we do the setns(). However, a process is never
-	 * really attached to a pid namespace (never changes its pid, doesn't appear
-	 * in the pid namespace /proc), only child processes of that process are
-	 * truely inside the new pid namespace. That's why we need to fork() again
-	 * after setns() before performing final initializations, then signal our
-	 * parent, which signals the primary process, which does cgroup adding,
-	 * which then signals to the grandchild that it can exec().
-	 */
-	ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, cgroup_ipc_sockets);
-	if (ret < 0) {
-		SYSERROR("could not set up required IPC mechanism for attaching");
-		return -1;
-	}
-
-	pid = fork();
-	if (pid < 0) {
-		SYSERROR("failed to create first subprocess");
-		return -1;
-	}
-
-	if (pid) {
-		int status;
-		pid_t grandchild;
-
-		close(cgroup_ipc_sockets[1]);
-
-	gparent_reread:
-		ret = read(cgroup_ipc_sockets[0], &grandchild, sizeof(grandchild));
-		if (ret <= 0) {
-			if (ret < 0 && (errno == EAGAIN || errno == EINTR))
-				goto gparent_reread;
-			ERROR("failed to get pid of attached process to add to cgroup");
-			return -1;
-		}
-
-		if (!elevated_privileges) {
-			ret = lxc_cgroup_attach(grandchild, my_args.name, my_args.lxcpath[0]);
-			if (ret < 0) {
-				ERROR("failed to attach process to cgroup");
-				return -1;
-			}
-		}
-
-		status = 0;
-		ret = write(cgroup_ipc_sockets[0], &status, sizeof(status));
-		if (ret <= 0) {
-			ERROR("failed to signal child that cgroup logic has finished");
-			return -1;
-		}
-
-		close(cgroup_ipc_sockets[0]);
-
-	gparent_again:
-		ret = waitpid(pid, &status, 0);
-		if (ret < 0) {
-			if (errno == EINTR)
-				goto gparent_again;
-			SYSERROR("failed to wait for process '%d'", pid);
-			return -1;
-		}
-
-		if (WIFEXITED(status))
-			return WEXITSTATUS(status);
-
-		return -1;
-	}
-
-	/* at this point we are in the 'parent' process so we need to close the
-	 * socket reserved for the 'grandparent' process
-	 */
-	close(cgroup_ipc_sockets[0]);
-
-	/* we need to attach before we fork since certain namespaces
-	 * (such as pid namespaces) only really affect children of the
-	 * current process and not the process itself
-	 */
-	ret = lxc_attach_to_ns(init_pid, namespace_flags);
-	if (ret < 0) {
-		ERROR("failed to enter the namespace");
-		return -1;
+	if (my_args.argc) {
+		command.program = my_args.argv[0];
+		command.argv = (char**)my_args.argv;
+		ret = lxc_attach(my_args.name, my_args.lxcpath[0], lxc_attach_run_command, &command, &attach_options, &pid);
+	} else {
+		ret = lxc_attach(my_args.name, my_args.lxcpath[0], lxc_attach_run_shell, NULL, &attach_options, &pid);
 	}
 
-	if (curdir && chdir(curdir))
-		WARN("could not change directory to '%s'", curdir);
-
-	free(curdir);
-
-	/* hack: we need sync.h infrastructure - and that needs a handler
-	 * FIXME: perhaps we should also just use a very simple socketpair()
-	 * here? - like with the grandparent <-> parent communication?
-	 */
-	handler = calloc(1, sizeof(*handler));
-
-	if (lxc_sync_init(handler)) {
-		ERROR("failed to initialize synchronization socket");
+	if (ret < 0)
 		return -1;
-	}
 
-	{
-		struct child_data child_data = {
-			.init_ctx = init_ctx,
-			.handler = handler,
-			.ipc_socket = cgroup_ipc_sockets[1]
-		};
-		pid = lxc_clone(child_main, &child_data, 0);
-	}
-
-	if (pid < 0) {
-		SYSERROR("failed to fork");
+	ret = lxc_wait_for_pid_status(pid);
+	if (ret < 0)
 		return -1;
-	}
-
-	if (pid) {
-		int status;
-
-		lxc_sync_fini_child(handler);
-
-		/* wait until the child has done configuring itself before
-		 * we put it in a cgroup that potentially limits these
-		 * possibilities */
-		if (lxc_sync_wait_child(handler, LXC_SYNC_CONFIGURE))
-			return -1;
-
-		/* ask grandparent to add child to cgroups, the grandparent will
-		 * itself check whether that's actually necessary
-		 */
-		ret = write(cgroup_ipc_sockets[1], &pid, sizeof(pid));
-		if (ret != sizeof(pid)) {
-			ERROR("error using IPC to notify main process of pid to add to the cgroups of the container");
-			return -1;
-		}
-
-	parent_reread:
-		/* we need some mechanism to check whether the grandparent could
-		 * add us to the cgroups or not - so we await a dummy integer
-		 * on the same socket (that's why we don't use a pipe - we need
-		 * two-way communication). So if the parent fails and exits, that
-		 * will close the socket, which will cause a read of 0 bytes for
-		 * us, so we just terminate. If we read at least a byte, we don't
-		 * care about the contents...
-		 */
-		ret = read(cgroup_ipc_sockets[1], &status, sizeof(status));
-		if (ret <= 0) {
-			if (ret < 0 && (errno == EAGAIN || errno == EINTR))
-				goto parent_reread;
-			/* only print someting if we can't assume the parent already
-			 * gave an error message, that will reduce confusion for the
-			 * user
-			 */
-			if (ret != 0)
-				ERROR("failed to get notification that the child process was added to the container's cgroups");
-			return -1;
-		}
-
-		/* we don't need that IPC interface anymore */
-		close(cgroup_ipc_sockets[1]);
-
-		/* tell the child we are done initializing */
-		if (lxc_sync_wake_child(handler, LXC_SYNC_POST_CONFIGURE))
-			return -1;
 
-		lxc_sync_fini(handler);
-
-	again:
-		if (waitpid(pid, &status, 0) < 0) {
-			if (errno == EINTR)
-				goto again;
-			SYSERROR("failed to wait '%d'", pid);
-			return -1;
-		}
-
-		if (WIFEXITED(status))
-			return WEXITSTATUS(status);
-
-		return -1;
-	}
+	if (WIFEXITED(ret))
+		return WEXITSTATUS(ret);
 
-	/* shouldn't happen, because clone should never return 0 */
 	return -1;
 }
diff --git a/src/lxc/utils.h b/src/lxc/utils.h
index 7473b4d..455d7d2 100644
--- a/src/lxc/utils.h
+++ b/src/lxc/utils.h
@@ -25,6 +25,7 @@
 
 #include <errno.h>
 #include <sys/types.h>
+#include <unistd.h>
 #include "config.h"
 
 /* returns 1 on success, 0 if there were any failures */
-- 
1.7.10.4





More information about the lxc-devel mailing list