[lxc-devel] [lxc/stable-1.0] stable-1.0: support pre-setns() kernels

brauner on Github lxc-bot at linuxcontainers.org
Mon May 28 10:50:00 UTC 2018


A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 458 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20180528/1733ca6f/attachment.bin>
-------------- next part --------------
From ac3bac8ca962984f5f362b2c004bcb30fdd411b1 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Mon, 28 May 2018 11:30:35 +0200
Subject: [PATCH 1/3] conf: inherit_ns_fd -> ns_share

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 src/lxc/conf.c      |  4 ++--
 src/lxc/conf.h      |  2 +-
 src/lxc/lxc_start.c |  2 +-
 src/lxc/start.c     | 10 +++++-----
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/lxc/conf.c b/src/lxc/conf.c
index 5d583d5f4..5d2a7e1c5 100644
--- a/src/lxc/conf.c
+++ b/src/lxc/conf.c
@@ -3043,7 +3043,7 @@ struct lxc_conf *lxc_conf_init(void)
 	new->tmp_umount_proc = 0;
 
 	for (i = 0; i < LXC_NS_MAX; i++)
-		new->inherit_ns_fd[i] = -1;
+		new->ns_share[i] = -1;
 
 	return new;
 }
@@ -4281,7 +4281,7 @@ int lxc_setup(struct lxc_handler *handler)
 		return -1;
 	}
 
-	if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
+	if (lxc_conf->ns_share[LXC_NS_UTS] == -1) {
 		if (setup_utsname(lxc_conf->utsname)) {
 			ERROR("failed to setup the utsname for '%s'", name);
 			return -1;
diff --git a/src/lxc/conf.h b/src/lxc/conf.h
index 8e5235121..21e9c95c7 100644
--- a/src/lxc/conf.h
+++ b/src/lxc/conf.h
@@ -332,7 +332,7 @@ struct lxc_conf {
 	char *logfile;  // the logfile as specifed in config
 	int loglevel;   // loglevel as specifed in config (if any)
 
-	int inherit_ns_fd[LXC_NS_MAX];
+	int ns_share[LXC_NS_MAX];
 
 	int start_auto;
 	int start_delay;
diff --git a/src/lxc/lxc_start.c b/src/lxc/lxc_start.c
index 29db1881a..244a9b814 100644
--- a/src/lxc/lxc_start.c
+++ b/src/lxc/lxc_start.c
@@ -310,7 +310,7 @@ int main(int argc, char *argv[])
 		int fd = open_ns(pid, ns_info[i].proc_name);
 		if (fd < 0)
 			goto out;
-		conf->inherit_ns_fd[i] = fd;
+		conf->ns_share[i] = fd;
 	}
 
 	if (!my_args.daemonize) {
diff --git a/src/lxc/start.c b/src/lxc/start.c
index 9d148dae3..c07276580 100644
--- a/src/lxc/start.c
+++ b/src/lxc/start.c
@@ -848,7 +848,7 @@ static int lxc_spawn(struct lxc_handler *handler)
 	netpipe = -1;
 
 	for (i = 0; i < LXC_NS_MAX; i++)
-		if (handler->conf->inherit_ns_fd[i] != -1)
+		if (handler->conf->ns_share[i] != -1)
 			preserve_mask |= ns_info[i].clone_flag;
 
 	if (lxc_sync_init(handler))
@@ -860,7 +860,7 @@ static int lxc_spawn(struct lxc_handler *handler)
 		handler->clone_flags |= CLONE_NEWUSER;
 	}
 
-	if (handler->conf->inherit_ns_fd[LXC_NS_NET] == -1) {
+	if (handler->conf->ns_share[LXC_NS_NET] == -1) {
 		if (!lxc_requests_empty_network(handler))
 			handler->clone_flags |= CLONE_NEWNET;
 
@@ -894,13 +894,13 @@ static int lxc_spawn(struct lxc_handler *handler)
 		INFO("Inheriting a net namespace");
 	}
 
-	if (handler->conf->inherit_ns_fd[LXC_NS_IPC] == -1) {
+	if (handler->conf->ns_share[LXC_NS_IPC] == -1) {
 		handler->clone_flags |= CLONE_NEWIPC;
 	} else {
 		INFO("Inheriting an IPC namespace");
 	}
 
-	if (handler->conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
+	if (handler->conf->ns_share[LXC_NS_UTS] == -1) {
 		handler->clone_flags |= CLONE_NEWUTS;
 	} else {
 		INFO("Inheriting a UTS namespace");
@@ -937,7 +937,7 @@ static int lxc_spawn(struct lxc_handler *handler)
 		free(errmsg);
 		goto out_delete_net;
 	}
-	if (attach_ns(handler->conf->inherit_ns_fd) < 0)
+	if (attach_ns(handler->conf->ns_share) < 0)
 		goto out_delete_net;
 
 	if (am_unpriv() && (nveths = count_veths(&handler->conf->network))) {

From bd323d93a03e1a5f6e3fc42033d13b4cfef2c019 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Mon, 28 May 2018 11:47:39 +0200
Subject: [PATCH 2/3] namespace: backport namespace simplifcations

They have been sitting in master, stable-2.0 and stable-3.0 for quite a while
now and have made things much more reliable so let's backport them to
stable-1.0 as well.

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 src/lxc/namespace.c | 152 +++++++++++++++++++++++++++++++++++++++++++---------
 src/lxc/namespace.h | 133 +++++++++++++++++++++++++++++++++++++++++++--
 src/lxc/start.c     |   9 ----
 src/lxc/start.h     |  17 ------
 src/lxc/utils.h     |  12 +++++
 5 files changed, 269 insertions(+), 54 deletions(-)

diff --git a/src/lxc/namespace.c b/src/lxc/namespace.c
index 19b6593fd..6f5ea674b 100644
--- a/src/lxc/namespace.c
+++ b/src/lxc/namespace.c
@@ -21,17 +21,20 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <unistd.h>
 #include <alloca.h>
 #include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
 #include <signal.h>
+#include <unistd.h>
 #include <sys/param.h>
-#include <sys/types.h>
 #include <sys/stat.h>
-#include <fcntl.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
 
-#include "namespace.h"
 #include "log.h"
+#include "namespace.h"
+#include "utils.h"
 
 lxc_log_define(lxc_namespace, lxc);
 
@@ -53,41 +56,140 @@ pid_t lxc_clone(int (*fn)(void *), void *arg, int flags)
 		.arg = arg,
 	};
 
-	size_t stack_size = sysconf(_SC_PAGESIZE);
+	size_t stack_size = lxc_getpagesize();
 	void *stack = alloca(stack_size);
 	pid_t ret;
 
 #ifdef __ia64__
-	ret = __clone2(do_clone, stack,
-		       stack_size, flags | SIGCHLD, &clone_arg);
+	ret = __clone2(do_clone, stack, stack_size, flags | SIGCHLD, &clone_arg);
 #else
 	ret = clone(do_clone, stack  + stack_size, flags | SIGCHLD, &clone_arg);
 #endif
 	if (ret < 0)
-		ERROR("failed to clone (%#x): %s", flags, strerror(errno));
+		ERROR("Failed to clone (%#x): %s.", flags, strerror(errno));
 
 	return ret;
 }
 
-static const char * const namespaces_list[] = {
-	"MOUNT", "PID", "UTSNAME", "IPC",
-	"USER", "NETWORK"
-};
-static const int cloneflags_list[] = {
-	CLONE_NEWNS, CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC,
-	CLONE_NEWUSER, CLONE_NEWNET
+/**
+ * This is based on raw_clone in systemd but adapted to our needs. This uses
+ * copy on write semantics and doesn't pass a stack. CLONE_VM is tricky and
+ * doesn't really matter to us so disallow it.
+ *
+ * The nice thing about this is that we get fork() behavior. That is
+ * lxc_raw_clone() returns 0 in the child and the child pid in the parent.
+ */
+pid_t lxc_raw_clone(unsigned long flags)
+{
+
+	/* These flags don't interest at all so we don't jump through any hoopes
+	 * of retrieving them and passing them to the kernel.
+	 */
+	errno = EINVAL;
+	if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
+		      CLONE_CHILD_CLEARTID | CLONE_SETTLS)))
+		return -EINVAL;
+
+#if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
+	/* On s390/s390x and cris the order of the first and second arguments
+	 * of the system call is reversed.
+	 */
+	return (int)syscall(__NR_clone, NULL, flags | SIGCHLD);
+#elif defined(__sparc__) && defined(__arch64__)
+	{
+		/**
+		 * sparc64 always returns the other process id in %o0, and
+		 * a boolean flag whether this is the child or the parent in
+		 * %o1. Inline assembly is needed to get the flag returned
+		 * in %o1.
+		 */
+		int in_child;
+		int child_pid;
+		asm volatile("mov %2, %%g1\n\t"
+			     "mov %3, %%o0\n\t"
+			     "mov 0 , %%o1\n\t"
+			     "t 0x6d\n\t"
+			     "mov %%o1, %0\n\t"
+			     "mov %%o0, %1"
+			     : "=r"(in_child), "=r"(child_pid)
+			     : "i"(__NR_clone), "r"(flags | SIGCHLD)
+			     : "%o1", "%o0", "%g1");
+		if (in_child)
+			return 0;
+		else
+			return child_pid;
+	}
+#elif defined(__ia64__)
+	/* On ia64 the stack and stack size are passed as separate arguments. */
+	return (int)syscall(__NR_clone, flags | SIGCHLD, NULL, 0);
+#else
+	return (int)syscall(__NR_clone, flags | SIGCHLD, NULL);
+#endif
+}
+
+pid_t lxc_raw_clone_cb(int (*fn)(void *), void *args, unsigned long flags)
+{
+	pid_t pid;
+
+	pid = lxc_raw_clone(flags);
+	if (pid < 0)
+		return -1;
+
+	/* exit() is not thread-safe and might mess with the parent's signal
+	 * handlers and other stuff when exec() fails.
+	 */
+	if (pid == 0)
+		_exit(fn(args));
+
+	return pid;
+}
+
+/* Leave the user namespace at the first position in the array of structs so
+ * that we always attach to it first when iterating over the struct and using
+ * setns() to switch namespaces. This especially affects lxc_attach(): Suppose
+ * you cloned a new user namespace and mount namespace as an unprivileged user
+ * on the host and want to setns() to the mount namespace. This requires you to
+ * attach to the user namespace first otherwise the kernel will fail this check:
+ *
+ *        if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
+ *            !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
+ *            !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ *            return -EPERM;
+ *
+ *    in
+ *
+ *        linux/fs/namespace.c:mntns_install().
+ */
+const struct ns_info ns_info[LXC_NS_MAX] = {
+	[LXC_NS_USER]    = { "user",   CLONE_NEWUSER,   "CLONE_NEWUSER",   "LXC_USER_NS"    },
+	[LXC_NS_MNT]    =  { "mnt",    CLONE_NEWNS,     "CLONE_NEWNS",     "LXC_MNT_NS"     },
+	[LXC_NS_PID]    =  { "pid",    CLONE_NEWPID,    "CLONE_NEWPID",    "LXC_PID_NS"     },
+	[LXC_NS_UTS]    =  { "uts",    CLONE_NEWUTS,    "CLONE_NEWUTS",    "LXC_UTS_NS"     },
+	[LXC_NS_IPC]    =  { "ipc",    CLONE_NEWIPC,    "CLONE_NEWIPC",    "LXC_IPC_NS"     },
+	[LXC_NS_NET]    =  { "net",    CLONE_NEWNET,    "CLONE_NEWNET",    "LXC_NET_NS"     },
+	[LXC_NS_CGROUP] =  { "cgroup", CLONE_NEWCGROUP, "CLONE_NEWCGROUP", "LXC_CGROUP_NS"  }
 };
 
-int lxc_namespace_2_cloneflag(char *namespace)
+int lxc_namespace_2_cloneflag(const char *namespace)
+{
+	int i;
+	for (i = 0; i < LXC_NS_MAX; i++)
+		if (!strcasecmp(ns_info[i].proc_name, namespace))
+			return ns_info[i].clone_flag;
+
+	ERROR("Invalid namespace name \"%s\"", namespace);
+	return -EINVAL;
+}
+
+int lxc_namespace_2_ns_idx(const char *namespace)
 {
-	int i, len;
-	len = sizeof(namespaces_list)/sizeof(namespaces_list[0]);
-	for (i = 0; i < len; i++)
-		if (!strcmp(namespaces_list[i], namespace))
-			return cloneflags_list[i];
-
-	ERROR("invalid namespace name %s", namespace);
-	return -1;
+	int i;
+	for (i = 0; i < LXC_NS_MAX; i++)
+		if (!strcmp(ns_info[i].proc_name, namespace))
+			return i;
+
+	ERROR("Invalid namespace name \"%s\"", namespace);
+	return -EINVAL;
 }
 
 int lxc_fill_namespace_flags(char *flaglist, int *flags)
@@ -96,7 +198,7 @@ int lxc_fill_namespace_flags(char *flaglist, int *flags)
 	int aflag;
 
 	if (!flaglist) {
-		ERROR("need at least one namespace to unshare");
+		ERROR("At least one namespace is needed.");
 		return -1;
 	}
 
diff --git a/src/lxc/namespace.h b/src/lxc/namespace.h
index 28f17e687..4bfe9c4f5 100644
--- a/src/lxc/namespace.h
+++ b/src/lxc/namespace.h
@@ -23,17 +23,53 @@
 #ifndef __LXC_NAMESPACE_H
 #define __LXC_NAMESPACE_H
 
-#include <sys/syscall.h>
 #include <sched.h>
+#include <unistd.h>
+#include <sys/syscall.h>
 
 #include "config.h"
 
+#ifndef CLONE_PARENT_SETTID
+#define CLONE_PARENT_SETTID 0x00100000
+#endif
+
+#ifndef CLONE_CHILD_CLEARTID
+#define CLONE_CHILD_CLEARTID 0x00200000
+#endif
+
+#ifndef CLONE_CHILD_SETTID
+#define CLONE_CHILD_SETTID 0x01000000
+#endif
+
+#ifndef CLONE_VFORK
+#define CLONE_VFORK 0x00004000
+#endif
+
+#ifndef CLONE_THREAD
+#define CLONE_THREAD 0x00010000
+#endif
+
+#ifndef CLONE_SETTLS
+#define CLONE_SETTLS 0x00080000
+#endif
+
+#ifndef CLONE_VM
+#define CLONE_VM 0x00000100
+#endif
+
+#ifndef CLONE_FILES
+#define CLONE_FILES 0x00000400
+#endif
+
 #ifndef CLONE_FS
 #  define CLONE_FS                0x00000200
 #endif
 #ifndef CLONE_NEWNS
 #  define CLONE_NEWNS             0x00020000
 #endif
+#ifndef CLONE_NEWCGROUP
+#  define CLONE_NEWCGROUP         0x02000000
+#endif
 #ifndef CLONE_NEWUTS
 #  define CLONE_NEWUTS            0x04000000
 #endif
@@ -50,6 +86,24 @@
 #  define CLONE_NEWNET            0x40000000
 #endif
 
+enum {
+	LXC_NS_USER,
+	LXC_NS_MNT,
+	LXC_NS_PID,
+	LXC_NS_UTS,
+	LXC_NS_IPC,
+	LXC_NS_NET,
+	LXC_NS_CGROUP,
+	LXC_NS_MAX
+};
+
+extern const struct ns_info {
+	const char *proc_name;
+	int clone_flag;
+	const char *flag_name;
+	const char *env_name;
+} ns_info[LXC_NS_MAX];
+
 #if defined(__ia64__)
 int __clone2(int (*__fn) (void *__arg), void *__child_stack_base,
              size_t __child_stack_size, int __flags, void *__arg, ...);
@@ -59,10 +113,83 @@ int clone(int (*fn)(void *), void *child_stack,
 	/* pid_t *ptid, struct user_desc *tls, pid_t *ctid */ );
 #endif
 
-
+/**
+ * lxc_clone() - create a new process
+ *
+ * - allocate stack:
+ *   This function allocates a new stack the size of page and passes it to the
+ *   kernel.
+ *
+ * - support all CLONE_*flags:
+ *   This function supports all CLONE_* flags. If in doubt or not sufficiently
+ *   familiar with process creation in the kernel and interactions with libcs
+ *   this function should be used.
+ *
+ * - pthread_atfork() handlers depending on libc:
+ *   Whether this function runs pthread_atfork() handlers depends on the
+ *   corresponding libc wrapper. glibc currently does not run pthread_atfork()
+ *   handlers but does not guarantee that they are not. Other libcs might or
+ *   might not run pthread_atfork() handlers. If you require guarantees please
+ *   refer to the lxc_raw_clone*() functions below.
+ *
+ * - should call lxc_raw_getpid():
+ *   The child should use lxc_raw_getpid() to retrieve its pid.
+ */
 extern pid_t lxc_clone(int (*fn)(void *), void *arg, int flags);
 
-extern int lxc_namespace_2_cloneflag(char *namespace);
+/**
+ * lxc_raw_clone() - create a new process
+ *
+ * - fork() behavior:
+ *   This function returns 0 in the child and > 0 in the parent.
+ *
+ * - copy-on-write:
+ *   This function does not allocate a new stack and relies on copy-on-write
+ *   semantics.
+ *
+ * - supports subset of ClONE_* flags:
+ *   lxc_raw_clone() intentionally only supports a subset of the flags available
+ *   to the actual system call. Please refer to the implementation what flags
+ *   cannot be used. Also, please don't assume that just because a flag isn't
+ *   explicitly checked for as being unsupported that it is supported. If in
+ *   doubt or not sufficiently familiar with process creation in the kernel and
+ *   interactions with libcs this function should be used.
+ *
+ * - no pthread_atfork() handlers:
+ *   This function circumvents - as much as this this is possible - any libc
+ *   wrappers and thus does not run any pthread_atfork() handlers. Make sure
+ *   that this is safe to do in the context you are trying to call this
+ *   function.
+ *
+ * - must call lxc_raw_getpid():
+ *   The child must use lxc_raw_getpid() to retrieve its pid.
+ */
+extern pid_t lxc_raw_clone(unsigned long flags);
+/**
+ * lxc_raw_clone_cb() - create a new process
+ *
+ * - non-fork() behavior:
+ *   Function does return pid of the child or -1 on error. Pass in a callback
+ *   function via the "fn" argument that gets executed in the child process. The
+ *   "args" argument is passed to "fn".
+ *
+ * All other comments that apply to lxc_raw_clone() apply to lxc_raw_clone_cb()
+ * as well.
+ */
+extern pid_t lxc_raw_clone_cb(int (*fn)(void *), void *args,
+			      unsigned long flags);
+
+extern int lxc_namespace_2_cloneflag(const char *namespace);
+extern int lxc_namespace_2_ns_idx(const char *namespace);
 extern int lxc_fill_namespace_flags(char *flaglist, int *flags);
 
+/**
+ * Because of older glibc's pid cache (up to 2.25) whenever clone() is called
+ * the child must must retrieve it's own pid via lxc_raw_getpid().
+ */
+static inline pid_t lxc_raw_getpid(void)
+{
+	return (pid_t) syscall(SYS_getpid);
+}
+
 #endif
diff --git a/src/lxc/start.c b/src/lxc/start.c
index c07276580..226e67ed0 100644
--- a/src/lxc/start.c
+++ b/src/lxc/start.c
@@ -74,15 +74,6 @@
 
 lxc_log_define(lxc_start, lxc);
 
-const struct ns_info ns_info[LXC_NS_MAX] = {
-	[LXC_NS_MNT] = {"mnt", CLONE_NEWNS},
-	[LXC_NS_PID] = {"pid", CLONE_NEWPID},
-	[LXC_NS_UTS] = {"uts", CLONE_NEWUTS},
-	[LXC_NS_IPC] = {"ipc", CLONE_NEWIPC},
-	[LXC_NS_USER] = {"user", CLONE_NEWUSER},
-	[LXC_NS_NET] = {"net", CLONE_NEWNET}
-};
-
 static void print_top_failing_dir(const char *path)
 {
 	size_t len = strlen(path);
diff --git a/src/lxc/start.h b/src/lxc/start.h
index 7afa37a39..7f24d6060 100644
--- a/src/lxc/start.h
+++ b/src/lxc/start.h
@@ -41,23 +41,6 @@ struct lxc_operations {
 
 struct cgroup_desc;
 
-enum {
-	LXC_NS_MNT,
-	LXC_NS_PID,
-	LXC_NS_UTS,
-	LXC_NS_IPC,
-	LXC_NS_USER,
-	LXC_NS_NET,
-	LXC_NS_MAX
-};
-
-struct ns_info {
-	const char *proc_name;
-	int clone_flag;
-};
-
-extern const struct ns_info ns_info[LXC_NS_MAX];
-
 struct lxc_handler {
 	pid_t pid;
 	char *name;
diff --git a/src/lxc/utils.h b/src/lxc/utils.h
index 8859eeb74..8633680b0 100644
--- a/src/lxc/utils.h
+++ b/src/lxc/utils.h
@@ -319,4 +319,16 @@ int null_stdfds(void);
 int safe_mount(const char *src, const char *dest, const char *fstype,
 		unsigned long flags, const void *data, const char *rootfs);
 int set_stdfds(int fd);
+
+static inline uint64_t lxc_getpagesize(void)
+{
+	int64_t pgsz;
+
+	pgsz = sysconf(_SC_PAGESIZE);
+	if (pgsz <= 0)
+		pgsz = 1 << 12;
+
+	return pgsz;
+}
+
 #endif /* __LXC_UTILS_H */

From b749cb06590c19ce653371ee710c297934853317 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Mon, 28 May 2018 12:46:32 +0200
Subject: [PATCH 3/3] start: backport namespace preservation logic

They have been sitting in master, stable-2.0 and stable-3.0 for quite a while
now and have made things much more reliable so let's backport them to
stable-1.0 as well. This will allow users to run containers on 2.6.32 container
with stable-1.0 where setns() and namespace preservation is not supported.

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 src/lxc/commands.c  |   2 +-
 src/lxc/error.h     |   2 +
 src/lxc/namespace.c |   3 +-
 src/lxc/namespace.h |   3 -
 src/lxc/start.c     | 325 ++++++++++++++++++++++++++++++++--------------------
 src/lxc/start.h     |  18 ++-
 src/lxc/utils.c     |  21 ++++
 src/lxc/utils.h     |   2 +
 8 files changed, 244 insertions(+), 132 deletions(-)

diff --git a/src/lxc/commands.c b/src/lxc/commands.c
index fcba5a9f1..5585dbc40 100644
--- a/src/lxc/commands.c
+++ b/src/lxc/commands.c
@@ -434,7 +434,7 @@ int lxc_cmd_get_clone_flags(const char *name, const char *lxcpath)
 static int lxc_cmd_get_clone_flags_callback(int fd, struct lxc_cmd_req *req,
 					    struct lxc_handler *handler)
 {
-	struct lxc_cmd_rsp rsp = { .data = INT_TO_PTR(handler->clone_flags) };
+	struct lxc_cmd_rsp rsp = { .data = INT_TO_PTR(handler->ns_clone_flags) };
 
 	return lxc_cmd_rsp_send(fd, &rsp);
 }
diff --git a/src/lxc/error.h b/src/lxc/error.h
index d5d60de0f..6fe474a13 100644
--- a/src/lxc/error.h
+++ b/src/lxc/error.h
@@ -23,6 +23,8 @@
 #ifndef __LXC_ERROR_H
 #define __LXC_ERROR_H
 
+#define LXC_CLONE_ERROR "Failed to clone a new set of namespaces"
+
 extern int  lxc_error_set_and_log(int pid, int status);
 
 #endif
diff --git a/src/lxc/namespace.c b/src/lxc/namespace.c
index 6f5ea674b..b6ee3abd4 100644
--- a/src/lxc/namespace.c
+++ b/src/lxc/namespace.c
@@ -166,8 +166,7 @@ const struct ns_info ns_info[LXC_NS_MAX] = {
 	[LXC_NS_PID]    =  { "pid",    CLONE_NEWPID,    "CLONE_NEWPID",    "LXC_PID_NS"     },
 	[LXC_NS_UTS]    =  { "uts",    CLONE_NEWUTS,    "CLONE_NEWUTS",    "LXC_UTS_NS"     },
 	[LXC_NS_IPC]    =  { "ipc",    CLONE_NEWIPC,    "CLONE_NEWIPC",    "LXC_IPC_NS"     },
-	[LXC_NS_NET]    =  { "net",    CLONE_NEWNET,    "CLONE_NEWNET",    "LXC_NET_NS"     },
-	[LXC_NS_CGROUP] =  { "cgroup", CLONE_NEWCGROUP, "CLONE_NEWCGROUP", "LXC_CGROUP_NS"  }
+	[LXC_NS_NET]    =  { "net",    CLONE_NEWNET,    "CLONE_NEWNET",    "LXC_NET_NS"     }
 };
 
 int lxc_namespace_2_cloneflag(const char *namespace)
diff --git a/src/lxc/namespace.h b/src/lxc/namespace.h
index 4bfe9c4f5..e879f4dff 100644
--- a/src/lxc/namespace.h
+++ b/src/lxc/namespace.h
@@ -67,9 +67,6 @@
 #ifndef CLONE_NEWNS
 #  define CLONE_NEWNS             0x00020000
 #endif
-#ifndef CLONE_NEWCGROUP
-#  define CLONE_NEWCGROUP         0x02000000
-#endif
 #ifndef CLONE_NEWUTS
 #  define CLONE_NEWUTS            0x04000000
 #endif
diff --git a/src/lxc/start.c b/src/lxc/start.c
index 226e67ed0..1165f663d 100644
--- a/src/lxc/start.c
+++ b/src/lxc/start.c
@@ -76,100 +76,107 @@ lxc_log_define(lxc_start, lxc);
 
 static void print_top_failing_dir(const char *path)
 {
-	size_t len = strlen(path);
-	char *copy = alloca(len+1), *p, *e, saved;
-	strcpy(copy, path);
+	int ret;
+	size_t len;
+	char *copy, *e, *p, saved;
 
+	len = strlen(path);
+	copy = alloca(len + 1);
+	strcpy(copy, path);
 	p = copy;
 	e = copy + len;
 	while (p < e) {
-		while (p < e && *p == '/') p++;
-		while (p < e && *p != '/') p++;
+		while (p < e && *p == '/')
+			p++;
+
+		while (p < e && *p != '/')
+			p++;
+
 		saved = *p;
 		*p = '\0';
-		if (access(copy, X_OK)) {
-			SYSERROR("could not access %s.  Please grant it 'x' " \
-			      "access, or add an ACL for the container root.",
-			      copy);
+
+		ret = access(copy, X_OK);
+		if (ret != 0) {
+			SYSERROR("Could not access %s. Please grant it x "
+				 "access, or add an ACL for the container "
+				 "root", copy);
 			return;
 		}
 		*p = saved;
 	}
 }
 
-static void close_ns(int ns_fd[LXC_NS_MAX]) {
+static void lxc_put_nsfds(struct lxc_handler *handler)
+{
 	int i;
 
 	for (i = 0; i < LXC_NS_MAX; i++) {
-		if (ns_fd[i] > -1) {
-			close(ns_fd[i]);
-			ns_fd[i] = -1;
-		}
+		if (handler->nsfd[i] < 0)
+			continue;
+
+		close(handler->nsfd[i]);
+		handler->nsfd[i] = -EBADF;
 	}
 }
 
-/*
- * preserve_ns: open /proc/@pid/ns/@ns for each namespace specified
- * in clone_flags.
- * Return true on success, false on failure.  On failure, leave an error
- * message in *errmsg, which caller must free.
- */
-static
-bool preserve_ns(int ns_fd[LXC_NS_MAX], int clone_flags, pid_t pid, char **errmsg) {
-	int i, ret;
-	char path[MAXPATHLEN];
-
-	for (i = 0; i < LXC_NS_MAX; i++)
-		ns_fd[i] = -1;
+static int lxc_try_preserve_ns(const int pid, const char *ns)
+{
+	int fd;
 
-	snprintf(path, MAXPATHLEN, "/proc/%d/ns", pid);
-	if (access(path, X_OK)) {
-		if (asprintf(errmsg, "Kernel does not support setns.") == -1)
-			*errmsg = NULL;
-		return false;
-	}
+	fd = lxc_preserve_ns(pid, ns);
+	if (fd < 0) {
+		if (errno != ENOENT) {
+			SYSERROR("Failed to preserve %s namespace", ns);
+			return -EINVAL;
+		}
 
-	for (i = 0; i < LXC_NS_MAX; i++) {
-		if ((clone_flags & ns_info[i].clone_flag) == 0)
-			continue;
-		snprintf(path, MAXPATHLEN, "/proc/%d/ns/%s", pid,
-		         ns_info[i].proc_name);
-		ns_fd[i] = open(path, O_RDONLY | O_CLOEXEC);
-		if (ns_fd[i] < 0)
-			goto error;
+		WARN("%s - Kernel does not support preserving %s namespaces",
+		     strerror(errno), ns);
+		return -EOPNOTSUPP;
 	}
 
-	return true;
-
-error:
-	if (errno == ENOENT) {
-		ret = asprintf(errmsg, "Kernel does not support setns for %s",
-			ns_info[i].proc_name);
-	} else {
-		ret = asprintf(errmsg, "Failed to open %s: %s",
-			path, strerror(errno));
-	}
-	if (ret == -1)
-		*errmsg = NULL;
-	close_ns(ns_fd);
-	return false;
+	return fd;
 }
 
-static int attach_ns(const int ns_fd[LXC_NS_MAX]) {
+/* lxc_try_preserve_namespaces: open /proc/@pid/ns/@ns for each namespace
+ * specified in ns_clone_flags.
+ * Return true on success, false on failure.
+ */
+static bool lxc_try_preserve_namespaces(struct lxc_handler *handler,
+					int ns_clone_flags, pid_t pid)
+{
 	int i;
 
+	for (i = 0; i < LXC_NS_MAX; i++)
+		handler->nsfd[i] = -EBADF;
+
 	for (i = 0; i < LXC_NS_MAX; i++) {
-		if (ns_fd[i] < 0)
+		int fd;
+
+		if ((ns_clone_flags & ns_info[i].clone_flag) == 0)
 			continue;
 
-		if (setns(ns_fd[i], 0) != 0)
-			goto error;
+		fd = lxc_try_preserve_ns(pid, ns_info[i].proc_name);
+		if (fd < 0) {
+			handler->nsfd[i] = -EBADF;
+
+			/* Do not fail to start container on kernels that do
+			 * not support interacting with namespaces through
+			 * /proc.
+			 */
+			if (fd == -EOPNOTSUPP)
+				continue;
+
+			lxc_put_nsfds(handler);
+			return false;
+		}
+
+		handler->nsfd[i] = fd;
+		DEBUG("Preserved %s namespace via fd %d", ns_info[i].proc_name,
+		      handler->nsfd[i]);
 	}
-	return 0;
 
-error:
-	SYSERROR("failed to set namespace '%s'", ns_info[i].proc_name);
-	return -1;
+	return true;
 }
 
 static int match_fd(int fd)
@@ -481,10 +488,11 @@ static void lxc_fini(const char *name, struct lxc_handler *handler)
 	lxc_set_state(name, handler, STOPPING);
 
 	for (i = 0; i < LXC_NS_MAX; i++) {
-		if (handler->nsfd[i] != -1) {
-			close(handler->nsfd[i]);
-			handler->nsfd[i] = -1;
-		}
+		if (handler->nsfd[i] < 0)
+			continue;
+
+		close(handler->nsfd[i]);
+		handler->nsfd[i] = -EBADF;
 	}
 	lxc_set_state(name, handler, STOPPED);
 
@@ -663,15 +671,17 @@ static int do_start(void *data)
 	if (lxc_sync_wait_parent(handler, LXC_SYNC_STARTUP))
 		return -1;
 
-	/* Unshare CLONE_NEWNET after CLONE_NEWUSER  - see
-	  https://github.com/lxc/lxd/issues/1978 */
-	if ((handler->clone_flags & (CLONE_NEWNET | CLONE_NEWUSER)) ==
-			(CLONE_NEWNET | CLONE_NEWUSER)) {
+	/* Unshare CLONE_NEWNET after CLONE_NEWUSER. See
+	 * https://github.com/lxc/lxd/issues/1978.
+	 */
+	if ((handler->ns_clone_flags & (CLONE_NEWNET | CLONE_NEWUSER)) ==
+	    (CLONE_NEWNET | CLONE_NEWUSER)) {
 		ret = unshare(CLONE_NEWNET);
 		if (ret < 0) {
-			SYSERROR("Error unsharing network namespace");
+			SYSERROR("Failed to unshare CLONE_NEWNET");
 			goto out_warn_father;
 		}
+		INFO("Unshared CLONE_NEWNET");
 	}
 
 	/* Tell the parent task it can begin to configure the
@@ -826,35 +836,96 @@ static int save_phys_nics(struct lxc_conf *conf)
 	return 0;
 }
 
+int resolve_clone_flags(struct lxc_handler *handler)
+{
+	int i;
+	struct lxc_conf *conf = handler->conf;
+
+	for (i = 0; i < LXC_NS_MAX; i++) {
+
+		if (conf->ns_share[i] < 0)
+			continue;
+
+		handler->ns_clone_flags &= ~ns_info[i].clone_flag;
+		TRACE("Sharing %s namespace", ns_info[i].proc_name);
+	}
+
+	return 0;
+}
+
+/* Note that this function is used with clone(CLONE_VM). Some glibc versions
+ * used to reset the pid/tid to -1 when CLONE_VM was used without CLONE_THREAD.
+ * But since the memory between parent and child is shared on CLONE_VM this
+ * would invalidate the getpid() cache that glibc used to maintain and so
+ * getpid() in the child would return the parent's pid. This is all fixed in
+ * newer glibc versions where the getpid() cache is removed and the pid/tid is
+ * not reset anymore.
+ * However, if for whatever reason you - dear commiter - somehow need to get the
+ * pid of the dummy intermediate process for do_share_ns() you need to call
+ * lxc_raw_getpid(). The next lxc_raw_clone() call does not employ CLONE_VM and
+ * will be fine.
+ */
+static inline int do_share_ns(void *arg)
+{
+	int i, flags, ret;
+	struct lxc_handler *handler = arg;
+
+	for (i = 0; i < LXC_NS_MAX; i++) {
+		if (handler->conf->ns_share[i] < 0)
+			continue;
+
+		ret = setns(handler->conf->ns_share[i], 0);
+		if (ret < 0) {
+			/*
+			 * Note that joining a user and/or mount namespace
+			 * requires the process is not multithreaded otherwise
+			 * setns() will fail here.
+			 */
+			SYSERROR("Failed to inherit %s namespace",
+				 ns_info[i].proc_name);
+			return -1;
+		}
+
+		DEBUG("Inherited %s namespace", ns_info[i].proc_name);
+	}
+
+	flags = handler->ns_on_clone_flags;
+	flags |= CLONE_PARENT;
+	handler->pid = lxc_raw_clone_cb(do_start, handler, flags);
+	if (handler->pid < 0)
+		return -1;
+
+	return 0;
+}
+
 static int lxc_spawn(struct lxc_handler *handler)
 {
+	int i, nveths, ret;
+	int netpipepair[2];
 	int failed_before_rename = 0;
 	const char *name = handler->name;
-	char *errmsg = NULL;
-	bool cgroups_connected = false;
-	int saved_ns_fd[LXC_NS_MAX];
-	int preserve_mask = 0, i, flags;
-	int netpipepair[2], nveths;
+	bool cgroups_connected = false, share_ns = false;
 
 	netpipe = -1;
 
-	for (i = 0; i < LXC_NS_MAX; i++)
-		if (handler->conf->ns_share[i] != -1)
-			preserve_mask |= ns_info[i].clone_flag;
+	for (i = 0; i < LXC_NS_MAX; i++) {
+		if (handler->conf->ns_share[i] < 0)
+			continue;
+
+		share_ns = true;
+		break;
+	}
 
 	if (lxc_sync_init(handler))
 		return -1;
 
-	handler->clone_flags = CLONE_NEWPID|CLONE_NEWNS;
-	if (!lxc_list_empty(&handler->conf->id_map)) {
-		INFO("Cloning a new user namespace");
-		handler->clone_flags |= CLONE_NEWUSER;
+	ret = resolve_clone_flags(handler);
+	if (ret < 0) {
+		lxc_sync_fini(handler);
+		return -1;
 	}
 
-	if (handler->conf->ns_share[LXC_NS_NET] == -1) {
-		if (!lxc_requests_empty_network(handler))
-			handler->clone_flags |= CLONE_NEWNET;
-
+	if (handler->ns_clone_flags & CLONE_NEWNET) {
 		if (!lxc_list_empty(&handler->conf->network)) {
 
 			/* Find gateway addresses from the link device, which is
@@ -885,19 +956,6 @@ static int lxc_spawn(struct lxc_handler *handler)
 		INFO("Inheriting a net namespace");
 	}
 
-	if (handler->conf->ns_share[LXC_NS_IPC] == -1) {
-		handler->clone_flags |= CLONE_NEWIPC;
-	} else {
-		INFO("Inheriting an IPC namespace");
-	}
-
-	if (handler->conf->ns_share[LXC_NS_UTS] == -1) {
-		handler->clone_flags |= CLONE_NEWUTS;
-	} else {
-		INFO("Inheriting a UTS namespace");
-	}
-
-
 	if (!cgroup_init(handler)) {
 		ERROR("failed initializing cgroup support");
 		goto out_delete_net;
@@ -922,15 +980,6 @@ static int lxc_spawn(struct lxc_handler *handler)
 			INFO("failed to pin the container's rootfs");
 	}
 
-	if (!preserve_ns(saved_ns_fd, preserve_mask, getpid(), &errmsg)) {
-		SYSERROR("Failed to preserve requested namespaces: %s",
-			errmsg ? errmsg : "(Out of memory)");
-		free(errmsg);
-		goto out_delete_net;
-	}
-	if (attach_ns(handler->conf->ns_share) < 0)
-		goto out_delete_net;
-
 	if (am_unpriv() && (nveths = count_veths(&handler->conf->network))) {
 		if (pipe(netpipepair) < 0) {
 			SYSERROR("Error creating pipe");
@@ -940,24 +989,50 @@ static int lxc_spawn(struct lxc_handler *handler)
 		netpipe = netpipepair[0];
 	}
 
-	/* Create a process in a new set of namespaces */
-	flags = handler->clone_flags;
-	if (handler->clone_flags & CLONE_NEWUSER)
-		flags &= ~CLONE_NEWNET;
-	handler->pid = lxc_clone(do_start, handler, handler->clone_flags);
+	/* Create a process in a new set of namespaces. */
+	handler->ns_on_clone_flags = handler->ns_clone_flags;
+	if (handler->ns_clone_flags & CLONE_NEWUSER) {
+		/* If CLONE_NEWUSER and CLONE_NEWNET was requested, we need to
+		 * clone a new user namespace first and only later unshare our
+		 * network namespace to ensure that network devices ownership is
+		 * set up correctly.
+		 */
+		handler->ns_on_clone_flags &= ~CLONE_NEWNET;
+	}
+
+	if (share_ns) {
+		pid_t attacher_pid;
+
+		attacher_pid = lxc_clone(do_share_ns, handler,
+					 CLONE_VFORK | CLONE_VM | CLONE_FILES);
+		if (attacher_pid < 0) {
+			SYSERROR(LXC_CLONE_ERROR);
+			goto out_delete_net;
+		}
+
+		ret = wait_for_pid(attacher_pid);
+		if (ret < 0) {
+			SYSERROR("Intermediate process failed");
+			goto out_delete_net;
+		}
+	} else {
+		handler->pid = lxc_raw_clone_cb(do_start, handler,
+						handler->ns_on_clone_flags);
+	}
 	if (handler->pid < 0) {
-		SYSERROR("failed to fork into a new namespace");
+		SYSERROR(LXC_CLONE_ERROR);
 		goto out_delete_net;
 	}
+	TRACE("Cloned child process %d", handler->pid);
 
-	if (!preserve_ns(handler->nsfd, handler->clone_flags | preserve_mask, handler->pid, &errmsg)) {
-		INFO("Failed to store namespace references for stop hook: %s",
-			errmsg ? errmsg : "(Out of memory)");
-		free(errmsg);
-	}
+	for (i = 0; i < LXC_NS_MAX; i++)
+		if (handler->ns_on_clone_flags & ns_info[i].clone_flag)
+			INFO("Cloned %s", ns_info[i].flag_name);
 
-	if (attach_ns(saved_ns_fd))
-		WARN("failed to restore saved namespaces");
+	if (!lxc_try_preserve_namespaces(handler, handler->ns_on_clone_flags, handler->pid)) {
+		ERROR("Failed to preserve cloned namespaces for lxc.hook.stop");
+		goto out_delete_net;
+	}
 
 	lxc_sync_fini_child(handler);
 
@@ -1000,7 +1075,7 @@ static int lxc_spawn(struct lxc_handler *handler)
 		goto out_delete_net;
 
 	/* Create the network configuration */
-	if (handler->clone_flags & CLONE_NEWNET) {
+	if (handler->ns_clone_flags & CLONE_NEWNET) {
 		if (lxc_assign_network(&handler->conf->network, handler->pid)) {
 			ERROR("failed to create the configured network");
 			goto out_delete_net;
@@ -1067,7 +1142,7 @@ static int lxc_spawn(struct lxc_handler *handler)
 out_delete_net:
 	if (cgroups_connected)
 		cgroup_disconnect();
-	if (handler->clone_flags & CLONE_NEWNET)
+	if (handler->ns_clone_flags & CLONE_NEWNET)
 		lxc_delete_network(handler);
 out_abort:
 	lxc_abort(name, handler);
diff --git a/src/lxc/start.h b/src/lxc/start.h
index 7f24d6060..3de2f75a4 100644
--- a/src/lxc/start.h
+++ b/src/lxc/start.h
@@ -42,10 +42,26 @@ struct lxc_operations {
 struct cgroup_desc;
 
 struct lxc_handler {
+        /* Record the clone for namespaces flags that the container requested.
+	 *
+	 * @ns_clone_flags
+	 * - All clone flags that were requested.
+	 *
+	 * @ns_on_clone_flags
+	 * - The clone flags for namespaces to actually use when calling
+	 *   lxc_clone(): After the container has started ns_on_clone_flags will
+	 *   list the clone flags that were unshare()ed rather then clone()ed
+	 *   because of ordering requirements (e.g. e.g. CLONE_NEWNET and
+	 *   CLONE_NEWUSER) or implementation details.
+	 */
+	struct /* lxc_ns */ {
+		int ns_clone_flags;
+		int ns_on_clone_flags;
+	};
+
 	pid_t pid;
 	char *name;
 	lxc_state_t state;
-	int clone_flags;
 	int sigfd;
 	sigset_t oldmask;
 	struct lxc_conf *conf;
diff --git a/src/lxc/utils.c b/src/lxc/utils.c
index 97892ad1e..677711818 100644
--- a/src/lxc/utils.c
+++ b/src/lxc/utils.c
@@ -1392,3 +1392,24 @@ int set_stdfds(int fd)
 
 	return 0;
 }
+
+int lxc_preserve_ns(const int pid, const char *ns)
+{
+	int ret;
+/* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
+#define __NS_PATH_LEN 50
+	char path[__NS_PATH_LEN];
+
+	/* This way we can use this function to also check whether namespaces
+	 * are supported by the kernel by passing in the NULL or the empty
+	 * string.
+	 */
+	ret = snprintf(path, __NS_PATH_LEN, "/proc/%d/ns%s%s", pid,
+		       !ns || strcmp(ns, "") == 0 ? "" : "/",
+		       !ns || strcmp(ns, "") == 0 ? "" : ns);
+	errno = EFBIG;
+	if (ret < 0 || (size_t)ret >= __NS_PATH_LEN)
+		return -EFBIG;
+
+	return open(path, O_RDONLY | O_CLOEXEC);
+}
diff --git a/src/lxc/utils.h b/src/lxc/utils.h
index 8633680b0..984a65708 100644
--- a/src/lxc/utils.h
+++ b/src/lxc/utils.h
@@ -331,4 +331,6 @@ static inline uint64_t lxc_getpagesize(void)
 	return pgsz;
 }
 
+extern int lxc_preserve_ns(const int pid, const char *ns);
+
 #endif /* __LXC_UTILS_H */


More information about the lxc-devel mailing list