[lxc-devel] [lxc/stable-1.0] stable-1.0: support pre-setns() kernels
brauner on Github
lxc-bot at linuxcontainers.org
Mon May 28 10:50:00 UTC 2018
A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 458 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20180528/1733ca6f/attachment.bin>
-------------- next part --------------
From ac3bac8ca962984f5f362b2c004bcb30fdd411b1 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Mon, 28 May 2018 11:30:35 +0200
Subject: [PATCH 1/3] conf: inherit_ns_fd -> ns_share
Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
src/lxc/conf.c | 4 ++--
src/lxc/conf.h | 2 +-
src/lxc/lxc_start.c | 2 +-
src/lxc/start.c | 10 +++++-----
4 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/src/lxc/conf.c b/src/lxc/conf.c
index 5d583d5f4..5d2a7e1c5 100644
--- a/src/lxc/conf.c
+++ b/src/lxc/conf.c
@@ -3043,7 +3043,7 @@ struct lxc_conf *lxc_conf_init(void)
new->tmp_umount_proc = 0;
for (i = 0; i < LXC_NS_MAX; i++)
- new->inherit_ns_fd[i] = -1;
+ new->ns_share[i] = -1;
return new;
}
@@ -4281,7 +4281,7 @@ int lxc_setup(struct lxc_handler *handler)
return -1;
}
- if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
+ if (lxc_conf->ns_share[LXC_NS_UTS] == -1) {
if (setup_utsname(lxc_conf->utsname)) {
ERROR("failed to setup the utsname for '%s'", name);
return -1;
diff --git a/src/lxc/conf.h b/src/lxc/conf.h
index 8e5235121..21e9c95c7 100644
--- a/src/lxc/conf.h
+++ b/src/lxc/conf.h
@@ -332,7 +332,7 @@ struct lxc_conf {
char *logfile; // the logfile as specifed in config
int loglevel; // loglevel as specifed in config (if any)
- int inherit_ns_fd[LXC_NS_MAX];
+ int ns_share[LXC_NS_MAX];
int start_auto;
int start_delay;
diff --git a/src/lxc/lxc_start.c b/src/lxc/lxc_start.c
index 29db1881a..244a9b814 100644
--- a/src/lxc/lxc_start.c
+++ b/src/lxc/lxc_start.c
@@ -310,7 +310,7 @@ int main(int argc, char *argv[])
int fd = open_ns(pid, ns_info[i].proc_name);
if (fd < 0)
goto out;
- conf->inherit_ns_fd[i] = fd;
+ conf->ns_share[i] = fd;
}
if (!my_args.daemonize) {
diff --git a/src/lxc/start.c b/src/lxc/start.c
index 9d148dae3..c07276580 100644
--- a/src/lxc/start.c
+++ b/src/lxc/start.c
@@ -848,7 +848,7 @@ static int lxc_spawn(struct lxc_handler *handler)
netpipe = -1;
for (i = 0; i < LXC_NS_MAX; i++)
- if (handler->conf->inherit_ns_fd[i] != -1)
+ if (handler->conf->ns_share[i] != -1)
preserve_mask |= ns_info[i].clone_flag;
if (lxc_sync_init(handler))
@@ -860,7 +860,7 @@ static int lxc_spawn(struct lxc_handler *handler)
handler->clone_flags |= CLONE_NEWUSER;
}
- if (handler->conf->inherit_ns_fd[LXC_NS_NET] == -1) {
+ if (handler->conf->ns_share[LXC_NS_NET] == -1) {
if (!lxc_requests_empty_network(handler))
handler->clone_flags |= CLONE_NEWNET;
@@ -894,13 +894,13 @@ static int lxc_spawn(struct lxc_handler *handler)
INFO("Inheriting a net namespace");
}
- if (handler->conf->inherit_ns_fd[LXC_NS_IPC] == -1) {
+ if (handler->conf->ns_share[LXC_NS_IPC] == -1) {
handler->clone_flags |= CLONE_NEWIPC;
} else {
INFO("Inheriting an IPC namespace");
}
- if (handler->conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
+ if (handler->conf->ns_share[LXC_NS_UTS] == -1) {
handler->clone_flags |= CLONE_NEWUTS;
} else {
INFO("Inheriting a UTS namespace");
@@ -937,7 +937,7 @@ static int lxc_spawn(struct lxc_handler *handler)
free(errmsg);
goto out_delete_net;
}
- if (attach_ns(handler->conf->inherit_ns_fd) < 0)
+ if (attach_ns(handler->conf->ns_share) < 0)
goto out_delete_net;
if (am_unpriv() && (nveths = count_veths(&handler->conf->network))) {
From bd323d93a03e1a5f6e3fc42033d13b4cfef2c019 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Mon, 28 May 2018 11:47:39 +0200
Subject: [PATCH 2/3] namespace: backport namespace simplifcations
They have been sitting in master, stable-2.0 and stable-3.0 for quite a while
now and have made things much more reliable so let's backport them to
stable-1.0 as well.
Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
src/lxc/namespace.c | 152 +++++++++++++++++++++++++++++++++++++++++++---------
src/lxc/namespace.h | 133 +++++++++++++++++++++++++++++++++++++++++++--
src/lxc/start.c | 9 ----
src/lxc/start.h | 17 ------
src/lxc/utils.h | 12 +++++
5 files changed, 269 insertions(+), 54 deletions(-)
diff --git a/src/lxc/namespace.c b/src/lxc/namespace.c
index 19b6593fd..6f5ea674b 100644
--- a/src/lxc/namespace.c
+++ b/src/lxc/namespace.c
@@ -21,17 +21,20 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include <unistd.h>
#include <alloca.h>
#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
#include <signal.h>
+#include <unistd.h>
#include <sys/param.h>
-#include <sys/types.h>
#include <sys/stat.h>
-#include <fcntl.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
-#include "namespace.h"
#include "log.h"
+#include "namespace.h"
+#include "utils.h"
lxc_log_define(lxc_namespace, lxc);
@@ -53,41 +56,140 @@ pid_t lxc_clone(int (*fn)(void *), void *arg, int flags)
.arg = arg,
};
- size_t stack_size = sysconf(_SC_PAGESIZE);
+ size_t stack_size = lxc_getpagesize();
void *stack = alloca(stack_size);
pid_t ret;
#ifdef __ia64__
- ret = __clone2(do_clone, stack,
- stack_size, flags | SIGCHLD, &clone_arg);
+ ret = __clone2(do_clone, stack, stack_size, flags | SIGCHLD, &clone_arg);
#else
ret = clone(do_clone, stack + stack_size, flags | SIGCHLD, &clone_arg);
#endif
if (ret < 0)
- ERROR("failed to clone (%#x): %s", flags, strerror(errno));
+ ERROR("Failed to clone (%#x): %s.", flags, strerror(errno));
return ret;
}
-static const char * const namespaces_list[] = {
- "MOUNT", "PID", "UTSNAME", "IPC",
- "USER", "NETWORK"
-};
-static const int cloneflags_list[] = {
- CLONE_NEWNS, CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC,
- CLONE_NEWUSER, CLONE_NEWNET
+/**
+ * This is based on raw_clone in systemd but adapted to our needs. This uses
+ * copy on write semantics and doesn't pass a stack. CLONE_VM is tricky and
+ * doesn't really matter to us so disallow it.
+ *
+ * The nice thing about this is that we get fork() behavior. That is
+ * lxc_raw_clone() returns 0 in the child and the child pid in the parent.
+ */
+pid_t lxc_raw_clone(unsigned long flags)
+{
+
+ /* These flags don't interest at all so we don't jump through any hoopes
+ * of retrieving them and passing them to the kernel.
+ */
+ errno = EINVAL;
+ if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
+ CLONE_CHILD_CLEARTID | CLONE_SETTLS)))
+ return -EINVAL;
+
+#if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
+ /* On s390/s390x and cris the order of the first and second arguments
+ * of the system call is reversed.
+ */
+ return (int)syscall(__NR_clone, NULL, flags | SIGCHLD);
+#elif defined(__sparc__) && defined(__arch64__)
+ {
+ /**
+ * sparc64 always returns the other process id in %o0, and
+ * a boolean flag whether this is the child or the parent in
+ * %o1. Inline assembly is needed to get the flag returned
+ * in %o1.
+ */
+ int in_child;
+ int child_pid;
+ asm volatile("mov %2, %%g1\n\t"
+ "mov %3, %%o0\n\t"
+ "mov 0 , %%o1\n\t"
+ "t 0x6d\n\t"
+ "mov %%o1, %0\n\t"
+ "mov %%o0, %1"
+ : "=r"(in_child), "=r"(child_pid)
+ : "i"(__NR_clone), "r"(flags | SIGCHLD)
+ : "%o1", "%o0", "%g1");
+ if (in_child)
+ return 0;
+ else
+ return child_pid;
+ }
+#elif defined(__ia64__)
+ /* On ia64 the stack and stack size are passed as separate arguments. */
+ return (int)syscall(__NR_clone, flags | SIGCHLD, NULL, 0);
+#else
+ return (int)syscall(__NR_clone, flags | SIGCHLD, NULL);
+#endif
+}
+
+pid_t lxc_raw_clone_cb(int (*fn)(void *), void *args, unsigned long flags)
+{
+ pid_t pid;
+
+ pid = lxc_raw_clone(flags);
+ if (pid < 0)
+ return -1;
+
+ /* exit() is not thread-safe and might mess with the parent's signal
+ * handlers and other stuff when exec() fails.
+ */
+ if (pid == 0)
+ _exit(fn(args));
+
+ return pid;
+}
+
+/* Leave the user namespace at the first position in the array of structs so
+ * that we always attach to it first when iterating over the struct and using
+ * setns() to switch namespaces. This especially affects lxc_attach(): Suppose
+ * you cloned a new user namespace and mount namespace as an unprivileged user
+ * on the host and want to setns() to the mount namespace. This requires you to
+ * attach to the user namespace first otherwise the kernel will fail this check:
+ *
+ * if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
+ * !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
+ * !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ * return -EPERM;
+ *
+ * in
+ *
+ * linux/fs/namespace.c:mntns_install().
+ */
+const struct ns_info ns_info[LXC_NS_MAX] = {
+ [LXC_NS_USER] = { "user", CLONE_NEWUSER, "CLONE_NEWUSER", "LXC_USER_NS" },
+ [LXC_NS_MNT] = { "mnt", CLONE_NEWNS, "CLONE_NEWNS", "LXC_MNT_NS" },
+ [LXC_NS_PID] = { "pid", CLONE_NEWPID, "CLONE_NEWPID", "LXC_PID_NS" },
+ [LXC_NS_UTS] = { "uts", CLONE_NEWUTS, "CLONE_NEWUTS", "LXC_UTS_NS" },
+ [LXC_NS_IPC] = { "ipc", CLONE_NEWIPC, "CLONE_NEWIPC", "LXC_IPC_NS" },
+ [LXC_NS_NET] = { "net", CLONE_NEWNET, "CLONE_NEWNET", "LXC_NET_NS" },
+ [LXC_NS_CGROUP] = { "cgroup", CLONE_NEWCGROUP, "CLONE_NEWCGROUP", "LXC_CGROUP_NS" }
};
-int lxc_namespace_2_cloneflag(char *namespace)
+int lxc_namespace_2_cloneflag(const char *namespace)
+{
+ int i;
+ for (i = 0; i < LXC_NS_MAX; i++)
+ if (!strcasecmp(ns_info[i].proc_name, namespace))
+ return ns_info[i].clone_flag;
+
+ ERROR("Invalid namespace name \"%s\"", namespace);
+ return -EINVAL;
+}
+
+int lxc_namespace_2_ns_idx(const char *namespace)
{
- int i, len;
- len = sizeof(namespaces_list)/sizeof(namespaces_list[0]);
- for (i = 0; i < len; i++)
- if (!strcmp(namespaces_list[i], namespace))
- return cloneflags_list[i];
-
- ERROR("invalid namespace name %s", namespace);
- return -1;
+ int i;
+ for (i = 0; i < LXC_NS_MAX; i++)
+ if (!strcmp(ns_info[i].proc_name, namespace))
+ return i;
+
+ ERROR("Invalid namespace name \"%s\"", namespace);
+ return -EINVAL;
}
int lxc_fill_namespace_flags(char *flaglist, int *flags)
@@ -96,7 +198,7 @@ int lxc_fill_namespace_flags(char *flaglist, int *flags)
int aflag;
if (!flaglist) {
- ERROR("need at least one namespace to unshare");
+ ERROR("At least one namespace is needed.");
return -1;
}
diff --git a/src/lxc/namespace.h b/src/lxc/namespace.h
index 28f17e687..4bfe9c4f5 100644
--- a/src/lxc/namespace.h
+++ b/src/lxc/namespace.h
@@ -23,17 +23,53 @@
#ifndef __LXC_NAMESPACE_H
#define __LXC_NAMESPACE_H
-#include <sys/syscall.h>
#include <sched.h>
+#include <unistd.h>
+#include <sys/syscall.h>
#include "config.h"
+#ifndef CLONE_PARENT_SETTID
+#define CLONE_PARENT_SETTID 0x00100000
+#endif
+
+#ifndef CLONE_CHILD_CLEARTID
+#define CLONE_CHILD_CLEARTID 0x00200000
+#endif
+
+#ifndef CLONE_CHILD_SETTID
+#define CLONE_CHILD_SETTID 0x01000000
+#endif
+
+#ifndef CLONE_VFORK
+#define CLONE_VFORK 0x00004000
+#endif
+
+#ifndef CLONE_THREAD
+#define CLONE_THREAD 0x00010000
+#endif
+
+#ifndef CLONE_SETTLS
+#define CLONE_SETTLS 0x00080000
+#endif
+
+#ifndef CLONE_VM
+#define CLONE_VM 0x00000100
+#endif
+
+#ifndef CLONE_FILES
+#define CLONE_FILES 0x00000400
+#endif
+
#ifndef CLONE_FS
# define CLONE_FS 0x00000200
#endif
#ifndef CLONE_NEWNS
# define CLONE_NEWNS 0x00020000
#endif
+#ifndef CLONE_NEWCGROUP
+# define CLONE_NEWCGROUP 0x02000000
+#endif
#ifndef CLONE_NEWUTS
# define CLONE_NEWUTS 0x04000000
#endif
@@ -50,6 +86,24 @@
# define CLONE_NEWNET 0x40000000
#endif
+enum {
+ LXC_NS_USER,
+ LXC_NS_MNT,
+ LXC_NS_PID,
+ LXC_NS_UTS,
+ LXC_NS_IPC,
+ LXC_NS_NET,
+ LXC_NS_CGROUP,
+ LXC_NS_MAX
+};
+
+extern const struct ns_info {
+ const char *proc_name;
+ int clone_flag;
+ const char *flag_name;
+ const char *env_name;
+} ns_info[LXC_NS_MAX];
+
#if defined(__ia64__)
int __clone2(int (*__fn) (void *__arg), void *__child_stack_base,
size_t __child_stack_size, int __flags, void *__arg, ...);
@@ -59,10 +113,83 @@ int clone(int (*fn)(void *), void *child_stack,
/* pid_t *ptid, struct user_desc *tls, pid_t *ctid */ );
#endif
-
+/**
+ * lxc_clone() - create a new process
+ *
+ * - allocate stack:
+ * This function allocates a new stack the size of page and passes it to the
+ * kernel.
+ *
+ * - support all CLONE_*flags:
+ * This function supports all CLONE_* flags. If in doubt or not sufficiently
+ * familiar with process creation in the kernel and interactions with libcs
+ * this function should be used.
+ *
+ * - pthread_atfork() handlers depending on libc:
+ * Whether this function runs pthread_atfork() handlers depends on the
+ * corresponding libc wrapper. glibc currently does not run pthread_atfork()
+ * handlers but does not guarantee that they are not. Other libcs might or
+ * might not run pthread_atfork() handlers. If you require guarantees please
+ * refer to the lxc_raw_clone*() functions below.
+ *
+ * - should call lxc_raw_getpid():
+ * The child should use lxc_raw_getpid() to retrieve its pid.
+ */
extern pid_t lxc_clone(int (*fn)(void *), void *arg, int flags);
-extern int lxc_namespace_2_cloneflag(char *namespace);
+/**
+ * lxc_raw_clone() - create a new process
+ *
+ * - fork() behavior:
+ * This function returns 0 in the child and > 0 in the parent.
+ *
+ * - copy-on-write:
+ * This function does not allocate a new stack and relies on copy-on-write
+ * semantics.
+ *
+ * - supports subset of ClONE_* flags:
+ * lxc_raw_clone() intentionally only supports a subset of the flags available
+ * to the actual system call. Please refer to the implementation what flags
+ * cannot be used. Also, please don't assume that just because a flag isn't
+ * explicitly checked for as being unsupported that it is supported. If in
+ * doubt or not sufficiently familiar with process creation in the kernel and
+ * interactions with libcs this function should be used.
+ *
+ * - no pthread_atfork() handlers:
+ * This function circumvents - as much as this this is possible - any libc
+ * wrappers and thus does not run any pthread_atfork() handlers. Make sure
+ * that this is safe to do in the context you are trying to call this
+ * function.
+ *
+ * - must call lxc_raw_getpid():
+ * The child must use lxc_raw_getpid() to retrieve its pid.
+ */
+extern pid_t lxc_raw_clone(unsigned long flags);
+/**
+ * lxc_raw_clone_cb() - create a new process
+ *
+ * - non-fork() behavior:
+ * Function does return pid of the child or -1 on error. Pass in a callback
+ * function via the "fn" argument that gets executed in the child process. The
+ * "args" argument is passed to "fn".
+ *
+ * All other comments that apply to lxc_raw_clone() apply to lxc_raw_clone_cb()
+ * as well.
+ */
+extern pid_t lxc_raw_clone_cb(int (*fn)(void *), void *args,
+ unsigned long flags);
+
+extern int lxc_namespace_2_cloneflag(const char *namespace);
+extern int lxc_namespace_2_ns_idx(const char *namespace);
extern int lxc_fill_namespace_flags(char *flaglist, int *flags);
+/**
+ * Because of older glibc's pid cache (up to 2.25) whenever clone() is called
+ * the child must must retrieve it's own pid via lxc_raw_getpid().
+ */
+static inline pid_t lxc_raw_getpid(void)
+{
+ return (pid_t) syscall(SYS_getpid);
+}
+
#endif
diff --git a/src/lxc/start.c b/src/lxc/start.c
index c07276580..226e67ed0 100644
--- a/src/lxc/start.c
+++ b/src/lxc/start.c
@@ -74,15 +74,6 @@
lxc_log_define(lxc_start, lxc);
-const struct ns_info ns_info[LXC_NS_MAX] = {
- [LXC_NS_MNT] = {"mnt", CLONE_NEWNS},
- [LXC_NS_PID] = {"pid", CLONE_NEWPID},
- [LXC_NS_UTS] = {"uts", CLONE_NEWUTS},
- [LXC_NS_IPC] = {"ipc", CLONE_NEWIPC},
- [LXC_NS_USER] = {"user", CLONE_NEWUSER},
- [LXC_NS_NET] = {"net", CLONE_NEWNET}
-};
-
static void print_top_failing_dir(const char *path)
{
size_t len = strlen(path);
diff --git a/src/lxc/start.h b/src/lxc/start.h
index 7afa37a39..7f24d6060 100644
--- a/src/lxc/start.h
+++ b/src/lxc/start.h
@@ -41,23 +41,6 @@ struct lxc_operations {
struct cgroup_desc;
-enum {
- LXC_NS_MNT,
- LXC_NS_PID,
- LXC_NS_UTS,
- LXC_NS_IPC,
- LXC_NS_USER,
- LXC_NS_NET,
- LXC_NS_MAX
-};
-
-struct ns_info {
- const char *proc_name;
- int clone_flag;
-};
-
-extern const struct ns_info ns_info[LXC_NS_MAX];
-
struct lxc_handler {
pid_t pid;
char *name;
diff --git a/src/lxc/utils.h b/src/lxc/utils.h
index 8859eeb74..8633680b0 100644
--- a/src/lxc/utils.h
+++ b/src/lxc/utils.h
@@ -319,4 +319,16 @@ int null_stdfds(void);
int safe_mount(const char *src, const char *dest, const char *fstype,
unsigned long flags, const void *data, const char *rootfs);
int set_stdfds(int fd);
+
+static inline uint64_t lxc_getpagesize(void)
+{
+ int64_t pgsz;
+
+ pgsz = sysconf(_SC_PAGESIZE);
+ if (pgsz <= 0)
+ pgsz = 1 << 12;
+
+ return pgsz;
+}
+
#endif /* __LXC_UTILS_H */
From b749cb06590c19ce653371ee710c297934853317 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Mon, 28 May 2018 12:46:32 +0200
Subject: [PATCH 3/3] start: backport namespace preservation logic
They have been sitting in master, stable-2.0 and stable-3.0 for quite a while
now and have made things much more reliable so let's backport them to
stable-1.0 as well. This will allow users to run containers on 2.6.32 container
with stable-1.0 where setns() and namespace preservation is not supported.
Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
src/lxc/commands.c | 2 +-
src/lxc/error.h | 2 +
src/lxc/namespace.c | 3 +-
src/lxc/namespace.h | 3 -
src/lxc/start.c | 325 ++++++++++++++++++++++++++++++++--------------------
src/lxc/start.h | 18 ++-
src/lxc/utils.c | 21 ++++
src/lxc/utils.h | 2 +
8 files changed, 244 insertions(+), 132 deletions(-)
diff --git a/src/lxc/commands.c b/src/lxc/commands.c
index fcba5a9f1..5585dbc40 100644
--- a/src/lxc/commands.c
+++ b/src/lxc/commands.c
@@ -434,7 +434,7 @@ int lxc_cmd_get_clone_flags(const char *name, const char *lxcpath)
static int lxc_cmd_get_clone_flags_callback(int fd, struct lxc_cmd_req *req,
struct lxc_handler *handler)
{
- struct lxc_cmd_rsp rsp = { .data = INT_TO_PTR(handler->clone_flags) };
+ struct lxc_cmd_rsp rsp = { .data = INT_TO_PTR(handler->ns_clone_flags) };
return lxc_cmd_rsp_send(fd, &rsp);
}
diff --git a/src/lxc/error.h b/src/lxc/error.h
index d5d60de0f..6fe474a13 100644
--- a/src/lxc/error.h
+++ b/src/lxc/error.h
@@ -23,6 +23,8 @@
#ifndef __LXC_ERROR_H
#define __LXC_ERROR_H
+#define LXC_CLONE_ERROR "Failed to clone a new set of namespaces"
+
extern int lxc_error_set_and_log(int pid, int status);
#endif
diff --git a/src/lxc/namespace.c b/src/lxc/namespace.c
index 6f5ea674b..b6ee3abd4 100644
--- a/src/lxc/namespace.c
+++ b/src/lxc/namespace.c
@@ -166,8 +166,7 @@ const struct ns_info ns_info[LXC_NS_MAX] = {
[LXC_NS_PID] = { "pid", CLONE_NEWPID, "CLONE_NEWPID", "LXC_PID_NS" },
[LXC_NS_UTS] = { "uts", CLONE_NEWUTS, "CLONE_NEWUTS", "LXC_UTS_NS" },
[LXC_NS_IPC] = { "ipc", CLONE_NEWIPC, "CLONE_NEWIPC", "LXC_IPC_NS" },
- [LXC_NS_NET] = { "net", CLONE_NEWNET, "CLONE_NEWNET", "LXC_NET_NS" },
- [LXC_NS_CGROUP] = { "cgroup", CLONE_NEWCGROUP, "CLONE_NEWCGROUP", "LXC_CGROUP_NS" }
+ [LXC_NS_NET] = { "net", CLONE_NEWNET, "CLONE_NEWNET", "LXC_NET_NS" }
};
int lxc_namespace_2_cloneflag(const char *namespace)
diff --git a/src/lxc/namespace.h b/src/lxc/namespace.h
index 4bfe9c4f5..e879f4dff 100644
--- a/src/lxc/namespace.h
+++ b/src/lxc/namespace.h
@@ -67,9 +67,6 @@
#ifndef CLONE_NEWNS
# define CLONE_NEWNS 0x00020000
#endif
-#ifndef CLONE_NEWCGROUP
-# define CLONE_NEWCGROUP 0x02000000
-#endif
#ifndef CLONE_NEWUTS
# define CLONE_NEWUTS 0x04000000
#endif
diff --git a/src/lxc/start.c b/src/lxc/start.c
index 226e67ed0..1165f663d 100644
--- a/src/lxc/start.c
+++ b/src/lxc/start.c
@@ -76,100 +76,107 @@ lxc_log_define(lxc_start, lxc);
static void print_top_failing_dir(const char *path)
{
- size_t len = strlen(path);
- char *copy = alloca(len+1), *p, *e, saved;
- strcpy(copy, path);
+ int ret;
+ size_t len;
+ char *copy, *e, *p, saved;
+ len = strlen(path);
+ copy = alloca(len + 1);
+ strcpy(copy, path);
p = copy;
e = copy + len;
while (p < e) {
- while (p < e && *p == '/') p++;
- while (p < e && *p != '/') p++;
+ while (p < e && *p == '/')
+ p++;
+
+ while (p < e && *p != '/')
+ p++;
+
saved = *p;
*p = '\0';
- if (access(copy, X_OK)) {
- SYSERROR("could not access %s. Please grant it 'x' " \
- "access, or add an ACL for the container root.",
- copy);
+
+ ret = access(copy, X_OK);
+ if (ret != 0) {
+ SYSERROR("Could not access %s. Please grant it x "
+ "access, or add an ACL for the container "
+ "root", copy);
return;
}
*p = saved;
}
}
-static void close_ns(int ns_fd[LXC_NS_MAX]) {
+static void lxc_put_nsfds(struct lxc_handler *handler)
+{
int i;
for (i = 0; i < LXC_NS_MAX; i++) {
- if (ns_fd[i] > -1) {
- close(ns_fd[i]);
- ns_fd[i] = -1;
- }
+ if (handler->nsfd[i] < 0)
+ continue;
+
+ close(handler->nsfd[i]);
+ handler->nsfd[i] = -EBADF;
}
}
-/*
- * preserve_ns: open /proc/@pid/ns/@ns for each namespace specified
- * in clone_flags.
- * Return true on success, false on failure. On failure, leave an error
- * message in *errmsg, which caller must free.
- */
-static
-bool preserve_ns(int ns_fd[LXC_NS_MAX], int clone_flags, pid_t pid, char **errmsg) {
- int i, ret;
- char path[MAXPATHLEN];
-
- for (i = 0; i < LXC_NS_MAX; i++)
- ns_fd[i] = -1;
+static int lxc_try_preserve_ns(const int pid, const char *ns)
+{
+ int fd;
- snprintf(path, MAXPATHLEN, "/proc/%d/ns", pid);
- if (access(path, X_OK)) {
- if (asprintf(errmsg, "Kernel does not support setns.") == -1)
- *errmsg = NULL;
- return false;
- }
+ fd = lxc_preserve_ns(pid, ns);
+ if (fd < 0) {
+ if (errno != ENOENT) {
+ SYSERROR("Failed to preserve %s namespace", ns);
+ return -EINVAL;
+ }
- for (i = 0; i < LXC_NS_MAX; i++) {
- if ((clone_flags & ns_info[i].clone_flag) == 0)
- continue;
- snprintf(path, MAXPATHLEN, "/proc/%d/ns/%s", pid,
- ns_info[i].proc_name);
- ns_fd[i] = open(path, O_RDONLY | O_CLOEXEC);
- if (ns_fd[i] < 0)
- goto error;
+ WARN("%s - Kernel does not support preserving %s namespaces",
+ strerror(errno), ns);
+ return -EOPNOTSUPP;
}
- return true;
-
-error:
- if (errno == ENOENT) {
- ret = asprintf(errmsg, "Kernel does not support setns for %s",
- ns_info[i].proc_name);
- } else {
- ret = asprintf(errmsg, "Failed to open %s: %s",
- path, strerror(errno));
- }
- if (ret == -1)
- *errmsg = NULL;
- close_ns(ns_fd);
- return false;
+ return fd;
}
-static int attach_ns(const int ns_fd[LXC_NS_MAX]) {
+/* lxc_try_preserve_namespaces: open /proc/@pid/ns/@ns for each namespace
+ * specified in ns_clone_flags.
+ * Return true on success, false on failure.
+ */
+static bool lxc_try_preserve_namespaces(struct lxc_handler *handler,
+ int ns_clone_flags, pid_t pid)
+{
int i;
+ for (i = 0; i < LXC_NS_MAX; i++)
+ handler->nsfd[i] = -EBADF;
+
for (i = 0; i < LXC_NS_MAX; i++) {
- if (ns_fd[i] < 0)
+ int fd;
+
+ if ((ns_clone_flags & ns_info[i].clone_flag) == 0)
continue;
- if (setns(ns_fd[i], 0) != 0)
- goto error;
+ fd = lxc_try_preserve_ns(pid, ns_info[i].proc_name);
+ if (fd < 0) {
+ handler->nsfd[i] = -EBADF;
+
+ /* Do not fail to start container on kernels that do
+ * not support interacting with namespaces through
+ * /proc.
+ */
+ if (fd == -EOPNOTSUPP)
+ continue;
+
+ lxc_put_nsfds(handler);
+ return false;
+ }
+
+ handler->nsfd[i] = fd;
+ DEBUG("Preserved %s namespace via fd %d", ns_info[i].proc_name,
+ handler->nsfd[i]);
}
- return 0;
-error:
- SYSERROR("failed to set namespace '%s'", ns_info[i].proc_name);
- return -1;
+ return true;
}
static int match_fd(int fd)
@@ -481,10 +488,11 @@ static void lxc_fini(const char *name, struct lxc_handler *handler)
lxc_set_state(name, handler, STOPPING);
for (i = 0; i < LXC_NS_MAX; i++) {
- if (handler->nsfd[i] != -1) {
- close(handler->nsfd[i]);
- handler->nsfd[i] = -1;
- }
+ if (handler->nsfd[i] < 0)
+ continue;
+
+ close(handler->nsfd[i]);
+ handler->nsfd[i] = -EBADF;
}
lxc_set_state(name, handler, STOPPED);
@@ -663,15 +671,17 @@ static int do_start(void *data)
if (lxc_sync_wait_parent(handler, LXC_SYNC_STARTUP))
return -1;
- /* Unshare CLONE_NEWNET after CLONE_NEWUSER - see
- https://github.com/lxc/lxd/issues/1978 */
- if ((handler->clone_flags & (CLONE_NEWNET | CLONE_NEWUSER)) ==
- (CLONE_NEWNET | CLONE_NEWUSER)) {
+ /* Unshare CLONE_NEWNET after CLONE_NEWUSER. See
+ * https://github.com/lxc/lxd/issues/1978.
+ */
+ if ((handler->ns_clone_flags & (CLONE_NEWNET | CLONE_NEWUSER)) ==
+ (CLONE_NEWNET | CLONE_NEWUSER)) {
ret = unshare(CLONE_NEWNET);
if (ret < 0) {
- SYSERROR("Error unsharing network namespace");
+ SYSERROR("Failed to unshare CLONE_NEWNET");
goto out_warn_father;
}
+ INFO("Unshared CLONE_NEWNET");
}
/* Tell the parent task it can begin to configure the
@@ -826,35 +836,96 @@ static int save_phys_nics(struct lxc_conf *conf)
return 0;
}
+int resolve_clone_flags(struct lxc_handler *handler)
+{
+ int i;
+ struct lxc_conf *conf = handler->conf;
+
+ for (i = 0; i < LXC_NS_MAX; i++) {
+
+ if (conf->ns_share[i] < 0)
+ continue;
+
+ handler->ns_clone_flags &= ~ns_info[i].clone_flag;
+ TRACE("Sharing %s namespace", ns_info[i].proc_name);
+ }
+
+ return 0;
+}
+
+/* Note that this function is used with clone(CLONE_VM). Some glibc versions
+ * used to reset the pid/tid to -1 when CLONE_VM was used without CLONE_THREAD.
+ * But since the memory between parent and child is shared on CLONE_VM this
+ * would invalidate the getpid() cache that glibc used to maintain and so
+ * getpid() in the child would return the parent's pid. This is all fixed in
+ * newer glibc versions where the getpid() cache is removed and the pid/tid is
+ * not reset anymore.
+ * However, if for whatever reason you - dear commiter - somehow need to get the
+ * pid of the dummy intermediate process for do_share_ns() you need to call
+ * lxc_raw_getpid(). The next lxc_raw_clone() call does not employ CLONE_VM and
+ * will be fine.
+ */
+static inline int do_share_ns(void *arg)
+{
+ int i, flags, ret;
+ struct lxc_handler *handler = arg;
+
+ for (i = 0; i < LXC_NS_MAX; i++) {
+ if (handler->conf->ns_share[i] < 0)
+ continue;
+
+ ret = setns(handler->conf->ns_share[i], 0);
+ if (ret < 0) {
+ /*
+ * Note that joining a user and/or mount namespace
+ * requires the process is not multithreaded otherwise
+ * setns() will fail here.
+ */
+ SYSERROR("Failed to inherit %s namespace",
+ ns_info[i].proc_name);
+ return -1;
+ }
+
+ DEBUG("Inherited %s namespace", ns_info[i].proc_name);
+ }
+
+ flags = handler->ns_on_clone_flags;
+ flags |= CLONE_PARENT;
+ handler->pid = lxc_raw_clone_cb(do_start, handler, flags);
+ if (handler->pid < 0)
+ return -1;
+
+ return 0;
+}
+
static int lxc_spawn(struct lxc_handler *handler)
{
+ int i, nveths, ret;
+ int netpipepair[2];
int failed_before_rename = 0;
const char *name = handler->name;
- char *errmsg = NULL;
- bool cgroups_connected = false;
- int saved_ns_fd[LXC_NS_MAX];
- int preserve_mask = 0, i, flags;
- int netpipepair[2], nveths;
+ bool cgroups_connected = false, share_ns = false;
netpipe = -1;
- for (i = 0; i < LXC_NS_MAX; i++)
- if (handler->conf->ns_share[i] != -1)
- preserve_mask |= ns_info[i].clone_flag;
+ for (i = 0; i < LXC_NS_MAX; i++) {
+ if (handler->conf->ns_share[i] < 0)
+ continue;
+
+ share_ns = true;
+ break;
+ }
if (lxc_sync_init(handler))
return -1;
- handler->clone_flags = CLONE_NEWPID|CLONE_NEWNS;
- if (!lxc_list_empty(&handler->conf->id_map)) {
- INFO("Cloning a new user namespace");
- handler->clone_flags |= CLONE_NEWUSER;
+ ret = resolve_clone_flags(handler);
+ if (ret < 0) {
+ lxc_sync_fini(handler);
+ return -1;
}
- if (handler->conf->ns_share[LXC_NS_NET] == -1) {
- if (!lxc_requests_empty_network(handler))
- handler->clone_flags |= CLONE_NEWNET;
-
+ if (handler->ns_clone_flags & CLONE_NEWNET) {
if (!lxc_list_empty(&handler->conf->network)) {
/* Find gateway addresses from the link device, which is
@@ -885,19 +956,6 @@ static int lxc_spawn(struct lxc_handler *handler)
INFO("Inheriting a net namespace");
}
- if (handler->conf->ns_share[LXC_NS_IPC] == -1) {
- handler->clone_flags |= CLONE_NEWIPC;
- } else {
- INFO("Inheriting an IPC namespace");
- }
-
- if (handler->conf->ns_share[LXC_NS_UTS] == -1) {
- handler->clone_flags |= CLONE_NEWUTS;
- } else {
- INFO("Inheriting a UTS namespace");
- }
-
-
if (!cgroup_init(handler)) {
ERROR("failed initializing cgroup support");
goto out_delete_net;
@@ -922,15 +980,6 @@ static int lxc_spawn(struct lxc_handler *handler)
INFO("failed to pin the container's rootfs");
}
- if (!preserve_ns(saved_ns_fd, preserve_mask, getpid(), &errmsg)) {
- SYSERROR("Failed to preserve requested namespaces: %s",
- errmsg ? errmsg : "(Out of memory)");
- free(errmsg);
- goto out_delete_net;
- }
- if (attach_ns(handler->conf->ns_share) < 0)
- goto out_delete_net;
-
if (am_unpriv() && (nveths = count_veths(&handler->conf->network))) {
if (pipe(netpipepair) < 0) {
SYSERROR("Error creating pipe");
@@ -940,24 +989,50 @@ static int lxc_spawn(struct lxc_handler *handler)
netpipe = netpipepair[0];
}
- /* Create a process in a new set of namespaces */
- flags = handler->clone_flags;
- if (handler->clone_flags & CLONE_NEWUSER)
- flags &= ~CLONE_NEWNET;
- handler->pid = lxc_clone(do_start, handler, handler->clone_flags);
+ /* Create a process in a new set of namespaces. */
+ handler->ns_on_clone_flags = handler->ns_clone_flags;
+ if (handler->ns_clone_flags & CLONE_NEWUSER) {
+ /* If CLONE_NEWUSER and CLONE_NEWNET was requested, we need to
+ * clone a new user namespace first and only later unshare our
+ * network namespace to ensure that network devices ownership is
+ * set up correctly.
+ */
+ handler->ns_on_clone_flags &= ~CLONE_NEWNET;
+ }
+
+ if (share_ns) {
+ pid_t attacher_pid;
+
+ attacher_pid = lxc_clone(do_share_ns, handler,
+ CLONE_VFORK | CLONE_VM | CLONE_FILES);
+ if (attacher_pid < 0) {
+ SYSERROR(LXC_CLONE_ERROR);
+ goto out_delete_net;
+ }
+
+ ret = wait_for_pid(attacher_pid);
+ if (ret < 0) {
+ SYSERROR("Intermediate process failed");
+ goto out_delete_net;
+ }
+ } else {
+ handler->pid = lxc_raw_clone_cb(do_start, handler,
+ handler->ns_on_clone_flags);
+ }
if (handler->pid < 0) {
- SYSERROR("failed to fork into a new namespace");
+ SYSERROR(LXC_CLONE_ERROR);
goto out_delete_net;
}
+ TRACE("Cloned child process %d", handler->pid);
- if (!preserve_ns(handler->nsfd, handler->clone_flags | preserve_mask, handler->pid, &errmsg)) {
- INFO("Failed to store namespace references for stop hook: %s",
- errmsg ? errmsg : "(Out of memory)");
- free(errmsg);
- }
+ for (i = 0; i < LXC_NS_MAX; i++)
+ if (handler->ns_on_clone_flags & ns_info[i].clone_flag)
+ INFO("Cloned %s", ns_info[i].flag_name);
- if (attach_ns(saved_ns_fd))
- WARN("failed to restore saved namespaces");
+ if (!lxc_try_preserve_namespaces(handler, handler->ns_on_clone_flags, handler->pid)) {
+ ERROR("Failed to preserve cloned namespaces for lxc.hook.stop");
+ goto out_delete_net;
+ }
lxc_sync_fini_child(handler);
@@ -1000,7 +1075,7 @@ static int lxc_spawn(struct lxc_handler *handler)
goto out_delete_net;
/* Create the network configuration */
- if (handler->clone_flags & CLONE_NEWNET) {
+ if (handler->ns_clone_flags & CLONE_NEWNET) {
if (lxc_assign_network(&handler->conf->network, handler->pid)) {
ERROR("failed to create the configured network");
goto out_delete_net;
@@ -1067,7 +1142,7 @@ static int lxc_spawn(struct lxc_handler *handler)
out_delete_net:
if (cgroups_connected)
cgroup_disconnect();
- if (handler->clone_flags & CLONE_NEWNET)
+ if (handler->ns_clone_flags & CLONE_NEWNET)
lxc_delete_network(handler);
out_abort:
lxc_abort(name, handler);
diff --git a/src/lxc/start.h b/src/lxc/start.h
index 7f24d6060..3de2f75a4 100644
--- a/src/lxc/start.h
+++ b/src/lxc/start.h
@@ -42,10 +42,26 @@ struct lxc_operations {
struct cgroup_desc;
struct lxc_handler {
+ /* Record the clone for namespaces flags that the container requested.
+ *
+ * @ns_clone_flags
+ * - All clone flags that were requested.
+ *
+ * @ns_on_clone_flags
+ * - The clone flags for namespaces to actually use when calling
+ * lxc_clone(): After the container has started ns_on_clone_flags will
+ * list the clone flags that were unshare()ed rather then clone()ed
+ * because of ordering requirements (e.g. e.g. CLONE_NEWNET and
+ * CLONE_NEWUSER) or implementation details.
+ */
+ struct /* lxc_ns */ {
+ int ns_clone_flags;
+ int ns_on_clone_flags;
+ };
+
pid_t pid;
char *name;
lxc_state_t state;
- int clone_flags;
int sigfd;
sigset_t oldmask;
struct lxc_conf *conf;
diff --git a/src/lxc/utils.c b/src/lxc/utils.c
index 97892ad1e..677711818 100644
--- a/src/lxc/utils.c
+++ b/src/lxc/utils.c
@@ -1392,3 +1392,24 @@ int set_stdfds(int fd)
return 0;
}
+
+int lxc_preserve_ns(const int pid, const char *ns)
+{
+ int ret;
+/* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
+#define __NS_PATH_LEN 50
+ char path[__NS_PATH_LEN];
+
+ /* This way we can use this function to also check whether namespaces
+ * are supported by the kernel by passing in the NULL or the empty
+ * string.
+ */
+ ret = snprintf(path, __NS_PATH_LEN, "/proc/%d/ns%s%s", pid,
+ !ns || strcmp(ns, "") == 0 ? "" : "/",
+ !ns || strcmp(ns, "") == 0 ? "" : ns);
+ errno = EFBIG;
+ if (ret < 0 || (size_t)ret >= __NS_PATH_LEN)
+ return -EFBIG;
+
+ return open(path, O_RDONLY | O_CLOEXEC);
+}
diff --git a/src/lxc/utils.h b/src/lxc/utils.h
index 8633680b0..984a65708 100644
--- a/src/lxc/utils.h
+++ b/src/lxc/utils.h
@@ -331,4 +331,6 @@ static inline uint64_t lxc_getpagesize(void)
return pgsz;
}
+extern int lxc_preserve_ns(const int pid, const char *ns);
+
#endif /* __LXC_UTILS_H */
More information about the lxc-devel
mailing list