[lxc-devel] [lxd/master] daemon: support attach to namespaces via pidfds

brauner on Github lxc-bot at linuxcontainers.org
Fri Aug 14 18:09:32 UTC 2020


A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 364 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20200814/e292afe9/attachment.bin>
-------------- next part --------------
From 4b3132965cf430b0ee4e44cca998d8dbe27dd44a Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Fri, 14 Aug 2020 18:45:58 +0200
Subject: [PATCH 1/4] daemon: check namespace management support through pidfds

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 lxd/daemon.go            |  7 +++++++
 lxd/main_checkfeature.go | 22 ++++++++++++++++------
 lxd/sys/os.go            |  1 +
 3 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/lxd/daemon.go b/lxd/daemon.go
index 90131cdf02..cc5bcb4fad 100644
--- a/lxd/daemon.go
+++ b/lxd/daemon.go
@@ -690,6 +690,13 @@ func (d *Daemon) init() error {
 		logger.Infof(" - seccomp listener add file descriptors: no")
 	}
 
+	d.os.PidFdSetns = canUsePidFdSetns()
+	if d.os.PidFdSetns {
+		logger.Infof(" - attach to namespaces via pidfds: yes")
+	} else {
+		logger.Infof(" - attach to namespaces via pidfds: no")
+	}
+
 	if d.os.LXCFeatures["devpts_fd"] && canUseNativeTerminals() {
 		d.os.NativeTerminals = true
 		logger.Infof(" - safe native terminal allocation : yes")
diff --git a/lxd/main_checkfeature.go b/lxd/main_checkfeature.go
index a11ac9de74..14433976d7 100644
--- a/lxd/main_checkfeature.go
+++ b/lxd/main_checkfeature.go
@@ -47,6 +47,7 @@ __ro_after_init bool close_range_aware = false;
 __ro_after_init bool tiocgptpeer_aware = false;
 __ro_after_init bool netnsid_aware = false;
 __ro_after_init bool pidfd_aware = false;
+__ro_after_init bool pidfd_setns_aware = false;
 __ro_after_init bool uevent_aware = false;
 __ro_after_init int seccomp_notify_aware = 0;
 __ro_after_init char errbuf[4096];
@@ -429,14 +430,14 @@ static void is_seccomp_notify_aware(void)
 
 }
 
-static void is_pidfd_aware(void)
+static int is_pidfd_aware(void)
 {
 	__do_close int pidfd = -EBADF;
 	int ret;
 
 	pidfd = pidfd_open(getpid(), 0);
 	if (pidfd < 0)
-		return;
+		return -EBADF;
 
 	// We don't care whether or not children were in a waitable state. We
 	// just care whether waitid() recognizes P_PIDFD.
@@ -448,13 +449,14 @@ static void is_pidfd_aware(void)
 		    // What state to wait for.
 		    WEXITED | WSTOPPED | WCONTINUED);
 	if (ret < 0 && errno != ECHILD)
-		return;
+		return -errno;
 
 	ret = pidfd_send_signal(pidfd, 0, NULL, 0);
 	if (ret)
-		return;
+		return -errno;
 
 	pidfd_aware = true;
+	return move_fd(pidfd);
 }
 
 #ifndef TIOCGPTPEER
@@ -505,15 +507,18 @@ static void is_close_range_aware(void)
 
 void checkfeature(void)
 {
-	__do_close int hostnetns_fd = -EBADF, newnetns_fd = -EBADF;
+	__do_close int hostnetns_fd = -EBADF, newnetns_fd = -EBADF, pidfd = -EBADF;
 
 	is_netnsid_aware(&hostnetns_fd, &newnetns_fd);
-	is_pidfd_aware();
+	pidfd = is_pidfd_aware();
 	is_uevent_aware();
 	is_seccomp_notify_aware();
 	is_tiocgptpeer_aware();
 	is_close_range_aware();
 
+	if (pidfd >= 0)
+		pidfd_setns_aware = !setns(pidfd, CLONE_NEWNET);
+
 	if (setns(hostnetns_fd, CLONE_NEWNET) < 0)
 		(void)sprintf(errbuf, "%s", "Failed to attach to host network namespace");
 
@@ -549,6 +554,7 @@ func canUseSeccompListenerContinue() bool {
 func canUseSeccompListenerAddfd() bool {
 	return bool(C.seccomp_notify_aware == 3)
 }
+
 func canUsePidFds() bool {
 	return bool(C.pidfd_aware)
 }
@@ -597,3 +603,7 @@ func canUseNativeTerminals() bool {
 func canUseCloseRange() bool {
 	return bool(C.close_range_aware)
 }
+
+func canUsePidFdSetns() bool {
+	return bool(C.pidfd_setns_aware)
+}
diff --git a/lxd/sys/os.go b/lxd/sys/os.go
index 5dc6d1df61..c68ceb6aaf 100644
--- a/lxd/sys/os.go
+++ b/lxd/sys/os.go
@@ -71,6 +71,7 @@ type OS struct {
 	NativeTerminals         bool
 	NetnsGetifaddrs         bool
 	PidFds                  bool
+	PidFdSetns              bool
 	SeccompListener         bool
 	SeccompListenerAddfd    bool
 	SeccompListenerContinue bool

From e4cb9d3d95c0c832309a6c53855b1139e6ad9b8b Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Fri, 14 Aug 2020 18:47:19 +0200
Subject: [PATCH 2/4] main_nsexec: remove unused dosetns() function

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 lxd/main_nsexec.go | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/lxd/main_nsexec.go b/lxd/main_nsexec.go
index bbd148ba5a..f336a96b46 100644
--- a/lxd/main_nsexec.go
+++ b/lxd/main_nsexec.go
@@ -108,25 +108,6 @@ void error(char *msg)
 	fprintf(stderr, "errno: %d\n", old_errno);
 }
 
-int dosetns(int pid, char *nstype) {
-	__do_close int ns_fd = -EBADF;
-	char buf[PATH_MAX];
-
-	sprintf(buf, "/proc/%d/ns/%s", pid, nstype);
-	ns_fd = open(buf, O_RDONLY);
-	if (ns_fd < 0) {
-		error("error: open namespace");
-		return -1;
-	}
-
-	if (setns(ns_fd, 0) < 0) {
-		error("error: setns");
-		return -1;
-	}
-
-	return 0;
-}
-
 int dosetns_file(char *file, char *nstype) {
 	__do_close int ns_fd = -EBADF;
 

From 41210863889b20d3562ee4bdb2911243e0ff73f3 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Fri, 14 Aug 2020 19:13:32 +0200
Subject: [PATCH 3/4] main_nsexec: add new change_namespace() helper

which abstracts nsfd and pidfd based namespace attaching.

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 lxd/include/macro.h      |  2 ++
 lxd/main_checkfeature.go |  2 --
 lxd/main_forkfile.go     |  9 ++++----
 lxd/main_nsexec.go       | 46 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/lxd/include/macro.h b/lxd/include/macro.h
index 87d7d41fb2..89401be780 100644
--- a/lxd/include/macro.h
+++ b/lxd/include/macro.h
@@ -272,4 +272,6 @@ enum {
 #define P_PIDFD 3
 #endif
 
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
 #endif /* __LXC_MACRO_H */
diff --git a/lxd/main_checkfeature.go b/lxd/main_checkfeature.go
index 14433976d7..30e352014f 100644
--- a/lxd/main_checkfeature.go
+++ b/lxd/main_checkfeature.go
@@ -152,8 +152,6 @@ static void is_uevent_aware(void)
 	uevent_aware = true;
 }
 
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
-
 static int user_trap_syscall(int nr, unsigned int flags)
 {
 	struct sock_filter filter[] = {
diff --git a/lxd/main_forkfile.go b/lxd/main_forkfile.go
index 3a2ab7efd1..53e07d4734 100644
--- a/lxd/main_forkfile.go
+++ b/lxd/main_forkfile.go
@@ -62,10 +62,9 @@ int copy(int target, int source, bool append)
 
 int manip_file_in_ns(char *rootfs, int ns_fd, char *host, char *container, bool is_put, char *type, uid_t uid, gid_t gid, mode_t mode, uid_t defaultUid, gid_t defaultGid, mode_t defaultMode, bool append) {
 	__do_close int host_fd = -EBADF, container_fd = -EBADF;
-	int ret = -1;
+	int exists = -1, fret = -1;
 	int container_open_flags;
 	struct stat st;
-	int exists = 1;
 	bool is_dir_manip = type != NULL && !strcmp(type, "directory");
 	bool is_symlink_manip = type != NULL && !strcmp(type, "symlink");
 	char link_target[PATH_MAX];
@@ -221,7 +220,7 @@ int manip_file_in_ns(char *rootfs, int ns_fd, char *host, char *container, bool
 			error("error: chown");
 			return -1;
 		}
-		ret = 0;
+		fret = 0;
 	} else {
 		if (fstat(container_fd, &st) < 0) {
 			error("error: stat");
@@ -266,12 +265,12 @@ int manip_file_in_ns(char *rootfs, int ns_fd, char *host, char *container, bool
 			return -1;
 		} else {
 			fprintf(stderr, "type: file\n");
-			ret = copy(host_fd, container_fd, false);
+			fret = copy(host_fd, container_fd, false);
 		}
 		fprintf(stderr, "type: %s", S_ISDIR(st.st_mode) ? "directory" : "file");
 	}
 
-	return ret;
+	return fret;
 }
 
 void forkdofile(bool is_put, char *rootfs, int ns_fd) {
diff --git a/lxd/main_nsexec.go b/lxd/main_nsexec.go
index f336a96b46..887b835536 100644
--- a/lxd/main_nsexec.go
+++ b/lxd/main_nsexec.go
@@ -59,6 +59,7 @@ extern void forkuevent();
 char *cmdline_buf = NULL;
 char *cmdline_cur = NULL;
 ssize_t cmdline_size = -1;
+extern int pidfd_setns_aware;
 
 int wait_for_pid(pid_t pid)
 {
@@ -267,6 +268,51 @@ int pidfd_nsfd(int pidfd, pid_t pid)
 	return move_fd(ns_fd);
 }
 
+static const struct ns_info {
+	const char *proc_name;
+	int clone_flag;
+} ns_info[] = {
+	{ "user",   CLONE_NEWUSER	},
+	{ "mnt",    CLONE_NEWNS		},
+	{ "pid",    CLONE_NEWPID	},
+	{ "uts",    CLONE_NEWUTS	},
+	{ "ipc",    CLONE_NEWIPC	},
+	{ "net",    CLONE_NEWNET	},
+	{ "cgroup", CLONE_NEWCGROUP	},
+	{ "time",   CLONE_NEWTIME	},
+};
+
+static inline const char *namespace_flag_into_name(unsigned int flags)
+{
+	for (int i = 0; i < ARRAY_SIZE(ns_info); i++)
+		if (ns_info[i].clone_flag == flags)
+			return ns_info[i].proc_name;
+
+	return NULL;
+}
+
+bool change_namespaces(int pidfd, int nsfd, unsigned int flags)
+{
+	__do_close int fd = -EBADF;
+	const char *ns;
+
+	if (pidfd >= 0 && nsfd >= 0)
+		return false;
+
+	if (pidfd >= 0)
+		return setns(pidfd, flags) == 0;
+
+	ns = namespace_flag_into_name(flags);
+	if (!ns)
+		return false;
+
+	fd = openat(nsfd, ns, O_RDONLY | O_CLOEXEC);
+	if (fd < 0)
+		return false;
+
+	return setns(fd, 0) == 0;
+}
+
 bool setnsat(int ns_fd, const char *ns)
 {
 	__do_close int fd = -EBADF;

From 1ccbd7067a1655feb92f755adb7ccad1d131a68a Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Fri, 14 Aug 2020 20:05:08 +0200
Subject: [PATCH 4/4] forksyscall: use pidfds to attach to namespaces

This is currently somewhat penalizing kernels that don't have pidfd setns
support because we're trying to setns() via the pidfd first and then fallback
to regular setns(). It's also penalizing error paths even non newer kernels
because in that case we always try both because we can't discern based on
return value from setns() when passing a pidfd if it failed because it doesn't
supports pidfd or because an actual error happened.

So in the long-term we'll want a way for the LXD daemon to communicate the
feature mask it detected at startup time to its helpers. This could probably be
a 32 bitmask that records all the kernel features via flags and that is passed
as an argument to the subprocess which can check for features it needs to know
about. Or maybe something more clever.

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 lxd/main_forksyscall.go | 27 +++++++++++++--------------
 lxd/main_nsexec.go      | 10 ++++++----
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/lxd/main_forksyscall.go b/lxd/main_forksyscall.go
index bb148fa7d2..b4a62a3b92 100644
--- a/lxd/main_forksyscall.go
+++ b/lxd/main_forksyscall.go
@@ -35,8 +35,8 @@ import (
 extern char* advance_arg(bool required);
 extern void attach_userns_fd(int ns_fd);
 extern int pidfd_nsfd(int pidfd, pid_t pid);
-extern bool setnsat(int ns_fd, const char *ns);
 extern int preserve_ns(const int pid, const char *ns);
+extern bool change_namespaces(int pidfd, int nsfd, unsigned int flags);
 
 static bool chdirchroot_in_mntns(int cwd_fd, int root_fd)
 {
@@ -57,7 +57,7 @@ static bool chdirchroot_in_mntns(int cwd_fd, int root_fd)
 	return true;
 }
 
-static bool acquire_basic_creds(pid_t pid, int ns_fd)
+static bool acquire_basic_creds(pid_t pid, int pidfd, int ns_fd)
 {
 	__do_close int cwd_fd = -EBADF, root_fd = -EBADF;
 	char buf[256];
@@ -72,7 +72,7 @@ static bool acquire_basic_creds(pid_t pid, int ns_fd)
 	if (cwd_fd < 0)
 		return false;
 
-	if (!setnsat(ns_fd, "mnt"))
+	if (!change_namespaces(pidfd, ns_fd, CLONE_NEWNS))
 		return false;
 
 	return chdirchroot_in_mntns(cwd_fd, root_fd);
@@ -157,7 +157,7 @@ static void mknod_emulate(void)
 	fsuid = atoi(advance_arg(true));
 	fsgid = atoi(advance_arg(true));
 
-	if (!acquire_basic_creds(pid, ns_fd)) {
+	if (!acquire_basic_creds(pid, pidfd, ns_fd)) {
 		fprintf(stderr, "%d", ENOANO);
 		_exit(EXIT_FAILURE);
 	}
@@ -203,15 +203,15 @@ static void mknod_emulate(void)
 #define CLONE_NEWCGROUP	0x02000000
 #endif
 
-const char *ns_names[] = { "user", "pid", "uts", "ipc", "net", "cgroup", NULL };
+const static int ns_flags[] = { CLONE_NEWUSER, CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWNET, CLONE_NEWCGROUP };
 
-static bool change_creds(int ns_fd, cap_t caps, uid_t nsuid, gid_t nsgid, uid_t nsfsuid, gid_t nsfsgid)
+static bool change_creds(int pidfd, int ns_fd, cap_t caps, uid_t nsuid, gid_t nsgid, uid_t nsfsuid, gid_t nsfsgid)
 {
 	if (prctl(PR_SET_KEEPCAPS, 1))
 		return false;
 
-	for (const char **ns = ns_names; ns && *ns; ns++) {
-		if (!setnsat(ns_fd, *ns))
+	for (int i = 0; ARRAY_SIZE(ns_flags); i++) {
+		if (!change_namespaces(pidfd, ns_fd, ns_flags[i]))
 			return false;
 	}
 
@@ -261,7 +261,7 @@ static void setxattr_emulate(void)
 	size = atoi(advance_arg(true));
 	data = advance_arg(true);
 
-	if (!acquire_basic_creds(pid, ns_fd)) {
+	if (!acquire_basic_creds(pid, pidfd, ns_fd)) {
 		fprintf(stderr, "%d", ENOANO);
 		_exit(EXIT_FAILURE);
 	}
@@ -296,7 +296,7 @@ static void setxattr_emulate(void)
 			_exit(EXIT_FAILURE);
 		}
 	} else {
-		if (!change_creds(ns_fd, caps, nsuid, nsgid, nsfsuid, nsfsgid)) {
+		if (!change_creds(pidfd, ns_fd, caps, nsuid, nsgid, nsfsuid, nsfsgid)) {
 			fprintf(stderr, "%d", EFAULT);
 			_exit(EXIT_FAILURE);
 		}
@@ -355,7 +355,6 @@ static void mount_emulate(void)
 	ns_fd = pidfd_nsfd(pidfd, pid);
 	if (ns_fd < 0)
 		_exit(EXIT_FAILURE);
-
 	use_fuse = (atoi(advance_arg(true)) == 1);
 	if (!use_fuse) {
 		source = advance_arg(true);
@@ -386,10 +385,10 @@ static void mount_emulate(void)
 		// Attach to pid namespace so that if we spawn a fuse daemon
 		// it'll belong to the correct pid namespace and dies with the
 		// container.
-		setnsat(ns_fd, "pid");
+		change_namespaces(pidfd, ns_fd, CLONE_NEWPID);
 	}
 
-	if (!acquire_basic_creds(pid, ns_fd))
+	if (!acquire_basic_creds(pid, pidfd, ns_fd))
 		_exit(EXIT_FAILURE);
 
 	if (!acquire_final_creds(pid, uid, gid, fsuid, fsgid))
@@ -447,7 +446,7 @@ static void mount_emulate(void)
 		}
 
 		attach_userns_fd(ns_fd);
-		if (!acquire_basic_creds(pid, ns_fd)) {
+		if (!acquire_basic_creds(pid, pidfd, ns_fd)) {
 			umount2(target, MNT_DETACH);
 			umount2(target, MNT_DETACH);
 			_exit(EXIT_FAILURE);
diff --git a/lxd/main_nsexec.go b/lxd/main_nsexec.go
index 887b835536..c3e8bd1290 100644
--- a/lxd/main_nsexec.go
+++ b/lxd/main_nsexec.go
@@ -296,11 +296,13 @@ bool change_namespaces(int pidfd, int nsfd, unsigned int flags)
 	__do_close int fd = -EBADF;
 	const char *ns;
 
-	if (pidfd >= 0 && nsfd >= 0)
-		return false;
+	if (pidfd >= 0 && setns(pidfd, flags) == 0) {
+		kill(getppid(), SIGKILL);
+		return true;
+	}
 
-	if (pidfd >= 0)
-		return setns(pidfd, flags) == 0;
+	if (nsfd < 0)
+		return false;
 
 	ns = namespace_flag_into_name(flags);
 	if (!ns)


More information about the lxc-devel mailing list