[lxc-devel] [lxcfs/master] [RFC] move lxcfs to minimal chroot

brauner on Github lxc-bot at linuxcontainers.org
Wed Jul 27 22:06:11 UTC 2016


A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 1142 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20160727/774135b7/attachment.bin>
-------------- next part --------------
From 024cff5f6872839466607675c89dced9a39d17c7 Mon Sep 17 00:00:00 2001
From: Christian Brauner <cbrauner at suse.de>
Date: Wed, 27 Jul 2016 23:24:50 +0200
Subject: [PATCH 1/4] small improvements

- capitalize BASEDIR
- add missing newline to debug output

Signed-off-by: Christian Brauner <cbrauner at suse.de>
---
 bindings.c | 62 +++++++++++++++++++++++++++++++-------------------------------
 bindings.h |  2 +-
 lxcfs.c    | 12 ++++++------
 3 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/bindings.c b/bindings.c
index 8186ca9..4cc3e82 100644
--- a/bindings.c
+++ b/bindings.c
@@ -384,7 +384,7 @@ static void print_subsystems(void)
 {
 	int i;
 
-	fprintf(stderr, "hierarchies:");
+	fprintf(stderr, "hierarchies:\n");
 	for (i = 0; i < num_hierarchies; i++) {
 		if (hierarchies[i])
 			fprintf(stderr, " %d: %s\n", i, hierarchies[i]);
@@ -435,10 +435,10 @@ bool cgfs_set_value(const char *controller, const char *cgroup, const char *file
 
 	if (!tmpc)
 		return false;
-	/* basedir / tmpc / cgroup / file \0 */
-	len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + strlen(file) + 4;
+	/* BASEDIR / tmpc / cgroup / file \0 */
+	len = strlen(BASEDIR) + strlen(tmpc) + strlen(cgroup) + strlen(file) + 4;
 	fnam = alloca(len);
-	snprintf(fnam, len, "%s/%s/%s/%s", basedir, tmpc, cgroup, file);
+	snprintf(fnam, len, "%s/%s/%s/%s", BASEDIR, tmpc, cgroup, file);
 
 	return write_string(fnam, value);
 }
@@ -486,10 +486,10 @@ int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
 
 	if (!tmpc)
 		return -EINVAL;
-	/* basedir / tmpc / cg \0 */
-	len = strlen(basedir) + strlen(tmpc) + strlen(cg) + 3;
+	/* BASEDIR / tmpc / cg \0 */
+	len = strlen(BASEDIR) + strlen(tmpc) + strlen(cg) + 3;
 	dirnam = alloca(len);
-	snprintf(dirnam, len, "%s/%s/%s", basedir,tmpc, cg);
+	snprintf(dirnam, len, "%s/%s/%s", BASEDIR,tmpc, cg);
 
 	if (mkdir(dirnam, 0755) < 0)
 		return -errno;
@@ -576,10 +576,10 @@ bool cgfs_remove(const char *controller, const char *cg)
 
 	if (!tmpc)
 		return false;
-	/* basedir / tmpc / cg \0 */
-	len = strlen(basedir) + strlen(tmpc) + strlen(cg) + 3;
+	/* BASEDIR / tmpc / cg \0 */
+	len = strlen(BASEDIR) + strlen(tmpc) + strlen(cg) + 3;
 	dirnam = alloca(len);
-	snprintf(dirnam, len, "%s/%s/%s", basedir,tmpc, cg);
+	snprintf(dirnam, len, "%s/%s/%s", BASEDIR,tmpc, cg);
 	return recursive_rmdir(dirnam);
 }
 
@@ -590,10 +590,10 @@ bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
 
 	if (!tmpc)
 		return false;
-	/* basedir / tmpc / file \0 */
-	len = strlen(basedir) + strlen(tmpc) + strlen(file) + 3;
+	/* BASEDIR / tmpc / file \0 */
+	len = strlen(BASEDIR) + strlen(tmpc) + strlen(file) + 3;
 	pathname = alloca(len);
-	snprintf(pathname, len, "%s/%s/%s", basedir, tmpc, file);
+	snprintf(pathname, len, "%s/%s/%s", BASEDIR, tmpc, file);
 	if (chmod(pathname, mode) < 0)
 		return false;
 	return true;
@@ -622,10 +622,10 @@ int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t g
 
 	if (!tmpc)
 		return -EINVAL;
-	/* basedir / tmpc / file \0 */
-	len = strlen(basedir) + strlen(tmpc) + strlen(file) + 3;
+	/* BASEDIR / tmpc / file \0 */
+	len = strlen(BASEDIR) + strlen(tmpc) + strlen(file) + 3;
 	pathname = alloca(len);
-	snprintf(pathname, len, "%s/%s/%s", basedir, tmpc, file);
+	snprintf(pathname, len, "%s/%s/%s", BASEDIR, tmpc, file);
 	if (chown(pathname, uid, gid) < 0)
 		return -errno;
 
@@ -643,10 +643,10 @@ FILE *open_pids_file(const char *controller, const char *cgroup)
 
 	if (!tmpc)
 		return NULL;
-	/* basedir / tmpc / cgroup / "cgroup.procs" \0 */
-	len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + 4 + strlen("cgroup.procs");
+	/* BASEDIR / tmpc / cgroup / "cgroup.procs" \0 */
+	len = strlen(BASEDIR) + strlen(tmpc) + strlen(cgroup) + 4 + strlen("cgroup.procs");
 	pathname = alloca(len);
-	snprintf(pathname, len, "%s/%s/%s/cgroup.procs", basedir, tmpc, cgroup);
+	snprintf(pathname, len, "%s/%s/%s/cgroup.procs", BASEDIR, tmpc, cgroup);
 	return fopen(pathname, "w");
 }
 
@@ -666,10 +666,10 @@ static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool
 	if (!tmpc)
 		return false;
 
-	/* basedir / tmpc / cgroup \0 */
-	len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + 3;
+	/* BASEDIR / tmpc / cgroup \0 */
+	len = strlen(BASEDIR) + strlen(tmpc) + strlen(cgroup) + 3;
 	dirname = alloca(len);
-	snprintf(dirname, len, "%s/%s/%s", basedir, tmpc, cgroup);
+	snprintf(dirname, len, "%s/%s/%s", BASEDIR, tmpc, cgroup);
 
 	dir = opendir(dirname);
 	if (!dir)
@@ -761,10 +761,10 @@ bool cgfs_get_value(const char *controller, const char *cgroup, const char *file
 
 	if (!tmpc)
 		return false;
-	/* basedir / tmpc / cgroup / file \0 */
-	len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + strlen(file) + 4;
+	/* BASEDIR / tmpc / cgroup / file \0 */
+	len = strlen(BASEDIR) + strlen(tmpc) + strlen(cgroup) + strlen(file) + 4;
 	fnam = alloca(len);
-	snprintf(fnam, len, "%s/%s/%s/%s", basedir, tmpc, cgroup, file);
+	snprintf(fnam, len, "%s/%s/%s/%s", BASEDIR, tmpc, cgroup, file);
 
 	*value = slurp_file(fnam);
 	return *value != NULL;
@@ -787,12 +787,12 @@ struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, cons
 	if (file && index(file, '/'))
 		return NULL;
 
-	/* basedir / tmpc / cgroup / file \0 */
-	len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + 3;
+	/* BASEDIR / tmpc / cgroup / file \0 */
+	len = strlen(BASEDIR) + strlen(tmpc) + strlen(cgroup) + 3;
 	if (file)
 		len += strlen(file) + 1;
 	fnam = alloca(len);
-	snprintf(fnam, len, "%s/%s/%s%s%s", basedir, tmpc, cgroup,
+	snprintf(fnam, len, "%s/%s/%s%s%s", BASEDIR, tmpc, cgroup,
 		file ? "/" : "", file ? file : "");
 
 	ret = stat(fnam, &sb);
@@ -838,10 +838,10 @@ bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
 
 	if (!tmpc)
 		return false;
-	/* basedir / tmpc / cgroup / f \0 */
-	len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + strlen(f) + 4;
+	/* BASEDIR / tmpc / cgroup / f \0 */
+	len = strlen(BASEDIR) + strlen(tmpc) + strlen(cgroup) + strlen(f) + 4;
 	fnam = alloca(len);
-	snprintf(fnam, len, "%s/%s/%s/%s", basedir, tmpc, cgroup, f);
+	snprintf(fnam, len, "%s/%s/%s/%s", BASEDIR, tmpc, cgroup, f);
 
 	ret = stat(fnam, &sb);
 	if (ret < 0 || !S_ISDIR(sb.st_mode))
diff --git a/bindings.h b/bindings.h
index 3d3bf41..5ac26fb 100644
--- a/bindings.h
+++ b/bindings.h
@@ -1,5 +1,5 @@
 /* directory under which we mount the controllers - /run/lxcfs/controllers */
-#define basedir RUNTIME_PATH "/lxcfs/controllers"
+#define BASEDIR RUNTIME_PATH "/lxcfs/controllers"
 
 extern int cg_write(const char *path, const char *buf, size_t size, off_t offset,
 	     struct fuse_file_info *fi);
diff --git a/lxcfs.c b/lxcfs.c
index 1d19a10..14c694e 100644
--- a/lxcfs.c
+++ b/lxcfs.c
@@ -789,8 +789,8 @@ static bool mkdir_p(const char *dir, mode_t mode)
 
 static bool umount_if_mounted(void)
 {
-	if (umount2(basedir, MNT_DETACH) < 0 && errno != EINVAL) {
-		fprintf(stderr, "failed to umount %s: %s\n", basedir,
+	if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
+		fprintf(stderr, "failed to umount %s: %s\n", BASEDIR,
 			strerror(errno));
 		return false;
 	}
@@ -799,7 +799,7 @@ static bool umount_if_mounted(void)
 
 static bool setup_cgfs_dir(void)
 {
-	if (!mkdir_p(basedir, 0700)) {
+	if (!mkdir_p(BASEDIR, 0700)) {
 		fprintf(stderr, "Failed to create lxcfs cgdir\n");
 		return false;
 	}
@@ -807,7 +807,7 @@ static bool setup_cgfs_dir(void)
 		fprintf(stderr, "Failed to clean up old lxcfs cgdir\n");
 		return false;
 	}
-	if (mount("tmpfs", basedir, "tmpfs", 0, "size=100000,mode=700") < 0) {
+	if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
 		fprintf(stderr, "Failed to mount tmpfs for private controllers\n");
 		return false;
 	}
@@ -820,9 +820,9 @@ static bool do_mount_cgroup(char *controller)
 	size_t len;
 	int ret;
 
-	len = strlen(basedir) + strlen(controller) + 2;
+	len = strlen(BASEDIR) + strlen(controller) + 2;
 	target = alloca(len);
-	ret = snprintf(target, len, "%s/%s", basedir, controller);
+	ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
 	if (ret < 0 || ret >= len)
 		return false;
 	if (mkdir(target, 0755) < 0 && errno != EEXIST)

From 82bdc0bc5bcc20ea51a40833954724d72662e090 Mon Sep 17 00:00:00 2001
From: Christian Brauner <cbrauner at suse.de>
Date: Wed, 27 Jul 2016 23:32:31 +0200
Subject: [PATCH 2/4] order includes

Signed-off-by: Christian Brauner <cbrauner at suse.de>
---
 lxcfs.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/lxcfs.c b/lxcfs.c
index 14c694e..c33bc0d 100644
--- a/lxcfs.c
+++ b/lxcfs.c
@@ -8,28 +8,28 @@
 
 #define FUSE_USE_VERSION 26
 
-#include <stdio.h>
 #include <dirent.h>
+#include <dlfcn.h>
+#include <errno.h>
 #include <fcntl.h>
 #include <fuse.h>
-#include <unistd.h>
-#include <errno.h>
-#include <stdbool.h>
-#include <time.h>
-#include <string.h>
-#include <stdlib.h>
 #include <libgen.h>
-#include <sched.h>
 #include <pthread.h>
-#include <dlfcn.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <wait.h>
 #include <linux/sched.h>
-#include <sys/socket.h>
-#include <sys/mount.h>
 #include <sys/epoll.h>
-#include <wait.h>
+#include <sys/mount.h>
+#include <sys/socket.h>
 
-#include "config.h" // for VERSION
 #include "bindings.h"
+#include "config.h" // for VERSION
 
 void *dlopen_handle;
 

From 6b56fdfe3357a055ddb2d28452a48cec4ef9d7be Mon Sep 17 00:00:00 2001
From: Christian Brauner <cbrauner at suse.de>
Date: Wed, 27 Jul 2016 23:43:31 +0200
Subject: [PATCH 3/4] lxcfs: add infrastructure to create minimal chroot

Signed-off-by: Christian Brauner <cbrauner at suse.de>
---
 bindings.h |   1 +
 lxcfs.c    | 147 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 148 insertions(+)

diff --git a/bindings.h b/bindings.h
index 5ac26fb..7cd75be 100644
--- a/bindings.h
+++ b/bindings.h
@@ -1,5 +1,6 @@
 /* directory under which we mount the controllers - /run/lxcfs/controllers */
 #define BASEDIR RUNTIME_PATH "/lxcfs/controllers"
+#define ROOTDIR RUNTIME_PATH "/lxcfs/root"
 
 extern int cg_write(const char *path, const char *buf, size_t size, off_t offset,
 	     struct fuse_file_info *fi);
diff --git a/lxcfs.c b/lxcfs.c
index c33bc0d..3d1b9ed 100644
--- a/lxcfs.c
+++ b/lxcfs.c
@@ -27,10 +27,26 @@
 #include <sys/epoll.h>
 #include <sys/mount.h>
 #include <sys/socket.h>
+#include <sys/syscall.h>
 
 #include "bindings.h"
 #include "config.h" // for VERSION
 
+/* Define pivot_root() if missing from the C library */
+#ifndef HAVE_PIVOT_ROOT
+static int pivot_root(const char * new_root, const char * put_old)
+{
+#ifdef __NR_pivot_root
+return syscall(__NR_pivot_root, new_root, put_old);
+#else
+errno = ENOSYS;
+return -1;
+#endif
+}
+#else
+extern int pivot_root(const char * new_root, const char * put_old);
+#endif
+
 void *dlopen_handle;
 
 /* Functions to keep track of number of threads using the library */
@@ -797,6 +813,133 @@ static bool umount_if_mounted(void)
 	return true;
 }
 
+static int pivot_enter(void)
+{
+	int oldroot = -1, newroot = -1;
+
+	oldroot = open("/", O_DIRECTORY | O_RDONLY);
+	if (oldroot < 0) {
+		fprintf(stderr, "%s: Failed to open old root for fchdir.\n", __func__);
+		return -1;
+	}
+
+	newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
+	if (newroot < 0) {
+		fprintf(stderr, "%s: Failed to open new root for fchdir.\n", __func__);
+		goto err;
+	}
+
+	/* change into new root fs */
+	if (fchdir(newroot)) {
+		fprintf(stderr, "%s: Failed to change directory to new rootfs: %s.\n", __func__, ROOTDIR);
+		goto err;
+	}
+
+	/* pivot_root into our new root fs */
+	if (pivot_root(".", ".")) {
+		fprintf(stderr, "%s: pivot_root() syscall failed: %s.\n", __func__, strerror(errno));
+		goto err;
+	}
+
+	/*
+	 * At this point the old-root is mounted on top of our new-root.
+	 * To unmounted it we must not be chdir'd into it, so escape back
+	 * to the old-root.
+	 */
+	if (fchdir(oldroot) < 0) {
+		fprintf(stderr, "%s: Failed to enter old root.\n", __func__);
+		goto err;
+	}
+	if (umount2(".", MNT_DETACH) < 0) {
+		fprintf(stderr, "%s: Failed to detach old root.\n", __func__);
+		goto err;
+	}
+
+	if (fchdir(newroot) < 0) {
+		fprintf(stderr, "%s: Failed to re-enter new root.\n", __func__);
+		goto err;
+	}
+
+	close(oldroot);
+	close(newroot);
+	return 0;
+
+err:
+	if (oldroot != -1)
+		close(oldroot);
+	if (newroot != -1)
+		close(newroot);
+	return -1;
+}
+
+/*
+ * Prepare our new root: We need to mount everything that fuse needs to
+ * correctly work in our minimal chroot:
+ *	- /var/lib/lxcfs		<-- the fuse mount
+ *	- /dev				<-- because of /dev/fuse
+ * 	- /sys				<-- because we want to mount /sys/fs/connections/fuse
+ * 	- /sys/fs/connections/fuse	<-- because of fuse
+ * 	- /proc				<-- where we read info from
+ * (Is that all we need? Did we not pin any unnecessary mounts?)
+ */
+static int pivot_prepare(void)
+{
+	if (mkdir(ROOTDIR, 0755) < 0 && errno != EEXIST) {
+		fprintf(stderr, "%s: Failed to create directory for new root.\n", __func__);
+		return -1;
+	}
+
+	if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
+		fprintf(stderr, "%s: Failed to bind-mount / for new root: %s.\n", __func__, strerror(errno));
+		return -1;
+	}
+
+	if (mount("/proc", ROOTDIR "/proc", NULL, MS_REC | MS_MOVE, 0) < 0) {
+		fprintf(stderr, "%s: Failed to move /proc into new root: %s.\n", __func__, strerror(errno));
+		return -1;
+	}
+
+	if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
+		fprintf(stderr, "%s: Failed to bind-mount /run into new root: %s.\n", __func__, strerror(errno));
+		return -1;
+	}
+
+	if (mount("/dev", ROOTDIR "/dev", NULL, MS_BIND, 0) < 0) {
+		printf("%s: Failed to bind-mount /dev into new root: %s.\n", __func__, strerror(errno));
+		return -1;
+	}
+
+	if (mount("/sys", ROOTDIR "/sys", NULL, MS_BIND, 0) < 0) {
+		printf("%s: failed to bind-mount /sys into new root: %s.\n", __func__, strerror(errno));
+		return -1;
+	}
+
+	if (mount("/sys/fs/fuse/connections", ROOTDIR "/sys/fs/fuse/connections", NULL, MS_BIND, 0) < 0) {
+		printf("%s: failed to bind-mount /sys/fs/fuse/connections into " "new root: %s.\n", __func__, strerror(errno));
+		return -1;
+	}
+
+	if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
+		printf("%s: failed to move " BASEDIR " into new root: %s.\n", __func__, strerror(errno));
+		return -1;
+	}
+
+	return 0;
+}
+
+static int pivot_new_root(void)
+{
+	/* Prepare new root. */
+	if (pivot_prepare() < 0)
+		return -1;
+
+	/* Pivot into new root. */
+	if (pivot_enter() < 0)
+		return -1;
+
+	return 0;
+}
+
 static bool setup_cgfs_dir(void)
 {
 	if (!mkdir_p(BASEDIR, 0700)) {
@@ -870,6 +1013,10 @@ static bool do_mount_cgroups(void)
 		if (!do_mount_cgroup(p))
 			goto out;
 	}
+
+	if (pivot_new_root() < 0)
+		goto out;
+
 	ret = true;
 
 out:

From 5f828458c25409c43e56a381600135bcc11a403e Mon Sep 17 00:00:00 2001
From: Christian Brauner <cbrauner at suse.de>
Date: Wed, 27 Jul 2016 23:52:04 +0200
Subject: [PATCH 4/4] move lxcfs to minimal chroot

lxcfs now creates a private mount namespace (actually rather a minimal chroot)
to hide its cgroup mounts under BASEDIR/ from other processes that would get
confused by it. However, the fuse mount usually placed under /var/lib/lxcfs (or
whatever the user gives us via argv[1]) needs to be visible in the initial
namespace. Hence, we place these mounts in different namespaces. This requires
some coordination. fuse_mount() needs to be called in the initial namespace.
Afterwards we can unshare(CLONE_NEWNS), setup the cgroup mounts and create our
minimal chroot. On failure we need to switch back to the initial mount namespace
for fuse_umount() to succeed.
In addition, when we are asked to reload our dynamic library, we also need to
switch to the initial mount namespace.

Signed-off-by: Christian Brauner <cbrauner at suse.de>
---
 lxcfs.c | 241 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 230 insertions(+), 11 deletions(-)

diff --git a/lxcfs.c b/lxcfs.c
index 3d1b9ed..36c8605 100644
--- a/lxcfs.c
+++ b/lxcfs.c
@@ -47,6 +47,22 @@ return -1;
 extern int pivot_root(const char * new_root, const char * put_old);
 #endif
 
+/*
+ * ns_fd[INIT_MNTNS]	--> initial       mnt ns
+ * ns_fd[LXCFS_MNTNS]	--> private lxcfs mnt ns
+ * active_ns		--> currently active mnt ns == one of ns_fd[MNT_NS_MAX]
+ */
+#define INIT_MNTNS 0
+#define LXCFS_MNTNS 1
+#define MNT_NS_MAX 2
+static struct preserved_ns {
+	int ns_fd[MNT_NS_MAX];
+	int active_ns;
+} preserved_ns = {
+    .ns_fd = {-1, -1},
+    .active_ns = -1,
+};
+
 void *dlopen_handle;
 
 /* Functions to keep track of number of threads using the library */
@@ -83,12 +99,84 @@ static void users_unlock(void)
 	unlock_mutex(&user_count_mutex);
 }
 
+/*
+ * Simple functions to preserve and switch mount namespaces.
+ */
+
+/* Close all open file descriptors refering to a namespace. */
+static void close_ns(struct preserved_ns *ns)
+{
+	int i;
+	for (i = 0; i < MNT_NS_MAX; i++) {
+		if (ns->ns_fd[i] > -1) {
+			close(ns->ns_fd[i]);
+			ns->ns_fd[i] = -1;
+		}
+	}
+	ns->active_ns = -1;
+}
+
+/* Open /proc/PID/ns/mnt and save open fd to preserve the mount namespace.
+ * if @is_caller_pid is set to true it is assumed that @pid is the callers pid
+ * and that we are attached to the namespace identified by which_ns.
+ */
+static bool preserve_ns(struct preserved_ns *ns, int which_ns, int pid,
+			bool is_caller_pid)
+{
+	int ret;
+	size_t len = 5 /* /proc */ + 21 /* /int_as_str */ + 7 /* /ns/mnt */ + 1 /* \0 */;
+	char path[len];
+
+	ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
+	if (ret < 0 || (size_t)ret >= len)
+		return false;
+
+	ns->ns_fd[which_ns] = open(path, O_RDONLY | O_CLOEXEC);
+	if (ns->ns_fd[which_ns] < 0)
+		goto error;
+
+	if (is_caller_pid)
+		ns->active_ns = ns->ns_fd[which_ns];
+
+	return true;
+
+error:
+	close_ns(ns);
+	return false;
+}
+
+/* Switch caller to namespace identified by the fd retrieved via @which_ns and
+ * set the active namespace to the switched namespace. */
+static bool switch_ns(struct preserved_ns *ns, int which_ns)
+{
+	int ret = setns(ns->ns_fd[which_ns], 0);
+	if (ret < 0)
+		ns->active_ns = ns->ns_fd[which_ns] = ret;
+	else
+		ns->active_ns = ns->ns_fd[which_ns];
+
+	return ret == 0;
+}
+
+/*
+ * Functions and types used to reload dynamic library.
+ */
 static volatile sig_atomic_t need_reload;
 
 /* do_reload - reload the dynamic library.  Done under
  * lock and when we know the user_count was 0 */
-static void do_reload(void)
+static void do_reload(struct preserved_ns *ns)
 {
+	if (ns->active_ns != -1) {
+		if (ns->active_ns == ns->ns_fd[INIT_MNTNS])
+			;
+		else
+		    /* What do we want to do if switch_ns() fails here?
+		     * Fail? */
+		    if (!switch_ns(ns, INIT_MNTNS))
+			goto bad;
+	}
+
 	if (dlopen_handle)
 		dlclose(dlopen_handle);
 
@@ -98,22 +186,27 @@ static void do_reload(void)
 		goto good;
 
 	dlopen_handle = dlopen("/usr/lib/lxcfs/liblxcfs.so", RTLD_LAZY);
-	if (!dlopen_handle) {
-		fprintf(stderr, "Failed to open liblxcfs\n");
-		_exit(1);
-	}
+	if (!dlopen_handle)
+		goto bad;
 
 good:
 	if (need_reload)
 		fprintf(stderr, "lxcfs: reloaded\n");
+	if (ns->active_ns != -1)
+		if (!switch_ns(ns, LXCFS_MNTNS))
+			goto bad;
 	need_reload = 0;
+	return;
+bad:
+	fprintf(stderr, "Failed to open liblxcfs\n");
+	_exit(1);
 }
 
 static void up_users(void)
 {
 	users_lock();
 	if (users_count == 0 && need_reload)
-		do_reload();
+		do_reload(&preserved_ns);
 	users_count++;
 	users_unlock();
 }
@@ -1081,6 +1174,129 @@ static int set_pidfile(char *pidfile)
 	return fd;
 }
 
+static struct fuse *fuse_prepare(int argc, char *argv[],
+				 const struct fuse_operations *op,
+				 size_t op_size, char **mountpoint,
+				 int *multithreaded, void *user_data)
+{
+	struct fuse_args args = FUSE_ARGS_INIT(argc, argv);
+	struct fuse_chan *ch;
+	struct fuse *fuse = NULL;
+	int foreground;
+	int res;
+
+	res = fuse_parse_cmdline(&args, mountpoint, multithreaded, &foreground);
+	if (res == -1)
+		return NULL;
+
+	ch = fuse_mount(*mountpoint, &args);
+	if (!ch) {
+		fuse_opt_free_args(&args);
+		goto err_free;
+	}
+
+	/* Switch to new mount namespace for lxcfs and setup private mounts for
+	 * fuse.
+	 */
+	if (unshare(CLONE_NEWNS) < 0) {
+		fprintf(stderr,
+			"%s: Failed to unshare the mount namespace: %s.\n",
+			__func__, strerror(errno));
+		goto err_free;
+	}
+
+	if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
+		fprintf(stderr, "%s: Failed to re-mount / private: %s.\n",
+			__func__, strerror(errno));
+		goto err_free;
+	}
+
+	/* Preserve lxcfs private mount namespace so we can switch to it when we
+	 * need to.
+	 */
+	if (!preserve_ns(&preserved_ns, LXCFS_MNTNS, getpid(), true))
+		goto err_unmount;
+
+	if (!cgfs_setup_controllers())
+		goto err_unmount;
+
+	fuse = fuse_new(ch, &args, op, op_size, user_data);
+	fuse_opt_free_args(&args);
+	if (fuse == NULL)
+		goto err_unmount;
+
+	res = fuse_daemonize(foreground);
+	if (res == -1)
+		goto err_unmount;
+
+	res = fuse_set_signal_handlers(fuse_get_session(fuse));
+	if (res == -1)
+		goto err_unmount;
+
+	return fuse;
+
+err_unmount:
+	/* fuse_umount() should be done in the initial mount namespace because
+	 * we did our fuse_mount() there. So attach back to host ns here. We do
+	 * not check for error since we can't do anything anyway if it fails.
+	 * TODO: What should we do if we fail? (Probably nothing.(?))
+	 */
+	(void)switch_ns(&preserved_ns, INIT_MNTNS);
+
+	fuse_unmount(*mountpoint, ch);
+	if (fuse)
+		fuse_destroy(fuse);
+err_free:
+	free(*mountpoint);
+	return NULL;
+}
+
+int fuse_init(int argc, char *argv[], const struct fuse_operations *op,
+	      size_t op_size, void *user_data)
+{
+	struct fuse *fuse;
+	char *mountpoint;
+	int multithreaded;
+	int res;
+
+	/* We are in our private mount namespace here! */
+	fuse = fuse_prepare(argc, argv, op, op_size, &mountpoint,
+			    &multithreaded, user_data);
+	if (fuse == NULL)
+		return -1;
+
+	if (multithreaded)
+		res = fuse_loop_mt(fuse);
+	else
+		res = fuse_loop(fuse);
+
+	/* fuse_teardown() should be done in the initial mount namespace because
+	 * we did fuse_new() + fuse_mount() there. So attach back to the initial
+	 * mount namespace here. We do not check for error since we can't do
+	 * anything anyway if it fails.
+	 */
+	(void)switch_ns(&preserved_ns, INIT_MNTNS);
+
+	fuse_teardown(fuse, mountpoint);
+	if (res == -1)
+		return -1;
+
+	return 0;
+}
+
+/* Note that lxcfs creates a private mount namespace (actually rather a minimal
+ * chroot) to hide its cgroup mounts under BASEDIR/ from other processes that
+ * would get confused by it. However, the fuse mount usually placed under
+ * /var/lib/lxcfs (or whatever the user gives us via argv[1]) needs to be
+ * visible in the initial namespace.
+ * Hence, we place these mounts in different namespaces. This requires some
+ * coordination. fuse_mount() needs to be called in the initial namespace.
+ * Afterwards we can unshare(CLONE_NEWNS), setup the cgroup mounts and create
+ * our minimal chroot. On failure we need to switch back to the initial
+ * mount namespace or fuse_umount() to succeed.
+ * Also, when we are asked to reload our dynamic library, we also need to switch
+ * to the initial mount namespace.
+ */
 int main(int argc, char *argv[])
 {
 	int ret = -1, pidfd;
@@ -1114,7 +1330,7 @@ int main(int argc, char *argv[])
 	if (argc != 2 || is_help(argv[1]))
 		usage(argv[0]);
 
-	do_reload();
+	do_reload(&preserved_ns);
 	if (signal(SIGUSR1, reload_handler) == SIG_ERR) {
 		fprintf(stderr, "Error setting USR1 signal handler: %m\n");
 		exit(1);
@@ -1127,9 +1343,6 @@ int main(int argc, char *argv[])
 	newargv[cnt++] = argv[1];
 	newargv[cnt++] = NULL;
 
-	if (!cgfs_setup_controllers())
-		goto out;
-
 	if (!pidfile) {
 		pidfile_len = strlen(RUNTIME_PATH) + strlen("/lxcfs.pid") + 1;
 		pidfile = alloca(pidfile_len);
@@ -1138,7 +1351,13 @@ int main(int argc, char *argv[])
 	if ((pidfd = set_pidfile(pidfile)) < 0)
 		goto out;
 
-	ret = fuse_main(nargs, newargv, &lxcfs_ops, NULL);
+	/* Preserve initial mount namespace so we can switch to it when we need
+	 * to (For example, when we reload our dynamic library.).
+	 */
+	if (!preserve_ns(&preserved_ns, INIT_MNTNS, getpid(), true))
+		goto out;
+
+	ret = fuse_init(nargs, newargv, &lxcfs_ops, sizeof(lxcfs_ops), NULL);
 
 	dlclose(dlopen_handle);
 	unlink(pidfile);


More information about the lxc-devel mailing list