[lxc-devel] [PATCH 1/2] add support for nbd (v3)

Serge Hallyn serge.hallyn at ubuntu.com
Thu May 15 14:33:18 UTC 2014


backing stores supported by qemu-nbd can be attached to a nbd block
device using qemu-nbd.  This user-space process (pair) stays around for
the duration of the device attachment.  Obviously we want it to go away
when the container shuts down, but not before the filesystems have been
cleanly unmounted.

The device attachment is done from the task which will become the
container monitor before the container setup+init task is spawned.
That task starts in a new pid namespace to ensure that the qemu-nbd
process will be killed if need be.  It sets its parent death signal
to sighup, and, on receiving sighup, attempts to do a clean
qemu-device detach, then exits.  This should ensure that the
device is detached if the qemu monitor crashes or exits.

It may be worth adding a delay before the qemu-nbd is detached, but
my brief tests haven't seen any data corruption.

Only the parts required for running a nbd-backed container are
implemented here.  Create, destroy, and clone are not.  The first
use of this that I imagine is for people to use downloaded nbd-backed
images (like ubuntu cloud images, or anything previously used with
qemu).  I imagine people will want to create/clone/destroy out of
band using qemu-img, but if I'm wrong about that we can implement
the rest later.

Because attach_block_device() is done before the bdev is initialized,
and bdev_init needs to know the nbd index so that it can mount the
filesystem, we now need to pass the lxc_conf.

file_exists() is moved to utils.c so we can use it from bdev.c

The nbd attach/detach should lay the groundwork for trivial implementation
of qed and raw images.

changelog (may 12): fix idx check at detach
changelog (may 15): generalize qcow2 to nbd

Signed-off-by: Serge Hallyn <serge.hallyn at ubuntu.com>
---
 src/lxc/bdev.c         | 293 ++++++++++++++++++++++++++++++++++++++++++++++++-
 src/lxc/bdev.h         |  17 ++-
 src/lxc/conf.c         |   3 +-
 src/lxc/conf.h         |   1 +
 src/lxc/lxccontainer.c |  19 +---
 src/lxc/start.c        |  11 +-
 src/lxc/utils.c        |   7 ++
 src/lxc/utils.h        |   1 +
 8 files changed, 329 insertions(+), 23 deletions(-)

diff --git a/src/lxc/bdev.c b/src/lxc/bdev.c
index 20e9fb3..e22d83d 100644
--- a/src/lxc/bdev.c
+++ b/src/lxc/bdev.c
@@ -41,6 +41,7 @@
 #include <libgen.h>
 #include <linux/loop.h>
 #include <dirent.h>
+#include <sys/prctl.h>
 
 #include "lxc.h"
 #include "config.h"
@@ -2410,6 +2411,287 @@ static const struct bdev_ops aufs_ops = {
 	.can_snapshot = true,
 };
 
+//
+// nbd dev ops
+//
+
+static int nbd_detect(const char *path)
+{
+	if (strncmp(path, "nbd:", 4) == 0)
+		return 1;
+	return 0;
+}
+
+struct nbd_attach_data {
+	const char *nbd;
+	const char *path;
+};
+
+static void nbd_detach(const char *path)
+{
+	int ret;
+	pid_t pid = fork();
+
+	if (pid < 0) {
+		SYSERROR("Error forking to detach nbd");
+		return;
+	}
+	if (pid) {
+		ret = wait_for_pid(pid);
+		if (ret < 0)
+			ERROR("nbd disconnect returned an error");
+		return;
+	}
+	execlp("qemu-nbd", "qemu-nbd", "-d", path, NULL);
+	SYSERROR("Error executing qemu-nbd");
+	exit(1);
+}
+
+static int do_attach_nbd(void *d)
+{
+	struct nbd_attach_data *data = d;
+	const char *nbd, *path;
+	pid_t pid;
+	sigset_t mask;
+	int sfd;
+	ssize_t s;
+	struct signalfd_siginfo fdsi;
+
+	sigemptyset(&mask);
+	sigaddset(&mask, SIGHUP);
+	sigaddset(&mask, SIGCHLD);
+
+	nbd = data->nbd;
+	path = data->path;
+
+	if (sigprocmask(SIG_BLOCK, &mask, NULL) == -1) {
+		SYSERROR("Error blocking signals for nbd watcher");
+		exit(1);
+	}
+
+	sfd = signalfd(-1, &mask, 0);
+	if (sfd == -1) {
+		SYSERROR("Error opening signalfd for nbd task");
+		exit(1);
+	}
+
+	if (prctl(PR_SET_PDEATHSIG, SIGHUP, 0, 0, 0) < 0)
+		SYSERROR("Error setting parent death signal for nbd watcher");
+
+	pid = fork();
+	if (pid) {
+		for (;;) {
+			s = read(sfd, &fdsi, sizeof(struct signalfd_siginfo));
+			if (s != sizeof(struct signalfd_siginfo))
+				SYSERROR("Error reading from signalfd");
+
+			if (fdsi.ssi_signo == SIGHUP) {
+				/* container has exited */
+				nbd_detach(nbd);
+				exit(0);
+			} else if (fdsi.ssi_signo == SIGCHLD) {
+				int status;
+				while (waitpid(-1, &status, WNOHANG) > 0);
+			}
+		}
+	}
+
+	close(sfd);
+	if (sigprocmask(SIG_UNBLOCK, &mask, NULL) == -1)
+		WARN("Warning: unblocking signals for nbd watcher");
+
+	execlp("qemu-nbd", "qemu-nbd", "-c", nbd, path, NULL);
+	SYSERROR("Error executing qemu-nbd");
+	exit(1);
+}
+
+static bool clone_attach_nbd(const char *nbd, const char *path)
+{
+	pid_t pid;
+	struct nbd_attach_data data;
+
+	data.nbd = nbd;
+	data.path = path;
+
+	pid = lxc_clone(do_attach_nbd, &data, CLONE_NEWPID);
+	if (pid < 0)
+		return false;
+	return true;
+}
+
+static bool nbd_busy(int idx)
+{
+	char path[100];
+	int ret;
+
+	ret = snprintf(path, 100, "/sys/block/nbd%d/pid", idx);
+	if (ret < 0 || ret >= 100)
+		return true;
+	return file_exists(path);
+}
+
+static bool attach_nbd(char *src, struct lxc_conf *conf)
+{
+	char *orig = alloca(strlen(src)+1), *p, path[50];
+	int i = 0;
+
+	strcpy(orig, src);
+	/* if path is followed by a partition, drop that for now */
+	p = strchr(orig, ':');
+	if (p)
+		*p = '\0';
+	while (1) {
+		sprintf(path, "/dev/nbd%d", i);
+		if (!file_exists(path))
+			return false;
+		if (nbd_busy(i)) {
+			i++;
+			continue;
+		}
+		if (!clone_attach_nbd(path, orig))
+			return false;
+		conf->nbd_idx = i;
+		return true;
+	}
+}
+
+static bool requires_nbd(const char *path)
+{
+	if (strncmp(path, "nbd:", 4) == 0)
+		return true;
+	return false;
+}
+
+/*
+ * attach_block_device returns true if all went well,
+ * meaning either a block device was attached or was not
+ * needed.  It returns false if something went wrong and
+ * container startup shoudl be stopped.
+ */
+bool attach_block_device(struct lxc_conf *conf)
+{
+	char *path;
+
+	if (!conf->rootfs.path)
+		return true;
+	path = conf->rootfs.path;
+	if (!requires_nbd(path))
+		return true;
+	path = strchr(path, ':');
+	if (!path)
+		return false;
+	path++;
+	if (!attach_nbd(path, conf))
+		return false;
+	return true;
+}
+
+void detach_nbd_idx(int idx)
+{
+	int ret;
+	char path[50];
+
+	ret = snprintf(path, 50, "/dev/nbd%d", idx);
+	if (ret < 0 || ret >= 50)
+		return;
+
+	nbd_detach(path);
+}
+
+void detach_block_device(struct lxc_conf *conf)
+{
+	if (conf->nbd_idx != -1)
+		detach_nbd_idx(conf->nbd_idx);
+}
+
+/*
+ * Pick the partition # off the end of a nbd:file:p
+ * description.  Return 1-9 for the partition id, or 0
+ * for no partition.
+ */
+static int nbd_get_partition(const char *src)
+{
+	char *p = strchr(src, ':');
+	if (!p)
+		return 0;
+	p = strchr(p+1, ':');
+	if (!p)
+		return 0;
+	p++;
+	if (*p < '1' && *p > '9')
+		return 0;
+	return *p - '0';
+}
+
+static int nbd_mount(struct bdev *bdev)
+{
+	int ret = -1, partition;
+	char path[50];
+
+	if (strcmp(bdev->type, "nbd"))
+		return -22;
+	if (!bdev->src || !bdev->dest)
+		return -22;
+
+	/* nbd_idx should have been copied by bdev_init from the lxc_conf */
+	if (bdev->nbd_idx < 0)
+		return -22;
+	partition = nbd_get_partition(bdev->src);
+	if (partition)
+		ret = snprintf(path, 50, "/dev/nbd%dp%d", bdev->nbd_idx,
+				partition);
+	else
+		ret = snprintf(path, 50, "/dev/nbd%d", bdev->nbd_idx);
+	if (ret < 0 || ret >= 50) {
+		ERROR("Error setting up nbd device path");
+		return ret;
+	}
+	ret = mount_unknown_fs(path, bdev->dest, bdev->mntopts);
+	if (ret < 0)
+		ERROR("Error mounting %s", bdev->src);
+
+	return ret;
+}
+
+static int nbd_create(struct bdev *bdev, const char *dest, const char *n,
+			struct bdev_specs *specs)
+{
+	return -ENOSYS;
+}
+
+static int nbd_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
+		const char *cname, const char *oldpath, const char *lxcpath, int snap,
+		uint64_t newsize, struct lxc_conf *conf)
+{
+	return -ENOSYS;
+}
+
+static int nbd_destroy(struct bdev *orig)
+{
+	return -ENOSYS;
+}
+
+static int nbd_umount(struct bdev *bdev)
+{
+	int ret;
+
+	if (strcmp(bdev->type, "nbd"))
+		return -22;
+	if (!bdev->src || !bdev->dest)
+		return -22;
+	ret = umount(bdev->dest);
+	return ret;
+}
+
+static const struct bdev_ops nbd_ops = {
+	.detect = &nbd_detect,
+	.mount = &nbd_mount,
+	.umount = &nbd_umount,
+	.clone_paths = &nbd_clonepaths,
+	.destroy = &nbd_destroy,
+	.create = &nbd_create,
+	.can_snapshot = true,
+};
 
 static const struct bdev_type bdevs[] = {
 	{.name = "zfs", .ops = &zfs_ops,},
@@ -2419,6 +2701,7 @@ static const struct bdev_type bdevs[] = {
 	{.name = "aufs", .ops = &aufs_ops,},
 	{.name = "overlayfs", .ops = &overlayfs_ops,},
 	{.name = "loop", .ops = &loop_ops,},
+	{.name = "nbd", .ops = &nbd_ops,},
 };
 
 static const size_t numbdevs = sizeof(bdevs) / sizeof(struct bdev_type);
@@ -2454,7 +2737,7 @@ struct bdev *bdev_get(const char *type)
 	return bdev;
 }
 
-struct bdev *bdev_init(const char *src, const char *dst, const char *mntopts)
+struct bdev *bdev_init(struct lxc_conf *conf, const char *src, const char *dst, const char *mntopts)
 {
 	int i;
 	struct bdev *bdev;
@@ -2480,6 +2763,8 @@ struct bdev *bdev_init(const char *src, const char *dst, const char *mntopts)
 		bdev->src = strdup(src);
 	if (dst)
 		bdev->dest = strdup(dst);
+	if (strcmp(bdev->type, "nbd") == 0)
+		bdev->nbd_idx = conf->nbd_idx;
 
 	return bdev;
 }
@@ -2538,9 +2823,9 @@ static int rsync_rootfs_wrapper(void *data)
 	return rsync_rootfs(arg);
 }
 
-bool bdev_is_dir(const char *path)
+bool bdev_is_dir(struct lxc_conf *conf, const char *path)
 {
-	struct bdev *orig = bdev_init(path, NULL, NULL);
+	struct bdev *orig = bdev_init(conf, path, NULL, NULL);
 	bool ret = false;
 	if (!orig)
 		return ret;
@@ -2605,7 +2890,7 @@ struct bdev *bdev_copy(struct lxc_container *c0, const char *cname,
 		return NULL;
 	}
 
-	orig = bdev_init(src, NULL, NULL);
+	orig = bdev_init(c0->lxc_conf, src, NULL, NULL);
 	if (!orig) {
 		ERROR("failed to detect blockdev type for %s", src);
 		return NULL;
diff --git a/src/lxc/bdev.h b/src/lxc/bdev.h
index cc0bf02..0893c11 100644
--- a/src/lxc/bdev.h
+++ b/src/lxc/bdev.h
@@ -24,8 +24,7 @@
 #ifndef __LXC_BDEV_H
 #define __LXC_BDEV_H
 /* blockdev operations for:
- * aufs, dir, raw, btrfs, overlayfs, aufs, lvm, loop, zfs
- * someday: qemu-nbd, qcow2, qed
+ * aufs, dir, raw, btrfs, overlayfs, aufs, lvm, loop, zfs, nbd (qcow2, raw, vdi, qed)
  */
 
 #include "config.h"
@@ -83,11 +82,13 @@ struct bdev {
 	// turn the following into a union if need be
 	// lofd is the open fd for the mounted loopback file
 	int lofd;
+	// index for the connected nbd device
+	int nbd_idx;
 };
 
 char *overlay_getlower(char *p);
 
-bool bdev_is_dir(const char *path);
+bool bdev_is_dir(struct lxc_conf *conf, const char *path);
 
 /*
  * Instantiate a bdev object.  The src is used to determine which blockdev
@@ -100,7 +101,8 @@ bool bdev_is_dir(const char *path);
  * use /var/lib/lxc/canonical/rootfs as lower dir, and /var/lib/lxc/c1/delta
  * as the upper, writeable layer.
  */
-struct bdev *bdev_init(const char *src, const char *dst, const char *data);
+struct bdev *bdev_init(struct lxc_conf *conf, const char *src, const char *dst,
+			const char *data);
 
 struct bdev *bdev_copy(struct lxc_container *c0, const char *cname,
 			const char *lxcpath, const char *bdevtype,
@@ -110,6 +112,13 @@ struct bdev *bdev_create(const char *dest, const char *type,
 			const char *cname, struct bdev_specs *specs);
 void bdev_put(struct bdev *bdev);
 
+/*
+ * these are really for qemu-nbd support, as container shutdown
+ * must explicitly request device detach.
+ */
+bool attach_block_device(struct lxc_conf *conf);
+void detach_block_device(struct lxc_conf *conf);
+
 /* define constants if the kernel/glibc headers don't define them */
 #ifndef MS_DIRSYNC
 #define MS_DIRSYNC  128
diff --git a/src/lxc/conf.c b/src/lxc/conf.c
index 78d9de2..7427a94 100644
--- a/src/lxc/conf.c
+++ b/src/lxc/conf.c
@@ -1555,7 +1555,7 @@ static int setup_rootfs(struct lxc_conf *conf)
 	}
 
 	// First try mounting rootfs using a bdev
-	struct bdev *bdev = bdev_init(rootfs->path, rootfs->mount, rootfs->options);
+	struct bdev *bdev = bdev_init(conf, rootfs->path, rootfs->mount, rootfs->options);
 	if (bdev && bdev->ops->mount(bdev) == 0) {
 		bdev_put(bdev);
 		DEBUG("mounted '%s' on '%s'", rootfs->path, rootfs->mount);
@@ -2675,6 +2675,7 @@ struct lxc_conf *lxc_conf_init(void)
 	new->console.slave = -1;
 	new->console.name[0] = '\0';
 	new->maincmd_fd = -1;
+	new->nbd_idx = -1;
 	new->rootfs.mount = strdup(default_rootfs_mount);
 	if (!new->rootfs.mount) {
 		ERROR("lxc_conf_init : %m");
diff --git a/src/lxc/conf.h b/src/lxc/conf.h
index 865b87a..3a81d0e 100644
--- a/src/lxc/conf.h
+++ b/src/lxc/conf.h
@@ -334,6 +334,7 @@ struct lxc_conf {
 	int start_delay;
 	int start_order;
 	struct lxc_list groups;
+	int nbd_idx;
 };
 
 int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
diff --git a/src/lxc/lxccontainer.c b/src/lxc/lxccontainer.c
index 255fde5..fdac433 100644
--- a/src/lxc/lxccontainer.c
+++ b/src/lxc/lxccontainer.c
@@ -82,13 +82,6 @@ return -1;
 
 lxc_log_define(lxc_container, lxc);
 
-static bool file_exists(const char *f)
-{
-	struct stat statbuf;
-
-	return stat(f, &statbuf) == 0;
-}
-
 static bool config_file_exists(const char *lxcpath, const char *cname)
 {
 	/* $lxcpath + '/' + $cname + '/config' + \0 */
@@ -900,7 +893,7 @@ static bool create_run_template(struct lxc_container *c, char *tpath, bool quiet
 		if (strncmp(src, "aufs:", 5) == 0)
 			src = overlay_getlower(src+5);
 
-		bdev = bdev_init(src, c->lxc_conf->rootfs.mount, NULL);
+		bdev = bdev_init(c->lxc_conf, src, c->lxc_conf->rootfs.mount, NULL);
 		if (!bdev) {
 			ERROR("Error opening rootfs");
 			exit(1);
@@ -1992,7 +1985,7 @@ static int do_bdev_destroy(struct lxc_conf *conf)
 	struct bdev *r;
 	int ret = 0;
 
-	r = bdev_init(conf->rootfs.path, conf->rootfs.mount, NULL);
+	r = bdev_init(conf, conf->rootfs.path, conf->rootfs.mount, NULL);
 	if (!r)
 		return -1;
 
@@ -2522,7 +2515,7 @@ static int clone_update_rootfs(struct clone_update_data *data)
 
 	if (unshare(CLONE_NEWNS) < 0)
 		return -1;
-	bdev = bdev_init(c->lxc_conf->rootfs.path, c->lxc_conf->rootfs.mount, NULL);
+	bdev = bdev_init(c->lxc_conf, c->lxc_conf->rootfs.path, c->lxc_conf->rootfs.mount, NULL);
 	if (!bdev)
 		return -1;
 	if (strcmp(bdev->type, "dir") != 0) {
@@ -2787,7 +2780,7 @@ static bool lxcapi_rename(struct lxc_container *c, const char *newname)
 	if (!c || !c->name || !c->config_path || !c->lxc_conf)
 		return false;
 
-	bdev = bdev_init(c->lxc_conf->rootfs.path, c->lxc_conf->rootfs.mount, NULL);
+	bdev = bdev_init(c->lxc_conf, c->lxc_conf->rootfs.path, c->lxc_conf->rootfs.mount, NULL);
 	if (!bdev) {
 		ERROR("Failed to find original backing store type");
 		return false;
@@ -2880,7 +2873,7 @@ static int lxcapi_snapshot(struct lxc_container *c, const char *commentfile)
 	 */
 	flags = LXC_CLONE_SNAPSHOT | LXC_CLONE_KEEPMACADDR | LXC_CLONE_KEEPNAME |
 		LXC_CLONE_KEEPBDEVTYPE | LXC_CLONE_MAYBE_SNAPSHOT;
-	if (bdev_is_dir(c->lxc_conf->rootfs.path)) {
+	if (bdev_is_dir(c->lxc_conf, c->lxc_conf->rootfs.path)) {
 		ERROR("Snapshot of directory-backed container requested.");
 		ERROR("Making a copy-clone.  If you do want snapshots, then");
 		ERROR("please create an aufs or overlayfs clone first, snapshot that");
@@ -3082,7 +3075,7 @@ static bool lxcapi_snapshot_restore(struct lxc_container *c, const char *snapnam
 	if (!c || !c->name || !c->config_path)
 		return false;
 
-	bdev = bdev_init(c->lxc_conf->rootfs.path, c->lxc_conf->rootfs.mount, NULL);
+	bdev = bdev_init(c->lxc_conf, c->lxc_conf->rootfs.path, c->lxc_conf->rootfs.mount, NULL);
 	if (!bdev) {
 		ERROR("Failed to find original backing store type");
 		return false;
diff --git a/src/lxc/start.c b/src/lxc/start.c
index df1304a..a7fb1d3 100644
--- a/src/lxc/start.c
+++ b/src/lxc/start.c
@@ -69,6 +69,7 @@
 #include "namespace.h"
 #include "lxcseccomp.h"
 #include "caps.h"
+#include "bdev.h"
 #include "lsm/lsm.h"
 
 lxc_log_define(lxc_start, lxc);
@@ -1054,10 +1055,15 @@ int __lxc_start(const char *name, struct lxc_conf *conf,
 		handler->conf->need_utmp_watch = 0;
 	}
 
+	if (!attach_block_device(handler->conf)) {
+		ERROR("Failure attaching block device");
+		goto out_fini_nonet;
+	}
+
 	err = lxc_spawn(handler);
 	if (err) {
 		ERROR("failed to spawn '%s'", name);
-		goto out_fini_nonet;
+		goto out_detach_blockdev;
 	}
 
 	netnsfd = get_netns_fd(handler->pid);
@@ -1110,6 +1116,9 @@ int __lxc_start(const char *name, struct lxc_conf *conf,
 out_fini:
 	lxc_delete_network(handler);
 
+out_detach_blockdev:
+	detach_block_device(handler->conf);
+
 out_fini_nonet:
 	lxc_fini(name, handler);
 	return err;
diff --git a/src/lxc/utils.c b/src/lxc/utils.c
index efec414..b076ce7 100644
--- a/src/lxc/utils.c
+++ b/src/lxc/utils.c
@@ -1306,3 +1306,10 @@ next_loop:
 	free(path);
 	return NULL;
 }
+
+bool file_exists(const char *f)
+{
+	struct stat statbuf;
+
+	return stat(f, &statbuf) == 0;
+}
diff --git a/src/lxc/utils.h b/src/lxc/utils.h
index b5e054c..9c618b7 100644
--- a/src/lxc/utils.h
+++ b/src/lxc/utils.h
@@ -280,3 +280,4 @@ uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval);
 int detect_shared_rootfs(void);
 int detect_ramfs_rootfs(void);
 char *on_path(char *cmd);
+bool file_exists(const char *f);
-- 
1.9.1



More information about the lxc-devel mailing list