[lxc-devel] [PATCH v2] add support for qcow2
Dwight Engen
dwight.engen at oracle.com
Wed May 14 16:36:21 UTC 2014
On Mon, 12 May 2014 18:02:28 +0000
Serge Hallyn <serge.hallyn at ubuntu.com> wrote:
> qcow2 backing stores can be attached to a nbd block device using
> qemu-nbd. This user-space process (pair) stays around for the
> duration of the device attachment. Obviously we want it to go
> away when the container shuts down, but not before the filesystems
> have been cleanly unmounted.
>
> The device attachment is done from the task which will become the
> container monitor before the container setup+init task is spawned.
> That task starts in a new pid namespace to ensure that the qemu-nbd
> process will be killed if need be. It sets its parent death signal
> to sighup, and, on receiving sighup, attempts to do a clean
> qemu-device detach, then exits. This should ensure that the
> device is detached if the qemu monitor crashes or exits.
>
> It may be worth adding a delay before the qemu-nbd is detached, but
> my brief tests haven't seen any data corruption.
>
> Only the parts required for running a qcow2-backed container are
> implemented here. Create, destroy, and clone are not. The first
> use of this that I imagine is for people to use downloaded
> qcow2-backed images (like ubuntu cloud images, or anything previously
> used with qemu). I imagine people will want to create/clone/destroy
> out of band using qemu-img, but if I'm wrong about that we can
> implement the rest later.
>
> Because attach_block_device() is done before the bdev is initialized,
> and bdev_init needs to know the nbd index so that it can mount the
> filesystem, we now need to pass the lxc_conf.
>
> file_exists() is moved to utils.c so we can use it from bdev.c
>
> The nbd attach/detach should lay the groundwork for trivial
> implementation of qed and raw images.
>
> changelog (may 12): qcow: fix idx check at detach
Hey Serge, I had to check the code for how to use this so maybe we
should document somewhere what the rootfs line needs to look like (ie.
lxc.rootfs = qcow2:/path/to/diskimg:1).
Also, I used this against a .vdi image just fine, so maybe we should be
more generic than just qcow2 and call it qemu? Not sure if qemu-nbd
supports all the same image formats as qemu-img.
For anyone else trying to use this note that I had to:
'modprobe nbd max_part=16' or else I would only get the full device node
(/dev/nbd0) but not the partition nodes (/dev/nbd0p?) which caused lxc
to hang on startup.
At any rate, this works great so thanks and:
Acked-by: Dwight Engen <dwight.engen at oracle.com>
> Signed-off-by: Serge Hallyn <serge.hallyn at ubuntu.com>
> ---
> src/lxc/bdev.c | 296
> ++++++++++++++++++++++++++++++++++++++++++++++++-
> src/lxc/bdev.h | 18 ++- src/lxc/conf.c | 3 +-
> src/lxc/conf.h | 1 +
> src/lxc/lxccontainer.c | 19 +---
> src/lxc/start.c | 11 +-
> src/lxc/utils.c | 7 ++
> src/lxc/utils.h | 1 +
> 8 files changed, 333 insertions(+), 23 deletions(-)
>
> diff --git a/src/lxc/bdev.c b/src/lxc/bdev.c
> index 20e9fb3..85b95a7 100644
> --- a/src/lxc/bdev.c
> +++ b/src/lxc/bdev.c
> @@ -41,6 +41,7 @@
> #include <libgen.h>
> #include <linux/loop.h>
> #include <dirent.h>
> +#include <sys/prctl.h>
>
> #include "lxc.h"
> #include "config.h"
> @@ -2410,6 +2411,290 @@ static const struct bdev_ops aufs_ops = {
> .can_snapshot = true,
> };
>
> +//
> +// qcow2 dev ops
> +//
> +
> +static int qcow2_detect(const char *path)
> +{
> + if (strncmp(path, "qcow2:", 6) == 0)
> + return 1;
> + return 0;
> +}
> +
> +struct nbd_attach_data {
> + const char *nbd;
> + const char *path;
> +};
> +
> +static void nbd_detach(const char *path)
> +{
> + int ret;
> + pid_t pid = fork();
> +
> + if (pid < 0) {
> + SYSERROR("Error forking to detach nbd");
> + return;
> + }
> + if (pid) {
> + ret = wait_for_pid(pid);
> + if (ret < 0)
> + ERROR("nbd disconnect returned an error");
> + return;
> + }
> + execlp("qemu-nbd", "qemu-nbd", "-d", path, NULL);
> + SYSERROR("Error executing qemu-nbd");
> + exit(1);
> +}
> +
> +static int do_attach_nbd(void *d)
> +{
> + struct nbd_attach_data *data = d;
> + const char *nbd, *path;
> + pid_t pid;
> + sigset_t mask;
> + int sfd;
> + ssize_t s;
> + struct signalfd_siginfo fdsi;
> +
> + sigemptyset(&mask);
> + sigaddset(&mask, SIGHUP);
> + sigaddset(&mask, SIGCHLD);
> +
> + nbd = data->nbd;
> + path = data->path;
> +
> + if (sigprocmask(SIG_BLOCK, &mask, NULL) == -1) {
> + SYSERROR("Error blocking signals for nbd watcher");
> + exit(1);
> + }
> +
> + sfd = signalfd(-1, &mask, 0);
> + if (sfd == -1) {
> + SYSERROR("Error opening signalfd for nbd task");
> + exit(1);
> + }
> +
> + if (prctl(PR_SET_PDEATHSIG, SIGHUP, 0, 0, 0) < 0)
> + SYSERROR("Error setting parent death signal for nbd
> watcher"); +
> + pid = fork();
> + if (pid) {
> + for (;;) {
> + s = read(sfd, &fdsi, sizeof(struct
> signalfd_siginfo));
> + if (s != sizeof(struct signalfd_siginfo))
> + SYSERROR("Error reading from
> signalfd"); +
> + if (fdsi.ssi_signo == SIGHUP) {
> + /* container has exited */
> + nbd_detach(nbd);
> + exit(0);
> + } else if (fdsi.ssi_signo == SIGCHLD) {
> + int status;
> + while (waitpid(-1, &status, WNOHANG)
> > 0);
> + }
> + }
> + }
> +
> + close(sfd);
> + if (sigprocmask(SIG_UNBLOCK, &mask, NULL) == -1)
> + WARN("Warning: unblocking signals for nbd watcher");
> +
> + execlp("qemu-nbd", "qemu-nbd", "-c", nbd, path, NULL);
> + SYSERROR("Error executing qemu-nbd");
> + exit(1);
> +}
> +
> +static bool clone_attach_nbd(const char *nbd, const char *path)
> +{
> + pid_t pid;
> + struct nbd_attach_data data;
> +
> + data.nbd = nbd;
> + data.path = path;
> +
> + pid = lxc_clone(do_attach_nbd, &data, CLONE_NEWPID);
> + if (pid < 0)
> + return false;
> + return true;
> +}
> +
> +static bool nbd_busy(int idx)
> +{
> + char path[100];
> + int ret;
> +
> + ret = snprintf(path, 100, "/sys/block/nbd%d/pid", idx);
> + if (ret < 0 || ret >= 100)
> + return true;
> + return file_exists(path);
> +}
> +
> +static bool attach_nbd(char *src, struct lxc_conf *conf)
> +{
> + char *orig = alloca(strlen(src)+1), *p, path[50];
> + int i = 0;
> +
> + strcpy(orig, src);
> + /* if path is followed by a partition, drop that for now */
> + p = strchr(orig, ':');
> + if (p)
> + *p = '\0';
> + while (1) {
> + sprintf(path, "/dev/nbd%d", i);
> + if (!file_exists(path))
> + return false;
> + if (nbd_busy(i)) {
> + i++;
> + continue;
> + }
> + if (!clone_attach_nbd(path, orig))
> + return false;
> + conf->nbd_idx = i;
> + return true;
> + }
> +}
> +
> +static bool requires_nbd(const char *path)
> +{
> + if (strncmp(path, "qcow2:", 6) == 0)
> + return true;
> + if (strncmp(path, "raw:", 4) == 0)
> + return true;
> + return false;
> +}
> +
> +/*
> + * attach_block_device returns true if all went well,
> + * meaning either a block device was attached or was not
> + * needed. It returns false if something went wrong and
> + * container startup shoudl be stopped.
> + */
> +bool attach_block_device(struct lxc_conf *conf)
> +{
> + char *path;
> +
> + if (!conf->rootfs.path)
> + return true;
> + path = conf->rootfs.path;
> + if (!requires_nbd(path))
> + return true;
> + path = strchr(path, ':');
> + if (!path)
> + return false;
> + path++;
> + if (!attach_nbd(path, conf))
> + return false;
> + return true;
> +}
> +
> +void detach_nbd_idx(int idx)
> +{
> + int ret;
> + char path[50];
> +
> + ret = snprintf(path, 50, "/dev/nbd%d", idx);
> + if (ret < 0 || ret >= 50)
> + return;
> +
> + nbd_detach(path);
> +}
> +
> +void detach_block_device(struct lxc_conf *conf)
> +{
> + if (conf->nbd_idx != -1)
> + detach_nbd_idx(conf->nbd_idx);
> +}
> +
> +/*
> + * Pick the partition # off the end of a qcow2:file:p
> + * description. Return 1-9 for the partition id, or 0
> + * for no partition.
> + */
> +static int qcow2_get_partition(const char *src)
> +{
> + char *p = strchr(src, ':');
> + if (!p)
> + return 0;
> + p = strchr(p+1, ':');
> + if (!p)
> + return 0;
> + p++;
> + if (*p < '1' && *p > '9')
> + return 0;
> + return *p - '0';
> +}
> +
> +static int qcow2_mount(struct bdev *bdev)
> +{
> + int ret = -1, partition;
> + char path[50];
> +
> + if (strcmp(bdev->type, "qcow2"))
> + return -22;
> + if (!bdev->src || !bdev->dest)
> + return -22;
> +
> + /* nbd_idx should have been copied by bdev_init from the
> lxc_conf */
> + if (bdev->nbd_idx < 0)
> + return -22;
> + partition = qcow2_get_partition(bdev->src);
> + if (partition)
> + ret = snprintf(path, 50, "/dev/nbd%dp%d",
> bdev->nbd_idx,
> + partition);
> + else
> + ret = snprintf(path, 50, "/dev/nbd%d",
> bdev->nbd_idx);
> + if (ret < 0 || ret >= 50) {
> + ERROR("Error setting up nbd device path");
> + return ret;
> + }
> + ret = mount_unknown_fs(path, bdev->dest, bdev->mntopts);
> + if (ret < 0)
> + ERROR("Error mounting %s", bdev->src);
> +
> + return ret;
> +}
> +
> +static int qcow2_create(struct bdev *bdev, const char *dest, const
> char *n,
> + struct bdev_specs *specs)
> +{
> + return -ENOSYS;
> +}
> +
> +/* qcow2 should shine here... */
> +static int qcow2_clonepaths(struct bdev *orig, struct bdev *new,
> const char *oldname,
> + const char *cname, const char *oldpath, const char
> *lxcpath, int snap,
> + uint64_t newsize, struct lxc_conf *conf)
> +{
> + return -ENOSYS;
> +}
> +
> +static int qcow2_destroy(struct bdev *orig)
> +{
> + return -ENOSYS;
> +}
> +
> +static int qcow2_umount(struct bdev *bdev)
> +{
> + int ret;
> +
> + if (strcmp(bdev->type, "qcow2"))
> + return -22;
> + if (!bdev->src || !bdev->dest)
> + return -22;
> + ret = umount(bdev->dest);
> + return ret;
> +}
> +
> +static const struct bdev_ops qcow2_ops = {
> + .detect = &qcow2_detect,
> + .mount = &qcow2_mount,
> + .umount = &qcow2_umount,
> + .clone_paths = &qcow2_clonepaths,
> + .destroy = &qcow2_destroy,
> + .create = &qcow2_create,
> + .can_snapshot = true,
> +};
>
> static const struct bdev_type bdevs[] = {
> {.name = "zfs", .ops = &zfs_ops,},
> @@ -2419,6 +2704,7 @@ static const struct bdev_type bdevs[] = {
> {.name = "aufs", .ops = &aufs_ops,},
> {.name = "overlayfs", .ops = &overlayfs_ops,},
> {.name = "loop", .ops = &loop_ops,},
> + {.name = "qcow2", .ops = &qcow2_ops,},
> };
>
> static const size_t numbdevs = sizeof(bdevs) / sizeof(struct
> bdev_type); @@ -2454,7 +2740,7 @@ struct bdev *bdev_get(const char
> *type) return bdev;
> }
>
> -struct bdev *bdev_init(const char *src, const char *dst, const char
> *mntopts) +struct bdev *bdev_init(struct lxc_conf *conf, const char
> *src, const char *dst, const char *mntopts) {
> int i;
> struct bdev *bdev;
> @@ -2480,6 +2766,8 @@ struct bdev *bdev_init(const char *src, const
> char *dst, const char *mntopts) bdev->src = strdup(src);
> if (dst)
> bdev->dest = strdup(dst);
> + if (strcmp(bdev->type, "qcow2") == 0 || strcmp(bdev->type,
> "raw") == 0)
> + bdev->nbd_idx = conf->nbd_idx;
>
> return bdev;
> }
> @@ -2538,9 +2826,9 @@ static int rsync_rootfs_wrapper(void *data)
> return rsync_rootfs(arg);
> }
>
> -bool bdev_is_dir(const char *path)
> +bool bdev_is_dir(struct lxc_conf *conf, const char *path)
> {
> - struct bdev *orig = bdev_init(path, NULL, NULL);
> + struct bdev *orig = bdev_init(conf, path, NULL, NULL);
> bool ret = false;
> if (!orig)
> return ret;
> @@ -2605,7 +2893,7 @@ struct bdev *bdev_copy(struct lxc_container
> *c0, const char *cname, return NULL;
> }
>
> - orig = bdev_init(src, NULL, NULL);
> + orig = bdev_init(c0->lxc_conf, src, NULL, NULL);
> if (!orig) {
> ERROR("failed to detect blockdev type for %s", src);
> return NULL;
> diff --git a/src/lxc/bdev.h b/src/lxc/bdev.h
> index cc0bf02..170a530 100644
> --- a/src/lxc/bdev.h
> +++ b/src/lxc/bdev.h
> @@ -24,8 +24,8 @@
> #ifndef __LXC_BDEV_H
> #define __LXC_BDEV_H
> /* blockdev operations for:
> - * aufs, dir, raw, btrfs, overlayfs, aufs, lvm, loop, zfs
> - * someday: qemu-nbd, qcow2, qed
> + * aufs, dir, raw, btrfs, overlayfs, aufs, lvm, loop, zfs, qcow2
> + * someday: qed
> */
>
> #include "config.h"
> @@ -83,11 +83,13 @@ struct bdev {
> // turn the following into a union if need be
> // lofd is the open fd for the mounted loopback file
> int lofd;
> + // index for the connected nbd device
> + int nbd_idx;
> };
>
> char *overlay_getlower(char *p);
>
> -bool bdev_is_dir(const char *path);
> +bool bdev_is_dir(struct lxc_conf *conf, const char *path);
>
> /*
> * Instantiate a bdev object. The src is used to determine which
> blockdev @@ -100,7 +102,8 @@ bool bdev_is_dir(const char *path);
> * use /var/lib/lxc/canonical/rootfs as lower dir,
> and /var/lib/lxc/c1/delta
> * as the upper, writeable layer.
> */
> -struct bdev *bdev_init(const char *src, const char *dst, const char
> *data); +struct bdev *bdev_init(struct lxc_conf *conf, const char
> *src, const char *dst,
> + const char *data);
>
> struct bdev *bdev_copy(struct lxc_container *c0, const char *cname,
> const char *lxcpath, const char *bdevtype,
> @@ -110,6 +113,13 @@ struct bdev *bdev_create(const char *dest, const
> char *type, const char *cname, struct bdev_specs *specs);
> void bdev_put(struct bdev *bdev);
>
> +/*
> + * these are really for qemu-nbd support, as container shutdown
> + * must explicitly request device detach.
> + */
> +bool attach_block_device(struct lxc_conf *conf);
> +void detach_block_device(struct lxc_conf *conf);
> +
> /* define constants if the kernel/glibc headers don't define them */
> #ifndef MS_DIRSYNC
> #define MS_DIRSYNC 128
> diff --git a/src/lxc/conf.c b/src/lxc/conf.c
> index 78d9de2..7427a94 100644
> --- a/src/lxc/conf.c
> +++ b/src/lxc/conf.c
> @@ -1555,7 +1555,7 @@ static int setup_rootfs(struct lxc_conf *conf)
> }
>
> // First try mounting rootfs using a bdev
> - struct bdev *bdev = bdev_init(rootfs->path, rootfs->mount,
> rootfs->options);
> + struct bdev *bdev = bdev_init(conf, rootfs->path,
> rootfs->mount, rootfs->options); if (bdev && bdev->ops->mount(bdev)
> == 0) { bdev_put(bdev);
> DEBUG("mounted '%s' on '%s'", rootfs->path,
> rootfs->mount); @@ -2675,6 +2675,7 @@ struct lxc_conf
> *lxc_conf_init(void) new->console.slave = -1;
> new->console.name[0] = '\0';
> new->maincmd_fd = -1;
> + new->nbd_idx = -1;
> new->rootfs.mount = strdup(default_rootfs_mount);
> if (!new->rootfs.mount) {
> ERROR("lxc_conf_init : %m");
> diff --git a/src/lxc/conf.h b/src/lxc/conf.h
> index 865b87a..3a81d0e 100644
> --- a/src/lxc/conf.h
> +++ b/src/lxc/conf.h
> @@ -334,6 +334,7 @@ struct lxc_conf {
> int start_delay;
> int start_order;
> struct lxc_list groups;
> + int nbd_idx;
> };
>
> int run_lxc_hooks(const char *name, char *hook, struct lxc_conf
> *conf, diff --git a/src/lxc/lxccontainer.c b/src/lxc/lxccontainer.c
> index 255fde5..fdac433 100644
> --- a/src/lxc/lxccontainer.c
> +++ b/src/lxc/lxccontainer.c
> @@ -82,13 +82,6 @@ return -1;
>
> lxc_log_define(lxc_container, lxc);
>
> -static bool file_exists(const char *f)
> -{
> - struct stat statbuf;
> -
> - return stat(f, &statbuf) == 0;
> -}
> -
> static bool config_file_exists(const char *lxcpath, const char
> *cname) {
> /* $lxcpath + '/' + $cname + '/config' + \0 */
> @@ -900,7 +893,7 @@ static bool create_run_template(struct
> lxc_container *c, char *tpath, bool quiet if (strncmp(src, "aufs:",
> 5) == 0) src = overlay_getlower(src+5);
>
> - bdev = bdev_init(src, c->lxc_conf->rootfs.mount,
> NULL);
> + bdev = bdev_init(c->lxc_conf, src,
> c->lxc_conf->rootfs.mount, NULL); if (!bdev) {
> ERROR("Error opening rootfs");
> exit(1);
> @@ -1992,7 +1985,7 @@ static int do_bdev_destroy(struct lxc_conf
> *conf) struct bdev *r;
> int ret = 0;
>
> - r = bdev_init(conf->rootfs.path, conf->rootfs.mount, NULL);
> + r = bdev_init(conf, conf->rootfs.path, conf->rootfs.mount,
> NULL); if (!r)
> return -1;
>
> @@ -2522,7 +2515,7 @@ static int clone_update_rootfs(struct
> clone_update_data *data)
> if (unshare(CLONE_NEWNS) < 0)
> return -1;
> - bdev = bdev_init(c->lxc_conf->rootfs.path,
> c->lxc_conf->rootfs.mount, NULL);
> + bdev = bdev_init(c->lxc_conf, c->lxc_conf->rootfs.path,
> c->lxc_conf->rootfs.mount, NULL); if (!bdev)
> return -1;
> if (strcmp(bdev->type, "dir") != 0) {
> @@ -2787,7 +2780,7 @@ static bool lxcapi_rename(struct lxc_container
> *c, const char *newname) if (!c || !c->name || !c->config_path
> || !c->lxc_conf) return false;
>
> - bdev = bdev_init(c->lxc_conf->rootfs.path,
> c->lxc_conf->rootfs.mount, NULL);
> + bdev = bdev_init(c->lxc_conf, c->lxc_conf->rootfs.path,
> c->lxc_conf->rootfs.mount, NULL); if (!bdev) {
> ERROR("Failed to find original backing store type");
> return false;
> @@ -2880,7 +2873,7 @@ static int lxcapi_snapshot(struct lxc_container
> *c, const char *commentfile) */
> flags = LXC_CLONE_SNAPSHOT | LXC_CLONE_KEEPMACADDR |
> LXC_CLONE_KEEPNAME | LXC_CLONE_KEEPBDEVTYPE |
> LXC_CLONE_MAYBE_SNAPSHOT;
> - if (bdev_is_dir(c->lxc_conf->rootfs.path)) {
> + if (bdev_is_dir(c->lxc_conf, c->lxc_conf->rootfs.path)) {
> ERROR("Snapshot of directory-backed container
> requested."); ERROR("Making a copy-clone. If you do want snapshots,
> then"); ERROR("please create an aufs or overlayfs clone first,
> snapshot that"); @@ -3082,7 +3075,7 @@ static bool
> lxcapi_snapshot_restore(struct lxc_container *c, const char *snapnam
> if (!c || !c->name || !c->config_path) return false;
>
> - bdev = bdev_init(c->lxc_conf->rootfs.path,
> c->lxc_conf->rootfs.mount, NULL);
> + bdev = bdev_init(c->lxc_conf, c->lxc_conf->rootfs.path,
> c->lxc_conf->rootfs.mount, NULL); if (!bdev) {
> ERROR("Failed to find original backing store type");
> return false;
> diff --git a/src/lxc/start.c b/src/lxc/start.c
> index df1304a..a7fb1d3 100644
> --- a/src/lxc/start.c
> +++ b/src/lxc/start.c
> @@ -69,6 +69,7 @@
> #include "namespace.h"
> #include "lxcseccomp.h"
> #include "caps.h"
> +#include "bdev.h"
> #include "lsm/lsm.h"
>
> lxc_log_define(lxc_start, lxc);
> @@ -1054,10 +1055,15 @@ int __lxc_start(const char *name, struct
> lxc_conf *conf, handler->conf->need_utmp_watch = 0;
> }
>
> + if (!attach_block_device(handler->conf)) {
> + ERROR("Failure attaching block device");
> + goto out_fini_nonet;
> + }
> +
> err = lxc_spawn(handler);
> if (err) {
> ERROR("failed to spawn '%s'", name);
> - goto out_fini_nonet;
> + goto out_detach_blockdev;
> }
>
> netnsfd = get_netns_fd(handler->pid);
> @@ -1110,6 +1116,9 @@ int __lxc_start(const char *name, struct
> lxc_conf *conf, out_fini:
> lxc_delete_network(handler);
>
> +out_detach_blockdev:
> + detach_block_device(handler->conf);
> +
> out_fini_nonet:
> lxc_fini(name, handler);
> return err;
> diff --git a/src/lxc/utils.c b/src/lxc/utils.c
> index efec414..b076ce7 100644
> --- a/src/lxc/utils.c
> +++ b/src/lxc/utils.c
> @@ -1306,3 +1306,10 @@ next_loop:
> free(path);
> return NULL;
> }
> +
> +bool file_exists(const char *f)
> +{
> + struct stat statbuf;
> +
> + return stat(f, &statbuf) == 0;
> +}
> diff --git a/src/lxc/utils.h b/src/lxc/utils.h
> index b5e054c..9c618b7 100644
> --- a/src/lxc/utils.h
> +++ b/src/lxc/utils.h
> @@ -280,3 +280,4 @@ uint64_t fnv_64a_buf(void *buf, size_t len,
> uint64_t hval); int detect_shared_rootfs(void);
> int detect_ramfs_rootfs(void);
> char *on_path(char *cmd);
> +bool file_exists(const char *f);
More information about the lxc-devel
mailing list