[lxc-devel] [PATCH 1/2] c/r: rework external mountpoint handling v3
Tycho Andersen
tycho.andersen at canonical.com
Wed Apr 15 16:12:41 UTC 2015
On Wed, Apr 15, 2015 at 03:57:32PM +0000, Serge Hallyn wrote:
> Quoting Tycho Andersen (tycho.andersen at canonical.com):
> > CRIU now supports autodetection of external mounts via the --ext-mount-map auto
> > --enable-external-sharing --enable-external-masters options, so we don't need
> > to explicitly pass the cgmanager mount or any of the mounts from the config.
> > This also means that lxcfs mounts (since they are bind mounts from outside the
> > container) are autodetected, meaning that c/r of containers using lxcfs works.
> >
> > A further advantage of this patch is that it addresses some of the ugliness
> > that was in the exec_criu() function. There are other criu options that will
> > allow us to trim this even further, though.
> >
> > Finally, with --enable-external-masters, criu understands slave mounts in the
> > container with shared mounts in the peer group that are outside the namespace.
> > This allows containers on a systemd host to be dumped and restored correctly.
> >
> > However, these options have just landed in criu trunk today, and the next
> > tagged release will be 1.6 on June 1, so we should avoid merging this into any
> > stable releases until then.
> >
> > v2: remount / as private before bind mounting the container's directory for
> > criu. The problem here is that if / is mounted as shared, even if we
> > unshare() the /var/lib/lxc/rootfs mountpoint propagates outside of our
> > mount namespace, which is bad, since we don't want to leak mounts. In
> > particular, this leak confuses criu the second time it goes to checkpoint
> > the container.
> >
> > v3: whoops, we really want / as MS_SLAVE | MS_REC here, to match what start
> > does
>
> Can you justify using MS_SLAVE? (I'm not opposed, just wondering whether
> there are specific reasons)
Because that's how setup_rootfs() does it for a regular start, so I
wanted to match that on restore:
https://github.com/lxc/lxc/blob/master/src/lxc/conf.c#L1256
(which also incedentally doesn't use MS_REMOUNT :)
> We probably do want MS_SLAVE so that any umount -l from the host is less
> likely to leave a busy mount in the container pinning a device on the host.
Yep, agreed, either private or slave makes sense, and I figured it's
probably best to have start/restore match.
> re MS_REC I was thinking you were doing this bc / was going to be bind-mounted
> into the container, but that's not the case, is it? If you're doing this
> so we can mount to our heart's content later on without worrying about the
> host, then MS_REC is needed.
Right, it's the latter. We don't want to pollute the host's mount ns
just because of sharing.
Also, I see you acked v2, if the above is ok can you ack this one
instead? :)
Tycho
> > Signed-off-by: Tycho Andersen <tycho.andersen at canonical.com>
> > ---
> > src/lxc/lxccontainer.c | 90 +++++++++++---------------------------------------
> > 1 file changed, 20 insertions(+), 70 deletions(-)
> >
> > diff --git a/src/lxc/lxccontainer.c b/src/lxc/lxccontainer.c
> > index 3c3ff33..db11947 100644
> > --- a/src/lxc/lxccontainer.c
> > +++ b/src/lxc/lxccontainer.c
> > @@ -3708,18 +3708,18 @@ struct criu_opts {
> > static void exec_criu(struct criu_opts *opts)
> > {
> > char **argv, log[PATH_MAX];
> > - int static_args = 14, argc = 0, i, ret;
> > + int static_args = 18, argc = 0, i, ret;
> > int netnr = 0;
> > struct lxc_list *it;
> >
> > - struct mntent mntent;
> > char buf[4096];
> > FILE *mnts = NULL;
> >
> > /* The command line always looks like:
> > * criu $(action) --tcp-established --file-locks --link-remap --force-irmap \
> > * --manage-cgroups action-script foo.sh -D $(directory) \
> > - * -o $(directory)/$(action).log
> > + * -o $(directory)/$(action).log --ext-mount-map auto
> > + * --enable-external-sharing --enable-external-masters
> > * +1 for final NULL */
> >
> > if (strcmp(opts->action, "dump") == 0) {
> > @@ -3746,11 +3746,6 @@ static void exec_criu(struct criu_opts *opts)
> > return;
> > }
> >
> > - // We need to tell criu where cgmanager's socket is bind mounted from
> > - // if it exists since it's external.
> > - if (cgroup_driver() == CGMANAGER)
> > - static_args+=2;
> > -
> > argv = malloc(static_args * sizeof(*argv));
> > if (!argv)
> > return;
> > @@ -3780,6 +3775,10 @@ static void exec_criu(struct criu_opts *opts)
> > DECLARE_ARG("--link-remap");
> > DECLARE_ARG("--force-irmap");
> > DECLARE_ARG("--manage-cgroups");
> > + DECLARE_ARG("--ext-mount-map");
> > + DECLARE_ARG("auto");
> > + DECLARE_ARG("--enable-external-sharing");
> > + DECLARE_ARG("--enable-external-masters");
> > DECLARE_ARG("--action-script");
> > DECLARE_ARG(DATADIR "/lxc/lxc-restore-net");
> > DECLARE_ARG("-D");
> > @@ -3790,35 +3789,9 @@ static void exec_criu(struct criu_opts *opts)
> > if (opts->verbose)
> > DECLARE_ARG("-vvvvvv");
> >
> > - /*
> > - * Note: this macro is not intended to be called unless argc is equal
> > - * to the length of the array; there is nothing that keeps track of the
> > - * length of the array besides the location in the code that this is
> > - * called. (Yes this is bad, and we should fix it.)
> > - */
> > -#define RESIZE_ARGS(additional) \
> > - do { \
> > - void *m; \
> > - if (additional < 0) { \
> > - ERROR("resizing by negative amount"); \
> > - goto err; \
> > - } else if (additional == 0) \
> > - continue; \
> > - \
> > - m = realloc(argv, (argc + additional + 1) * sizeof(*argv)); \
> > - if (!m) \
> > - goto err; \
> > - argv = m; \
> > - } while (0)
> > -
> > if (strcmp(opts->action, "dump") == 0) {
> > char pid[32];
> >
> > - if (cgroup_driver() == CGMANAGER) {
> > - DECLARE_ARG("--ext-mount-map");
> > - DECLARE_ARG("/sys/fs/cgroup/cgmanager:cgmanager");
> > - }
> > -
> > if (sprintf(pid, "%d", lxcapi_init_pid(opts->c)) < 0)
> > goto err;
> >
> > @@ -3827,11 +3800,8 @@ static void exec_criu(struct criu_opts *opts)
> > if (!opts->stop)
> > DECLARE_ARG("--leave-running");
> > } else if (strcmp(opts->action, "restore") == 0) {
> > -
> > - if (cgroup_driver() == CGMANAGER) {
> > - DECLARE_ARG("--ext-mount-map");
> > - DECLARE_ARG("cgmanager:/sys/fs/cgroup/cgmanager");
> > - }
> > + void *m;
> > + int additional;
> >
> > DECLARE_ARG("--root");
> > DECLARE_ARG(opts->c->lxc_conf->rootfs.mount);
> > @@ -3842,7 +3812,12 @@ static void exec_criu(struct criu_opts *opts)
> > DECLARE_ARG("--cgroup-root");
> > DECLARE_ARG(opts->cgroup_path);
> >
> > - RESIZE_ARGS(lxc_list_len(&opts->c->lxc_conf->network) * 2);
> > + additional = lxc_list_len(&opts->c->lxc_conf->network) * 2;
> > +
> > + m = realloc(argv, (argc + additional + 1) * sizeof(*argv)); \
> > + if (!m) \
> > + goto err; \
> > + argv = m;
> >
> > lxc_list_for_each(it, &opts->c->lxc_conf->network) {
> > char eth[128], *veth;
> > @@ -3864,37 +3839,8 @@ static void exec_criu(struct criu_opts *opts)
> > DECLARE_ARG("--veth-pair");
> > DECLARE_ARG(buf);
> > }
> > - }
> > -
> > - // CRIU wants to know about any external bind mounts the
> > - // container has.
> > - mnts = write_mount_file(&opts->c->lxc_conf->mount_list);
> > - if (!mnts)
> > - goto err;
> > -
> > - RESIZE_ARGS(lxc_list_len(&opts->c->lxc_conf->mount_list) * 2);
> > -
> > - while (getmntent_r(mnts, &mntent, buf, sizeof(buf))) {
> > - char arg[2048], *key, *val;
> > - int ret;
> > -
> > - if (strcmp(opts->action, "dump") == 0) {
> > - key = mntent.mnt_fsname;
> > - val = mntent.mnt_dir;
> > - } else {
> > - key = mntent.mnt_dir;
> > - val = mntent.mnt_fsname;
> > - }
> > -
> > - ret = snprintf(arg, sizeof(arg), "%s:%s", key, val);
> > - if (ret < 0 || ret >= sizeof(arg)) {
> > - goto err;
> > - }
> >
> > - DECLARE_ARG("--ext-mount-map");
> > - DECLARE_ARG(arg);
> > }
> > - fclose(mnts);
> >
> > argv[argc] = NULL;
> >
> > @@ -3932,7 +3878,6 @@ static void exec_criu(struct criu_opts *opts)
> > }
> >
> > #undef DECLARE_ARG
> > -#undef RESIZE_ARGS
> > execv(argv[0], argv);
> > err:
> > if (mnts)
> > @@ -4176,6 +4121,11 @@ static void do_restore(struct lxc_container *c, int pipe, char *directory, bool
> > if (mkdir(rootfs->mount, 0755) < 0 && errno != EEXIST)
> > goto out_fini_handler;
> >
> > + if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) {
> > + SYSERROR("remount / to private failed");
> > + goto out_fini_handler;
> > + }
> > +
> > if (mount(rootfs->path, rootfs->mount, NULL, MS_BIND, NULL) < 0) {
> > rmdir(rootfs->mount);
> > goto out_fini_handler;
> > --
> > 2.1.4
> >
> > _______________________________________________
> > lxc-devel mailing list
> > lxc-devel at lists.linuxcontainers.org
> > http://lists.linuxcontainers.org/listinfo/lxc-devel
> _______________________________________________
> lxc-devel mailing list
> lxc-devel at lists.linuxcontainers.org
> http://lists.linuxcontainers.org/listinfo/lxc-devel
More information about the lxc-devel
mailing list