[lxc-devel] [PATCH] Add support for checkpoint and restore via CRIU

Serge Hallyn serge.hallyn at ubuntu.com
Fri Aug 22 00:09:31 UTC 2014


Quoting Tycho Andersen (tycho.andersen at canonical.com):
> Hi Stéphane,
> 
> On Wed, Aug 20, 2014 at 05:19:14PM -0500, Stéphane Graber wrote:
> > On Wed, Aug 20, 2014 at 02:31:05PM -0500, Tycho Andersen wrote:
> > > Hi Stéphane,
> > > 
> > > On Wed, Aug 20, 2014 at 10:34:55AM -0500, Tycho Andersen wrote:
> > > >
> > > > Sounds good, I will make the changes.
> > > 
> > > Below is a revised version of the patch.
> > 
> > Some more comments below.
> 
> Here is an updated version with the comments addressed.

Thanks, Tycho.  I love how little was really needed in order
to make the monitor work.  A few question/comments below, sorry.
But after that I think I'm done.

> Tycho
> 
> 
> 
> This patch adds support for checkpointing and restoring containers via CRIU.
> It adds two api calls, ->checkpoint and ->restore, which are wrappers around
> the CRIU CLI. CRIU has an RPC API, but reasons for preferring exec() are
> discussed in [1].
> 
> To checkpoint, users specify a directory to dump the container metadata (CRIU
> dump files, plus some additional information about veth pairs and which
> bridges they are attached to) into this directory. On restore, this
> information is read out of the directory, a CRIU command line is constructed,
> and CRIU is exec()d. CRIU uses the lxc-restore-net callback (which in turn
> inspects the image directory with the NIC data) to properly restore the
> network.
> 
> This will only work with the current git master of CRIU; anything as of
> a152c843 should work. There is a known bug where containers which have been
> restored cannot be checkpointed [2].
> 
> [1]: http://lists.openvz.org/pipermail/criu/2014-July/015117.html
> [2]: http://lists.openvz.org/pipermail/criu/2014-August/015876.html
> 
> Signed-off-by: Tycho Andersen <tycho.andersen at canonical.com>
> ---
>  .gitignore                 |   1 +
>  configure.ac               |   1 +
>  doc/Makefile.am            |   1 +
>  doc/lxc-checkpoint.sgml.in | 194 +++++++++++++++++++
>  src/lxc/Makefile.am        |   4 +
>  src/lxc/lxc-restore-net    |  27 +++
>  src/lxc/lxc_checkpoint.c   | 201 ++++++++++++++++++++
>  src/lxc/lxccontainer.c     | 455 +++++++++++++++++++++++++++++++++++++++++++++
>  src/lxc/lxccontainer.h     |  25 +++
>  src/lxc/start.c            |   6 +-
>  src/lxc/start.h            |   4 +
>  src/lxc/utils.c            |  14 ++
>  src/lxc/utils.h            |   1 +
>  13 files changed, 931 insertions(+), 3 deletions(-)
>  create mode 100644 doc/lxc-checkpoint.sgml.in
>  create mode 100755 src/lxc/lxc-restore-net
>  create mode 100644 src/lxc/lxc_checkpoint.c
> 
> diff --git a/.gitignore b/.gitignore
> index e6de18f..0b6ec69 100644
> --- a/.gitignore
> +++ b/.gitignore
> @@ -49,6 +49,7 @@ src/lxc/lxc-attach
>  src/lxc/lxc-autostart
>  src/lxc/lxc-cgroup
>  src/lxc/lxc-checkconfig
> +src/lxc/lxc-checkpoint
>  src/lxc/lxc-clone
>  src/lxc/lxc-console
>  src/lxc/lxc-config
> diff --git a/configure.ac b/configure.ac
> index 462217e..2c1a90c 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -652,6 +652,7 @@ AC_CONFIG_FILES([
>  	doc/lxc-autostart.sgml
>  	doc/lxc-cgroup.sgml
>  	doc/lxc-checkconfig.sgml
> +	doc/lxc-checkpoint.sgml
>  	doc/lxc-clone.sgml
>  	doc/lxc-config.sgml
>  	doc/lxc-console.sgml
> diff --git a/doc/Makefile.am b/doc/Makefile.am
> index bfe887e..767ee38 100644
> --- a/doc/Makefile.am
> +++ b/doc/Makefile.am
> @@ -20,6 +20,7 @@ man_MANS = \
>  	lxc-autostart.1 \
>  	lxc-cgroup.1 \
>  	lxc-checkconfig.1 \
> +	lxc-checkpoint.1 \
>  	lxc-clone.1 \
>  	lxc-config.1 \
>  	lxc-console.1 \
> diff --git a/doc/lxc-checkpoint.sgml.in b/doc/lxc-checkpoint.sgml.in
> new file mode 100644
> index 0000000..a8766c5
> --- /dev/null
> +++ b/doc/lxc-checkpoint.sgml.in
> @@ -0,0 +1,194 @@
> +<!--
> +
> +(C) Copyright Canonical Ltd. 2014
> +
> +Authors:
> +Tycho Andersen <tycho.andersen at canonical.com>
> +
> +This library is free software; you can redistribute it and/or
> +modify it under the terms of the GNU Lesser General Public
> +License as published by the Free Software Foundation; either
> +version 2.1 of the License, or (at your option) any later version.
> +
> +This library is distributed in the hope that it will be useful,
> +but WITHOUT ANY WARRANTY; without even the implied warranty of
> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +Lesser General Public License for more details.
> +
> +You should have received a copy of the GNU Lesser General Public
> +License along with this library; if not, write to the Free Software
> +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +
> +-->
> +
> +<!DOCTYPE refentry PUBLIC @docdtd@ [
> +
> +<!ENTITY commonoptions SYSTEM "@builddir@/common_options.sgml">
> +<!ENTITY seealso SYSTEM "@builddir@/see_also.sgml">
> +]>
> +
> +<refentry>
> +
> +  <docinfo><date>@LXC_GENERATE_DATE@</date></docinfo>
> +
> +  <refmeta>
> +    <refentrytitle>lxc-checkpoint</refentrytitle>
> +    <manvolnum>1</manvolnum>
> +  </refmeta>
> +
> +  <refnamediv>
> +    <refname>lxc-checkpoint</refname>
> +
> +    <refpurpose>
> +      checkpoint a container
> +    </refpurpose>
> +  </refnamediv>
> +
> +  <refsynopsisdiv>
> +    <cmdsynopsis>
> +      <command>lxc-info</command>
> +      <arg choice="req">-n <replaceable>name</replaceable></arg>
> +      <arg choice="req">-D <replaceable>checkpoint-dir</replaceable></arg>
> +      <arg choice="opt">-r <replaceable>restore</replaceable></arg>
> +      <arg choice="opt">-s <replaceable>stop</replaceable></arg>
> +      <arg choice="opt">-v <replaceable>verbose</replaceable></arg>
> +      <arg choice="opt">-d <replacable>daemon</replacable></arg>
> +      <arg choice="opt">-F <replacable>foreground</replacable></arg>
> +    </cmdsynopsis>
> +  </refsynopsisdiv>
> +
> +  <refsect1>
> +    <title>Description</title>
> +    <para>
> +      <command>lxc-checkpoint</command> checkpoints and restores containers.
> +    </para>
> +  </refsect1>
> +
> +  <refsect1>
> +    <title>Options</title>
> +    <variablelist>
> +
> +      <varlistentry>
> +        <term>
> +          <option>-r <replaceable>restore</replaceable></option>
> +        </term>
> +        <listitem>
> +          <para>
> +            Restore the checkpoint for the container, instead of dumping it.
> +            This option is incompatible with <option>-s</option>.
> +          </para>
> +        </listitem>
> +      </varlistentry>
> +
> +      <varlistentry>
> +        <term>
> +          <option>-D <replaceable>checkpoint-dir</replaceable></option>
> +        </term>
> +        <listitem>
> +          <para>
> +            The directory to dump the checkpoint metadata.
> +          </para>
> +        </listitem>
> +      </varlistentry>
> +
> +      <varlistentry>
> +        <term>
> +          <option><optional>-s</optional></option>
> +        </term>
> +        <listitem>
> +          <para>
> +            Optionally stop the container after dumping. This option is
> +            incompatible with <option>-r</option>.
> +          </para>
> +        </listitem>
> +      </varlistentry>
> +
> +      <varlistentry>
> +        <term>
> +          <option><optional>-v</optional></option>
> +        </term>
> +        <listitem>
> +          <para>
> +            Enable verbose criu logging.
> +          </para>
> +        </listitem>
> +      </varlistentry>
> +
> +      <varlistentry>
> +        <term>
> +          <option><optional>-d</optional></option>
> +        </term>
> +        <listitem>
> +          <para>
> +            Restore the container in the background (this is the default).
> +            Only avaialable when providing <option>-r</option>.
> +          </para>
> +        </listitem>
> +      </varlistentry>
> +
> +      <varlistentry>
> +        <term>
> +          <option><optional>-F</optional></option>
> +        </term>
> +        <listitem>
> +          <para>
> +            Restore the container in the foreground. Only avaialable when
> +            providing <option>-r</option>.
> +          </para>
> +        </listitem>
> +      </varlistentry>
> +
> +    </variablelist>
> +  </refsect1>
> +
> +  &commonoptions;
> +
> +  <refsect1>
> +    <title>Examples</title>
> +    <variablelist>
> +
> +      <varlistentry>
> +        <term>lxc-checkpoint -n foo -D /tmp/checkpoint</term>
> +        <listitem>
> +          <para>
> +            Checkpoint the container foo into the directory /tmp/checkpoint.
> +          </para>
> +        </listitem>
> +      </varlistentry>
> +
> +      <varlistentry>
> +        <term>lxc-checkpoint -r -n foo -D /tmp/checkpoint</term>
> +        <listitem>
> +          <para>
> +            Restore the checkpoint from the directory /tmp/checkpoint.
> +          </para>
> +        </listitem>
> +      </varlistentry>
> +
> +    </variablelist>
> +  </refsect1>
> +
> +  &seealso;
> +
> +  <refsect1>
> +    <title>Author</title>
> +    <para>Tycho Andersen <email>tycho.andersen at canonical.com</email></para>
> +  </refsect1>
> +</refentry>
> +
> +<!-- Keep this comment at the end of the file
> +Local variables:
> +mode: sgml
> +sgml-omittag:t
> +sgml-shorttag:t
> +sgml-minimize-attributes:nil
> +sgml-always-quote-attributes:t
> +sgml-indent-step:2
> +sgml-indent-data:t
> +sgml-parent-document:nil
> +sgml-default-dtd-file:nil
> +sgml-exposed-tags:nil
> +sgml-local-catalogs:nil
> +sgml-local-ecat-files:nil
> +End:
> +-->
> diff --git a/src/lxc/Makefile.am b/src/lxc/Makefile.am
> index f7bc31a..9707b56 100644
> --- a/src/lxc/Makefile.am
> +++ b/src/lxc/Makefile.am
> @@ -188,6 +188,7 @@ bin_PROGRAMS = \
>  	lxc-attach \
>  	lxc-autostart \
>  	lxc-cgroup \
> +	lxc-checkpoint \
>  	lxc-clone \
>  	lxc-config \
>  	lxc-console \
> @@ -209,6 +210,8 @@ sbin_PROGRAMS = init.lxc
>  pkglibexec_PROGRAMS = \
>  	lxc-monitord \
>  	lxc-user-nic
> +pkglibexec_SCRIPTS = \
> +	lxc-restore-net
>  
>  AM_LDFLAGS = -Wl,-E
>  if ENABLE_RPATH
> @@ -238,6 +241,7 @@ lxc_create_SOURCES = lxc_create.c
>  lxc_snapshot_SOURCES = lxc_snapshot.c
>  lxc_usernsexec_SOURCES = lxc_usernsexec.c
>  lxc_user_nic_SOURCES = lxc_user_nic.c network.c network.h
> +lxc_checkpoint_SOURCES = lxc_checkpoint.c
>  
>  if HAVE_STATIC_LIBCAP
>  sbin_PROGRAMS += init.lxc.static
> diff --git a/src/lxc/lxc-restore-net b/src/lxc/lxc-restore-net
> new file mode 100755
> index 0000000..e1055cb
> --- /dev/null
> +++ b/src/lxc/lxc-restore-net
> @@ -0,0 +1,27 @@
> +#!/bin/sh
> +
> +[ -z "$CRTOOLS_IMAGE_DIR" ] && exit 1
> +
> +set -e
> +
> +dir="$CRTOOLS_IMAGE_DIR"
> +
> +[ "network-unlock" = "$CRTOOLS_SCRIPT_ACTION" ] ||
> +[ "network-lock" = "$CRTOOLS_SCRIPT_ACTION" ] || exit 0

What exactly is your intent with the two lines above?

> +i=0
> +while [ -f "$dir/eth$i" ] && [ -f "$dir/veth$i" ] && [ -f "$dir/bridge$i" ]; do
> +	veth=$(cat "$dir/veth$i")
> +	bridge=$(cat "$dir/bridge$i")
> +
> +	if [ "$CRTOOLS_SCRIPT_ACTION" = "network-lock" ]; then
> +		brctl delif $bridge $veth
> +	fi
> +
> +	if [ "$CRTOOLS_SCRIPT_ACTION" = "network-unlock" ]; then
> +		brctl addif $bridge $veth
> +		ifconfig $veth 0.0.0.0 up
> +	fi
> +
> +	i=$((i+1))
> +done
> diff --git a/src/lxc/lxc_checkpoint.c b/src/lxc/lxc_checkpoint.c
> new file mode 100644
> index 0000000..eeb102b
> --- /dev/null
> +++ b/src/lxc/lxc_checkpoint.c
> @@ -0,0 +1,201 @@
> +/*
> + *
> + * Copyright © 2014 Tycho Andersen <tycho.andersen at canonical.com>.
> + * Copyright © 2014 Canonical Ltd.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2, as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, write to the Free Software Foundation, Inc.,
> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
> + */
> +
> +#include <stdio.h>
> +#include <errno.h>
> +#include <unistd.h>
> +
> +#include <lxc/lxccontainer.h>
> +
> +#include "log.h"
> +#include "config.h"
> +#include "lxc.h"
> +#include "arguments.h"
> +
> +static char *checkpoint_dir;
> +static bool stop = false;
> +static bool verbose = false;
> +static bool do_restore = false;
> +static bool daemonize_set = false;
> +
> +static const struct option my_longopts[] = {
> +	{"checkpoint-dir", required_argument, 0, 'D'},
> +	{"stop", no_argument, 0, 's'},
> +	{"verbose", no_argument, 0, 'v'},
> +	{"restore", no_argument, 0, 'r'},
> +	{"daemon", no_argument, 0, 'd'},
> +	{"foreground", no_argument, 0, 'F'},
> +	LXC_COMMON_OPTIONS
> +};
> +
> +static int my_checker(const struct lxc_arguments *args)
> +{
> +	if (do_restore && stop) {
> +		lxc_error(args, "-s not compatible with -r.");
> +		return -1;
> +
> +	} else if (!do_restore && daemonize_set) {
> +		lxc_error(args, "-d/-F not compatible with -r.");
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> +
> +static int my_parser(struct lxc_arguments *args, int c, char *arg)
> +{
> +	switch (c) {
> +	case 'D':
> +		checkpoint_dir = strdup(arg);
> +		if (!checkpoint_dir)
> +			return -1;
> +		break;
> +	case 's':
> +		stop = true;
> +		break;
> +	case 'v':
> +		verbose = true;
> +		break;
> +	case 'r':
> +		do_restore = true;
> +		break;
> +	case 'd':
> +		args->daemonize = 1;
> +		daemonize_set = true;
> +		break;
> +	case 'F':
> +		args->daemonize = 0;
> +		daemonize_set = true;
> +		break;
> +	}
> +	return 0;
> +}
> +
> +static struct lxc_arguments my_args = {
> +	.progname  = "lxc-checkpoint",
> +	.help      = "\
> +--name=NAME\n\
> +\n\
> +lxc-checkpoint checkpoints and restores a container\n\
> +  Serializes a container's running state to disk to allow restoring it in\n\
> +  its running state at a later time.\n\
> +\n\
> +Options :\n\
> +  -n, --name=NAME           NAME for name of the container\n\
> +  -r, --restore             Restore container\n\
> +  -D, --checkpoint-dir=DIR  directory to save the checkpoint in\n\
> +  -v, --verbose             Enable verbose criu logs\n\
> +  Checkpoint options:\n\
> +  -s, --stop                Stop the container after checkpointing.\n\
> +  Restore options:\n\
> +  -d, --daemon              Daemonize the container (default)\n\
> +  -F, --foreground          Start with the current tty attached to /dev/console\n\
> +",
> +	.options   = my_longopts,
> +	.parser    = my_parser,
> +	.daemonize = 1,
> +	.checker   = my_checker,
> +};
> +
> +int checkpoint(struct lxc_container *c)
> +{
> +	int ret;
> +
> +	if (!c->is_running(c)) {
> +		fprintf(stderr, "%s not running, not checkpointing.\n", my_args.name);
> +		lxc_container_put(c);
> +		return 1;
> +	}
> +
> +	ret = c->checkpoint(c, checkpoint_dir, stop, verbose);
> +	lxc_container_put(c);
> +
> +	if (ret < 0) {
> +		fprintf(stderr, "Checkpointing %s failed.\n", my_args.name);
> +		if (ret == -ENOSYS)
> +			fprintf(stderr, "CRIU was not enabled at compile time.\n");
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +int restore(struct lxc_container *c)
> +{
> +	pid_t pid = 0;
> +	int ret = 0;
> +
> +	if (c->is_running(c)) {
> +		fprintf(stderr, "%s is running, not restoring.\n", my_args.name);
> +		lxc_container_put(c);
> +		return 1;
> +	}
> +
> +	if (my_args.daemonize)
> +		pid = fork();
> +
> +	if (pid == 0) {
> +		ret = c->restore(c, checkpoint_dir, verbose);
> +
> +		if (ret < 0) {
> +			fprintf(stderr, "Restoring %s failed.\n", my_args.name);
> +			if (ret == -ENOSYS)
> +				fprintf(stderr, "CRIU was not enabled at compile time.\n");
> +		}
> +	}
> +
> +	lxc_container_put(c);
> +
> +	return ret;
> +}
> +
> +int main(int argc, char *argv[])
> +{
> +	struct lxc_container *c;
> +	int ret;
> +
> +	if (lxc_arguments_parse(&my_args, argc, argv))
> +		exit(1);
> +
> +	c = lxc_container_new(my_args.name, my_args.lxcpath[0]);
> +	if (!c) {
> +		fprintf(stderr, "System error loading %s\n", my_args.name);
> +		exit(1);
> +	}
> +
> +	if (!c->may_control(c)) {
> +		fprintf(stderr, "Insufficent privileges to control %s\n", my_args.name);
> +		lxc_container_put(c);
> +		exit(1);
> +	}
> +
> +	if (!c->is_defined(c)) {
> +		fprintf(stderr, "%s is not defined\n", my_args.name);
> +		lxc_container_put(c);
> +		exit(1);
> +	}
> +
> +
> +	if (do_restore)
> +		ret = restore(c);
> +	else
> +		ret = checkpoint(c);
> +
> +	return ret;
> +}
> diff --git a/src/lxc/lxccontainer.c b/src/lxc/lxccontainer.c
> index 0cf21ce..cb69b61 100644
> --- a/src/lxc/lxccontainer.c
> +++ b/src/lxc/lxccontainer.c
> @@ -55,6 +55,7 @@
>  #include "monitor.h"
>  #include "namespace.h"
>  #include "lxclock.h"
> +#include "sync.h"
>  
>  #if HAVE_IFADDRS_H
>  #include <ifaddrs.h>
> @@ -62,6 +63,10 @@
>  #include <../include/ifaddrs.h>
>  #endif
>  
> +#ifdef CRIU_PATH
> +#include <criu/criu.h>
> +#endif
> +
>  #define MAX_BUFFER 4096
>  
>  #define NOT_SUPPORTED_ERROR "the requested function %s is not currently supported with unprivileged containers"
> @@ -3476,6 +3481,454 @@ static bool lxcapi_remove_device_node(struct lxc_container *c, const char *src_p
>  	return add_remove_device_node(c, src_path, dest_path, false);
>  }
>  
> +struct criu_opts {
> +	/* The type of criu invocation, one of "dump" or "restore" */
> +	char *action;
> +
> +	/* The directory to pass to criu */
> +	char *directory;
> +
> +	/* The container to dump */
> +	struct lxc_container *c;
> +
> +	/* Enable criu verbose mode? */
> +	bool verbose;
> +
> +	/* dump: stop the container or not after dumping? */
> +	bool stop;
> +
> +	/* restore: the file to write the init process' pid into */
> +	char *pidfile;
> +};
> +
> +/*
> + * @out must be 128 bytes long
> + */
> +static int read_criu_file(const char *directory, const char *file, int netnr, char *out)
> +{
> +	char path[PATH_MAX];
> +	int ret;
> +	FILE *f;
> +
> +	ret = snprintf(path, PATH_MAX,  "%s/%s%d", directory, file, netnr);
> +	if (ret < 0 || ret >= PATH_MAX) {
> +		ERROR("%s: path too long", __func__);
> +		return -1;
> +	}
> +
> +	f = fopen(path, "r");
> +	if (!f)
> +		return -1;
> +
> +	ret = fscanf(f, "%127s", out);
> +	fclose(f);
> +	if (ret <= 0)
> +		return -1;
> +
> +	return 0;
> +}
> +
> +static void exec_criu(struct criu_opts *opts)
> +{
> +	char **argv, log[PATH_MAX];
> +	int static_args = 12, argc = 0, i, ret;
> +
> +	/* The command line always looks like:
> +	 * criu $(action) --tcp-established --file-locks --manage-cgroups \
> +	 *     --action-script foo.sh -D $(directory) -o $(directory)/$(action).log
> +	 * +1 for final NULL */
> +
> +	if (strcmp(opts->action, "dump") == 0) {
> +		/* -t pid */
> +		static_args += 2;
> +
> +		/* --leave-running */
> +		if (!opts->stop)
> +			static_args++;
> +	} else if (strcmp(opts->action, "restore") == 0) {
> +		/* --root $(lxc_mount_point) --restore-detached --pidfile $foo */
> +		static_args += 5;
> +	} else {
> +		return;
> +	}
> +
> +	if (opts->verbose)
> +		static_args++;
> +
> +	ret = snprintf(log, PATH_MAX, "%s/%s.log", opts->directory, opts->action);
> +	if (ret < 0 || ret >= PATH_MAX) {
> +		ERROR("logfile name too long\n");
> +		return;
> +	}
> +
> +	argv = malloc(static_args * sizeof(*argv));
> +	if (!argv)
> +		return;
> +
> +	memset(argv, 0, static_args * sizeof(*argv));
> +
> +#define DECLARE_ARG(arg) 			\
> +	do {					\
> +		argv[argc++] = strdup(arg);	\
> +		if (!argv[argc-1])		\
> +			goto err;		\
> +	} while (0)
> +
> +	argv[argc++] = on_path("criu", NULL);
> +	if (!argv[argc-1]) {
> +		ERROR("Couldn't find criu binary\n");
> +		goto err;
> +	}
> +
> +	DECLARE_ARG(opts->action);
> +	DECLARE_ARG("--tcp-established");
> +	DECLARE_ARG("--file-locks");
> +	DECLARE_ARG("--manage-cgroups");
> +	DECLARE_ARG("--action-script");
> +	DECLARE_ARG(LIBEXECDIR "/lxc/lxc-restore-net");
> +	DECLARE_ARG("-D");
> +	DECLARE_ARG(opts->directory);
> +	DECLARE_ARG("-o");
> +	DECLARE_ARG(log);
> +
> +	if (opts->verbose)
> +		DECLARE_ARG("-vvvvvv");
> +
> +	if (strcmp(opts->action, "dump") == 0) {
> +		char pid[32];
> +
> +		if (sprintf(pid, "%d", lxcapi_init_pid(opts->c)) < 0)
> +			goto err;
> +
> +		DECLARE_ARG("-t");
> +		DECLARE_ARG(pid);
> +		if (!opts->stop)
> +			DECLARE_ARG("--leave-running");
> +	} else if (strcmp(opts->action, "restore") == 0) {
> +		int netnr = 0;
> +		struct lxc_list *it;
> +
> +		DECLARE_ARG("--root");
> +		DECLARE_ARG(opts->c->lxc_conf->rootfs.mount);
> +		DECLARE_ARG("--restore-detached");
> +		DECLARE_ARG("--pidfile");
> +		DECLARE_ARG(opts->pidfile);
> +
> +		lxc_list_for_each(it, &opts->c->lxc_conf->network) {
> +			char eth[128], veth[128], buf[257];
> +			void *m;
> +
> +			if (read_criu_file(opts->directory, "veth", netnr, veth))
> +				goto err;
> +			if (read_criu_file(opts->directory, "eth", netnr, eth))
> +				goto err;
> +			ret = snprintf(buf, 257, "%s=%s", eth, veth);
> +			if (ret < 0 || ret >= 257)
> +				goto err;
> +
> +			/* final NULL and --veth-pair eth0:vethASDF */
> +			m = realloc(argv, (argc + 1 + 2) * sizeof(*argv));
> +			if (!m)
> +				goto err;
> +			argv = m;
> +
> +			DECLARE_ARG("--veth-pair");
> +			DECLARE_ARG(buf);
> +			argv[argc] = NULL;
> +
> +			netnr++;
> +		}
> +	}
> +
> +#undef DECLARE_ARG
> +
> +	execv(argv[0], argv);
> +err:
> +	for (i = 0; argv[i]; i++)
> +		free(argv[i]);
> +	free(argv);
> +}
> +
> +/* Check and make sure the container has a configuration that we know CRIU can
> + * dump. */
> +static bool criu_ok(struct lxc_container *c)
> +{
> +	struct lxc_list *it;
> +	bool found_deny_rule = false;
> +
> +	if (geteuid()) {
> +		ERROR("Must be root to checkpoint\n");
> +		return -1;
> +	}
> +
> +	/* We only know how to restore containers with veth networks. */
> +	lxc_list_for_each(it, &c->lxc_conf->network) {
> +		struct lxc_netdev *n = it->elem;
> +		if (n->type != LXC_NET_VETH && n->type != LXC_NET_NONE)
> +			return false;
> +	}
> +
> +	// These requirements come from http://criu.org/LXC
> +	if (strcmp(c->lxc_conf->console.path, "none") != 0)
> +		return false;
> +
> +	if (c->lxc_conf->tty != 0)
> +		return false;
> +
> +	lxc_list_for_each(it, &c->lxc_conf->cgroup) {
> +		struct lxc_cgroup *cg = it->elem;
> +		if (strcmp(cg->subsystem, "devices.deny") == 0 &&
> +				strcmp(cg->value, "c 5:1 rwm") == 0) {
> +
> +			found_deny_rule = true;
> +			break;
> +		}
> +	}
> +
> +	if (!found_deny_rule)
> +		return false;
> +
> +	return true;
> +}
> +
> +static int lxcapi_checkpoint(struct lxc_container *c, char *directory, bool stop, bool verbose)

Can we please make this return bool to match the other api functions?

(same with lxcapi_restore)

> +{
> +	int netnr, ret = 0, status;
> +	struct lxc_list *it;
> +	pid_t pid;
> +
> +	if (!criu_ok(c))
> +		return -1;
> +
> +	if (mkdir(directory, 0700) < 0 && errno != EEXIST)
> +		return -1;
> +
> +	netnr = 0;
> +	lxc_list_for_each(it, &c->lxc_conf->network) {
> +		char *veth = NULL, *bridge = NULL, veth_path[PATH_MAX], eth[128];
> +		struct lxc_netdev *n = it->elem;
> +		int pret;
> +
> +		pret = snprintf(veth_path, PATH_MAX, "lxc.network.%d.veth.pair", netnr);
> +		if (pret < 0 || pret >= PATH_MAX) {
> +			ret = -1;
> +			goto out;
> +		}
> +
> +		veth = lxcapi_get_running_config_item(c, veth_path);
> +		if (!veth) {
> +			/* criu_ok() checks that all interfaces are
> +			 * LXC_NET{VETH,NONE}, and VETHs should have this
> +			 * config */
> +			assert(n->type == LXC_NET_NONE);
> +			break;
> +		}
> +
> +		pret = snprintf(veth_path, PATH_MAX, "lxc.network.%d.link", netnr);
> +		if (pret < 0 || pret >= PATH_MAX) {
> +			ret = -1;
> +			goto out;
> +		}
> +
> +		bridge = lxcapi_get_running_config_item(c, veth_path);
> +		if (!bridge) {
> +			ret = -1;
> +			goto out;
> +		}
> +
> +		pret = snprintf(veth_path, PATH_MAX, "%s/veth%d", directory, netnr);
> +		if (pret < 0 || pret >= PATH_MAX || print_to_file(veth_path, veth) < 0) {
> +			ret = -1;
> +			goto out;
> +		}
> +
> +		pret = snprintf(veth_path, PATH_MAX, "%s/bridge%d", directory, netnr);
> +		if (pret < 0 || pret >= PATH_MAX || print_to_file(veth_path, bridge) < 0) {
> +			ret = -1;
> +			goto out;
> +		}
> +
> +		if (n->name)
> +			strncpy(eth, n->name, 128);

if n->name is exactly 128 bytes, then eth0 won't be null-terminated here.

> +		else
> +			sprintf(eth, "eth%d", netnr);
> +
> +		pret = snprintf(veth_path, PATH_MAX, "%s/eth%d", directory, netnr);
> +		if (pret < 0 || pret >= PATH_MAX || print_to_file(veth_path, eth) < 0)
> +			ret = -1;
> +
> +out:
> +		free(veth);
> +		free(bridge);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	pid = fork();
> +	if (pid < 0)
> +		return -1;
> +
> +	if (pid == 0) {
> +		struct criu_opts os;
> +
> +		os.action = "dump";
> +		os.directory = directory;
> +		os.c = c;
> +		os.stop = stop;
> +		os.verbose = verbose;
> +
> +		/* exec_criu() returning is an error */
> +		exec_criu(&os);
> +		exit(1);
> +	} else {
> +		pid_t w = waitpid(pid, &status, 0);
> +		if (w == -1) {
> +			perror("waitpid");
> +			return -1;
> +		}
> +
> +		if (WIFEXITED(status)) {
> +			return -WEXITSTATUS(status);
> +		}
> +
> +		return -1;
> +	}
> +}
> +
> +static int lxcapi_restore(struct lxc_container *c, char *directory, bool verbose)
> +{
> +	pid_t pid;
> +	struct lxc_list *it;
> +	struct lxc_rootfs *rootfs;
> +	char pidfile[L_tmpnam];
> +
> +	if (!criu_ok(c))
> +		return -1;
> +
> +	if (geteuid()) {
> +		ERROR("Must be root to restore\n");
> +		return -1;
> +	}
> +
> +	if (!tmpnam(pidfile))
> +		return -1;
> +
> +	struct lxc_handler *handler;
> +
> +	handler = lxc_init(c->name, c->lxc_conf, c->config_path);
> +	if (!handler)
> +		return -1;
> +
> +	if (unshare(CLONE_NEWNS))
> +		return -1;

Ok, thinking through this.  This is an api function, so a thread
could call this.  Since you fork before restarting it seems reasonable
for the caller to expect to be able to do a restore and then continue
on to do other things from this thread.  However you've unshared a new
mounts ns in this thread.

Is there any reason not to do the unshare and the mounting after the
fork, when pid == 0 ?

> +	/* CRIU needs the lxc root bind mounted so that it is the root of some
> +	 * mount. */
> +	rootfs = &c->lxc_conf->rootfs;
> +
> +	if (rootfs_is_blockdev(c->lxc_conf)) {
> +		if (do_rootfs_setup(c->lxc_conf, c->name, c->config_path) < 0)
> +			return -1;
> +	}
> +	else {
> +		if (mkdir(rootfs->mount, 0755) < 0 && errno != EEXIST)
> +			return -1;
> +
> +		if (mount(rootfs->path, rootfs->mount, NULL, MS_BIND, NULL) < 0) {
> +			rmdir(rootfs->mount);
> +			return -1;
> +		}
> +	}
> +
> +	pid = fork();
> +	if (pid < 0)
> +		return -1;
> +
> +	if (pid == 0) {
> +		struct criu_opts os;
> +
> +		os.action = "restore";
> +		os.directory = directory;
> +		os.c = c;
> +		os.pidfile = pidfile;
> +		os.verbose = verbose;
> +
> +		/* exec_criu() returning is an error */
> +		exec_criu(&os);
> +		umount(rootfs->mount);
> +		rmdir(rootfs->mount);
> +		exit(1);
> +	} else {
> +		int status;
> +		pid_t w = waitpid(pid, &status, 0);
> +
> +		if (w == -1) {
> +			perror("waitpid");
> +			return -1;
> +		}
> +
> +		if (WIFEXITED(status)) {
> +			if (WEXITSTATUS(status)) {
> +				return -1;
> +			}
> +			else {
> +				int netnr = 0, ret;
> +				FILE *f = fopen(pidfile, "r");
> +				if (!f) {
> +					perror("reading pidfile");
> +					ERROR("couldn't read restore's init pidfile %s\n", pidfile);
> +					return -1;
> +				}
> +
> +				ret = fscanf(f, "%d", (int*) &handler->pid);
> +				fclose(f);
> +				if (ret != 1) {
> +					ERROR("reading restore pid failed");
> +					return -1;
> +				}
> +
> +				if (container_mem_lock(c))
> +					return -1;
> +
> +				ret = 0;
> +				lxc_list_for_each(it, &c->lxc_conf->network) {
> +					char eth[128], veth[128];
> +					struct lxc_netdev *netdev = it->elem;
> +
> +					if (read_criu_file(directory, "veth", netnr, veth)) {
> +						ret = -1;
> +						goto out_unlock;
> +					}
> +					if (read_criu_file(directory, "eth", netnr, eth)) {
> +						ret = -1;
> +						goto out_unlock;
> +					}
> +					netdev->priv.veth_attr.pair = strdup(veth);
> +					if (!netdev->priv.veth_attr.pair) {
> +						ret = -1;
> +						goto out_unlock;
> +					}
> +					netnr++;
> +				}
> +out_unlock:
> +				container_mem_unlock(c);
> +				if (ret)
> +					return ret;
> +
> +				if (lxc_set_state(c->name, handler, RUNNING))
> +					return -1;
> +			}
> +		}
> +
> +		if (lxc_poll(c->name, handler)) {
> +			lxc_abort(c->name, handler);
> +			return -1;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  static int lxcapi_attach_run_waitl(struct lxc_container *c, lxc_attach_options_t *options, const char *program, const char *arg, ...)
>  {
>  	va_list ap;
> @@ -3608,6 +4061,8 @@ struct lxc_container *lxc_container_new(const char *name, const char *configpath
>  	c->may_control = lxcapi_may_control;
>  	c->add_device_node = lxcapi_add_device_node;
>  	c->remove_device_node = lxcapi_remove_device_node;
> +	c->checkpoint = lxcapi_checkpoint;
> +	c->restore = lxcapi_restore;
>  
>  	/* we'll allow the caller to update these later */
>  	if (lxc_log_init(NULL, "none", NULL, "lxc_container", 0, c->config_path)) {
> diff --git a/src/lxc/lxccontainer.h b/src/lxc/lxccontainer.h
> index 5085c43..899354d 100644
> --- a/src/lxc/lxccontainer.h
> +++ b/src/lxc/lxccontainer.h
> @@ -760,6 +760,31 @@ struct lxc_container {
>  	 * \return \c true on success, else \c false.
>  	 */
>  	bool (*remove_device_node)(struct lxc_container *c, const char *src_path, const char *dest_path);
> +
> +	/*!
> +	 * \brief Checkpoint a container.
> +	 *
> +	 * \param c Container.
> +	 * \param directory The directory to dump the container to.
> +	 * \param stop Whether or not to stop the container after checkpointing.
> +	 * \param verbose Enable criu's verbose logs.
> +	 *
> +	 * \return \c 0 on success, \c <0 on failure (-ENOSYS if criu wasn't
> +	 * present at compile time).
> +	 */
> +	int (*checkpoint)(struct lxc_container *c, char *directory, bool stop, bool verbose);
> +
> +	/*!
> +	 * \brief Restore a container from a checkpoint.
> +	 *
> +	 * \param c Container.
> +	 * \param directory The directory to restore the container from.
> +	 * \param verbose Enable criu's verbose logs.
> +	 *
> +	 * \return \c 0 on success \c <0 on failure (-ENOSYS if criu wasn't
> +	 * present at compile time).
> +	 */
> +	int (*restore)(struct lxc_container *c, char *directory, bool verbose);
>  };
>  
>  /*!
> diff --git a/src/lxc/start.c b/src/lxc/start.c
> index f282b93..98849e1 100644
> --- a/src/lxc/start.c
> +++ b/src/lxc/start.c
> @@ -300,14 +300,14 @@ static int signal_handler(int fd, uint32_t events, void *data,
>  	return 1;
>  }
>  
> -static int lxc_set_state(const char *name, struct lxc_handler *handler, lxc_state_t state)
> +int lxc_set_state(const char *name, struct lxc_handler *handler, lxc_state_t state)
>  {
>  	handler->state = state;
>  	lxc_monitor_send_state(name, state, handler->lxcpath);
>  	return 0;
>  }
>  
> -static int lxc_poll(const char *name, struct lxc_handler *handler)
> +int lxc_poll(const char *name, struct lxc_handler *handler)
>  {
>  	int sigfd = handler->sigfd;
>  	int pid = handler->pid;
> @@ -485,7 +485,7 @@ static void lxc_fini(const char *name, struct lxc_handler *handler)
>  	free(handler);
>  }
>  
> -static void lxc_abort(const char *name, struct lxc_handler *handler)
> +void lxc_abort(const char *name, struct lxc_handler *handler)
>  {
>  	int ret, status;
>  
> diff --git a/src/lxc/start.h b/src/lxc/start.h
> index ca7891c..8af0a06 100644
> --- a/src/lxc/start.h
> +++ b/src/lxc/start.h
> @@ -74,6 +74,10 @@ struct lxc_handler {
>  	void *cgroup_data;
>  };
>  
> +
> +extern int lxc_poll(const char *name, struct lxc_handler *handler);
> +extern int lxc_set_state(const char *name, struct lxc_handler *handler, lxc_state_t state);
> +extern void lxc_abort(const char *name, struct lxc_handler *handler);
>  extern struct lxc_handler *lxc_init(const char *name, struct lxc_conf *, const char *);
>  
>  extern int lxc_check_inherited(struct lxc_conf *conf, int fd_to_ignore);
> diff --git a/src/lxc/utils.c b/src/lxc/utils.c
> index a32829d..ed34706 100644
> --- a/src/lxc/utils.c
> +++ b/src/lxc/utils.c
> @@ -1446,3 +1446,17 @@ out1:
>  	free(retv);
>  	return NULL;
>  }
> +
> +int print_to_file(const char *file, const char *content)
> +{
> +	FILE *f;
> +	int ret = 0;
> +
> +	f = fopen(file, "w");
> +	if (!f)
> +		return -1;
> +	if (fprintf(f, "%s", content) != strlen(content))
> +		ret = -1;
> +	fclose(f);
> +	return ret;
> +}
> diff --git a/src/lxc/utils.h b/src/lxc/utils.h
> index a84b489..cdfe56a 100644
> --- a/src/lxc/utils.h
> +++ b/src/lxc/utils.h
> @@ -282,3 +282,4 @@ int detect_ramfs_rootfs(void);
>  char *on_path(char *cmd, const char *rootfs);
>  bool file_exists(const char *f);
>  char *choose_init(const char *rootfs);
> +int print_to_file(const char *file, const char *content);
> -- 
> 1.9.1
> 
> _______________________________________________
> lxc-devel mailing list
> lxc-devel at lists.linuxcontainers.org
> http://lists.linuxcontainers.org/listinfo/lxc-devel


More information about the lxc-devel mailing list