[lxc-devel] [lxd/master] Resume dump on failed restore

tych0 on Github lxc-bot at linuxcontainers.org
Fri Jul 8 20:30:36 UTC 2016


A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 301 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20160708/0f5b5515/attachment.bin>
-------------- next part --------------
From bde987ddbf167f5e848d001b4e28d828d6b093b9 Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho.andersen at canonical.com>
Date: Thu, 7 Jul 2016 22:28:32 +0000
Subject: [PATCH 1/2] make client.websocket a public API

We'll use this in the next patch.

Signed-off-by: Tycho Andersen <tycho.andersen at canonical.com>
---
 client.go | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/client.go b/client.go
index 3567127..3e0d791 100644
--- a/client.go
+++ b/client.go
@@ -482,7 +482,7 @@ func (c *Client) delete(base string, args interface{}, rtype ResponseType) (*Res
 	return HoistResponse(resp, rtype)
 }
 
-func (c *Client) websocket(operation string, secret string) (*websocket.Conn, error) {
+func (c *Client) Websocket(operation string, secret string) (*websocket.Conn, error) {
 	query := url.Values{"secret": []string{secret}}
 	url := c.BaseWSURL + path.Join(operation, "websocket") + "?" + query.Encode()
 	return WebsocketDial(c.websocketDialer, url)
@@ -1513,7 +1513,7 @@ func (c *Client) Exec(name string, cmd []string, env map[string]string,
 	if controlHandler != nil {
 		var control *websocket.Conn
 		if wsControl, ok := fds["control"]; ok {
-			control, err = c.websocket(resp.Operation, wsControl.(string))
+			control, err = c.Websocket(resp.Operation, wsControl.(string))
 			if err != nil {
 				return -1, err
 			}
@@ -1522,7 +1522,7 @@ func (c *Client) Exec(name string, cmd []string, env map[string]string,
 			go controlHandler(c, control)
 		}
 
-		conn, err := c.websocket(resp.Operation, fds["0"].(string))
+		conn, err := c.Websocket(resp.Operation, fds["0"].(string))
 		if err != nil {
 			return -1, err
 		}
@@ -1535,7 +1535,7 @@ func (c *Client) Exec(name string, cmd []string, env map[string]string,
 		conns := make([]*websocket.Conn, 3)
 		dones := make([]chan bool, 3)
 
-		conns[0], err = c.websocket(resp.Operation, fds[strconv.Itoa(0)].(string))
+		conns[0], err = c.Websocket(resp.Operation, fds[strconv.Itoa(0)].(string))
 		if err != nil {
 			return -1, err
 		}
@@ -1545,7 +1545,7 @@ func (c *Client) Exec(name string, cmd []string, env map[string]string,
 
 		outputs := []io.WriteCloser{stdout, stderr}
 		for i := 1; i < 3; i++ {
-			conns[i], err = c.websocket(resp.Operation, fds[strconv.Itoa(i)].(string))
+			conns[i], err = c.Websocket(resp.Operation, fds[strconv.Itoa(i)].(string))
 			if err != nil {
 				return -1, err
 			}

From b8078bc35703bdf09062adfc3491bbc70ba2aa14 Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho.andersen at canonical.com>
Date: Thu, 7 Jul 2016 02:35:44 +0000
Subject: [PATCH 2/2] resume dumped container on failed restore

This commit implements "freezing" the dumped container until we are sure
that the restore was successful. Because of various issues (namely, TCP
socket repair mode and windowing, but also various other problems), we
can't simply implement some kind of --leave-frozen option in CRIU.

Instead, we have CRIU do a callback when the dump is done but the container
is still frozen, so that we can then try the restore and see if is succeds.

Signed-off-by: Tycho Andersen <tycho.andersen at canonical.com>
---
 lxd/container.go     |   7 +++-
 lxd/container_lxc.go |  14 +++++--
 lxd/main.go          |  23 +++++++++++
 lxd/migrate.go       | 115 ++++++++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 147 insertions(+), 12 deletions(-)

diff --git a/lxd/container.go b/lxd/container.go
index dd1038c..8242de7 100644
--- a/lxd/container.go
+++ b/lxd/container.go
@@ -271,7 +271,10 @@ type container interface {
 
 	// Snapshots & migration
 	Restore(sourceContainer container) error
-	Migrate(cmd uint, stateDir string, function string, stop bool) error
+	/* actionScript here is a script called action.sh in the stateDir, to
+	 * be passed to CRIU as --action-script
+	 */
+	Migrate(cmd uint, stateDir string, function string, stop bool, actionScript bool) error
 	Snapshots() ([]container, error)
 
 	// Config handling
@@ -457,7 +460,7 @@ func containerCreateAsSnapshot(d *Daemon, args containerArgs, sourceContainer co
 		 * after snapshotting will fail.
 		 */
 
-		err = sourceContainer.Migrate(lxc.MIGRATE_DUMP, stateDir, "snapshot", false)
+		err = sourceContainer.Migrate(lxc.MIGRATE_DUMP, stateDir, "snapshot", false, false)
 		if err != nil {
 			os.RemoveAll(sourceContainer.StatePath())
 			return nil, err
diff --git a/lxd/container_lxc.go b/lxd/container_lxc.go
index 8761e3e..2cb3521 100644
--- a/lxd/container_lxc.go
+++ b/lxd/container_lxc.go
@@ -1198,7 +1198,7 @@ func (c *containerLXC) Start(stateful bool) error {
 			return fmt.Errorf("Container has no existing state to restore.")
 		}
 
-		err := c.Migrate(lxc.MIGRATE_RESTORE, c.StatePath(), "snapshot", false)
+		err := c.Migrate(lxc.MIGRATE_RESTORE, c.StatePath(), "snapshot", false, false)
 		if err != nil && !c.IsRunning() {
 			return err
 		}
@@ -1364,7 +1364,7 @@ func (c *containerLXC) Stop(stateful bool) error {
 		}
 
 		// Checkpoint
-		err = c.Migrate(lxc.MIGRATE_DUMP, stateDir, "snapshot", true)
+		err = c.Migrate(lxc.MIGRATE_DUMP, stateDir, "snapshot", true, false)
 		if err != nil {
 			return err
 		}
@@ -1740,7 +1740,7 @@ func (c *containerLXC) Restore(sourceContainer container) error {
 	// If the container wasn't running but was stateful, should we restore
 	// it as running?
 	if shared.PathExists(c.StatePath()) {
-		if err := c.Migrate(lxc.MIGRATE_RESTORE, c.StatePath(), "snapshot", false); err != nil {
+		if err := c.Migrate(lxc.MIGRATE_RESTORE, c.StatePath(), "snapshot", false, false); err != nil {
 			return err
 		}
 
@@ -2661,7 +2661,7 @@ func findCriu(host string) error {
 	return nil
 }
 
-func (c *containerLXC) Migrate(cmd uint, stateDir string, function string, stop bool) error {
+func (c *containerLXC) Migrate(cmd uint, stateDir string, function string, stop bool, actionScript bool) error {
 	if err := findCriu(function); err != nil {
 		return err
 	}
@@ -2747,11 +2747,17 @@ func (c *containerLXC) Migrate(cmd uint, stateDir string, function string, stop
 			return err
 		}
 
+		script := ""
+		if actionScript {
+			script = filepath.Join(stateDir, "action.sh")
+		}
+
 		opts := lxc.MigrateOptions{
 			Stop:            stop,
 			Directory:       stateDir,
 			Verbose:         true,
 			PreservesInodes: preservesInodes,
+			ActionScript:    script,
 		}
 
 		migrateErr = c.c.Migrate(cmd, opts)
diff --git a/lxd/main.go b/lxd/main.go
index 9da7563..4eb4372 100644
--- a/lxd/main.go
+++ b/lxd/main.go
@@ -158,6 +158,8 @@ func run() error {
 		fmt.Printf("        Start a container\n")
 		fmt.Printf("    callhook\n")
 		fmt.Printf("        Call a container hook\n")
+		fmt.Printf("    migratedumpsuccess\n")
+		fmt.Printf("        Indicate that a migration dump was successful\n")
 		fmt.Printf("    netcat\n")
 		fmt.Printf("        Mirror a unix socket to stdin/stdout\n")
 	}
@@ -237,6 +239,8 @@ func run() error {
 			os.Exit(ret)
 		case "netcat":
 			return Netcat(os.Args[1:])
+		case "migratedumpsuccess":
+			return cmdMigrateDumpSuccess(os.Args[1:])
 		}
 	}
 
@@ -1050,3 +1054,22 @@ func printnet() error {
 
 	return nil
 }
+
+func cmdMigrateDumpSuccess(args []string) error {
+	if len(args) != 3 {
+		return fmt.Errorf("bad migrate dump success args %s", args)
+	}
+
+	c, err := lxd.NewClient(&lxd.DefaultConfig, "local")
+	if err != nil {
+		return err
+	}
+
+	conn, err := c.Websocket(args[1], args[2])
+	if err != nil {
+		return err
+	}
+	conn.Close()
+
+	return c.WaitForSuccess(args[1])
+}
diff --git a/lxd/migrate.go b/lxd/migrate.go
index 14e9c00..5377a61 100644
--- a/lxd/migrate.go
+++ b/lxd/migrate.go
@@ -11,6 +11,7 @@ import (
 	"net/http"
 	"net/url"
 	"os"
+	"path/filepath"
 	"strconv"
 	"strings"
 	"sync"
@@ -228,7 +229,28 @@ func (s *migrationSourceWs) Connect(op *operation, r *http.Request, w http.Respo
 	return nil
 }
 
-func (s *migrationSourceWs) Do(op *operation) error {
+func writeActionScript(directory string, operation string, secret string) error {
+	script := fmt.Sprintf(`#!/bin/sh -e
+if [ "$CRTOOLS_SCRIPT_ACTION" = "post-dump" ]; then
+	%s migratedumpsuccess %s %s
+fi
+`, execPath, operation, secret)
+
+	f, err := os.Create(filepath.Join(directory, "action.sh"))
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	if err := f.Chmod(0500); err != nil {
+		return err
+	}
+
+	_, err = f.WriteString(script)
+	return err
+}
+
+func (s *migrationSourceWs) Do(migrateOp *operation) error {
 	<-s.allConnected
 
 	criuType := CRIUType_CRIU_RSYNC.Enum()
@@ -326,15 +348,98 @@ func (s *migrationSourceWs) Do(op *operation) error {
 			return abort(fmt.Errorf("Formats other than criu rsync not understood"))
 		}
 
+		/* What happens below is slightly convoluted. Due to various
+		 * complications with networking, there's no easy way for criu
+		 * to exit and leave the container in a frozen state for us to
+		 * somehow resume later.
+		 *
+		 * Instead, we use what criu calls an "action-script", which is
+		 * basically a callback that lets us know when the dump is
+		 * done. (Unfortunately, we can't pass arguments, just an
+		 * executable path, so we write a custom action script with the
+		 * real command we want to run.)
+		 *
+		 * This script then hangs until the migration operation either
+		 * finishes successfully or fails, and exits 1 or 0, which
+		 * causes criu to either leave the container running or kill it
+		 * as we asked.
+		 */
+		dumpDone := make(chan bool, 1)
+		actionScriptOpSecret, err := shared.RandomCryptoString()
+		if err != nil {
+			return abort(err)
+		}
+
+		actionScriptOp, err := operationCreate(
+			operationClassWebsocket,
+			nil,
+			nil,
+			func(op *operation) error {
+				_, err := migrateOp.WaitFinal(-1)
+				if err != nil {
+					return err
+				}
+
+				if migrateOp.status != shared.Success {
+					return fmt.Errorf("restore failed: %s", op.status.String())
+				}
+				return nil
+			},
+			nil,
+			func(op *operation, r *http.Request, w http.ResponseWriter) error {
+				secret := r.FormValue("secret")
+				if secret == "" {
+					return fmt.Errorf("missing secret")
+				}
+
+				if secret != actionScriptOpSecret {
+					return os.ErrPermission
+				}
+
+				c, err := shared.WebsocketUpgrader.Upgrade(w, r, nil)
+				if err != nil {
+					return err
+				}
+
+				dumpDone <- true
+
+				closeMsg := websocket.FormatCloseMessage(websocket.CloseNormalClosure, "")
+				return c.WriteMessage(websocket.CloseMessage, closeMsg)
+			},
+		)
+		if err != nil {
+			return abort(err)
+		}
+
 		checkpointDir, err := ioutil.TempDir("", "lxd_checkpoint_")
 		if err != nil {
 			return abort(err)
 		}
-		defer os.RemoveAll(checkpointDir)
 
-		err = s.container.Migrate(lxc.MIGRATE_DUMP, checkpointDir, "migration", true)
+		if err := writeActionScript(checkpointDir, actionScriptOp.url, actionScriptOpSecret); err != nil {
+			os.RemoveAll(checkpointDir)
+			return abort(err)
+		}
+
+		_, err = actionScriptOp.Run()
 		if err != nil {
+			os.RemoveAll(checkpointDir)
+			return abort(err)
+		}
+
+		migrateDone := make(chan error, 1)
+		go func() {
+			defer os.RemoveAll(checkpointDir)
+			migrateDone <- s.container.Migrate(lxc.MIGRATE_DUMP, checkpointDir, "migration", true, true)
+		}()
+
+		select {
+		/* the checkpoint failed, let's just abort */
+		case err = <-migrateDone:
 			return abort(err)
+		/* the dump finished, let's continue on to the restore */
+		case <-dumpDone:
+			shared.Debugf("Dump finished, continuing with restore...")
 		}
 
 		/*
@@ -361,8 +466,6 @@ func (s *migrationSourceWs) Do(op *operation) error {
 		return err
 	}
 
-	// TODO: should we add some config here about automatically restarting
-	// the container migrate failure? What about the failures above?
 	if !*msg.Success {
 		return fmt.Errorf(*msg.Message)
 	}
@@ -559,7 +662,7 @@ func (c *migrationSink) do() error {
 		}
 
 		if c.live {
-			err = c.container.Migrate(lxc.MIGRATE_RESTORE, imagesDir, "migration", false)
+			err = c.container.Migrate(lxc.MIGRATE_RESTORE, imagesDir, "migration", false, false)
 			if err != nil {
 				restore <- err
 				return


More information about the lxc-devel mailing list