[lxc-devel] [lxd/master] Implement stateful container stop

stgraber on Github lxc-bot at linuxcontainers.org
Mon Feb 29 04:13:40 UTC 2016


A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 1346 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20160229/c7f50ea1/attachment.bin>
-------------- next part --------------
From 14cbb2cd34d2dd27d8c0d9784720d311fab90ad3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20Graber?= <stgraber at ubuntu.com>
Date: Sat, 27 Feb 2016 01:30:02 -0500
Subject: [PATCH] Implement stateful container stop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This makes it possible to have the container save its state at stop
time, then restore its state on restart.

The feature is mostly interesting as a way to do a "suspend to disk"
kinda of equivalent where there is a guarantee that no work will be done
after the tasks are dumped to disk.

Expected use of the feature is to stop containers when more important
containers need the memory resources as well as a way to do a quick host
reboot without loosing running state.

This branch requires the client to specifically ask for state to be save
and restored at both stop and start time. The command line client is set
so that state isn't capture on stop by default (requires --stateful) but
is restored automatically on start (unless --stateless is passed).

Once checkpoint/restore as proved to be reliable, we should probably
introduce a server option, or a container option to have this be used on
host reboot in place of a standard container shutdown.

Closes #1558

Signed-off-by: Stéphane Graber <stgraber at ubuntu.com>
---
 client.go              | 14 +++++++++---
 lxc/action.go          | 18 ++++++++++++++-
 lxc/delete.go          |  2 +-
 lxc/launch.go          |  2 +-
 lxc/main.go            |  8 +++----
 lxc/publish.go         |  4 ++--
 lxd/container.go       |  4 ++--
 lxd/container_lxc.go   | 61 +++++++++++++++++++++++++++++++++++++++++++++-----
 lxd/container_state.go | 38 ++++++++++++++++++++++---------
 lxd/containers.go      |  4 ++--
 specs/rest-api.md      |  3 ++-
 11 files changed, 125 insertions(+), 33 deletions(-)

diff --git a/client.go b/client.go
index c5b1468..664efde 100644
--- a/client.go
+++ b/client.go
@@ -1435,15 +1435,23 @@ func (c *Client) Exec(name string, cmd []string, env map[string]string,
 	return op.Metadata.GetInt("return")
 }
 
-func (c *Client) Action(name string, action shared.ContainerAction, timeout int, force bool) (*Response, error) {
+func (c *Client) Action(name string, action shared.ContainerAction, timeout int, force bool, stateful bool) (*Response, error) {
+	body := shared.Jmap{
+		"action":  action,
+		"timeout": timeout,
+		"force":   force}
+
 	if action == "start" {
 		current, err := c.ContainerState(name)
 		if err == nil && current.StatusCode == shared.Frozen {
-			action = "unfreeze"
+			body["action"] = "unfreeze"
 		}
 	}
 
-	body := shared.Jmap{"action": action, "timeout": timeout, "force": force}
+	if shared.StringInSlice(string(action), []string{"start", "stop"}) {
+		body["stateful"] = stateful
+	}
+
 	return c.put(fmt.Sprintf("containers/%s/state", name), body, Async)
 }
 
diff --git a/lxc/action.go b/lxc/action.go
index f358da4..4d35266 100644
--- a/lxc/action.go
+++ b/lxc/action.go
@@ -16,6 +16,8 @@ type actionCmd struct {
 	name       string
 	timeout    int
 	force      bool
+	stateful   bool
+	stateless  bool
 }
 
 func (c *actionCmd) showByDefault() bool {
@@ -33,6 +35,8 @@ func (c *actionCmd) flags() {
 	if c.hasTimeout {
 		gnuflag.IntVar(&c.timeout, "timeout", -1, i18n.G("Time to wait for the container before killing it."))
 		gnuflag.BoolVar(&c.force, "force", false, i18n.G("Force the container to shutdown."))
+		gnuflag.BoolVar(&c.stateful, "stateful", false, i18n.G("Store the container state (only for stop)."))
+		gnuflag.BoolVar(&c.stateless, "stateless", false, i18n.G("Ignore the container state (only forstart)."))
 	}
 }
 
@@ -41,6 +45,18 @@ func (c *actionCmd) run(config *lxd.Config, args []string) error {
 		return errArgs
 	}
 
+	state := false
+
+	// Never store state unless asked to
+	if c.action == "start" && !c.stateless {
+		state = true
+	}
+
+	// Always restore state (if present) unless asked not to
+	if c.action == "stop" && c.stateful {
+		state = true
+	}
+
 	for _, nameArg := range args {
 		remote, name := config.ParseRemoteAndContainer(nameArg)
 		d, err := lxd.NewClient(config, remote)
@@ -48,7 +64,7 @@ func (c *actionCmd) run(config *lxd.Config, args []string) error {
 			return err
 		}
 
-		resp, err := d.Action(name, c.action, c.timeout, c.force)
+		resp, err := d.Action(name, c.action, c.timeout, c.force, state)
 		if err != nil {
 			return err
 		}
diff --git a/lxc/delete.go b/lxc/delete.go
index 716832b..afa3d7d 100644
--- a/lxc/delete.go
+++ b/lxc/delete.go
@@ -92,7 +92,7 @@ func (c *deleteCmd) run(config *lxd.Config, args []string) error {
 				return fmt.Errorf(i18n.G("The container is currently running, stop it first or pass --force."))
 			}
 
-			resp, err := d.Action(name, shared.Stop, -1, true)
+			resp, err := d.Action(name, shared.Stop, -1, true, false)
 			if err != nil {
 				return err
 			}
diff --git a/lxc/launch.go b/lxc/launch.go
index e2c9bd6..c065872 100644
--- a/lxc/launch.go
+++ b/lxc/launch.go
@@ -120,7 +120,7 @@ func (c *launchCmd) run(config *lxd.Config, args []string) error {
 	}
 
 	fmt.Printf(i18n.G("Starting %s")+"\n", name)
-	resp, err = d.Action(name, shared.Start, -1, false)
+	resp, err = d.Action(name, shared.Start, -1, false, false)
 	if err != nil {
 		return err
 	}
diff --git a/lxc/main.go b/lxc/main.go
index 88845f5..a25cafe 100644
--- a/lxc/main.go
+++ b/lxc/main.go
@@ -182,15 +182,15 @@ var commands = map[string]command{
 	"list":     &listCmd{},
 	"monitor":  &monitorCmd{},
 	"move":     &moveCmd{},
-	"pause":    &actionCmd{shared.Freeze, false, false, "pause", -1, false},
+	"pause":    &actionCmd{shared.Freeze, false, false, "pause", -1, false, false, false},
 	"profile":  &profileCmd{},
 	"publish":  &publishCmd{},
 	"remote":   &remoteCmd{},
-	"restart":  &actionCmd{shared.Restart, true, true, "restart", -1, false},
+	"restart":  &actionCmd{shared.Restart, true, true, "restart", -1, false, false, false},
 	"restore":  &restoreCmd{},
 	"snapshot": &snapshotCmd{},
-	"start":    &actionCmd{shared.Start, false, true, "start", -1, false},
-	"stop":     &actionCmd{shared.Stop, true, true, "stop", -1, false},
+	"start":    &actionCmd{shared.Start, false, true, "start", -1, false, false, false},
+	"stop":     &actionCmd{shared.Stop, true, true, "stop", -1, false, false, false},
 	"version":  &versionCmd{},
 }
 
diff --git a/lxc/publish.go b/lxc/publish.go
index 8ccd663..690dfdf 100644
--- a/lxc/publish.go
+++ b/lxc/publish.go
@@ -97,7 +97,7 @@ func (c *publishCmd) run(config *lxd.Config, args []string) error {
 				}
 			}
 
-			resp, err := s.Action(cName, shared.Stop, -1, true)
+			resp, err := s.Action(cName, shared.Stop, -1, true, false)
 			if err != nil {
 				return err
 			}
@@ -110,7 +110,7 @@ func (c *publishCmd) run(config *lxd.Config, args []string) error {
 			if op.StatusCode == shared.Failure {
 				return fmt.Errorf(i18n.G("Stopping container failed!"))
 			}
-			defer s.Action(cName, shared.Start, -1, true)
+			defer s.Action(cName, shared.Start, -1, true, false)
 
 			if wasEphemeral {
 				ct.Ephemeral = true
diff --git a/lxd/container.go b/lxd/container.go
index 0abfbd6..e5ab8ec 100644
--- a/lxd/container.go
+++ b/lxd/container.go
@@ -311,8 +311,8 @@ type container interface {
 	// Container actions
 	Freeze() error
 	Shutdown(timeout time.Duration) error
-	Start() error
-	Stop() error
+	Start(stateful bool) error
+	Stop(stateful bool) error
 	Unfreeze() error
 
 	// Snapshots & migration
diff --git a/lxd/container_lxc.go b/lxd/container_lxc.go
index b52738c..b5b8438 100644
--- a/lxd/container_lxc.go
+++ b/lxd/container_lxc.go
@@ -1070,7 +1070,7 @@ func (c *containerLXC) startCommon() (string, error) {
 	return configPath, nil
 }
 
-func (c *containerLXC) Start() error {
+func (c *containerLXC) Start(stateful bool) error {
 	// Wait for container tear down to finish
 	wgStopping, stopping := lxcStoppingContainers[c.id]
 	if stopping {
@@ -1083,6 +1083,25 @@ func (c *containerLXC) Start() error {
 		return err
 	}
 
+	// If stateful, restore now
+	if stateful && shared.PathExists(c.StatePath()) {
+		err := c.c.Restore(lxc.RestoreOptions{
+			Directory: c.StatePath(),
+			Verbose:   true,
+		})
+
+		err2 := os.RemoveAll(c.StatePath())
+		if err2 != nil {
+			return err2
+		}
+
+		if err != nil {
+			return err
+		}
+
+		return nil
+	}
+
 	// Start the LXC container
 	out, err := exec.Command(
 		c.daemon.execPath,
@@ -1232,7 +1251,33 @@ func (c *containerLXC) setupStopping() *sync.WaitGroup {
 }
 
 // Stop functions
-func (c *containerLXC) Stop() error {
+func (c *containerLXC) Stop(stateful bool) error {
+	// Handle stateful stop
+	if stateful {
+		// Cleanup any existing state
+		stateDir := c.StatePath()
+		os.RemoveAll(stateDir)
+
+		err := os.MkdirAll(stateDir, 0700)
+		if err != nil {
+			return err
+		}
+
+		// Checkpoint
+		opts := lxc.CheckpointOptions{Directory: stateDir, Stop: true, Verbose: true}
+		err = c.Checkpoint(opts)
+		err2 := CollectCRIULogFile(c, stateDir, "snapshot", "dump")
+		if err2 != nil {
+			shared.Log.Warn("failed to collect criu log file", log.Ctx{"error": err2})
+		}
+
+		if err != nil {
+			return err
+		}
+
+		return nil
+	}
+
 	// Load the go-lxc struct
 	err := c.initLXC()
 	if err != nil {
@@ -1351,7 +1396,7 @@ func (c *containerLXC) OnStop(target string) error {
 
 		// Reboot the container
 		if target == "reboot" {
-			c.Start()
+			c.Start(false)
 			return
 		}
 
@@ -1474,7 +1519,7 @@ func (c *containerLXC) Restore(sourceContainer container) error {
 	wasRunning := false
 	if c.IsRunning() {
 		wasRunning = true
-		if err := c.Stop(); err != nil {
+		if err := c.Stop(false); err != nil {
 			shared.Log.Error(
 				"Could not stop container",
 				log.Ctx{
@@ -1528,12 +1573,16 @@ func (c *containerLXC) Restore(sourceContainer container) error {
 			shared.Log.Error("failed to delete snapshot state", "path", c.StatePath(), "err", err2)
 		}
 
-		return err
+		if err != nil {
+			return err
+		}
+
+		return nil
 	}
 
 	// Restart the container
 	if wasRunning {
-		return c.Start()
+		return c.Start(false)
 	}
 
 	return nil
diff --git a/lxd/container_state.go b/lxd/container_state.go
index 9446617..5e4ced7 100644
--- a/lxd/container_state.go
+++ b/lxd/container_state.go
@@ -7,13 +7,15 @@ import (
 	"time"
 
 	"github.com/gorilla/mux"
+
 	"github.com/lxc/lxd/shared"
 )
 
 type containerStatePutReq struct {
-	Action  string `json:"action"`
-	Timeout int    `json:"timeout"`
-	Force   bool   `json:"force"`
+	Action   string `json:"action"`
+	Timeout  int    `json:"timeout"`
+	Force    bool   `json:"force"`
+	Stateful bool   `json:"stateful"`
 }
 
 func containerState(d *Daemon, r *http.Request) Response {
@@ -53,15 +55,25 @@ func containerStatePut(d *Daemon, r *http.Request) Response {
 	switch shared.ContainerAction(raw.Action) {
 	case shared.Start:
 		do = func(op *operation) error {
-			if err = c.Start(); err != nil {
+			if err = c.Start(raw.Stateful); err != nil {
 				return err
 			}
 			return nil
 		}
 	case shared.Stop:
-		if raw.Timeout == 0 || raw.Force {
+		if raw.Stateful {
 			do = func(op *operation) error {
-				if err = c.Stop(); err != nil {
+				err := c.Stop(raw.Stateful)
+				if err != nil {
+					return err
+				}
+
+				return nil
+			}
+		} else if raw.Timeout == 0 || raw.Force {
+			do = func(op *operation) error {
+				err = c.Stop(false)
+				if err != nil {
 					return err
 				}
 
@@ -73,30 +85,36 @@ func containerStatePut(d *Daemon, r *http.Request) Response {
 			}
 		} else {
 			do = func(op *operation) error {
-				if err = c.Shutdown(time.Duration(raw.Timeout) * time.Second); err != nil {
+				err = c.Shutdown(time.Duration(raw.Timeout) * time.Second)
+				if err != nil {
 					return err
 				}
 
 				if c.IsEphemeral() {
 					c.Delete()
 				}
+
 				return nil
 			}
 		}
 	case shared.Restart:
 		do = func(op *operation) error {
 			if raw.Timeout == 0 || raw.Force {
-				if err = c.Stop(); err != nil {
+				err = c.Stop(false)
+				if err != nil {
 					return err
 				}
 			} else {
-				if err = c.Shutdown(time.Duration(raw.Timeout) * time.Second); err != nil {
+				err = c.Shutdown(time.Duration(raw.Timeout) * time.Second)
+				if err != nil {
 					return err
 				}
 			}
-			if err = c.Start(); err != nil {
+			err = c.Start(false)
+			if err != nil {
 				return err
 			}
+
 			return nil
 		}
 	case shared.Freeze:
diff --git a/lxd/containers.go b/lxd/containers.go
index 520bb58..6a02273 100644
--- a/lxd/containers.go
+++ b/lxd/containers.go
@@ -114,7 +114,7 @@ func containersRestart(d *Daemon) error {
 				continue
 			}
 
-			c.Start()
+			c.Start(false)
 
 			autoStartDelayInt, err := strconv.Atoi(autoStartDelay)
 			if err == nil {
@@ -155,7 +155,7 @@ func containersShutdown(d *Daemon) error {
 			wg.Add(1)
 			go func() {
 				c.Shutdown(time.Second * 30)
-				c.Stop()
+				c.Stop(false)
 				wg.Done()
 			}()
 		}
diff --git a/specs/rest-api.md b/specs/rest-api.md
index 09fca34..9dbacf9 100644
--- a/specs/rest-api.md
+++ b/specs/rest-api.md
@@ -754,7 +754,8 @@ Input:
     {
         "action": "stop",       # State change action (stop, start, restart, freeze or unfreeze)
         "timeout": 30,          # A timeout after which the state change is considered as failed
-        "force": true           # Force the state change (currently only valid for stop and restart where it means killing the container)
+        "force": true,          # Force the state change (currently only valid for stop and restart where it means killing the container)
+        "stateful": true        # Whether to store or restore runtime state before stopping or startiong (only valid for stop and start, defaults to false)
     }
 
 ## /1.0/containers/\<name\>/files


More information about the lxc-devel mailing list