[lxc-devel] [lxd/master] Handle unresponsive container monitors

hallyn on Github lxc-bot at linuxcontainers.org
Tue Mar 15 02:24:32 UTC 2016


A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 1442 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20160315/e7c03ece/attachment.bin>
-------------- next part --------------
From 51ec4f40bb5a3cb92674c4a5895b7cd9fb4f2bf5 Mon Sep 17 00:00:00 2001
From: Serge Hallyn <serge.hallyn at ubuntu.com>
Date: Mon, 14 Mar 2016 18:52:01 -0700
Subject: [PATCH] Handle unresponsive container monitors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If a container monitor is unresponsive, we may wait forever for
responses over the lxc command socket.

An easy way to reproduce this is to choose a container monitor
process (look for a process like:
	[lxc monitor] /var/lib/lxc/containers containername
) and suspend it with 'kill -STOP'.

So put a one-second timeout around calls to the go-lxc State()
function.  This leads to lxc list feedback like:

0 ✓ serge at sl ~ $ lxc list
+------+-------+------+------+------------+-----------+
| NAME | STATE | IPV4 | IPV6 |    TYPE    | SNAPSHOTS |
+------+-------+------+------+------------+-----------+
| x1   | ERROR |      |      | PERSISTENT | 0         |
+------+-------+------+------+------------+-----------+
0 ✓ serge at sl ~ $ lxc info x1
error: Monitor is hung
1 ✗ serge at sl ~ $ lxc stop x1
error: Monitor is hung

If there were thousands of containers with hung monitors the 1s
each would add up, but this is supposed to be a mitigation for a
rare case.  If we end up with a lot of hung monitors we should
figure out why and prevent it.

Closes #1752

Signed-off-by: Serge Hallyn <serge.hallyn at ubuntu.com>
---
 lxc/list.go           | 19 +++++++++++++++----
 lxd/container_lxc.go  | 38 +++++++++++++++++++++++++++++++++++---
 lxd/containers_get.go |  8 +++++++-
 shared/status.go      |  3 +++
 4 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/lxc/list.go b/lxc/list.go
index 8b24d4b..b0a7f79 100644
--- a/lxc/list.go
+++ b/lxc/list.go
@@ -230,7 +230,7 @@ func (c *listCmd) listContainers(d *lxd.Client, cinfos []shared.ContainerInfo, f
 		}
 
 		for _, column := range columns {
-			if column.NeedsState && cInfo.StatusCode != shared.Stopped {
+			if column.NeedsState && cIsActive(cInfo) {
 				_, ok := cStates[cInfo.Name]
 				if ok {
 					continue
@@ -367,8 +367,19 @@ func (c *listCmd) statusColumnData(cInfo shared.ContainerInfo, cState *shared.Co
 	return strings.ToUpper(cInfo.Status)
 }
 
+func cIsActive(cInfo shared.ContainerInfo) bool {
+	switch cInfo.StatusCode {
+	case shared.Stopped:
+		return false
+	case shared.Error:
+		return false
+	default:
+		return true
+	}
+}
+
 func (c *listCmd) IP4ColumnData(cInfo shared.ContainerInfo, cState *shared.ContainerState, cSnaps []shared.SnapshotInfo) string {
-	if cInfo.StatusCode != shared.Stopped {
+	if cIsActive(cInfo) {
 		ipv4s := []string{}
 		for netName, net := range cState.Network {
 			if net.Type == "loopback" {
@@ -392,7 +403,7 @@ func (c *listCmd) IP4ColumnData(cInfo shared.ContainerInfo, cState *shared.Conta
 }
 
 func (c *listCmd) IP6ColumnData(cInfo shared.ContainerInfo, cState *shared.ContainerState, cSnaps []shared.SnapshotInfo) string {
-	if cInfo.StatusCode != shared.Stopped {
+	if cIsActive(cInfo) {
 		ipv6s := []string{}
 		for netName, net := range cState.Network {
 			if net.Type == "loopback" {
@@ -428,7 +439,7 @@ func (c *listCmd) numberSnapshotsColumnData(cInfo shared.ContainerInfo, cState *
 }
 
 func (c *listCmd) PIDColumnData(cInfo shared.ContainerInfo, cState *shared.ContainerState, cSnaps []shared.SnapshotInfo) string {
-	if cInfo.StatusCode != shared.Stopped {
+	if cIsActive(cInfo) {
 		return fmt.Sprintf("%d", cState.Pid)
 	}
 
diff --git a/lxd/container_lxc.go b/lxd/container_lxc.go
index 3e3ca99..e63413c 100644
--- a/lxd/container_lxc.go
+++ b/lxd/container_lxc.go
@@ -1482,6 +1482,25 @@ func (c *containerLXC) Unfreeze() error {
 	return c.c.Unfreeze()
 }
 
+var LxcMonitorStateError = fmt.Errorf("Monitor is hung")
+
+// Get lxc container state, with 1 second timeout
+// If we don't get a reply, assume the lxc monitor is hung
+func (c *containerLXC) GetLxcState() (lxc.State, error) {
+	monitor := make(chan lxc.State, 1)
+
+	go func(c *lxc.Container) {
+		monitor <- c.State()
+	}(c.c)
+
+	select {
+	case state := <-monitor:
+		return state, nil
+	case <-time.After(time.Second):
+		return lxc.StateMap["FROZEN"], LxcMonitorStateError
+	}
+}
+
 func (c *containerLXC) Render() (interface{}, error) {
 	// Load the go-lxc struct
 	err := c.initLXC()
@@ -1507,7 +1526,11 @@ func (c *containerLXC) Render() (interface{}, error) {
 		}, nil
 	} else {
 		// FIXME: Render shouldn't directly access the go-lxc struct
-		statusCode := shared.FromLXCState(int(c.c.State()))
+		cState, err := c.GetLxcState()
+		if err != nil {
+			return nil, err
+		}
+		statusCode := shared.FromLXCState(int(cState))
 
 		return &shared.ContainerInfo{
 			Architecture:    architectureName,
@@ -1534,7 +1557,11 @@ func (c *containerLXC) RenderState() (*shared.ContainerState, error) {
 	}
 
 	// FIXME: RenderState shouldn't directly access the go-lxc struct
-	statusCode := shared.FromLXCState(int(c.c.State()))
+	cState, err := c.GetLxcState()
+	if err != nil {
+		return nil, err
+	}
+	statusCode := shared.FromLXCState(int(cState))
 	status := shared.ContainerState{
 		Status:     statusCode.String(),
 		StatusCode: statusCode,
@@ -4245,7 +4272,12 @@ func (c *containerLXC) State() string {
 		return "BROKEN"
 	}
 
-	return c.c.State().String()
+	cString := "Error"
+	state, err := c.GetLxcState()
+	if err == nil {
+		cString = state.String()
+	}
+	return cString
 }
 
 // Various container paths
diff --git a/lxd/containers_get.go b/lxd/containers_get.go
index fb00956..082979d 100644
--- a/lxd/containers_get.go
+++ b/lxd/containers_get.go
@@ -67,7 +67,13 @@ func doContainerGet(d *Daemon, cname string) (*shared.ContainerInfo, Response) {
 	}
 
 	cts, err := c.Render()
-	if err != nil {
+	if err == LxcMonitorStateError {
+		return &shared.ContainerInfo{
+			Name:       cname,
+			Status:     "Error",
+			StatusCode: 112,
+		}, nil
+	} else if err != nil {
 		return nil, SmartError(err)
 	}
 
diff --git a/shared/status.go b/shared/status.go
index 35bff80..96010c4 100644
--- a/shared/status.go
+++ b/shared/status.go
@@ -15,6 +15,7 @@ const (
 	Freezing         StatusCode = 109
 	Frozen           StatusCode = 110
 	Thawed           StatusCode = 111
+	Error            StatusCode = 112
 
 	Success StatusCode = 200
 
@@ -39,6 +40,7 @@ func (o StatusCode) String() string {
 		Freezing:         "Freezing",
 		Frozen:           "Frozen",
 		Thawed:           "Thawed",
+		Error:            "Error",
 	}[o]
 }
 
@@ -61,5 +63,6 @@ func FromLXCState(state int) StatusCode {
 		6: Freezing,
 		7: Frozen,
 		8: Thawed,
+		9: Error,
 	}[state]
 }


More information about the lxc-devel mailing list