[lxc-devel] [lxd/master] Handle unresponsive container monitors
hallyn on Github
lxc-bot at linuxcontainers.org
Tue Mar 15 02:24:32 UTC 2016
A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 1442 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20160315/e7c03ece/attachment.bin>
-------------- next part --------------
From 51ec4f40bb5a3cb92674c4a5895b7cd9fb4f2bf5 Mon Sep 17 00:00:00 2001
From: Serge Hallyn <serge.hallyn at ubuntu.com>
Date: Mon, 14 Mar 2016 18:52:01 -0700
Subject: [PATCH] Handle unresponsive container monitors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
If a container monitor is unresponsive, we may wait forever for
responses over the lxc command socket.
An easy way to reproduce this is to choose a container monitor
process (look for a process like:
[lxc monitor] /var/lib/lxc/containers containername
) and suspend it with 'kill -STOP'.
So put a one-second timeout around calls to the go-lxc State()
function. This leads to lxc list feedback like:
0 ✓ serge at sl ~ $ lxc list
+------+-------+------+------+------------+-----------+
| NAME | STATE | IPV4 | IPV6 | TYPE | SNAPSHOTS |
+------+-------+------+------+------------+-----------+
| x1 | ERROR | | | PERSISTENT | 0 |
+------+-------+------+------+------------+-----------+
0 ✓ serge at sl ~ $ lxc info x1
error: Monitor is hung
1 ✗ serge at sl ~ $ lxc stop x1
error: Monitor is hung
If there were thousands of containers with hung monitors the 1s
each would add up, but this is supposed to be a mitigation for a
rare case. If we end up with a lot of hung monitors we should
figure out why and prevent it.
Closes #1752
Signed-off-by: Serge Hallyn <serge.hallyn at ubuntu.com>
---
lxc/list.go | 19 +++++++++++++++----
lxd/container_lxc.go | 38 +++++++++++++++++++++++++++++++++++---
lxd/containers_get.go | 8 +++++++-
shared/status.go | 3 +++
4 files changed, 60 insertions(+), 8 deletions(-)
diff --git a/lxc/list.go b/lxc/list.go
index 8b24d4b..b0a7f79 100644
--- a/lxc/list.go
+++ b/lxc/list.go
@@ -230,7 +230,7 @@ func (c *listCmd) listContainers(d *lxd.Client, cinfos []shared.ContainerInfo, f
}
for _, column := range columns {
- if column.NeedsState && cInfo.StatusCode != shared.Stopped {
+ if column.NeedsState && cIsActive(cInfo) {
_, ok := cStates[cInfo.Name]
if ok {
continue
@@ -367,8 +367,19 @@ func (c *listCmd) statusColumnData(cInfo shared.ContainerInfo, cState *shared.Co
return strings.ToUpper(cInfo.Status)
}
+func cIsActive(cInfo shared.ContainerInfo) bool {
+ switch cInfo.StatusCode {
+ case shared.Stopped:
+ return false
+ case shared.Error:
+ return false
+ default:
+ return true
+ }
+}
+
func (c *listCmd) IP4ColumnData(cInfo shared.ContainerInfo, cState *shared.ContainerState, cSnaps []shared.SnapshotInfo) string {
- if cInfo.StatusCode != shared.Stopped {
+ if cIsActive(cInfo) {
ipv4s := []string{}
for netName, net := range cState.Network {
if net.Type == "loopback" {
@@ -392,7 +403,7 @@ func (c *listCmd) IP4ColumnData(cInfo shared.ContainerInfo, cState *shared.Conta
}
func (c *listCmd) IP6ColumnData(cInfo shared.ContainerInfo, cState *shared.ContainerState, cSnaps []shared.SnapshotInfo) string {
- if cInfo.StatusCode != shared.Stopped {
+ if cIsActive(cInfo) {
ipv6s := []string{}
for netName, net := range cState.Network {
if net.Type == "loopback" {
@@ -428,7 +439,7 @@ func (c *listCmd) numberSnapshotsColumnData(cInfo shared.ContainerInfo, cState *
}
func (c *listCmd) PIDColumnData(cInfo shared.ContainerInfo, cState *shared.ContainerState, cSnaps []shared.SnapshotInfo) string {
- if cInfo.StatusCode != shared.Stopped {
+ if cIsActive(cInfo) {
return fmt.Sprintf("%d", cState.Pid)
}
diff --git a/lxd/container_lxc.go b/lxd/container_lxc.go
index 3e3ca99..e63413c 100644
--- a/lxd/container_lxc.go
+++ b/lxd/container_lxc.go
@@ -1482,6 +1482,25 @@ func (c *containerLXC) Unfreeze() error {
return c.c.Unfreeze()
}
+var LxcMonitorStateError = fmt.Errorf("Monitor is hung")
+
+// Get lxc container state, with 1 second timeout
+// If we don't get a reply, assume the lxc monitor is hung
+func (c *containerLXC) GetLxcState() (lxc.State, error) {
+ monitor := make(chan lxc.State, 1)
+
+ go func(c *lxc.Container) {
+ monitor <- c.State()
+ }(c.c)
+
+ select {
+ case state := <-monitor:
+ return state, nil
+ case <-time.After(time.Second):
+ return lxc.StateMap["FROZEN"], LxcMonitorStateError
+ }
+}
+
func (c *containerLXC) Render() (interface{}, error) {
// Load the go-lxc struct
err := c.initLXC()
@@ -1507,7 +1526,11 @@ func (c *containerLXC) Render() (interface{}, error) {
}, nil
} else {
// FIXME: Render shouldn't directly access the go-lxc struct
- statusCode := shared.FromLXCState(int(c.c.State()))
+ cState, err := c.GetLxcState()
+ if err != nil {
+ return nil, err
+ }
+ statusCode := shared.FromLXCState(int(cState))
return &shared.ContainerInfo{
Architecture: architectureName,
@@ -1534,7 +1557,11 @@ func (c *containerLXC) RenderState() (*shared.ContainerState, error) {
}
// FIXME: RenderState shouldn't directly access the go-lxc struct
- statusCode := shared.FromLXCState(int(c.c.State()))
+ cState, err := c.GetLxcState()
+ if err != nil {
+ return nil, err
+ }
+ statusCode := shared.FromLXCState(int(cState))
status := shared.ContainerState{
Status: statusCode.String(),
StatusCode: statusCode,
@@ -4245,7 +4272,12 @@ func (c *containerLXC) State() string {
return "BROKEN"
}
- return c.c.State().String()
+ cString := "Error"
+ state, err := c.GetLxcState()
+ if err == nil {
+ cString = state.String()
+ }
+ return cString
}
// Various container paths
diff --git a/lxd/containers_get.go b/lxd/containers_get.go
index fb00956..082979d 100644
--- a/lxd/containers_get.go
+++ b/lxd/containers_get.go
@@ -67,7 +67,13 @@ func doContainerGet(d *Daemon, cname string) (*shared.ContainerInfo, Response) {
}
cts, err := c.Render()
- if err != nil {
+ if err == LxcMonitorStateError {
+ return &shared.ContainerInfo{
+ Name: cname,
+ Status: "Error",
+ StatusCode: 112,
+ }, nil
+ } else if err != nil {
return nil, SmartError(err)
}
diff --git a/shared/status.go b/shared/status.go
index 35bff80..96010c4 100644
--- a/shared/status.go
+++ b/shared/status.go
@@ -15,6 +15,7 @@ const (
Freezing StatusCode = 109
Frozen StatusCode = 110
Thawed StatusCode = 111
+ Error StatusCode = 112
Success StatusCode = 200
@@ -39,6 +40,7 @@ func (o StatusCode) String() string {
Freezing: "Freezing",
Frozen: "Frozen",
Thawed: "Thawed",
+ Error: "Error",
}[o]
}
@@ -61,5 +63,6 @@ func FromLXCState(state int) StatusCode {
6: Freezing,
7: Frozen,
8: Thawed,
+ 9: Error,
}[state]
}
More information about the lxc-devel
mailing list