[lxc-devel] [lxd/master] Bugfixes
stgraber on Github
lxc-bot at linuxcontainers.org
Mon Sep 5 22:44:23 UTC 2016
A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 301 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20160905/49e9a1ff/attachment.bin>
-------------- next part --------------
From efd6bbf0d3632843c45fc4838796b4373e2e7738 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20Graber?= <stgraber at ubuntu.com>
Date: Mon, 5 Sep 2016 18:43:26 -0400
Subject: [PATCH] Rework container operation locking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This should fix a number of race conditions around start, stop and shutdown.
Closes #2297
Signed-off-by: Stéphane Graber <stgraber at ubuntu.com>
---
lxd/container_lxc.go | 216 +++++++++++++++++++++++++++++++++------------------
1 file changed, 139 insertions(+), 77 deletions(-)
diff --git a/lxd/container_lxc.go b/lxd/container_lxc.go
index ed9ef60..f28c99a 100644
--- a/lxd/container_lxc.go
+++ b/lxd/container_lxc.go
@@ -28,9 +28,55 @@ import (
log "gopkg.in/inconshreveable/log15.v2"
)
-// Global variables
-var lxcStoppingContainersLock sync.Mutex
-var lxcStoppingContainers map[int]*sync.WaitGroup = make(map[int]*sync.WaitGroup)
+// Operation locking
+type lxcContainerOperation struct {
+ action string
+ chanDone chan error
+ err error
+ id int
+ timeout int
+}
+
+func (op *lxcContainerOperation) Create(id int, action string, timeout int) *lxcContainerOperation {
+ op.id = id
+ op.action = action
+ op.timeout = timeout
+ op.chanDone = make(chan error, 0)
+
+ if timeout > 1 {
+ go func(op *lxcContainerOperation) {
+ time.Sleep(time.Second * time.Duration(op.timeout))
+ op.Done(fmt.Errorf("Container %s operation timed out after %d seconds", op.action, op.timeout))
+ }(op)
+ }
+
+ return op
+}
+
+func (op *lxcContainerOperation) Wait() error {
+ <-op.chanDone
+
+ return op.err
+}
+
+func (op *lxcContainerOperation) Done(err error) {
+ lxcContainerOperationsLock.Lock()
+ defer lxcContainerOperationsLock.Unlock()
+
+ // Check if already done
+ runningOp, ok := lxcContainerOperations[op.id]
+ if !ok || runningOp != op {
+ return
+ }
+
+ op.err = err
+ close(op.chanDone)
+
+ delete(lxcContainerOperations, op.id)
+}
+
+var lxcContainerOperationsLock sync.Mutex
+var lxcContainerOperations map[int]*lxcContainerOperation = make(map[int]*lxcContainerOperation)
// Helper functions
func lxcSetConfigItem(c *lxc.Container, key string, value string) error {
@@ -248,6 +294,51 @@ type containerLXC struct {
storage storage
}
+func (c *containerLXC) createOperation(action string, timeout int) (*lxcContainerOperation, error) {
+ op, _ := c.getOperation("")
+ if op != nil {
+ return nil, fmt.Errorf("Container is already running a %s operation", op.action)
+ }
+
+ lxcContainerOperationsLock.Lock()
+ defer lxcContainerOperationsLock.Unlock()
+
+ op = &lxcContainerOperation{}
+ op.Create(c.id, action, timeout)
+ lxcContainerOperations[c.id] = op
+
+ return lxcContainerOperations[c.id], nil
+}
+
+func (c *containerLXC) getOperation(action string) (*lxcContainerOperation, error) {
+ lxcContainerOperationsLock.Lock()
+ defer lxcContainerOperationsLock.Unlock()
+
+ op := lxcContainerOperations[c.id]
+
+ if op == nil {
+ return nil, fmt.Errorf("No running %s container operation", action)
+ }
+
+ if action != "" && op.action != action {
+ return nil, fmt.Errorf("Container is running a %s operation, not a %s operation", op.action, action)
+ }
+
+ return op, nil
+}
+
+func (c *containerLXC) waitOperation() error {
+ op, _ := c.getOperation("")
+ if op != nil {
+ err := op.Wait()
+ if err != nil {
+ return err
+ }
+ }
+
+ return nil
+}
+
func (c *containerLXC) init() error {
// Compute the expanded config and device list
err := c.expandConfig()
@@ -1262,15 +1353,15 @@ func (c *containerLXC) startCommon() (string, error) {
}
func (c *containerLXC) Start(stateful bool) error {
- // Wait for container tear down to finish
- lxcStoppingContainersLock.Lock()
- wgStopping, stopping := lxcStoppingContainers[c.id]
- lxcStoppingContainersLock.Unlock()
- if stopping {
- wgStopping.Wait()
+ // Setup a new operation
+ op, err := c.createOperation("start", 30)
+ if err != nil {
+ return err
}
+ defer op.Done(nil)
- if err := setupSharedMounts(); err != nil {
+ err = setupSharedMounts()
+ if err != nil {
return fmt.Errorf("Daemon failed to setup shared mounts base: %s.\nDoes security.nesting need to be turned on?", err)
}
@@ -1441,35 +1532,14 @@ func (c *containerLXC) OnStart() error {
return nil
}
-// Container shutdown locking
-func (c *containerLXC) setupStopping() *sync.WaitGroup {
- // Handle locking
- lxcStoppingContainersLock.Lock()
- defer lxcStoppingContainersLock.Unlock()
-
- // Existing entry
- wg, stopping := lxcStoppingContainers[c.id]
- if stopping {
- return wg
- }
-
- // Setup new entry
- lxcStoppingContainers[c.id] = &sync.WaitGroup{}
-
- go func(wg *sync.WaitGroup, id int) {
- wg.Wait()
-
- lxcStoppingContainersLock.Lock()
- defer lxcStoppingContainersLock.Unlock()
-
- delete(lxcStoppingContainers, id)
- }(lxcStoppingContainers[c.id], c.id)
-
- return lxcStoppingContainers[c.id]
-}
-
// Stop functions
func (c *containerLXC) Stop(stateful bool) error {
+ // Setup a new operation
+ op, err := c.createOperation("stop", 30)
+ if err != nil {
+ return err
+ }
+
// Handle stateful stop
if stateful {
// Cleanup any existing state
@@ -1478,86 +1548,75 @@ func (c *containerLXC) Stop(stateful bool) error {
err := os.MkdirAll(stateDir, 0700)
if err != nil {
+ op.Done(err)
return err
}
// Checkpoint
err = c.Migrate(lxc.MIGRATE_DUMP, stateDir, "snapshot", true, false)
if err != nil {
+ op.Done(err)
return err
}
c.stateful = true
err = dbContainerSetStateful(c.daemon.db, c.id, true)
if err != nil {
+ op.Done(err)
return err
}
+ op.Done(nil)
return nil
}
// Load the go-lxc struct
- err := c.initLXC()
+ err = c.initLXC()
if err != nil {
+ op.Done(err)
return err
}
// Attempt to freeze the container first, helps massively with fork bombs
c.Freeze()
- // Handle locking
- wg := c.setupStopping()
-
- // Stop the container
- wg.Add(1)
if err := c.c.Stop(); err != nil {
- wg.Done()
+ op.Done(err)
return err
}
- // Mark ourselves as done
- wg.Done()
-
- // Wait for any other teardown routines to finish
- wg.Wait()
-
- return nil
+ return op.Wait()
}
func (c *containerLXC) Shutdown(timeout time.Duration) error {
- // Load the go-lxc struct
- err := c.initLXC()
+ // Setup a new operation
+ op, err := c.createOperation("shutdown", 30)
if err != nil {
return err
}
- // Handle locking
- wg := c.setupStopping()
+ // Load the go-lxc struct
+ err = c.initLXC()
+ if err != nil {
+ op.Done(err)
+ return err
+ }
- // Shutdown the container
- wg.Add(1)
if err := c.c.Shutdown(timeout); err != nil {
- wg.Done()
+ op.Done(err)
return err
}
- // Mark ourselves as done
- wg.Done()
-
- // Wait for any other teardown routines to finish
- wg.Wait()
-
- return nil
+ return op.Wait()
}
func (c *containerLXC) OnStop(target string) error {
- // Get locking
- lxcStoppingContainersLock.Lock()
- wg, stopping := lxcStoppingContainers[c.id]
- lxcStoppingContainersLock.Unlock()
- if wg != nil {
- wg.Add(1)
+ // Get operation
+ op, _ := c.getOperation("")
+ if op != nil && !shared.StringInSlice(op.action, []string{"stop", "shutdown"}) {
+ return fmt.Errorf("Container is already running a %s operation", op.action)
}
+ shared.Debugf("stgraber: op = %s", op)
// Make sure we can't call go-lxc functions by mistake
c.fromHook = true
@@ -1565,7 +1624,10 @@ func (c *containerLXC) OnStop(target string) error {
// Stop the storage for this container
err := c.StorageStop()
if err != nil {
- wg.Done()
+ if op != nil {
+ op.Done(err)
+ }
+
return err
}
@@ -1573,15 +1635,15 @@ func (c *containerLXC) OnStop(target string) error {
AAUnloadProfile(c)
// FIXME: The go routine can go away once we can rely on LXC_TARGET
- go func(c *containerLXC, target string, wg *sync.WaitGroup) {
+ go func(c *containerLXC, target string, op *lxcContainerOperation) {
c.fromHook = false
// Unlock on return
- if wg != nil {
- defer wg.Done()
+ if op != nil {
+ defer op.Done(nil)
}
- if target == "unknown" && stopping {
+ if target == "unknown" && op != nil {
target = "stop"
}
@@ -1644,7 +1706,7 @@ func (c *containerLXC) OnStop(target string) error {
if c.ephemeral {
c.Delete()
}
- }(c, target, wg)
+ }(c, target, op)
return nil
}
More information about the lxc-devel
mailing list