[lxc-devel] [lxd/master] Fix some cluster reliability issues
stgraber on Github
lxc-bot at linuxcontainers.org
Wed May 8 04:00:18 UTC 2019
A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 301 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20190507/5ff6cc25/attachment.bin>
-------------- next part --------------
From 549526b8ba1e60dfd204b7892c3de200b2ca3c3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20Graber?= <stgraber at ubuntu.com>
Date: Tue, 7 May 2019 17:14:12 -0400
Subject: [PATCH 1/2] lxd/cluster: Avoid panic in Gateway
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: Stéphane Graber <stgraber at ubuntu.com>
---
lxd/cluster/gateway.go | 43 +++++++++++++++++++++++++++++++++++++++++-
1 file changed, 42 insertions(+), 1 deletion(-)
diff --git a/lxd/cluster/gateway.go b/lxd/cluster/gateway.go
index 5703da77d5..c34fdd1c0a 100644
--- a/lxd/cluster/gateway.go
+++ b/lxd/cluster/gateway.go
@@ -11,6 +11,7 @@ import (
"os"
"path/filepath"
"strconv"
+ "sync"
"time"
"github.com/CanonicalLtd/go-dqlite"
@@ -99,6 +100,8 @@ type Gateway struct {
// ServerStore wrapper.
store *dqliteServerStore
+
+ lock sync.RWMutex
}
// HandlerFuncs returns the HTTP handlers that should be added to the REST API
@@ -112,6 +115,9 @@ type Gateway struct {
// database node part of the dqlite cluster.
func (g *Gateway) HandlerFuncs() map[string]http.HandlerFunc {
database := func(w http.ResponseWriter, r *http.Request) {
+ g.lock.RLock()
+ defer g.lock.RUnlock()
+
if !tlsCheckCert(r, g.cert) {
http.Error(w, "403 invalid client certificate", http.StatusForbidden)
return
@@ -202,6 +208,9 @@ func (g *Gateway) HandlerFuncs() map[string]http.HandlerFunc {
g.acceptCh <- conn
}
raft := func(w http.ResponseWriter, r *http.Request) {
+ g.lock.RLock()
+ defer g.lock.RUnlock()
+
// If we are not part of the raft cluster, reply with a
// redirect to one of the raft nodes that we know about.
if g.raft == nil {
@@ -245,6 +254,9 @@ func (g *Gateway) HandlerFuncs() map[string]http.HandlerFunc {
// Snapshot can be used to manually trigger a RAFT snapshot
func (g *Gateway) Snapshot() error {
+ g.lock.RLock()
+ defer g.lock.RUnlock()
+
return g.raft.Snapshot()
}
@@ -257,6 +269,9 @@ func (g *Gateway) WaitUpgradeNotification() {
// IsDatabaseNode returns true if this gateway also run acts a raft database node.
func (g *Gateway) IsDatabaseNode() bool {
+ g.lock.RLock()
+ defer g.lock.RUnlock()
+
return g.raft != nil
}
@@ -264,6 +279,9 @@ func (g *Gateway) IsDatabaseNode() bool {
// dqlite nodes.
func (g *Gateway) DialFunc() dqlite.DialFunc {
return func(ctx context.Context, address string) (net.Conn, error) {
+ g.lock.RLock()
+ defer g.lock.RUnlock()
+
// Memory connection.
if g.memoryDial != nil {
return g.memoryDial(ctx, address)
@@ -301,12 +319,15 @@ func (g *Gateway) Kill() {
func (g *Gateway) Shutdown() error {
logger.Debugf("Stop database gateway")
+ g.lock.RLock()
if g.raft != nil {
err := g.raft.Shutdown()
if err != nil {
+ g.lock.RUnlock()
return errors.Wrap(err, "Failed to shutdown raft")
}
}
+ g.lock.RUnlock()
if g.server != nil {
g.Sync()
@@ -314,7 +335,9 @@ func (g *Gateway) Shutdown() error {
// Unset the memory dial, since Shutdown() is also called for
// switching between in-memory and network mode.
+ g.lock.Lock()
g.memoryDial = nil
+ g.lock.Unlock()
}
return nil
@@ -325,6 +348,9 @@ func (g *Gateway) Shutdown() error {
// it can inspect the database in order to decide whether to activate the
// daemon or not.
func (g *Gateway) Sync() {
+ g.lock.RLock()
+ defer g.lock.RUnlock()
+
if g.server == nil {
return
}
@@ -362,6 +388,9 @@ func (g *Gateway) Reset(cert *shared.CertInfo) error {
// LeaderAddress returns the address of the current raft leader.
func (g *Gateway) LeaderAddress() (string, error) {
+ g.lock.RLock()
+ defer g.lock.RUnlock()
+
// If we aren't clustered, return an error.
if g.memoryDial != nil {
return "", fmt.Errorf("Node is not clustered")
@@ -381,7 +410,6 @@ func (g *Gateway) LeaderAddress() (string, error) {
time.Sleep(time.Second)
}
return "", ctx.Err()
-
}
// If this isn't a raft node, contact a raft node and ask for the
@@ -483,15 +511,21 @@ func (g *Gateway) init() error {
return errors.Wrap(err, "Failed to create dqlite server")
}
+ g.lock.Lock()
g.server = server
g.raft = raft
+ g.lock.Unlock()
} else {
+ g.lock.Lock()
g.server = nil
g.raft = nil
g.store.inMemory = nil
+ g.lock.Unlock()
}
+ g.lock.Lock()
g.store.onDisk = dqlite.NewServerStore(g.db.DB(), "main", "raft_nodes", "address")
+ g.lock.Unlock()
return nil
}
@@ -502,9 +536,13 @@ func (g *Gateway) waitLeadership() error {
n := 80
sleep := 250 * time.Millisecond
for i := 0; i < n; i++ {
+ g.lock.RLock()
if g.raft.raft.State() == raft.Leader {
+ g.lock.RUnlock()
return nil
}
+ g.lock.RUnlock()
+
time.Sleep(sleep)
}
return fmt.Errorf("RAFT node did not self-elect within %s", time.Duration(n)*sleep)
@@ -514,6 +552,9 @@ func (g *Gateway) waitLeadership() error {
// cluster, as configured in the raft log. It returns an error if this node is
// not the leader.
func (g *Gateway) currentRaftNodes() ([]db.RaftNode, error) {
+ g.lock.RLock()
+ defer g.lock.RUnlock()
+
if g.raft == nil {
return nil, raft.ErrNotLeader
}
From ac32d0fd891ba21c0c8d6cf5597097be3ffc37c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20Graber?= <stgraber at ubuntu.com>
Date: Tue, 7 May 2019 23:29:29 -0400
Subject: [PATCH 2/2] lxd/cluster: Use current time for hearbeat
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: Stéphane Graber <stgraber at ubuntu.com>
---
lxd/cluster/heartbeat.go | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lxd/cluster/heartbeat.go b/lxd/cluster/heartbeat.go
index 980f8c7018..e842787b64 100644
--- a/lxd/cluster/heartbeat.go
+++ b/lxd/cluster/heartbeat.go
@@ -119,7 +119,7 @@ func Heartbeat(gateway *Gateway, cluster *db.Cluster) (task.Func, task.Schedule)
continue
}
- err := tx.NodeHeartbeat(node.Address, heartbeats[i])
+ err := tx.NodeHeartbeat(node.Address, time.Now())
if err != nil {
return err
}
More information about the lxc-devel
mailing list