[lxc-devel] [lxd/master] Spread cluster heartbeats
stgraber on Github
lxc-bot at linuxcontainers.org
Thu Apr 18 19:33:02 UTC 2019
A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 301 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20190418/e89fd7ec/attachment.bin>
-------------- next part --------------
From e15d7928ce4a240ac9c2eb670f483ef8b4f58d5f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20Graber?= <stgraber at ubuntu.com>
Date: Thu, 18 Apr 2019 13:55:54 -0400
Subject: [PATCH 1/2] lxd/cluster: Bump heartbeatInterval to 10s
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: Stéphane Graber <stgraber at ubuntu.com>
---
lxd/cluster/heartbeat.go | 2 +-
test/suites/clustering.sh | 22 +++++++++++-----------
2 files changed, 12 insertions(+), 12 deletions(-)
diff --git a/lxd/cluster/heartbeat.go b/lxd/cluster/heartbeat.go
index 986505649f..78eacbba71 100644
--- a/lxd/cluster/heartbeat.go
+++ b/lxd/cluster/heartbeat.go
@@ -131,7 +131,7 @@ func Heartbeat(gateway *Gateway, cluster *db.Cluster) (task.Func, task.Schedule)
}
// Number of seconds to wait between to heartbeat rounds.
-const heartbeatInterval = 4
+const heartbeatInterval = 10
// Perform a single heartbeat request against the node with the given address.
func heartbeatNode(taskCtx context.Context, address string, cert *shared.CertInfo, raftNodes []db.RaftNode) error {
diff --git a/test/suites/clustering.sh b/test/suites/clustering.sh
index 9dc641f32d..25a5b06206 100644
--- a/test/suites/clustering.sh
+++ b/test/suites/clustering.sh
@@ -55,9 +55,9 @@ test_clustering_membership() {
spawn_lxd_and_join_cluster "${ns2}" "${bridge}" "${cert}" 2 1 "${LXD_TWO_DIR}"
# Configuration keys can be changed on any node.
- LXD_DIR="${LXD_TWO_DIR}" lxc config set cluster.offline_threshold 30
- LXD_DIR="${LXD_ONE_DIR}" lxc info | grep -q 'cluster.offline_threshold: "30"'
- LXD_DIR="${LXD_TWO_DIR}" lxc info | grep -q 'cluster.offline_threshold: "30"'
+ LXD_DIR="${LXD_TWO_DIR}" lxc config set cluster.offline_threshold 40
+ LXD_DIR="${LXD_ONE_DIR}" lxc info | grep -q 'cluster.offline_threshold: "40"'
+ LXD_DIR="${LXD_TWO_DIR}" lxc info | grep -q 'cluster.offline_threshold: "40"'
# The preseeded network bridge exists on all nodes.
ns1_pid="$(cat "${TEST_DIR}/ns/${ns1}/PID")"
@@ -115,9 +115,9 @@ test_clustering_membership() {
# Shutdown a database node, and wait a few seconds so it will be
# detected as down.
- LXD_DIR="${LXD_ONE_DIR}" lxc config set cluster.offline_threshold 5
+ LXD_DIR="${LXD_ONE_DIR}" lxc config set cluster.offline_threshold 12
LXD_DIR="${LXD_THREE_DIR}" lxd shutdown
- sleep 5
+ sleep 30
LXD_DIR="${LXD_TWO_DIR}" lxc cluster list | grep "node3" | grep -q "OFFLINE"
LXD_DIR="${LXD_TWO_DIR}" lxc config set cluster.offline_threshold 20
@@ -129,7 +129,7 @@ test_clustering_membership() {
# Sleep a bit to let a heartbeat occur and update the list of raft nodes
# everywhere, showing that node 4 has been promoted to database node.
- sleep 5
+ sleep 30
LXD_DIR="${LXD_TWO_DIR}" lxc cluster list | grep "node4" | grep -q "YES"
# Now the preseeded network can be deleted, and all nodes are
@@ -310,9 +310,9 @@ test_clustering_containers() {
# Shutdown node 2, wait for it to be considered offline, and list
# containers.
- LXD_DIR="${LXD_THREE_DIR}" lxc config set cluster.offline_threshold 5
+ LXD_DIR="${LXD_THREE_DIR}" lxc config set cluster.offline_threshold 12
LXD_DIR="${LXD_TWO_DIR}" lxd shutdown
- sleep 5
+ sleep 30
LXD_DIR="${LXD_ONE_DIR}" lxc list | grep foo | grep -q ERROR
LXD_DIR="${LXD_ONE_DIR}" lxc config set cluster.offline_threshold 20
@@ -493,9 +493,9 @@ test_clustering_storage() {
LXD_DIR="${LXD_ONE_DIR}" lxc info bar | grep -q "backup (taken at"
# Shutdown node 3, and wait for it to be considered offline.
- LXD_DIR="${LXD_THREE_DIR}" lxc config set cluster.offline_threshold 5
+ LXD_DIR="${LXD_THREE_DIR}" lxc config set cluster.offline_threshold 12
LXD_DIR="${LXD_THREE_DIR}" lxd shutdown
- sleep 5
+ sleep 30
# Move the container back to node2, even if node3 is offline
LXD_DIR="${LXD_ONE_DIR}" lxc move bar --target node2
@@ -987,7 +987,7 @@ test_clustering_shutdown_nodes() {
wait "$(cat three.pid)"
# Make sure the database is not available to the first node
- sleep 5
+ sleep 30
LXD_DIR="${LXD_ONE_DIR}" lxd shutdown
# Wait for LXD to terminate, otherwise the db will not be empty, and the
From b89414016296683d7a20e1d456f4997b9c64c96a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20Graber?= <stgraber at ubuntu.com>
Date: Thu, 18 Apr 2019 14:11:45 -0400
Subject: [PATCH 2/2] lxd/cluster: Spread hearbeats in time
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: Stéphane Graber <stgraber at ubuntu.com>
---
lxd/cluster/heartbeat.go | 39 ++++++++++++++++++++++++++++++---------
1 file changed, 30 insertions(+), 9 deletions(-)
diff --git a/lxd/cluster/heartbeat.go b/lxd/cluster/heartbeat.go
index 78eacbba71..980f8c7018 100644
--- a/lxd/cluster/heartbeat.go
+++ b/lxd/cluster/heartbeat.go
@@ -4,7 +4,9 @@ import (
"bytes"
"encoding/json"
"fmt"
+ "math/rand"
"net/http"
+ "sync"
"time"
"github.com/hashicorp/raft"
@@ -27,13 +29,12 @@ func Heartbeat(gateway *Gateway, cluster *db.Cluster) (task.Func, task.Schedule)
// We're not a raft node or we're not clustered
return
}
- logger.Debugf("Starting heartbeat round")
raftNodes, err := gateway.currentRaftNodes()
if err == raft.ErrNotLeader {
- logger.Debugf("Skipping heartbeat since we're not leader")
return
}
+ logger.Debugf("Starting heartbeat round")
if err != nil {
logger.Warnf("Failed to get current raft nodes: %v", err)
return
@@ -70,22 +71,41 @@ func Heartbeat(gateway *Gateway, cluster *db.Cluster) (task.Func, task.Schedule)
logger.Warnf("Failed to get current cluster nodes: %v", err)
return
}
+
heartbeats := make([]time.Time, len(nodes))
+ heartbeatsLock := sync.Mutex{}
+ heartbeatsWg := sync.WaitGroup{}
+
for i, node := range nodes {
- func(i int, address string) {
- var err error
- // Only send actual requests to other nodes
- if address != nodeAddress {
- err = heartbeatNode(ctx, address, gateway.cert, raftNodes)
- }
+ // Special case the local node
+ if node.Address == nodeAddress {
+ heartbeatsLock.Lock()
+ heartbeats[i] = time.Now()
+ heartbeatsLock.Unlock()
+ continue
+ }
+
+ // Parallelize the rest
+ heartbeatsWg.Add(1)
+ go func(i int, address string) {
+ defer heartbeatsWg.Done()
+
+ // Spread in time by waiting up to 3s less than the interval
+ time.Sleep(time.Duration(rand.Intn((heartbeatInterval*1000)-3000)) * time.Millisecond)
+ logger.Debugf("Sending heartbeat to %s", address)
+
+ err := heartbeatNode(ctx, address, gateway.cert, raftNodes)
if err == nil {
- logger.Debugf("Successful heartbeat for %s", address)
+ heartbeatsLock.Lock()
heartbeats[i] = time.Now()
+ heartbeatsLock.Unlock()
+ logger.Debugf("Successful heartbeat for %s", address)
} else {
logger.Debugf("Failed heartbeat for %s: %v", address, err)
}
}(i, node.Address)
}
+ heartbeatsWg.Wait()
// If the context has been cancelled, return immediately.
if ctx.Err() != nil {
@@ -98,6 +118,7 @@ func Heartbeat(gateway *Gateway, cluster *db.Cluster) (task.Func, task.Schedule)
if heartbeats[i].Equal(time.Time{}) {
continue
}
+
err := tx.NodeHeartbeat(node.Address, heartbeats[i])
if err != nil {
return err
More information about the lxc-devel
mailing list