[lxc-devel] [lxd/master] Fix demoted voters disrupting leader
freeekanayaka on Github
lxc-bot at linuxcontainers.org
Sat May 16 17:18:39 UTC 2020
A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 301 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20200516/b2214c1f/attachment.bin>
-------------- next part --------------
From 3aec38499274068bf1862e0c0e4c60ae0fe3fee1 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Sat, 16 May 2020 18:15:40 +0100
Subject: [PATCH 1/2] Attempt to demote only offline nodes that are stand-by
Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
lxd/cluster/membership.go | 48 +++++++++++++++++++--------------------
1 file changed, 23 insertions(+), 25 deletions(-)
diff --git a/lxd/cluster/membership.go b/lxd/cluster/membership.go
index 0013310b6b..a70833ae32 100644
--- a/lxd/cluster/membership.go
+++ b/lxd/cluster/membership.go
@@ -533,34 +533,32 @@ func Rebalance(state *state.State, gateway *Gateway) (string, []db.RaftNode, err
candidates := make([]string, 0)
for i, info := range currentRaftNodes {
node := nodesByAddress[info.Address]
- if node.IsOffline(offlineThreshold) && info.Role != db.RaftSpare {
- // Even the heartbeat timestamp is not recent
- // enough, let's try to connect to the node,
- // just in case the heartbeat is lagging behind
- // for some reason and the node is actually up.
- client, err := Connect(node.Address, gateway.cert, true)
- if err == nil {
- _, _, err = client.GetServer()
- }
- if err != nil {
- client, err := gateway.getClient()
- if err != nil {
- return "", nil, errors.Wrap(err, "Failed to connect to local dqlite node")
- }
- defer client.Close()
- ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
- defer cancel()
- err = client.Assign(ctx, info.ID, db.RaftSpare)
- if err != nil {
- return "", nil, errors.Wrap(err, "Failed to demote offline node")
+ if node.IsOffline(offlineThreshold) {
+ if info.Role == db.RaftStandBy {
+ // Even the heartbeat timestamp is not recent
+ // enough, let's try to connect to the node,
+ // just in case the heartbeat is lagging behind
+ // for some reason and the node is actually up.
+ client, err := Connect(node.Address, gateway.cert, true)
+ if err == nil {
+ _, _, err = client.GetServer()
}
- err = state.Cluster.Transaction(func(tx *db.ClusterTx) error {
- return tx.RemoveNodeRole(node.ID, db.ClusterRoleDatabase)
- })
if err != nil {
- return "", nil, errors.Wrap(err, "Failed to update node role")
+ client, err := gateway.getClient()
+ if err != nil {
+ return "", nil, errors.Wrap(err, "Failed to connect to local dqlite node")
+ }
+ defer client.Close()
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+ logger.Infof(
+ "Demote offline stand-by node %s (%s) to spare", node.Name, node.Address)
+ err = client.Assign(ctx, info.ID, db.RaftSpare)
+ if err != nil {
+ return "", nil, errors.Wrap(err, "Failed to demote offline node")
+ }
+ currentRaftNodes[i].Role = db.RaftSpare
}
- currentRaftNodes[i].Role = db.RaftSpare
continue
}
}
From b64f7858c5028e40945b8955fddd28cd16299535 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Sat, 16 May 2020 18:17:12 +0100
Subject: [PATCH 2/2] When demoting a voter to spare, transition to stand-by
first
This will let the node know that it's not a voter anymore, and
avoid disrupting the cluster.
Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
lxd/api_cluster.go | 2 +-
lxd/cluster/membership.go | 42 ++++++++++++++++++++++++++++++++++++---
2 files changed, 40 insertions(+), 4 deletions(-)
diff --git a/lxd/api_cluster.go b/lxd/api_cluster.go
index dd0095b9e8..94eac84708 100644
--- a/lxd/api_cluster.go
+++ b/lxd/api_cluster.go
@@ -634,7 +634,7 @@ func clusterPutJoin(d *Daemon, req api.ClusterPut) response.Response {
// role changes.
_, _, err = client.RawQuery("POST", "/internal/cluster/rebalance", nil, "")
if err != nil {
- return errors.Wrap(err, "Failed cluster rebalance request")
+ logger.Warnf("Failed to trigger cluster rebalance: %v", err)
}
return nil
diff --git a/lxd/cluster/membership.go b/lxd/cluster/membership.go
index a70833ae32..b6f30cdc86 100644
--- a/lxd/cluster/membership.go
+++ b/lxd/cluster/membership.go
@@ -590,7 +590,7 @@ func Rebalance(state *state.State, gateway *Gateway) (string, []db.RaftNode, err
address := ""
for _, candidate := range candidates {
node := nodesByAddress[candidate]
- logger.Debugf(
+ logger.Infof(
"Found spare node %s (%s) to be promoted to %s", node.Name, node.Address, role)
address = node.Address
break
@@ -613,8 +613,6 @@ func Rebalance(state *state.State, gateway *Gateway) (string, []db.RaftNode, err
// Assign a new role to the local dqlite node.
func Assign(state *state.State, gateway *Gateway, nodes []db.RaftNode) error {
- logger.Info("Assign new role to dqlite node")
-
// Figure out our own address.
address := ""
err := state.Cluster.Transaction(func(tx *db.ClusterTx) error {
@@ -715,6 +713,44 @@ assign:
}
defer client.Close()
+ // If we're stepping back to spare, let's first transition to stand-by
+ // and wait for the configuration change to be notified to us. This
+ // prevent us from thinking we're still voters and potentially disrupt
+ // the cluster.
+ if info.Role == db.RaftSpare {
+ err = client.Assign(ctx, info.ID, db.RaftStandBy)
+ if err != nil {
+ return errors.Wrap(err, "Failed to step back to stand-by")
+ }
+ local, err := gateway.getClient()
+ if err != nil {
+ return errors.Wrap(err, "Failed to get local dqlite client")
+ }
+ notified := false
+ for i := 0; i < 10; i++ {
+ time.Sleep(500 * time.Millisecond)
+ servers, err := local.Cluster(context.Background())
+ if err != nil {
+ return errors.Wrap(err, "Failed to get current cluster")
+ }
+ for _, server := range servers {
+ if server.Address != info.Address {
+ continue
+ }
+ if server.Role == db.RaftStandBy {
+ notified = true
+ break
+ }
+ }
+ if notified {
+ break
+ }
+ }
+ if !notified {
+ return fmt.Errorf("Timeout waiting for configuration change notification")
+ }
+ }
+
err = client.Assign(ctx, info.ID, info.Role)
if err != nil {
return errors.Wrap(err, "Failed to assign role")
More information about the lxc-devel
mailing list