[lxc-devel] [lxd/master] lxd/cluster: Transfer leadership before adjusting roles, not after
freeekanayaka on Github
lxc-bot at linuxcontainers.org
Thu May 21 18:42:06 UTC 2020
A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 478 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20200521/a7cd2cbe/attachment.bin>
-------------- next part --------------
From e03bea5bb3a74101f3c5c0427b0d1e0cf5f81d5e Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Thu, 21 May 2020 19:38:42 +0100
Subject: [PATCH] lxd/cluster: Transfer leadership before adjusting roles, not
after
Although it's possible to hold leadership while being demoted to spare, that
creates a few issues down the line.
Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
lxd/api_cluster.go | 10 +++++++
lxd/cluster/gateway.go | 58 ++++++++++++++++++++++++++-------------
lxd/cluster/membership.go | 52 ++---------------------------------
lxd/daemon.go | 3 ++
4 files changed, 55 insertions(+), 68 deletions(-)
diff --git a/lxd/api_cluster.go b/lxd/api_cluster.go
index 94eac84708..31ddbb6abc 100644
--- a/lxd/api_cluster.go
+++ b/lxd/api_cluster.go
@@ -1286,6 +1286,7 @@ func handoverMemberRole(d *Daemon) error {
}
// Find the cluster leader.
+findLeader:
leader, err := d.gateway.LeaderAddress()
if err != nil {
return err
@@ -1297,6 +1298,15 @@ func handoverMemberRole(d *Daemon) error {
return nil
}
+ if leader == address {
+ logger.Info("Transfer leadership")
+ err := d.gateway.TransferLeadership()
+ if err != nil {
+ return errors.Wrapf(err, "Failed to transfer leadership")
+ }
+ goto findLeader
+ }
+
cert := d.endpoints.NetworkCert()
client, err := cluster.Connect(leader, cert, true)
if err != nil {
diff --git a/lxd/cluster/gateway.go b/lxd/cluster/gateway.go
index 2195ca6d9f..64a84265c5 100644
--- a/lxd/cluster/gateway.go
+++ b/lxd/cluster/gateway.go
@@ -428,6 +428,45 @@ func (g *Gateway) Kill() {
g.cancel()
}
+// TransferLeadership attempts to transfer leadership to another node.
+func (g *Gateway) TransferLeadership() error {
+ client, err := g.getClient()
+ if err != nil {
+ return err
+ }
+ defer client.Close()
+
+ // Try to find a voter that is also online.
+ servers, err := client.Cluster(context.Background())
+ if err != nil {
+ return err
+ }
+ var id uint64
+ for _, server := range servers {
+ if server.ID == g.info.ID || server.Role != db.RaftVoter {
+ continue
+ }
+ address, err := g.raftAddress(server.ID)
+ if err != nil {
+ return err
+ }
+ if !hasConnectivity(g.cert, address) {
+ continue
+ }
+ id = server.ID
+ break
+ }
+
+ if id == 0 {
+ return fmt.Errorf("No online voter found")
+ }
+
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+
+ return client.Transfer(ctx, id)
+}
+
// Shutdown this gateway, stopping the gRPC server and possibly the raft factory.
func (g *Gateway) Shutdown() error {
logger.Debugf("Stop database gateway")
@@ -437,25 +476,6 @@ func (g *Gateway) Shutdown() error {
g.Sync()
}
- // If this is not a standalone node and we are the cluster
- // leader, let's try to transfer leadership.
- if g.memoryDial == nil {
- isLeader, err := g.isLeader()
- if err == nil && isLeader {
- client, err := g.getClient()
- if err == nil {
- ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
- defer cancel()
- logger.Info("Transfer leadership")
- err := client.Transfer(ctx, 0)
- if err != nil {
- logger.Warnf("Failed to transfer leadership: %v", err)
- }
- client.Close()
- }
- }
- }
-
g.server.Close()
close(g.stopCh)
diff --git a/lxd/cluster/membership.go b/lxd/cluster/membership.go
index 9e28022619..62a19f4a91 100644
--- a/lxd/cluster/membership.go
+++ b/lxd/cluster/membership.go
@@ -533,20 +533,8 @@ func Rebalance(state *state.State, gateway *Gateway) (string, []db.RaftNode, err
candidates := make([]string, 0)
for i, info := range currentRaftNodes {
node := nodesByAddress[info.Address]
- if node.IsOffline(offlineThreshold) {
+ if !hasConnectivity(gateway.cert, node.Address) {
if info.Role != db.RaftSpare {
- // Even the heartbeat timestamp is not recent
- // enough, let's try to connect to the node,
- // just in case the heartbeat is lagging behind
- // for some reason and the node is actually up.
- restclient, err := Connect(node.Address, gateway.cert, true)
- if err == nil {
- _, _, err = restclient.GetServer()
- }
- if err == nil {
- // This isn't actually offline.
- goto append
- }
client, err := gateway.getClient()
if err != nil {
return "", nil, errors.Wrap(err, "Failed to connect to local dqlite node")
@@ -572,7 +560,7 @@ func Rebalance(state *state.State, gateway *Gateway) (string, []db.RaftNode, err
}
continue
}
- append:
+
switch info.Role {
case db.RaftVoter:
voters = append(voters, info.Address)
@@ -902,11 +890,7 @@ func Handover(state *state.State, gateway *Gateway, address string) (string, []d
if node.Role == db.RaftVoter || node.Address == address {
continue
}
- online, err := isMemberOnline(state, gateway.cert, node.Address)
- if err != nil {
- return "", nil, errors.Wrapf(err, "Failed to check if %s is online", node.Address)
- }
- if !online {
+ if !hasConnectivity(gateway.cert, node.Address) {
continue
}
nodes[i].Role = db.RaftVoter
@@ -916,36 +900,6 @@ func Handover(state *state.State, gateway *Gateway, address string) (string, []d
return "", nil, nil
}
-// Check if the member with the given address is one.
-func isMemberOnline(state *state.State, cert *shared.CertInfo, address string) (bool, error) {
- online := true
- err := state.Cluster.Transaction(func(tx *db.ClusterTx) error {
- offlineThreshold, err := tx.GetNodeOfflineThreshold()
- if err != nil {
- return err
- }
- node, err := tx.GetNodeByAddress(address)
- if err != nil {
- return err
- }
- if node.IsOffline(offlineThreshold) {
- online = false
- }
- return nil
- })
- if err != nil {
- return false, err
- }
- // Even if the heartbeat timestamp is not recent enough, let's try to
- // connect to the node, just in case the heartbeat is lagging behind
- // for some reason and the node is actually up.
- if !online && hasConnectivity(cert, address) {
- online = true
- }
-
- return online, nil
-}
-
// Purge removes a node entirely from the cluster database.
func Purge(cluster *db.Cluster, name string) error {
logger.Debugf("Remove node %s from the database", name)
diff --git a/lxd/daemon.go b/lxd/daemon.go
index 6a5cd40b93..26bca25d6a 100644
--- a/lxd/daemon.go
+++ b/lxd/daemon.go
@@ -1534,6 +1534,9 @@ func (d *Daemon) NodeRefreshTask(heartbeatData *cluster.APIHeartbeat) {
if isDegraded || voters < int(maxVoters) || standbys < int(maxStandBy) {
go func() {
+ // Wait a little bit, just to avoid spurious
+ // attempts due to nodes being shut down.
+ time.Sleep(5 * time.Second)
d.clusterMembershipMutex.Lock()
defer d.clusterMembershipMutex.Unlock()
err := rebalanceMemberRoles(d)
More information about the lxc-devel
mailing list