[lxc-devel] [lxd/master] cluster: Don't upgrade nodes without raft role concurrently

freeekanayaka on Github lxc-bot at linuxcontainers.org
Mon Aug 10 17:57:32 UTC 2020


A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 748 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20200810/cd1be260/attachment.bin>
-------------- next part --------------
From 3f0453de9ab415417bbc9896a594880d935e5b1c Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Mon, 10 Aug 2020 13:45:01 +0200
Subject: [PATCH] cluster: Don't upgrade nodes without raft role concurrently

If there are other raft-related changes in progress, trying to upgrade a node
without raft configuration (typically when upgrading a pre-3.20 cluster to
>=3.20) might race with other cluster changes (e.g. node removal).

This change makes sure that we perform such upgrades serially, by acquiring the
same Daemon.clusterMembershipMutex lock used for other membership operations.

Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
 lxd/api_cluster.go                 | 20 ++++++++++++++++++++
 lxd/cluster/heartbeat.go           |  9 ---------
 lxd/cluster/upgrade.go             | 18 +++++++++++++-----
 lxd/cluster/upgrade_export_test.go |  7 -------
 lxd/cluster/upgrade_test.go        |  2 +-
 lxd/daemon.go                      | 14 ++++++++++++++
 6 files changed, 48 insertions(+), 22 deletions(-)
 delete mode 100644 lxd/cluster/upgrade_export_test.go

diff --git a/lxd/api_cluster.go b/lxd/api_cluster.go
index 0416cc872f..81cdd258eb 100644
--- a/lxd/api_cluster.go
+++ b/lxd/api_cluster.go
@@ -1250,6 +1250,26 @@ again:
 	goto again
 }
 
+// Check if there are nodes not part of the raft configuration and add them in
+// case.
+func upgradeNodesWithoutRaftRole(d *Daemon) error {
+	var allNodes []db.NodeInfo
+	err := d.cluster.Transaction(func(tx *db.ClusterTx) error {
+		var err error
+		allNodes, err = tx.GetNodes()
+		if err != nil {
+			return err
+		}
+
+		return nil
+	})
+	if err != nil {
+		return errors.Wrap(err, "Failed to get current cluster nodes")
+
+	}
+	return cluster.UpgradeMembersWithoutRole(d.gateway, allNodes)
+}
+
 // Post a change role request to the member with the given address. The nodes
 // slice contains details about all members, including the one being changed.
 func changeMemberRole(d *Daemon, address string, nodes []db.RaftNode) error {
diff --git a/lxd/cluster/heartbeat.go b/lxd/cluster/heartbeat.go
index 5c7474ce32..41ad468712 100644
--- a/lxd/cluster/heartbeat.go
+++ b/lxd/cluster/heartbeat.go
@@ -245,15 +245,6 @@ func (g *Gateway) heartbeat(ctx context.Context, initialHeartbeat bool) {
 		return
 	}
 
-	if len(allNodes) != len(raftNodes) {
-		logger.Infof("Upgrading %d nodes not part of raft configuration", len(allNodes)-len(raftNodes))
-		err = upgradeMembersWithoutRole(g, allNodes, raftNodes)
-		if err != nil {
-			logger.Warnf("Failed upgrade raft roles: %v", err)
-			return
-		}
-	}
-
 	// Cumulative set of node states (will be written back to database once done).
 	hbState := &APIHeartbeat{}
 
diff --git a/lxd/cluster/upgrade.go b/lxd/cluster/upgrade.go
index 4625da0174..860fbda234 100644
--- a/lxd/cluster/upgrade.go
+++ b/lxd/cluster/upgrade.go
@@ -121,10 +121,18 @@ func triggerUpdate() error {
 	return nil
 }
 
-// This function assigns the Spare raft role to all cluster members that are
-// not currently part of the raft configuration. It's used for upgrading a
-// cluster from a version without roles support.
-func upgradeMembersWithoutRole(gateway *Gateway, members []db.NodeInfo, nodes []db.RaftNode) error {
+// UpgradeMembersWithoutRole assigns the Spare raft role to all cluster members
+// that are not currently part of the raft configuration. It's used for
+// upgrading a cluster from a version without roles support.
+func UpgradeMembersWithoutRole(gateway *Gateway, members []db.NodeInfo) error {
+	nodes, err := gateway.currentRaftNodes()
+	if err == ErrNotLeader {
+		return nil
+	}
+	if err != nil {
+		return errors.Wrap(err, "Failed to get current raft nodes")
+	}
+
 	// Used raft IDs.
 	ids := map[uint64]bool{}
 	for _, node := range nodes {
@@ -138,7 +146,7 @@ func upgradeMembersWithoutRole(gateway *Gateway, members []db.NodeInfo, nodes []
 	defer client.Close()
 
 	// Check that each member is present in the raft configuration, and add
-	// if not.
+	// it if not.
 	for _, member := range members {
 		found := false
 		for _, node := range nodes {
diff --git a/lxd/cluster/upgrade_export_test.go b/lxd/cluster/upgrade_export_test.go
deleted file mode 100644
index bed9f4e2a2..0000000000
--- a/lxd/cluster/upgrade_export_test.go
+++ /dev/null
@@ -1,7 +0,0 @@
-package cluster
-
-import "github.com/lxc/lxd/lxd/db"
-
-func UpgradeMembersWithoutRole(gateway *Gateway, members []db.NodeInfo, nodes []db.RaftNode) error {
-	return upgradeMembersWithoutRole(gateway, members, nodes)
-}
diff --git a/lxd/cluster/upgrade_test.go b/lxd/cluster/upgrade_test.go
index b5500f7bbf..b00d5f7332 100644
--- a/lxd/cluster/upgrade_test.go
+++ b/lxd/cluster/upgrade_test.go
@@ -160,7 +160,7 @@ func TestUpgradeMembersWithoutRole(t *testing.T) {
 	nodes, err := gateway.RaftNodes()
 	require.NoError(t, err)
 
-	err = cluster.UpgradeMembersWithoutRole(gateway, members, nodes)
+	err = cluster.UpgradeMembersWithoutRole(gateway, members)
 	require.NoError(t, err)
 
 	// The members have been added to the raft configuration.
diff --git a/lxd/daemon.go b/lxd/daemon.go
index ff4307e974..90131cdf02 100644
--- a/lxd/daemon.go
+++ b/lxd/daemon.go
@@ -1557,6 +1557,7 @@ func (d *Daemon) NodeRefreshTask(heartbeatData *cluster.APIHeartbeat) {
 	}
 
 	isDegraded := false
+	hasNodesNotPartOfRaft := false
 	voters := 0
 	standbys := 0
 
@@ -1579,6 +1580,9 @@ func (d *Daemon) NodeRefreshTask(heartbeatData *cluster.APIHeartbeat) {
 			case db.RaftStandBy:
 				standbys++
 			}
+			if node.RaftID == 0 {
+				hasNodesNotPartOfRaft = true
+			}
 
 		}
 
@@ -1633,5 +1637,15 @@ func (d *Daemon) NodeRefreshTask(heartbeatData *cluster.APIHeartbeat) {
 				}
 			}()
 		}
+		if hasNodesNotPartOfRaft {
+			go func() {
+				d.clusterMembershipMutex.Lock()
+				defer d.clusterMembershipMutex.Unlock()
+				err := upgradeNodesWithoutRaftRole(d)
+				if err != nil && errors.Cause(err) != cluster.ErrNotLeader {
+					logger.Warnf("Failed upgrade raft roles: %v", err)
+				}
+			}()
+		}
 	}
 }


More information about the lxc-devel mailing list