[lxc-devel] [lxd/master] Failure domains support

freeekanayaka on Github lxc-bot at linuxcontainers.org
Thu Jul 2 10:41:55 UTC 2020


A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 301 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20200702/50f75de5/attachment.bin>
-------------- next part --------------
From 4eece934f4cefaff341d018bf1fedfb1b289e1c8 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Thu, 2 Jul 2020 10:00:44 +0200
Subject: [PATCH 01/10] lxd/db: Add failure_domains table and nodes column
 reference

Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
 lxd/db/cluster/schema.go |  8 +++++++-
 lxd/db/cluster/update.go | 20 ++++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/lxd/db/cluster/schema.go b/lxd/db/cluster/schema.go
index 201ed4e1eb..fb36056864 100644
--- a/lxd/db/cluster/schema.go
+++ b/lxd/db/cluster/schema.go
@@ -20,6 +20,11 @@ CREATE TABLE config (
     value TEXT,
     UNIQUE (key)
 );
+CREATE TABLE failure_domains (
+    id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
+    name TEXT NOT NULL,
+    UNIQUE (name)
+);
 CREATE TABLE "images" (
     id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
     fingerprint TEXT NOT NULL,
@@ -311,6 +316,7 @@ CREATE TABLE nodes (
     heartbeat DATETIME DEFAULT CURRENT_TIMESTAMP,
     pending INTEGER NOT NULL DEFAULT 0,
     arch INTEGER NOT NULL DEFAULT 0 CHECK (arch > 0),
+    failure_domain_id INTEGER DEFAULT NULL REFERENCES failure_domains (id) ON DELETE SET NULL,
     UNIQUE (name),
     UNIQUE (address)
 );
@@ -565,5 +571,5 @@ CREATE TABLE storage_volumes_snapshots_config (
     UNIQUE (storage_volume_snapshot_id, key)
 );
 
-INSERT INTO schema (version, updated_at) VALUES (31, strftime("%s"))
+INSERT INTO schema (version, updated_at) VALUES (32, strftime("%s"))
 `
diff --git a/lxd/db/cluster/update.go b/lxd/db/cluster/update.go
index b6d5d4b1fe..758b569a96 100644
--- a/lxd/db/cluster/update.go
+++ b/lxd/db/cluster/update.go
@@ -68,6 +68,26 @@ var updates = map[int]schema.Update{
 	29: updateFromV28,
 	30: updateFromV29,
 	31: updateFromV30,
+	32: updateFromV31,
+}
+
+// Add failure_domain column to nodes table.
+func updateFromV31(tx *sql.Tx) error {
+	stmts := `
+CREATE TABLE failure_domains (
+    id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
+    name TEXT NOT NULL,
+    UNIQUE (name)
+);
+ALTER TABLE nodes
+ ADD COLUMN failure_domain_id INTEGER DEFAULT NULL REFERENCES failure_domains (id) ON DELETE SET NULL;
+`
+	_, err := tx.Exec(stmts)
+	if err != nil {
+		return err
+	}
+
+	return nil
 }
 
 // Add content type field to storage volumes

From c945c182b16941212e9de3e3b502ff0e975d78eb Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Thu, 2 Jul 2020 10:35:29 +0200
Subject: [PATCH 02/10] lxd/db: Add UpdateNodeFailureDomain() and
 GetNodesFailureDomains()

Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
 lxd/db/node.go      | 128 +++++++++++++++++++++++++++++++++++++++++++-
 lxd/db/node_test.go |  20 +++++++
 2 files changed, 147 insertions(+), 1 deletion(-)

diff --git a/lxd/db/node.go b/lxd/db/node.go
index e0a2c30f41..ec4aad41d3 100644
--- a/lxd/db/node.go
+++ b/lxd/db/node.go
@@ -3,6 +3,7 @@
 package db
 
 import (
+	"database/sql"
 	"fmt"
 	"strconv"
 	"strings"
@@ -228,7 +229,7 @@ func (c *ClusterTx) RenameNode(old, new string) error {
 // Nodes returns all LXD nodes part of the cluster.
 func (c *ClusterTx) nodes(pending bool, where string, args ...interface{}) ([]NodeInfo, error) {
 	// Get node roles
-	sql := "SELECT node_id, role FROM nodes_roles;"
+	sql := "SELECT node_id, role FROM nodes_roles"
 
 	nodeRoles := map[int64][]string{}
 	rows, err := c.tx.Query(sql)
@@ -456,6 +457,131 @@ func (c *ClusterTx) UpdateNodeRoles(id int64, roles []ClusterRole) error {
 	return nil
 }
 
+// UpdateNodeFailureDomain changes the failure domain of a node.
+func (c *ClusterTx) UpdateNodeFailureDomain(id int64, domain string) error {
+	var domainID interface{}
+
+	if domain == "" {
+		domainID = nil
+	} else {
+		row := c.tx.QueryRow("SELECT id FROM failure_domains WHERE name=?", domain)
+		err := row.Scan(&domainID)
+		if err != nil {
+			if err != sql.ErrNoRows {
+				return errors.Wrapf(err, "Load failure domain name")
+			}
+			result, err := c.tx.Exec("INSERT INTO failure_domains (name) VALUES (?)", domain)
+			if err != nil {
+				return errors.Wrapf(err, "Create new failure domain")
+			}
+			domainID, err = result.LastInsertId()
+			if err != nil {
+				return errors.Wrapf(err, "Get last inserted ID")
+			}
+		}
+	}
+
+	result, err := c.tx.Exec("UPDATE nodes SET failure_domain_id=? WHERE id=?", domainID, id)
+	if err != nil {
+		return err
+	}
+	n, err := result.RowsAffected()
+	if err != nil {
+		return err
+	}
+	if n != 1 {
+		return fmt.Errorf("Query updated %d rows instead of 1", n)
+	}
+
+	return nil
+}
+
+// GetNodeFailureDomain returns the failure domain associated with the node with the given ID.
+func (c *ClusterTx) GetNodeFailureDomain(id int64) (string, error) {
+	stmt := `
+SELECT coalesce(failure_domains.name,'')
+  FROM nodes LEFT JOIN failure_domains ON nodes.failure_domain_id = failure_domains.id
+ WHERE nodes.id=?
+`
+	var domain string
+
+	err := c.tx.QueryRow(stmt, id).Scan(&domain)
+	if err != nil {
+		return "", err
+	}
+	return domain, nil
+}
+
+// GetNodesFailureDomains returns a map associating each node address with its
+// failure domain code.
+func (c *ClusterTx) GetNodesFailureDomains() (map[string]uint64, error) {
+	stmt, err := c.tx.Prepare("SELECT address, coalesce(failure_domain_id, 0) FROM nodes")
+	if err != nil {
+		return nil, err
+	}
+
+	rows := []struct {
+		Address         string
+		FailureDomainID int64
+	}{}
+
+	dest := func(i int) []interface{} {
+		rows = append(rows, struct {
+			Address         string
+			FailureDomainID int64
+		}{})
+		return []interface{}{&rows[len(rows)-1].Address, &rows[len(rows)-1].FailureDomainID}
+	}
+
+	err = query.SelectObjects(stmt, dest)
+	if err != nil {
+		return nil, err
+	}
+
+	domains := map[string]uint64{}
+
+	for _, row := range rows {
+		domains[row.Address] = uint64(row.FailureDomainID)
+	}
+
+	return domains, nil
+}
+
+// GetFailureDomainsNames return a map associating failure domain IDs to their
+// names.
+func (c *ClusterTx) GetFailureDomainsNames() (map[uint64]string, error) {
+	stmt, err := c.tx.Prepare("SELECT id, name FROM failure_domains")
+	if err != nil {
+		return nil, err
+	}
+
+	rows := []struct {
+		ID   int64
+		Name string
+	}{}
+
+	dest := func(i int) []interface{} {
+		rows = append(rows, struct {
+			ID   int64
+			Name string
+		}{})
+		return []interface{}{&rows[len(rows)-1].ID, &rows[len(rows)-1].Name}
+	}
+
+	err = query.SelectObjects(stmt, dest)
+	if err != nil {
+		return nil, err
+	}
+
+	domains := map[uint64]string{}
+
+	for _, row := range rows {
+		domains[uint64(row.ID)] = row.Name
+	}
+
+	return domains, nil
+}
+
 // RemoveNode removes the node with the given id.
 func (c *ClusterTx) RemoveNode(id int64) error {
 	result, err := c.tx.Exec("DELETE FROM nodes WHERE id=?", id)
diff --git a/lxd/db/node_test.go b/lxd/db/node_test.go
index 99c1c47f34..866053255f 100644
--- a/lxd/db/node_test.go
+++ b/lxd/db/node_test.go
@@ -391,3 +391,23 @@ INSERT INTO instances (id, node_id, name, architecture, type, project_id) VALUES
 	require.NoError(t, err)
 	assert.Equal(t, "none", name)
 }
+
+func TestUpdateNodeFailureDomain(t *testing.T) {
+	tx, cleanup := db.NewTestClusterTx(t)
+	defer cleanup()
+
+	id, err := tx.CreateNode("buzz", "1.2.3.4:666")
+	require.NoError(t, err)
+
+	assert.NoError(t, tx.UpdateNodeFailureDomain(id, "foo"))
+
+	domains, err := tx.GetNodesFailureDomains()
+	require.NoError(t, err)
+	assert.Equal(t, map[string]uint64{"0.0.0.0": 0, "1.2.3.4:666": 1}, domains)
+
+	assert.NoError(t, tx.UpdateNodeFailureDomain(id, ""))
+
+	domains, err = tx.GetNodesFailureDomains()
+	require.NoError(t, err)
+	assert.Equal(t, map[string]uint64{"0.0.0.0": 0, "1.2.3.4:666": 0}, domains)
+}

From 197a19fc1d6895a839fda8d7b9f36a55bedd12ae Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Thu, 2 Jul 2020 10:39:42 +0200
Subject: [PATCH 03/10] lxd/cluster: Honor failure domains when changing roles

Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
 lxd/cluster/membership.go | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/lxd/cluster/membership.go b/lxd/cluster/membership.go
index 978d8877d1..135f3417cb 100644
--- a/lxd/cluster/membership.go
+++ b/lxd/cluster/membership.go
@@ -824,6 +824,8 @@ func Handover(state *state.State, gateway *Gateway, address string) (string, []d
 func newRolesChanges(state *state.State, gateway *Gateway, nodes []db.RaftNode) (*app.RolesChanges, error) {
 	var maxVoters int
 	var maxStandBy int
+	var domains map[string]uint64
+
 	err := state.Cluster.Transaction(func(tx *db.ClusterTx) error {
 		config, err := ConfigLoad(tx)
 		if err != nil {
@@ -831,6 +833,12 @@ func newRolesChanges(state *state.State, gateway *Gateway, nodes []db.RaftNode)
 		}
 		maxVoters = int(config.MaxVoters())
 		maxStandBy = int(config.MaxStandBy())
+
+		domains, err = tx.GetNodesFailureDomains()
+		if err != nil {
+			return errors.Wrap(err, "Load failure domains")
+		}
+
 		return nil
 	})
 	if err != nil {
@@ -841,7 +849,9 @@ func newRolesChanges(state *state.State, gateway *Gateway, nodes []db.RaftNode)
 
 	for _, node := range nodes {
 		if HasConnectivity(gateway.cert, node.Address) {
-			cluster[node] = &client.NodeMetadata{}
+			cluster[node] = &client.NodeMetadata{
+				FailureDomain: domains[node.Address],
+			}
 		} else {
 			cluster[node] = nil
 		}

From 68e5d4006f61446d56dd4592d524296365520954 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Thu, 2 Jul 2020 12:20:58 +0200
Subject: [PATCH 04/10] shared/version: Add clustering_failure_domains
 extension

Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
 doc/api-extensions.md | 5 +++++
 shared/version/api.go | 1 +
 2 files changed, 6 insertions(+)

diff --git a/doc/api-extensions.md b/doc/api-extensions.md
index 9a34d7871e..ea699c5d7f 100644
--- a/doc/api-extensions.md
+++ b/doc/api-extensions.md
@@ -1093,3 +1093,8 @@ The 5 entities that have UsedBy are:
 
 This adds support for creating and attaching custom block volumes to instances.
 It introduces the new `--type` flag when creating custom storage volumes, and accepts the values `fs` and `block`.
+
+## clustering\_failure\_domains
+
+This extension adds a new `failure_domain` field to the `PUT /1.0/cluster/<node>` API,
+which can be used to set the failure domain of a node.
diff --git a/shared/version/api.go b/shared/version/api.go
index 6b5a9909a9..52744904a4 100644
--- a/shared/version/api.go
+++ b/shared/version/api.go
@@ -216,6 +216,7 @@ var APIExtensions = []string{
 	"network_state_bond_bridge",
 	"usedby_consistency",
 	"custom_block_volumes",
+	"clustering_failure_domains",
 }
 
 // APIExtensionsCount returns the number of available API extensions.

From 3ea751dde65009cf6576b45eb70db896fa79b544 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Thu, 2 Jul 2020 12:21:47 +0200
Subject: [PATCH 05/10] shared/api: Add FailureDomain field to ClusterMemberPut

Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
 shared/api/cluster.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/shared/api/cluster.go b/shared/api/cluster.go
index 42808d98c6..96c8d60909 100644
--- a/shared/api/cluster.go
+++ b/shared/api/cluster.go
@@ -75,4 +75,7 @@ type ClusterMemberPut struct {
 
 	// API extension: clustering_architecture
 	Architecture string `json:"architecture" yaml:"architecture"`
+
+	// API extension: clustering_failure_domains
+	FailureDomain string `json:"failure_domain" yaml:"failure_domain"`
 }

From cb8b33f7b5a8fdc21757e0a1cdac3a6549c6306c Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Thu, 2 Jul 2020 12:22:40 +0200
Subject: [PATCH 06/10] lxd/cluster: Populate FailureDomain field when listing
 cluster members

Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
 lxd/cluster/membership.go | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/lxd/cluster/membership.go b/lxd/cluster/membership.go
index 135f3417cb..3c475deff3 100644
--- a/lxd/cluster/membership.go
+++ b/lxd/cluster/membership.go
@@ -898,16 +898,34 @@ func List(state *state.State, gateway *Gateway) ([]api.ClusterMember, error) {
 	var err error
 	var nodes []db.NodeInfo
 	var offlineThreshold time.Duration
+	domains := map[string]string{}
 
 	err = state.Cluster.Transaction(func(tx *db.ClusterTx) error {
 		nodes, err = tx.GetNodes()
 		if err != nil {
-			return err
+			return errors.Wrap(err, "Load nodes")
 		}
 
 		offlineThreshold, err = tx.GetNodeOfflineThreshold()
 		if err != nil {
-			return err
+			return errors.Wrap(err, "Load offline threshold config")
+		}
+
+		nodesDomains, err := tx.GetNodesFailureDomains()
+		if err != nil {
+			return errors.Wrap(err, "Load nodes failure domains")
+		}
+
+		domainsNames, err := tx.GetFailureDomainsNames()
+		if err != nil {
+			return errors.Wrap(err, "Load failure domains names")
+		}
+
+		for _, node := range nodes {
+			domainID := nodesDomains[node.Address]
+			if domainID != 0 {
+				domains[node.Address] = domainsNames[domainID]
+			}
 		}
 
 		return nil
@@ -956,6 +974,7 @@ func List(state *state.State, gateway *Gateway) ([]api.ClusterMember, error) {
 		if err != nil {
 			return nil, err
 		}
+		result[i].FailureDomain = domains[node.Address]
 
 		if node.IsOffline(offlineThreshold) {
 			result[i].Status = "Offline"

From f44648765fb1271b3dc9955919473cf15af68e60 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Thu, 2 Jul 2020 12:23:22 +0200
Subject: [PATCH 07/10] lxd: Support changing failure domain in PUT
 /1.0/cluster/<node>

Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
 client/lxd_cluster.go |  5 +++++
 lxd/api_cluster.go    | 21 ++++++++++++++++++---
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/client/lxd_cluster.go b/client/lxd_cluster.go
index 7922ce1149..f67940b64f 100644
--- a/client/lxd_cluster.go
+++ b/client/lxd_cluster.go
@@ -113,6 +113,11 @@ func (r *ProtocolLXD) UpdateClusterMember(name string, member api.ClusterMemberP
 	if !r.HasExtension("clustering_edit_roles") {
 		return fmt.Errorf("The server is missing the required \"clustering_edit_roles\" API extension")
 	}
+	if member.FailureDomain != "" {
+		if !r.HasExtension("clustering_failure_domains") {
+			return fmt.Errorf("The server is missing the required \"clustering_failure_domains\" API extension")
+		}
+	}
 
 	// Send the request
 	_, _, err := r.query("PUT", fmt.Sprintf("/cluster/members/%s", name), member, ETag)
diff --git a/lxd/api_cluster.go b/lxd/api_cluster.go
index 4632cb8dbc..d8cfb0a74c 100644
--- a/lxd/api_cluster.go
+++ b/lxd/api_cluster.go
@@ -887,11 +887,17 @@ func clusterNodePut(d *Daemon, r *http.Request) response.Response {
 
 	// Find the requested one.
 	var current db.NodeInfo
+	var currentFailureDomain string
 	var err error
 	err = d.cluster.Transaction(func(tx *db.ClusterTx) error {
 		current, err = tx.GetNodeByName(name)
 		if err != nil {
-			return err
+			return errors.Wrap(err, "Load current node state")
+		}
+
+		currentFailureDomain, err = tx.GetNodeFailureDomain(current.ID)
+		if err != nil {
+			return errors.Wrap(err, "Load current failure domain")
 		}
 
 		return nil
@@ -901,7 +907,11 @@ func clusterNodePut(d *Daemon, r *http.Request) response.Response {
 	}
 
 	// Validate the request is fine
-	err = util.EtagCheck(r, current.Roles)
+	etag := []interface{}{
+		current.Roles,
+		currentFailureDomain,
+	}
+	err = util.EtagCheck(r, etag)
 	if err != nil {
 		return response.PreconditionFailed(err)
 	}
@@ -932,7 +942,12 @@ func clusterNodePut(d *Daemon, r *http.Request) response.Response {
 
 		err := tx.UpdateNodeRoles(current.ID, dbRoles)
 		if err != nil {
-			return err
+			return errors.Wrap(err, "Update roles")
+		}
+
+		err = tx.UpdateNodeFailureDomain(current.ID, req.FailureDomain)
+		if err != nil {
+			return errors.Wrap(err, "Update failure domain")
 		}
 
 		return nil

From ebd2459a5b8244546ef597496288b62b5efe0d5f Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Thu, 2 Jul 2020 12:23:49 +0200
Subject: [PATCH 08/10] test: Add new clustering_failure_domains test case

Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
 test/main.sh              |  1 +
 test/suites/clustering.sh | 91 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+)

diff --git a/test/main.sh b/test/main.sh
index f51a0ee0eb..364b203af2 100755
--- a/test/main.sh
+++ b/test/main.sh
@@ -182,6 +182,7 @@ run_test test_clustering_recover "clustering recovery"
 run_test test_clustering_handover "clustering handover"
 run_test test_clustering_rebalance "clustering rebalance"
 run_test test_clustering_remove_raft_node "custering remove raft node"
+run_test test_clustering_failure_domains "failure domains"
 # run_test test_clustering_upgrade "clustering upgrade"
 run_test test_projects_default "default project"
 run_test test_projects_crud "projects CRUD operations"
diff --git a/test/suites/clustering.sh b/test/suites/clustering.sh
index 49631e8d8f..240eee66d9 100644
--- a/test/suites/clustering.sh
+++ b/test/suites/clustering.sh
@@ -1901,3 +1901,94 @@ test_clustering_remove_raft_node() {
   kill_lxd "${LXD_THREE_DIR}"
   kill_lxd "${LXD_FOUR_DIR}"
 }
+
+test_clustering_failure_domains() {
+  # shellcheck disable=2039
+  local LXD_DIR
+
+  setup_clustering_bridge
+  prefix="lxd$$"
+  bridge="${prefix}"
+
+  setup_clustering_netns 1
+  LXD_ONE_DIR=$(mktemp -d -p "${TEST_DIR}" XXX)
+  chmod +x "${LXD_ONE_DIR}"
+  ns1="${prefix}1"
+  spawn_lxd_and_bootstrap_cluster "${ns1}" "${bridge}" "${LXD_ONE_DIR}"
+
+  # Add a newline at the end of each line. YAML as weird rules..
+  cert=$(sed ':a;N;$!ba;s/\n/\n\n/g' "${LXD_ONE_DIR}/server.crt")
+
+  # Spawn a second node
+  setup_clustering_netns 2
+  LXD_TWO_DIR=$(mktemp -d -p "${TEST_DIR}" XXX)
+  chmod +x "${LXD_TWO_DIR}"
+  ns2="${prefix}2"
+  spawn_lxd_and_join_cluster "${ns2}" "${bridge}" "${cert}" 2 1 "${LXD_TWO_DIR}"
+
+  # Spawn a third node, using the non-leader node2 as join target.
+  setup_clustering_netns 3
+  LXD_THREE_DIR=$(mktemp -d -p "${TEST_DIR}" XXX)
+  chmod +x "${LXD_THREE_DIR}"
+  ns3="${prefix}3"
+  spawn_lxd_and_join_cluster "${ns3}" "${bridge}" "${cert}" 3 2 "${LXD_THREE_DIR}"
+
+  # Spawn a fourth node, this will be a non-database node.
+  setup_clustering_netns 4
+  LXD_FOUR_DIR=$(mktemp -d -p "${TEST_DIR}" XXX)
+  chmod +x "${LXD_FOUR_DIR}"
+  ns4="${prefix}4"
+  spawn_lxd_and_join_cluster "${ns4}" "${bridge}" "${cert}" 4 1 "${LXD_FOUR_DIR}"
+
+  # Spawn a fifth node, using non-database node4 as join target.
+  setup_clustering_netns 5
+  LXD_FIVE_DIR=$(mktemp -d -p "${TEST_DIR}" XXX)
+  chmod +x "${LXD_FIVE_DIR}"
+  ns5="${prefix}5"
+  spawn_lxd_and_join_cluster "${ns5}" "${bridge}" "${cert}" 5 4 "${LXD_FIVE_DIR}"
+
+  # Spawn a sixth node, using non-database node4 as join target.
+  setup_clustering_netns 6
+  LXD_SIX_DIR=$(mktemp -d -p "${TEST_DIR}" XXX)
+  chmod +x "${LXD_SIX_DIR}"
+  ns6="${prefix}6"
+  spawn_lxd_and_join_cluster "${ns6}" "${bridge}" "${cert}" 6 4 "${LXD_SIX_DIR}"
+
+  # Set failure domains
+  echo -e "roles: [\"database\"]\nfailure_domain: \"az1\"" | LXD_DIR="${LXD_THREE_DIR}" lxc cluster edit node1
+  echo -e "roles: [\"database\"]\nfailure_domain: \"az2\"" | LXD_DIR="${LXD_THREE_DIR}" lxc cluster edit node2
+  echo -e "roles: [\"database\"]\nfailure_domain: \"az3\"" | LXD_DIR="${LXD_THREE_DIR}" lxc cluster edit node3
+  echo -e "roles: []\nfailure_domain: \"az1\"" | LXD_DIR="${LXD_THREE_DIR}" lxc cluster edit node4
+  echo -e "roles: []\nfailure_domain: \"az2\"" | LXD_DIR="${LXD_THREE_DIR}" lxc cluster edit node5
+  echo -e "roles: []\nfailure_domain: \"az3\"" | LXD_DIR="${LXD_THREE_DIR}" lxc cluster edit node6
+
+  # Shutdown a node in az2, its replacement is picked from az2.
+  LXD_DIR="${LXD_TWO_DIR}" lxd shutdown
+  sleep 3
+
+  LXD_DIR="${LXD_ONE_DIR}" lxc cluster show node2 | grep -q "database: false"
+  LXD_DIR="${LXD_ONE_DIR}" lxc cluster show node5 | grep -q "database: true"
+
+  LXD_DIR="${LXD_SIX_DIR}" lxd shutdown
+  LXD_DIR="${LXD_FIVE_DIR}" lxd shutdown
+  LXD_DIR="${LXD_FOUR_DIR}" lxd shutdown
+  LXD_DIR="${LXD_THREE_DIR}" lxd shutdown
+  LXD_DIR="${LXD_ONE_DIR}" lxd shutdown
+  sleep 0.5
+  rm -f "${LXD_SIX_DIR}/unix.socket"
+  rm -f "${LXD_FIVE_DIR}/unix.socket"
+  rm -f "${LXD_FOUR_DIR}/unix.socket"
+  rm -f "${LXD_THREE_DIR}/unix.socket"
+  rm -f "${LXD_TWO_DIR}/unix.socket"
+  rm -f "${LXD_ONE_DIR}/unix.socket"
+
+  teardown_clustering_netns
+  teardown_clustering_bridge
+
+  kill_lxd "${LXD_ONE_DIR}"
+  kill_lxd "${LXD_TWO_DIR}"
+  kill_lxd "${LXD_THREE_DIR}"
+  kill_lxd "${LXD_FOUR_DIR}"
+  kill_lxd "${LXD_FIVE_DIR}"
+  kill_lxd "${LXD_SIX_DIR}"
+}

From a0120ab3234bd80280356e8c6487df8b8dc026d1 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Thu, 2 Jul 2020 12:27:34 +0200
Subject: [PATCH 09/10] doc: Add documentation about failure domains

Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
 doc/clustering.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/doc/clustering.md b/doc/clustering.md
index cb90f4ed55..e7137350a1 100644
--- a/doc/clustering.md
+++ b/doc/clustering.md
@@ -217,6 +217,17 @@ transition to the Blocked state, until you upgrade the very last
 one. At that point the blocked nodes will notice that there is no
 out-of-date node left and will become operational again.
 
+### Failure domains
+
+Failure domains can be used to indicate which nodes should be given preference
+when trying to assign roles to a cluster member that has been shutdown or has
+crashed. For example, if a cluster member that currently has the database role
+gets shutdown, LXD will try to assign its database role to another cluster
+member in the same failure domain, if one is available.
+
+To change the failure domain of a cluster member you can use the `lxc cluster
+edit <member>` command line tool, or the `PUT /1.0/cluster/<member>` REST API.
+
 ### Recover from quorum loss
 
 Every LXD cluster has up to 3 members that serve as database nodes. If you

From b6d795a3555960e8d2871480b72485dbfa3ffa33 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Thu, 2 Jul 2020 12:33:42 +0200
Subject: [PATCH 10/10] lxc: Add failure domain column in "lxc cluster list"
 output

Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
 lxc/cluster.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lxc/cluster.go b/lxc/cluster.go
index eacd4ae36f..9d605990ce 100644
--- a/lxc/cluster.go
+++ b/lxc/cluster.go
@@ -123,7 +123,7 @@ func (c *cmdClusterList) Run(cmd *cobra.Command, args []string) error {
 		if member.Database {
 			database = "YES"
 		}
-		line := []string{member.ServerName, member.URL, database, strings.ToUpper(member.Status), member.Message, member.Architecture}
+		line := []string{member.ServerName, member.URL, database, strings.ToUpper(member.Status), member.Message, member.Architecture, member.FailureDomain}
 		data = append(data, line)
 	}
 	sort.Sort(byName(data))
@@ -135,6 +135,7 @@ func (c *cmdClusterList) Run(cmd *cobra.Command, args []string) error {
 		i18n.G("STATE"),
 		i18n.G("MESSAGE"),
 		i18n.G("ARCHITECTURE"),
+		i18n.G("FAILURE DOMAIN"),
 	}
 
 	return utils.RenderTable(c.flagFormat, header, data, members)


More information about the lxc-devel mailing list