[lxc-devel] [lxd/master] Cluster recover command

Mon Nov 4 11:37:18 UTC 2019

A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 876 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20191104/af121cf8/attachment.bin>
-------------- next part --------------
From b56db39b78f89678761b9fc43044557b38120669 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Mon, 4 Nov 2019 09:50:23 +0100
Subject: [PATCH 1/4] lxd/cluster: add Recover() and ListDatabaseNodes()
 utilities

Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
 lxd/cluster/recover.go | 84 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 lxd/cluster/recover.go

diff --git a/lxd/cluster/recover.go b/lxd/cluster/recover.go
new file mode 100644
index 0000000000..32d6575d63
--- /dev/null
+++ b/lxd/cluster/recover.go
@@ -0,0 +1,84 @@
+package cluster
+
+import (
+	"fmt"
+	"path/filepath"
+
+	dqlite "github.com/canonical/go-dqlite"
+	"github.com/lxc/lxd/lxd/db"
+	"github.com/lxc/lxd/lxd/node"
+	"github.com/pkg/errors"
+)
+
+func ListDatabaseNodes(database *db.Node) ([]string, error) {
+	nodes := []db.RaftNode{}
+	err := database.Transaction(func(tx *db.NodeTx) error {
+		var err error
+		nodes, err = tx.RaftNodes()
+		return err
+	})
+	if err != nil {
+		return nil, errors.Wrapf(err, "Failed to list database nodes")
+	}
+	addresses := make([]string, len(nodes))
+	for i, node := range nodes {
+		addresses[i] = node.Address
+	}
+	return addresses, nil
+}
+
+func Recover(database *db.Node) error {
+	// Figure out if we actually act as dqlite node.
+	var info *db.RaftNode
+	err := database.Transaction(func(tx *db.NodeTx) error {
+		var err error
+		info, err = node.DetermineRaftNode(tx)
+		return err
+	})
+	if err != nil {
+		return errors.Wrap(err, "Failed to determine node role.")
+	}
+
+	// If we're not a database node, return an error.
+	if info == nil {
+		return fmt.Errorf("This LXD instance has no database role.")
+	}
+
+	// If this is a standalone node not exposed to the network, return an
+	// error.
+	if info.Address == "" {
+		return fmt.Errorf("This LXD instance is not clustered.")
+	}
+
+	dir := filepath.Join(database.Dir(), "global")
+	server, err := dqlite.New(
+		uint64(info.ID),
+		info.Address,
+		dir,
+	)
+	if err != nil {
+		return errors.Wrap(err, "Failed to create dqlite server")
+	}
+
+	cluster := []dqlite.NodeInfo{
+		{ID: uint64(info.ID), Address: info.Address},
+	}
+
+	err = server.Recover(cluster)
+	if err != nil {
+		return errors.Wrap(err, "Failed to recover database state")
+	}
+
+	// Update the list of raft nodes.
+	err = database.Transaction(func(tx *db.NodeTx) error {
+		nodes := []db.RaftNode{
+			{ID: info.ID, Address: info.Address},
+		}
+		return tx.RaftNodesReplace(nodes)
+	})
+	if err != nil {
+		return errors.Wrap(err, "Failed to update database nodes.")
+	}
+
+	return nil
+}

From c3c7038ab12ce6037cb523bf51a52c46344020d0 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Mon, 4 Nov 2019 11:48:29 +0100
Subject: [PATCH 2/4] Add new "lxd cluster" sub-command

Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
 lxd/main.go         |   4 ++
 lxd/main_cluster.go | 158 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 162 insertions(+)
 create mode 100644 lxd/main_cluster.go

diff --git a/lxd/main.go b/lxd/main.go
index 72531e1f22..cf6b292cbb 100644
--- a/lxd/main.go
+++ b/lxd/main.go
@@ -177,6 +177,10 @@ func main() {
 	waitreadyCmd := cmdWaitready{global: &globalCmd}
 	app.AddCommand(waitreadyCmd.Command())
 
+	// cluster sub-command
+	clusterCmd := cmdCluster{global: &globalCmd}
+	app.AddCommand(clusterCmd.Command())
+
 	// Run the main command and handle errors
 	err := app.Execute()
 	if err != nil {
diff --git a/lxd/main_cluster.go b/lxd/main_cluster.go
new file mode 100644
index 0000000000..a3914409b7
--- /dev/null
+++ b/lxd/main_cluster.go
@@ -0,0 +1,158 @@
+package main
+
+import (
+	"bufio"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+
+	lxd "github.com/lxc/lxd/client"
+	"github.com/lxc/lxd/lxd/cluster"
+	"github.com/lxc/lxd/lxd/db"
+	"github.com/lxc/lxd/lxd/sys"
+	"github.com/lxc/lxd/shared"
+	"github.com/lxc/lxd/shared/i18n"
+	"github.com/olekukonko/tablewriter"
+	"github.com/pkg/errors"
+	"github.com/spf13/cobra"
+)
+
+type cmdCluster struct {
+	global *cmdGlobal
+}
+
+func (c *cmdCluster) Command() *cobra.Command {
+	cmd := &cobra.Command{}
+	cmd.Use = "cluster"
+	cmd.Short = "Cluster administration commands"
+	cmd.Long = `Description:
+  Low level administration tools for LXD clusters.
+`
+	// List database nodes
+	listDatabaseNodes := cmdClusterListDatabaseNodes{global: c.global}
+	cmd.AddCommand(listDatabaseNodes.Command())
+
+	// Recover
+	recover := cmdClusterRecover{global: c.global}
+	cmd.AddCommand(recover.Command())
+
+	return cmd
+}
+
+type cmdClusterListDatabaseNodes struct {
+	global *cmdGlobal
+}
+
+func (c *cmdClusterListDatabaseNodes) Command() *cobra.Command {
+	cmd := &cobra.Command{}
+	cmd.Use = i18n.G("list-database-nodes")
+	cmd.Aliases = []string{"ls"}
+	cmd.Short = i18n.G("Print the addresses of the cluster members serving as database nodes")
+
+	cmd.RunE = c.Run
+
+	return cmd
+}
+
+func (c *cmdClusterListDatabaseNodes) Run(cmd *cobra.Command, args []string) error {
+	os := sys.DefaultOS()
+
+	db, _, err := db.OpenNode(filepath.Join(os.VarDir, "database"), nil, nil)
+	if err != nil {
+		return errors.Wrapf(err, "Failed to open local database.")
+	}
+
+	addresses, err := cluster.ListDatabaseNodes(db)
+	if err != nil {
+		return errors.Wrapf(err, "Failed to get database nodes.")
+	}
+
+	printDatabaseNodes(addresses)
+
+	return nil
+}
+
+func printDatabaseNodes(addresses []string) {
+	table := tablewriter.NewWriter(os.Stdout)
+	table.SetAlignment(tablewriter.ALIGN_LEFT)
+	table.SetAutoWrapText(false)
+	table.SetAutoFormatHeaders(false)
+	table.SetHeader([]string{"Address"})
+	for _, address := range addresses {
+		data := []string{address}
+		table.Append(data)
+	}
+	table.Render()
+}
+
+type cmdClusterRecover struct {
+	global             *cmdGlobal
+	flagNonInteractive bool
+}
+
+func (c *cmdClusterRecover) Command() *cobra.Command {
+	cmd := &cobra.Command{}
+	cmd.Use = i18n.G("recover")
+	cmd.Aliases = []string{"ls"}
+	cmd.Short = i18n.G("Recover a LXD instance whose cluster has lost quorum")
+
+	cmd.RunE = c.Run
+
+	cmd.Flags().BoolVarP(&c.flagNonInteractive, "quiet", "q", false, i18n.G("Don't require user confirmation"))
+
+	return cmd
+}
+
+func (c *cmdClusterRecover) Run(cmd *cobra.Command, args []string) error {
+	// Make sure that the daemon is not running.
+	_, err := lxd.ConnectLXDUnix("", nil)
+	if err == nil {
+		return fmt.Errorf("The LXD daemon is running, please stop it first.")
+	}
+
+	// Prompt for confiromation unless --quite was passed..
+	if !c.flagNonInteractive {
+		err := c.promptConfirmation()
+		if err != nil {
+			return err
+		}
+	}
+
+	os := sys.DefaultOS()
+
+	db, _, err := db.OpenNode(filepath.Join(os.VarDir, "database"), nil, nil)
+	if err != nil {
+		return errors.Wrapf(err, "Failed to open local database.")
+	}
+
+	return cluster.Recover(db)
+}
+
+func (c *cmdClusterRecover) promptConfirmation() error {
+	reader := bufio.NewReader(os.Stdin)
+	fmt.Printf(i18n.G(`You should run this command only if you are *absolutely* certain that this is
+the only database node left in your cluster AND that other database nodes will
+never come back (i.e. their LXD daemon won't ever be started again).
+
+This will make this LXD instance the only member of the cluster, and it won't
+be possible to perform operations on former cluster members anymore.
+
+However all information about former cluster members will be preserved in the
+database, so you can possibly inspect it for further recovery.
+
+You'll be able to permanently delete from the database all information about
+former cluster members by running "lxc cluster remove <member-name> --force".
+
+See https://lxd.readthedocs.io/en/latest/clustering/#disaster-recovery for more
+info.
+
+Do you want to proceed? (yes/no): `))
+	input, _ := reader.ReadString('\n')
+	input = strings.TrimSuffix(input, "\n")
+
+	if !shared.StringInSlice(strings.ToLower(input), []string{i18n.G("yes")}) {
+		return fmt.Errorf(i18n.G("User aborted delete operation"))
+	}
+	return nil
+}

From 91fc30eaec1becde05b1f73e15b69e95a3dfc3c2 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Mon, 4 Nov 2019 11:48:50 +0100
Subject: [PATCH 3/4] Add clustering_recover integration test

Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
 test/suites/clustering.sh | 67 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/test/suites/clustering.sh b/test/suites/clustering.sh
index 02036f05c3..a3e5724397 100644
--- a/test/suites/clustering.sh
+++ b/test/suites/clustering.sh
@@ -1377,3 +1377,70 @@ test_clustering_dns() {
   ip link delete "${prefix}1"
   ip link delete "${prefix}2"
 }
+
+test_clustering_recover() {
+  # shellcheck disable=2039
+  local LXD_DIR
+
+  setup_clustering_bridge
+  prefix="lxd$$"
+  bridge="${prefix}"
+
+  setup_clustering_netns 1
+  LXD_ONE_DIR=$(mktemp -d -p "${TEST_DIR}" XXX)
+  chmod +x "${LXD_ONE_DIR}"
+  ns1="${prefix}1"
+  spawn_lxd_and_bootstrap_cluster "${ns1}" "${bridge}" "${LXD_ONE_DIR}"
+
+  # Add a newline at the end of each line. YAML as weird rules..
+  cert=$(sed ':a;N;$!ba;s/\n/\n\n/g' "${LXD_ONE_DIR}/server.crt")
+
+  # Spawn a second node
+  setup_clustering_netns 2
+  LXD_TWO_DIR=$(mktemp -d -p "${TEST_DIR}" XXX)
+  chmod +x "${LXD_TWO_DIR}"
+  ns2="${prefix}2"
+  spawn_lxd_and_join_cluster "${ns2}" "${bridge}" "${cert}" 2 1 "${LXD_TWO_DIR}"
+
+  # Check the current database nodes
+  LXD_DIR="${LXD_ONE_DIR}" lxd cluster list-database-nodes | grep -q "10.1.1.101:8443"
+  LXD_DIR="${LXD_ONE_DIR}" lxd cluster list-database-nodes | grep -q "10.1.1.102:8443"
+
+  # Create a test project, just to insert something in the database.
+  LXD_DIR="${LXD_ONE_DIR}" lxc project create p1
+
+  # Trying to recover a running daemon results in an error.
+  ! LXD_DIR="${LXD_ONE_DIR}" lxd cluster recover || false
+
+  # Shutdown both nodes.
+  LXD_DIR="${LXD_TWO_DIR}" lxd shutdown
+  LXD_DIR="${LXD_ONE_DIR}" lxd shutdown
+
+  sleep 0.5
+
+  # Now recover the first node and restart it.
+  LXD_DIR="${LXD_ONE_DIR}" lxd cluster recover -q
+  LXD_ALT_CERT=1 LXD_NETNS="${ns1}" spawn_lxd "${LXD_ONE_DIR}" false
+
+  # The project we had created is still there
+  LXD_DIR="${LXD_ONE_DIR}" lxc project list | grep -q p1
+
+  # The database nodes have been updated
+  LXD_DIR="${LXD_ONE_DIR}" lxd cluster list-database-nodes | grep -q "10.1.1.101:8443"
+  ! LXD_DIR="${LXD_ONE_DIR}" lxd cluster list-database-nodes | grep -q "10.1.1.102:8443" || false
+
+  # Cleanup the dead node.
+  LXD_DIR="${LXD_ONE_DIR}" lxc cluster remove node2 -q --force
+
+  LXD_DIR="${LXD_ONE_DIR}" lxd shutdown
+
+  teardown_clustering_netns
+  teardown_clustering_bridge
+
+  rm -f "${LXD_TWO_DIR}/unix.socket"
+  rm -f "${LXD_ONE_DIR}/unix.socket"
+
+
+  kill_lxd "${LXD_ONE_DIR}"
+  kill_lxd "${LXD_TWO_DIR}"
+}

From f3a269de3d57ba2f7203f1b108e18925fb0d17e8 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Mon, 4 Nov 2019 11:49:05 +0100
Subject: [PATCH 4/4] clustering.md: add documentation about disaster recovery

Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
 doc/clustering.md | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/doc/clustering.md b/doc/clustering.md
index f8bf65c2ee..fa6c1e560a 100644
--- a/doc/clustering.md
+++ b/doc/clustering.md
@@ -173,6 +173,50 @@ transition to the Blocked state, until you upgrade the very last
 one. At that point the blocked nodes will notice that there is no
 out-of-date node left and will become operational again.
 
+### Disaster recovery
+
+Every LXD cluster has up to 3 members that serve as database nodes. If you
+permanently lose a majority of the cluster members that are serving as database
+nodes (for example you have a 3-member cluster and you lose 2 members), the
+cluster will become unavailable. However, if at least one database node has
+survived, you will be able to recover the cluster.
+
+In order to check which cluster members are configured as database nodes, log on
+any survived member of your cluster and run the command:
+
+```
+lxd cluster list-database-nodes
+```
+
+This will work even if the LXD daemon is not running.
+
+Among the listed members, pick the one that has survived and log into it (if it
+differs from the one you have run the command on).
+
+Now make sure the LXD daemon is not running and then issue the command:
+
+```
+lxd cluster recover
+```
+
+At this point you can restart the LXD daemon and the database should be back
+online.
+
+Note that no information has been deleted from the database, in particular all
+information about the cluster members that you have lost is still there,
+including the metadata about their containers. This can help you with further
+recovery steps in case you need to re-create the lost containers.
+
+In order to permanently delete the cluster members that you have lost, you can
+run the command:
+
+```
+lxc cluster remove <name> --force
+```
+
+Note that this time you have to use the regular ```lxc``` command line tool, not
+```lxd```.
+
 ## Containers
 
 You can launch a container on any node in the cluster from any node in