[lxc-devel] [lxd/master] Cluster recover command
freeekanayaka on Github
lxc-bot at linuxcontainers.org
Mon Nov 4 11:37:18 UTC 2019
A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 876 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20191104/af121cf8/attachment.bin>
-------------- next part --------------
From b56db39b78f89678761b9fc43044557b38120669 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Mon, 4 Nov 2019 09:50:23 +0100
Subject: [PATCH 1/4] lxd/cluster: add Recover() and ListDatabaseNodes()
utilities
Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
lxd/cluster/recover.go | 84 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 84 insertions(+)
create mode 100644 lxd/cluster/recover.go
diff --git a/lxd/cluster/recover.go b/lxd/cluster/recover.go
new file mode 100644
index 0000000000..32d6575d63
--- /dev/null
+++ b/lxd/cluster/recover.go
@@ -0,0 +1,84 @@
+package cluster
+
+import (
+ "fmt"
+ "path/filepath"
+
+ dqlite "github.com/canonical/go-dqlite"
+ "github.com/lxc/lxd/lxd/db"
+ "github.com/lxc/lxd/lxd/node"
+ "github.com/pkg/errors"
+)
+
+func ListDatabaseNodes(database *db.Node) ([]string, error) {
+ nodes := []db.RaftNode{}
+ err := database.Transaction(func(tx *db.NodeTx) error {
+ var err error
+ nodes, err = tx.RaftNodes()
+ return err
+ })
+ if err != nil {
+ return nil, errors.Wrapf(err, "Failed to list database nodes")
+ }
+ addresses := make([]string, len(nodes))
+ for i, node := range nodes {
+ addresses[i] = node.Address
+ }
+ return addresses, nil
+}
+
+func Recover(database *db.Node) error {
+ // Figure out if we actually act as dqlite node.
+ var info *db.RaftNode
+ err := database.Transaction(func(tx *db.NodeTx) error {
+ var err error
+ info, err = node.DetermineRaftNode(tx)
+ return err
+ })
+ if err != nil {
+ return errors.Wrap(err, "Failed to determine node role.")
+ }
+
+ // If we're not a database node, return an error.
+ if info == nil {
+ return fmt.Errorf("This LXD instance has no database role.")
+ }
+
+ // If this is a standalone node not exposed to the network, return an
+ // error.
+ if info.Address == "" {
+ return fmt.Errorf("This LXD instance is not clustered.")
+ }
+
+ dir := filepath.Join(database.Dir(), "global")
+ server, err := dqlite.New(
+ uint64(info.ID),
+ info.Address,
+ dir,
+ )
+ if err != nil {
+ return errors.Wrap(err, "Failed to create dqlite server")
+ }
+
+ cluster := []dqlite.NodeInfo{
+ {ID: uint64(info.ID), Address: info.Address},
+ }
+
+ err = server.Recover(cluster)
+ if err != nil {
+ return errors.Wrap(err, "Failed to recover database state")
+ }
+
+ // Update the list of raft nodes.
+ err = database.Transaction(func(tx *db.NodeTx) error {
+ nodes := []db.RaftNode{
+ {ID: info.ID, Address: info.Address},
+ }
+ return tx.RaftNodesReplace(nodes)
+ })
+ if err != nil {
+ return errors.Wrap(err, "Failed to update database nodes.")
+ }
+
+ return nil
+}
From c3c7038ab12ce6037cb523bf51a52c46344020d0 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Mon, 4 Nov 2019 11:48:29 +0100
Subject: [PATCH 2/4] Add new "lxd cluster" sub-command
Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
lxd/main.go | 4 ++
lxd/main_cluster.go | 158 ++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 162 insertions(+)
create mode 100644 lxd/main_cluster.go
diff --git a/lxd/main.go b/lxd/main.go
index 72531e1f22..cf6b292cbb 100644
--- a/lxd/main.go
+++ b/lxd/main.go
@@ -177,6 +177,10 @@ func main() {
waitreadyCmd := cmdWaitready{global: &globalCmd}
app.AddCommand(waitreadyCmd.Command())
+ // cluster sub-command
+ clusterCmd := cmdCluster{global: &globalCmd}
+ app.AddCommand(clusterCmd.Command())
+
// Run the main command and handle errors
err := app.Execute()
if err != nil {
diff --git a/lxd/main_cluster.go b/lxd/main_cluster.go
new file mode 100644
index 0000000000..a3914409b7
--- /dev/null
+++ b/lxd/main_cluster.go
@@ -0,0 +1,158 @@
+package main
+
+import (
+ "bufio"
+ "fmt"
+ "os"
+ "path/filepath"
+ "strings"
+
+ lxd "github.com/lxc/lxd/client"
+ "github.com/lxc/lxd/lxd/cluster"
+ "github.com/lxc/lxd/lxd/db"
+ "github.com/lxc/lxd/lxd/sys"
+ "github.com/lxc/lxd/shared"
+ "github.com/lxc/lxd/shared/i18n"
+ "github.com/olekukonko/tablewriter"
+ "github.com/pkg/errors"
+ "github.com/spf13/cobra"
+)
+
+type cmdCluster struct {
+ global *cmdGlobal
+}
+
+func (c *cmdCluster) Command() *cobra.Command {
+ cmd := &cobra.Command{}
+ cmd.Use = "cluster"
+ cmd.Short = "Cluster administration commands"
+ cmd.Long = `Description:
+ Low level administration tools for LXD clusters.
+`
+ // List database nodes
+ listDatabaseNodes := cmdClusterListDatabaseNodes{global: c.global}
+ cmd.AddCommand(listDatabaseNodes.Command())
+
+ // Recover
+ recover := cmdClusterRecover{global: c.global}
+ cmd.AddCommand(recover.Command())
+
+ return cmd
+}
+
+type cmdClusterListDatabaseNodes struct {
+ global *cmdGlobal
+}
+
+func (c *cmdClusterListDatabaseNodes) Command() *cobra.Command {
+ cmd := &cobra.Command{}
+ cmd.Use = i18n.G("list-database-nodes")
+ cmd.Aliases = []string{"ls"}
+ cmd.Short = i18n.G("Print the addresses of the cluster members serving as database nodes")
+
+ cmd.RunE = c.Run
+
+ return cmd
+}
+
+func (c *cmdClusterListDatabaseNodes) Run(cmd *cobra.Command, args []string) error {
+ os := sys.DefaultOS()
+
+ db, _, err := db.OpenNode(filepath.Join(os.VarDir, "database"), nil, nil)
+ if err != nil {
+ return errors.Wrapf(err, "Failed to open local database.")
+ }
+
+ addresses, err := cluster.ListDatabaseNodes(db)
+ if err != nil {
+ return errors.Wrapf(err, "Failed to get database nodes.")
+ }
+
+ printDatabaseNodes(addresses)
+
+ return nil
+}
+
+func printDatabaseNodes(addresses []string) {
+ table := tablewriter.NewWriter(os.Stdout)
+ table.SetAlignment(tablewriter.ALIGN_LEFT)
+ table.SetAutoWrapText(false)
+ table.SetAutoFormatHeaders(false)
+ table.SetHeader([]string{"Address"})
+ for _, address := range addresses {
+ data := []string{address}
+ table.Append(data)
+ }
+ table.Render()
+}
+
+type cmdClusterRecover struct {
+ global *cmdGlobal
+ flagNonInteractive bool
+}
+
+func (c *cmdClusterRecover) Command() *cobra.Command {
+ cmd := &cobra.Command{}
+ cmd.Use = i18n.G("recover")
+ cmd.Aliases = []string{"ls"}
+ cmd.Short = i18n.G("Recover a LXD instance whose cluster has lost quorum")
+
+ cmd.RunE = c.Run
+
+ cmd.Flags().BoolVarP(&c.flagNonInteractive, "quiet", "q", false, i18n.G("Don't require user confirmation"))
+
+ return cmd
+}
+
+func (c *cmdClusterRecover) Run(cmd *cobra.Command, args []string) error {
+ // Make sure that the daemon is not running.
+ _, err := lxd.ConnectLXDUnix("", nil)
+ if err == nil {
+ return fmt.Errorf("The LXD daemon is running, please stop it first.")
+ }
+
+ // Prompt for confiromation unless --quite was passed..
+ if !c.flagNonInteractive {
+ err := c.promptConfirmation()
+ if err != nil {
+ return err
+ }
+ }
+
+ os := sys.DefaultOS()
+
+ db, _, err := db.OpenNode(filepath.Join(os.VarDir, "database"), nil, nil)
+ if err != nil {
+ return errors.Wrapf(err, "Failed to open local database.")
+ }
+
+ return cluster.Recover(db)
+}
+
+func (c *cmdClusterRecover) promptConfirmation() error {
+ reader := bufio.NewReader(os.Stdin)
+ fmt.Printf(i18n.G(`You should run this command only if you are *absolutely* certain that this is
+the only database node left in your cluster AND that other database nodes will
+never come back (i.e. their LXD daemon won't ever be started again).
+
+This will make this LXD instance the only member of the cluster, and it won't
+be possible to perform operations on former cluster members anymore.
+
+However all information about former cluster members will be preserved in the
+database, so you can possibly inspect it for further recovery.
+
+You'll be able to permanently delete from the database all information about
+former cluster members by running "lxc cluster remove <member-name> --force".
+
+See https://lxd.readthedocs.io/en/latest/clustering/#disaster-recovery for more
+info.
+
+Do you want to proceed? (yes/no): `))
+ input, _ := reader.ReadString('\n')
+ input = strings.TrimSuffix(input, "\n")
+
+ if !shared.StringInSlice(strings.ToLower(input), []string{i18n.G("yes")}) {
+ return fmt.Errorf(i18n.G("User aborted delete operation"))
+ }
+ return nil
+}
From 91fc30eaec1becde05b1f73e15b69e95a3dfc3c2 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Mon, 4 Nov 2019 11:48:50 +0100
Subject: [PATCH 3/4] Add clustering_recover integration test
Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
test/suites/clustering.sh | 67 +++++++++++++++++++++++++++++++++++++++
1 file changed, 67 insertions(+)
diff --git a/test/suites/clustering.sh b/test/suites/clustering.sh
index 02036f05c3..a3e5724397 100644
--- a/test/suites/clustering.sh
+++ b/test/suites/clustering.sh
@@ -1377,3 +1377,70 @@ test_clustering_dns() {
ip link delete "${prefix}1"
ip link delete "${prefix}2"
}
+
+test_clustering_recover() {
+ # shellcheck disable=2039
+ local LXD_DIR
+
+ setup_clustering_bridge
+ prefix="lxd$$"
+ bridge="${prefix}"
+
+ setup_clustering_netns 1
+ LXD_ONE_DIR=$(mktemp -d -p "${TEST_DIR}" XXX)
+ chmod +x "${LXD_ONE_DIR}"
+ ns1="${prefix}1"
+ spawn_lxd_and_bootstrap_cluster "${ns1}" "${bridge}" "${LXD_ONE_DIR}"
+
+ # Add a newline at the end of each line. YAML as weird rules..
+ cert=$(sed ':a;N;$!ba;s/\n/\n\n/g' "${LXD_ONE_DIR}/server.crt")
+
+ # Spawn a second node
+ setup_clustering_netns 2
+ LXD_TWO_DIR=$(mktemp -d -p "${TEST_DIR}" XXX)
+ chmod +x "${LXD_TWO_DIR}"
+ ns2="${prefix}2"
+ spawn_lxd_and_join_cluster "${ns2}" "${bridge}" "${cert}" 2 1 "${LXD_TWO_DIR}"
+
+ # Check the current database nodes
+ LXD_DIR="${LXD_ONE_DIR}" lxd cluster list-database-nodes | grep -q "10.1.1.101:8443"
+ LXD_DIR="${LXD_ONE_DIR}" lxd cluster list-database-nodes | grep -q "10.1.1.102:8443"
+
+ # Create a test project, just to insert something in the database.
+ LXD_DIR="${LXD_ONE_DIR}" lxc project create p1
+
+ # Trying to recover a running daemon results in an error.
+ ! LXD_DIR="${LXD_ONE_DIR}" lxd cluster recover || false
+
+ # Shutdown both nodes.
+ LXD_DIR="${LXD_TWO_DIR}" lxd shutdown
+ LXD_DIR="${LXD_ONE_DIR}" lxd shutdown
+
+ sleep 0.5
+
+ # Now recover the first node and restart it.
+ LXD_DIR="${LXD_ONE_DIR}" lxd cluster recover -q
+ LXD_ALT_CERT=1 LXD_NETNS="${ns1}" spawn_lxd "${LXD_ONE_DIR}" false
+
+ # The project we had created is still there
+ LXD_DIR="${LXD_ONE_DIR}" lxc project list | grep -q p1
+
+ # The database nodes have been updated
+ LXD_DIR="${LXD_ONE_DIR}" lxd cluster list-database-nodes | grep -q "10.1.1.101:8443"
+ ! LXD_DIR="${LXD_ONE_DIR}" lxd cluster list-database-nodes | grep -q "10.1.1.102:8443" || false
+
+ # Cleanup the dead node.
+ LXD_DIR="${LXD_ONE_DIR}" lxc cluster remove node2 -q --force
+
+ LXD_DIR="${LXD_ONE_DIR}" lxd shutdown
+
+ teardown_clustering_netns
+ teardown_clustering_bridge
+
+ rm -f "${LXD_TWO_DIR}/unix.socket"
+ rm -f "${LXD_ONE_DIR}/unix.socket"
+
+
+ kill_lxd "${LXD_ONE_DIR}"
+ kill_lxd "${LXD_TWO_DIR}"
+}
From f3a269de3d57ba2f7203f1b108e18925fb0d17e8 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Mon, 4 Nov 2019 11:49:05 +0100
Subject: [PATCH 4/4] clustering.md: add documentation about disaster recovery
Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
doc/clustering.md | 44 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 44 insertions(+)
diff --git a/doc/clustering.md b/doc/clustering.md
index f8bf65c2ee..fa6c1e560a 100644
--- a/doc/clustering.md
+++ b/doc/clustering.md
@@ -173,6 +173,50 @@ transition to the Blocked state, until you upgrade the very last
one. At that point the blocked nodes will notice that there is no
out-of-date node left and will become operational again.
+### Disaster recovery
+
+Every LXD cluster has up to 3 members that serve as database nodes. If you
+permanently lose a majority of the cluster members that are serving as database
+nodes (for example you have a 3-member cluster and you lose 2 members), the
+cluster will become unavailable. However, if at least one database node has
+survived, you will be able to recover the cluster.
+
+In order to check which cluster members are configured as database nodes, log on
+any survived member of your cluster and run the command:
+
+```
+lxd cluster list-database-nodes
+```
+
+This will work even if the LXD daemon is not running.
+
+Among the listed members, pick the one that has survived and log into it (if it
+differs from the one you have run the command on).
+
+Now make sure the LXD daemon is not running and then issue the command:
+
+```
+lxd cluster recover
+```
+
+At this point you can restart the LXD daemon and the database should be back
+online.
+
+Note that no information has been deleted from the database, in particular all
+information about the cluster members that you have lost is still there,
+including the metadata about their containers. This can help you with further
+recovery steps in case you need to re-create the lost containers.
+
+In order to permanently delete the cluster members that you have lost, you can
+run the command:
+
+```
+lxc cluster remove <name> --force
+```
+
+Note that this time you have to use the regular ```lxc``` command line tool, not
+```lxd```.
+
## Containers
You can launch a container on any node in the cluster from any node in
More information about the lxc-devel
mailing list