[lxc-devel] [lxd/master] Take raft snapshots more frequently and at shutdown
freeekanayaka on Github
lxc-bot at linuxcontainers.org
Wed May 2 08:52:45 UTC 2018
A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 417 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20180502/4d2536fd/attachment.bin>
-------------- next part --------------
From 270516df3c0e82adef0b86854d440da740069dd6 Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Wed, 2 May 2018 08:37:40 +0000
Subject: [PATCH] Take raft snapshots more frequently and at shutdown
This should mitigate/solve #4485 and similar issues.
Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
lxd/cluster/raft.go | 41 ++++++++++++++++++++++++++++++++++-------
1 file changed, 34 insertions(+), 7 deletions(-)
diff --git a/lxd/cluster/raft.go b/lxd/cluster/raft.go
index fa540ce35..6a8df0547 100644
--- a/lxd/cluster/raft.go
+++ b/lxd/cluster/raft.go
@@ -13,8 +13,6 @@ import (
"strings"
"time"
- "golang.org/x/net/context"
-
"github.com/CanonicalLtd/dqlite"
"github.com/CanonicalLtd/raft-http"
"github.com/CanonicalLtd/raft-membership"
@@ -270,11 +268,32 @@ func (i *raftInstance) MembershipChanger() raftmembership.Changer {
func (i *raftInstance) Shutdown() error {
logger.Info("Stop raft instance")
- // Stop raft asynchronously to allow for a timeout.
- errCh := make(chan error)
+ // Invoke raft APIs asynchronously to allow for a timeout.
timeout := 10 * time.Second
- ctx, cancel := context.WithTimeout(context.Background(), timeout)
- defer cancel()
+
+ // FIXME/TODO: We take a snapshot before when shutting down the daemon
+ // so there will be no uncompacted raft logs at the next
+ // startup. This is a workaround for slow log replay when
+ // the LXD daemon starts (see #4485). A more proper fix
+ // should be probably implemented in dqlite.
+ errCh := make(chan error)
+ timer := time.After(timeout)
+ go func() {
+ errCh <- i.raft.Snapshot().Error()
+ }()
+ // In case of error we just log a warning, since this is not really
+ // fatal.
+ select {
+ case err := <-errCh:
+ if err != nil && err != raft.ErrNothingNewToSnapshot {
+ logger.Warnf("Failed to take raft snapshot: %v", err)
+ }
+ case <-timer:
+ logger.Warnf("Timeout waiting for raft to take a snapshot")
+ }
+
+ errCh = make(chan error)
+ timer = time.After(timeout)
go func() {
errCh <- i.raft.Shutdown().Error()
}()
@@ -283,7 +302,7 @@ func (i *raftInstance) Shutdown() error {
if err != nil {
return errors.Wrap(err, "failed to shutdown raft")
}
- case <-ctx.Done():
+ case <-timer:
logger.Debug("Timeout waiting for raft to shutdown")
return fmt.Errorf("raft did not shutdown within %s", timeout)
@@ -381,6 +400,14 @@ func raftConfig(latency float64) *raft.Config {
for _, duration := range durations {
scale(duration)
}
+
+ // FIXME/TODO: We increase the frequency of snapshots here to keep the
+ // number of uncompacted raft logs low, and workaround slow
+ // log replay when the LXD daemon starts (see #4485). A more
+ // proper fix should be probably implemented in dqlite.
+ config.SnapshotThreshold = 64
+ config.TrailingLogs = 128
+
return config
}
More information about the lxc-devel
mailing list