[lxc-devel] [lxd/master] Make database queries timeout after 10s if cluster db is unavail
freeekanayaka on Github
lxc-bot at linuxcontainers.org
Wed Aug 29 11:28:58 UTC 2018
A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 1329 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20180829/cfc2f25b/attachment.bin>
-------------- next part --------------
From 066ea3acbe4da5a7e1a292df203f933fd7a56fdd Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Wed, 29 Aug 2018 12:57:54 +0200
Subject: [PATCH] Make database queries timeout after 10s if cluster db is
unavail
Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
lxd/api.go | 4 +++-
lxd/api_cluster.go | 1 +
lxd/cluster/heartbeat_test.go | 2 +-
lxd/cluster/membership_test.go | 6 ++++--
lxd/daemon.go | 3 ++-
lxd/db/db.go | 13 ++++++-------
lxd/db/query/retry.go | 2 +-
lxd/db/testing.go | 3 ++-
lxd/response.go | 6 +++++-
9 files changed, 25 insertions(+), 15 deletions(-)
diff --git a/lxd/api.go b/lxd/api.go
index f6509de1c3..2ab2736697 100644
--- a/lxd/api.go
+++ b/lxd/api.go
@@ -63,7 +63,9 @@ func (s *lxdHttpServer) ServeHTTP(rw http.ResponseWriter, req *http.Request) {
return nil
})
if err != nil {
- http.Error(rw, err.Error(), http.StatusInternalServerError)
+ response := SmartError(err)
+ response.Render(rw)
+ return
}
}
diff --git a/lxd/api_cluster.go b/lxd/api_cluster.go
index 785100ea4a..c585bca840 100644
--- a/lxd/api_cluster.go
+++ b/lxd/api_cluster.go
@@ -477,6 +477,7 @@ func clusterPutDisable(d *Daemon) Response {
store := d.gateway.ServerStore()
d.cluster, err = db.OpenCluster(
"db.bin", store, address, "/unused/db/dir",
+ d.config.DqliteSetupTimeout,
dqlite.WithDialFunc(d.gateway.DialFunc()),
dqlite.WithContext(d.gateway.Context()),
)
diff --git a/lxd/cluster/heartbeat_test.go b/lxd/cluster/heartbeat_test.go
index b6d698ad59..68c9883518 100644
--- a/lxd/cluster/heartbeat_test.go
+++ b/lxd/cluster/heartbeat_test.go
@@ -253,7 +253,7 @@ func (f *heartbeatFixture) node() (*state.State, *cluster.Gateway, string) {
store := gateway.ServerStore()
dial := gateway.DialFunc()
state.Cluster, err = db.OpenCluster(
- "db.bin", store, address, "/unused/db/dir", dqlite.WithDialFunc(dial))
+ "db.bin", store, address, "/unused/db/dir", 5*time.Second, dqlite.WithDialFunc(dial))
require.NoError(f.t, err)
f.gateways[len(f.gateways)] = gateway
diff --git a/lxd/cluster/membership_test.go b/lxd/cluster/membership_test.go
index 85707961b6..a954381e2d 100644
--- a/lxd/cluster/membership_test.go
+++ b/lxd/cluster/membership_test.go
@@ -6,6 +6,7 @@ import (
"net/http"
"path/filepath"
"testing"
+ "time"
"github.com/CanonicalLtd/go-dqlite"
"github.com/lxc/lxd/lxd/cluster"
@@ -258,6 +259,7 @@ func TestJoin(t *testing.T) {
var err error
targetState.Cluster, err = db.OpenCluster(
"db.bin", targetStore, targetAddress, "/unused/db/dir",
+ 10*time.Second,
dqlite.WithDialFunc(targetDialFunc))
require.NoError(t, err)
@@ -294,7 +296,7 @@ func TestJoin(t *testing.T) {
dialFunc := gateway.DialFunc()
state.Cluster, err = db.OpenCluster(
- "db.bin", store, address, "/unused/db/dir", dqlite.WithDialFunc(dialFunc))
+ "db.bin", store, address, "/unused/db/dir", 5*time.Second, dqlite.WithDialFunc(dialFunc))
require.NoError(t, err)
f := &membershipFixtures{t: t, state: state}
@@ -382,7 +384,7 @@ func FLAKY_TestPromote(t *testing.T) {
store := targetGateway.ServerStore()
dialFunc := targetGateway.DialFunc()
targetState.Cluster, err = db.OpenCluster(
- "db.bin", store, targetAddress, "/unused/db/dir", dqlite.WithDialFunc(dialFunc))
+ "db.bin", store, targetAddress, "/unused/db/dir", 5*time.Second, dqlite.WithDialFunc(dialFunc))
require.NoError(t, err)
targetF := &membershipFixtures{t: t, state: targetState}
targetF.NetworkAddress(targetAddress)
diff --git a/lxd/daemon.go b/lxd/daemon.go
index 4be1835898..55b7841033 100644
--- a/lxd/daemon.go
+++ b/lxd/daemon.go
@@ -475,9 +475,10 @@ func (d *Daemon) init() error {
store := d.gateway.ServerStore()
d.cluster, err = db.OpenCluster(
"db.bin", store, address, dir,
+ d.config.DqliteSetupTimeout,
dqlite.WithDialFunc(d.gateway.DialFunc()),
dqlite.WithContext(d.gateway.Context()),
- dqlite.WithConnectionTimeout(d.config.DqliteSetupTimeout),
+ dqlite.WithConnectionTimeout(10*time.Second),
dqlite.WithLogFunc(cluster.DqliteLog),
)
if err == nil {
diff --git a/lxd/db/db.go b/lxd/db/db.go
index 6c6954191b..40e4036d65 100644
--- a/lxd/db/db.go
+++ b/lxd/db/db.go
@@ -8,7 +8,6 @@ import (
"github.com/CanonicalLtd/go-dqlite"
"github.com/pkg/errors"
- "golang.org/x/net/context"
"github.com/lxc/lxd/lxd/db/cluster"
"github.com/lxc/lxd/lxd/db/node"
@@ -158,15 +157,15 @@ type Cluster struct {
// database matches our version, and possibly trigger a schema update. If the
// schema update can't be performed right now, because some nodes are still
// behind, an Upgrading error is returned.
-func OpenCluster(name string, store dqlite.ServerStore, address, dir string, options ...dqlite.DriverOption) (*Cluster, error) {
+func OpenCluster(name string, store dqlite.ServerStore, address, dir string, timeout time.Duration, options ...dqlite.DriverOption) (*Cluster, error) {
db, err := cluster.Open(name, store, options...)
if err != nil {
return nil, errors.Wrap(err, "failed to open database")
}
- // Test that the cluster database is operational. We wait up to 10
- // minutes, in case there's no quorum of nodes online yet.
- timeout := time.After(10 * time.Minute)
+ // Test that the cluster database is operational. We wait up to the
+ // given timeout , in case there's no quorum of nodes online yet.
+ timer := time.After(timeout)
for i := 0; ; i++ {
// Log initial attempts at debug level, but use warn
// level after the 5'th attempt (about 10 seconds).
@@ -186,7 +185,7 @@ func OpenCluster(name string, store dqlite.ServerStore, address, dir string, opt
}
cause := errors.Cause(err)
- if cause != context.DeadlineExceeded {
+ if cause != dqlite.ErrNoAvailableLeader {
return nil, err
}
@@ -199,7 +198,7 @@ func OpenCluster(name string, store dqlite.ServerStore, address, dir string, opt
time.Sleep(2 * time.Second)
select {
- case <-timeout:
+ case <-timer:
return nil, fmt.Errorf("failed to connect to cluster database")
default:
}
diff --git a/lxd/db/query/retry.go b/lxd/db/query/retry.go
index 0c78a8c7f7..1b288377a4 100644
--- a/lxd/db/query/retry.go
+++ b/lxd/db/query/retry.go
@@ -16,7 +16,7 @@ import (
func Retry(f func() error) error {
// TODO: the retry loop should be configurable.
var err error
- for i := 0; i < 20; i++ {
+ for i := 0; i < 5; i++ {
err = f()
if err != nil {
logger.Debugf("Database error: %#v", err)
diff --git a/lxd/db/testing.go b/lxd/db/testing.go
index 18a59e3031..850b474a0f 100644
--- a/lxd/db/testing.go
+++ b/lxd/db/testing.go
@@ -7,6 +7,7 @@ import (
"net"
"os"
"testing"
+ "time"
"github.com/CanonicalLtd/go-dqlite"
"github.com/CanonicalLtd/raft-test"
@@ -63,7 +64,7 @@ func NewTestCluster(t *testing.T) (*Cluster, func()) {
}
cluster, err := OpenCluster(
- "test.db", store, "1", "/unused/db/dir",
+ "test.db", store, "1", "/unused/db/dir", 5*time.Second,
dqlite.WithLogFunc(log), dqlite.WithDialFunc(dial))
require.NoError(t, err)
diff --git a/lxd/response.go b/lxd/response.go
index f4feb09769..dd56c0255b 100644
--- a/lxd/response.go
+++ b/lxd/response.go
@@ -11,7 +11,9 @@ import (
"os"
"time"
+ dqlite "github.com/CanonicalLtd/go-dqlite"
"github.com/mattn/go-sqlite3"
+ "github.com/pkg/errors"
lxd "github.com/lxc/lxd/client"
"github.com/lxc/lxd/lxd/cluster"
@@ -509,7 +511,7 @@ func PreconditionFailed(err error) Response {
* SmartError returns the right error message based on err.
*/
func SmartError(err error) Response {
- switch err {
+ switch errors.Cause(err) {
case nil:
return EmptySyncResponse
case os.ErrNotExist:
@@ -524,6 +526,8 @@ func SmartError(err error) Response {
return Conflict(nil)
case sqlite3.ErrConstraintUnique:
return Conflict(nil)
+ case dqlite.ErrNoAvailableLeader:
+ return Unavailable(err)
default:
return InternalError(err)
}
More information about the lxc-devel
mailing list