[lxc-devel] [lxd/master] Make database queries timeout after 10s if cluster db is unavail

freeekanayaka on Github lxc-bot at linuxcontainers.org
Wed Aug 29 11:28:58 UTC 2018


A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 1329 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20180829/cfc2f25b/attachment.bin>
-------------- next part --------------
From 066ea3acbe4da5a7e1a292df203f933fd7a56fdd Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <free.ekanayaka at canonical.com>
Date: Wed, 29 Aug 2018 12:57:54 +0200
Subject: [PATCH] Make database queries timeout after 10s if cluster db is
 unavail

Signed-off-by: Free Ekanayaka <free.ekanayaka at canonical.com>
---
 lxd/api.go                     |  4 +++-
 lxd/api_cluster.go             |  1 +
 lxd/cluster/heartbeat_test.go  |  2 +-
 lxd/cluster/membership_test.go |  6 ++++--
 lxd/daemon.go                  |  3 ++-
 lxd/db/db.go                   | 13 ++++++-------
 lxd/db/query/retry.go          |  2 +-
 lxd/db/testing.go              |  3 ++-
 lxd/response.go                |  6 +++++-
 9 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/lxd/api.go b/lxd/api.go
index f6509de1c3..2ab2736697 100644
--- a/lxd/api.go
+++ b/lxd/api.go
@@ -63,7 +63,9 @@ func (s *lxdHttpServer) ServeHTTP(rw http.ResponseWriter, req *http.Request) {
 			return nil
 		})
 		if err != nil {
-			http.Error(rw, err.Error(), http.StatusInternalServerError)
+			response := SmartError(err)
+			response.Render(rw)
+			return
 		}
 	}
 
diff --git a/lxd/api_cluster.go b/lxd/api_cluster.go
index 785100ea4a..c585bca840 100644
--- a/lxd/api_cluster.go
+++ b/lxd/api_cluster.go
@@ -477,6 +477,7 @@ func clusterPutDisable(d *Daemon) Response {
 	store := d.gateway.ServerStore()
 	d.cluster, err = db.OpenCluster(
 		"db.bin", store, address, "/unused/db/dir",
+		d.config.DqliteSetupTimeout,
 		dqlite.WithDialFunc(d.gateway.DialFunc()),
 		dqlite.WithContext(d.gateway.Context()),
 	)
diff --git a/lxd/cluster/heartbeat_test.go b/lxd/cluster/heartbeat_test.go
index b6d698ad59..68c9883518 100644
--- a/lxd/cluster/heartbeat_test.go
+++ b/lxd/cluster/heartbeat_test.go
@@ -253,7 +253,7 @@ func (f *heartbeatFixture) node() (*state.State, *cluster.Gateway, string) {
 	store := gateway.ServerStore()
 	dial := gateway.DialFunc()
 	state.Cluster, err = db.OpenCluster(
-		"db.bin", store, address, "/unused/db/dir", dqlite.WithDialFunc(dial))
+		"db.bin", store, address, "/unused/db/dir", 5*time.Second, dqlite.WithDialFunc(dial))
 	require.NoError(f.t, err)
 
 	f.gateways[len(f.gateways)] = gateway
diff --git a/lxd/cluster/membership_test.go b/lxd/cluster/membership_test.go
index 85707961b6..a954381e2d 100644
--- a/lxd/cluster/membership_test.go
+++ b/lxd/cluster/membership_test.go
@@ -6,6 +6,7 @@ import (
 	"net/http"
 	"path/filepath"
 	"testing"
+	"time"
 
 	"github.com/CanonicalLtd/go-dqlite"
 	"github.com/lxc/lxd/lxd/cluster"
@@ -258,6 +259,7 @@ func TestJoin(t *testing.T) {
 	var err error
 	targetState.Cluster, err = db.OpenCluster(
 		"db.bin", targetStore, targetAddress, "/unused/db/dir",
+		10*time.Second,
 		dqlite.WithDialFunc(targetDialFunc))
 	require.NoError(t, err)
 
@@ -294,7 +296,7 @@ func TestJoin(t *testing.T) {
 	dialFunc := gateway.DialFunc()
 
 	state.Cluster, err = db.OpenCluster(
-		"db.bin", store, address, "/unused/db/dir", dqlite.WithDialFunc(dialFunc))
+		"db.bin", store, address, "/unused/db/dir", 5*time.Second, dqlite.WithDialFunc(dialFunc))
 	require.NoError(t, err)
 
 	f := &membershipFixtures{t: t, state: state}
@@ -382,7 +384,7 @@ func FLAKY_TestPromote(t *testing.T) {
 	store := targetGateway.ServerStore()
 	dialFunc := targetGateway.DialFunc()
 	targetState.Cluster, err = db.OpenCluster(
-		"db.bin", store, targetAddress, "/unused/db/dir", dqlite.WithDialFunc(dialFunc))
+		"db.bin", store, targetAddress, "/unused/db/dir", 5*time.Second, dqlite.WithDialFunc(dialFunc))
 	require.NoError(t, err)
 	targetF := &membershipFixtures{t: t, state: targetState}
 	targetF.NetworkAddress(targetAddress)
diff --git a/lxd/daemon.go b/lxd/daemon.go
index 4be1835898..55b7841033 100644
--- a/lxd/daemon.go
+++ b/lxd/daemon.go
@@ -475,9 +475,10 @@ func (d *Daemon) init() error {
 		store := d.gateway.ServerStore()
 		d.cluster, err = db.OpenCluster(
 			"db.bin", store, address, dir,
+			d.config.DqliteSetupTimeout,
 			dqlite.WithDialFunc(d.gateway.DialFunc()),
 			dqlite.WithContext(d.gateway.Context()),
-			dqlite.WithConnectionTimeout(d.config.DqliteSetupTimeout),
+			dqlite.WithConnectionTimeout(10*time.Second),
 			dqlite.WithLogFunc(cluster.DqliteLog),
 		)
 		if err == nil {
diff --git a/lxd/db/db.go b/lxd/db/db.go
index 6c6954191b..40e4036d65 100644
--- a/lxd/db/db.go
+++ b/lxd/db/db.go
@@ -8,7 +8,6 @@ import (
 
 	"github.com/CanonicalLtd/go-dqlite"
 	"github.com/pkg/errors"
-	"golang.org/x/net/context"
 
 	"github.com/lxc/lxd/lxd/db/cluster"
 	"github.com/lxc/lxd/lxd/db/node"
@@ -158,15 +157,15 @@ type Cluster struct {
 // database matches our version, and possibly trigger a schema update. If the
 // schema update can't be performed right now, because some nodes are still
 // behind, an Upgrading error is returned.
-func OpenCluster(name string, store dqlite.ServerStore, address, dir string, options ...dqlite.DriverOption) (*Cluster, error) {
+func OpenCluster(name string, store dqlite.ServerStore, address, dir string, timeout time.Duration, options ...dqlite.DriverOption) (*Cluster, error) {
 	db, err := cluster.Open(name, store, options...)
 	if err != nil {
 		return nil, errors.Wrap(err, "failed to open database")
 	}
 
-	// Test that the cluster database is operational. We wait up to 10
-	// minutes, in case there's no quorum of nodes online yet.
-	timeout := time.After(10 * time.Minute)
+	// Test that the cluster database is operational. We wait up to the
+	// given timeout , in case there's no quorum of nodes online yet.
+	timer := time.After(timeout)
 	for i := 0; ; i++ {
 		// Log initial attempts at debug level, but use warn
 		// level after the 5'th attempt (about 10 seconds).
@@ -186,7 +185,7 @@ func OpenCluster(name string, store dqlite.ServerStore, address, dir string, opt
 		}
 
 		cause := errors.Cause(err)
-		if cause != context.DeadlineExceeded {
+		if cause != dqlite.ErrNoAvailableLeader {
 			return nil, err
 		}
 
@@ -199,7 +198,7 @@ func OpenCluster(name string, store dqlite.ServerStore, address, dir string, opt
 
 		time.Sleep(2 * time.Second)
 		select {
-		case <-timeout:
+		case <-timer:
 			return nil, fmt.Errorf("failed to connect to cluster database")
 		default:
 		}
diff --git a/lxd/db/query/retry.go b/lxd/db/query/retry.go
index 0c78a8c7f7..1b288377a4 100644
--- a/lxd/db/query/retry.go
+++ b/lxd/db/query/retry.go
@@ -16,7 +16,7 @@ import (
 func Retry(f func() error) error {
 	// TODO: the retry loop should be configurable.
 	var err error
-	for i := 0; i < 20; i++ {
+	for i := 0; i < 5; i++ {
 		err = f()
 		if err != nil {
 			logger.Debugf("Database error: %#v", err)
diff --git a/lxd/db/testing.go b/lxd/db/testing.go
index 18a59e3031..850b474a0f 100644
--- a/lxd/db/testing.go
+++ b/lxd/db/testing.go
@@ -7,6 +7,7 @@ import (
 	"net"
 	"os"
 	"testing"
+	"time"
 
 	"github.com/CanonicalLtd/go-dqlite"
 	"github.com/CanonicalLtd/raft-test"
@@ -63,7 +64,7 @@ func NewTestCluster(t *testing.T) (*Cluster, func()) {
 	}
 
 	cluster, err := OpenCluster(
-		"test.db", store, "1", "/unused/db/dir",
+		"test.db", store, "1", "/unused/db/dir", 5*time.Second,
 		dqlite.WithLogFunc(log), dqlite.WithDialFunc(dial))
 	require.NoError(t, err)
 
diff --git a/lxd/response.go b/lxd/response.go
index f4feb09769..dd56c0255b 100644
--- a/lxd/response.go
+++ b/lxd/response.go
@@ -11,7 +11,9 @@ import (
 	"os"
 	"time"
 
+	dqlite "github.com/CanonicalLtd/go-dqlite"
 	"github.com/mattn/go-sqlite3"
+	"github.com/pkg/errors"
 
 	lxd "github.com/lxc/lxd/client"
 	"github.com/lxc/lxd/lxd/cluster"
@@ -509,7 +511,7 @@ func PreconditionFailed(err error) Response {
  * SmartError returns the right error message based on err.
  */
 func SmartError(err error) Response {
-	switch err {
+	switch errors.Cause(err) {
 	case nil:
 		return EmptySyncResponse
 	case os.ErrNotExist:
@@ -524,6 +526,8 @@ func SmartError(err error) Response {
 		return Conflict(nil)
 	case sqlite3.ErrConstraintUnique:
 		return Conflict(nil)
+	case dqlite.ErrNoAvailableLeader:
+		return Unavailable(err)
 	default:
 		return InternalError(err)
 	}


More information about the lxc-devel mailing list