[lxc-devel] [lxd/master] Network: Improvements to clustering node state to better handle failed startup during network create

tomponline on Github lxc-bot at linuxcontainers.org
Fri Dec 11 14:57:39 UTC 2020


A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 1057 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20201211/27d84d5b/attachment.bin>
-------------- next part --------------
From bc7e0525ef8f585bbcda22d3a4bb160ae124b7e6 Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Fri, 11 Dec 2020 14:43:56 +0000
Subject: [PATCH 1/8] lxd/network/network/interface: Adds Project function

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 lxd/network/network_interface.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lxd/network/network_interface.go b/lxd/network/network_interface.go
index 021cd88198..af8c8afcbe 100644
--- a/lxd/network/network_interface.go
+++ b/lxd/network/network_interface.go
@@ -31,6 +31,7 @@ type Network interface {
 	Validate(config map[string]string) error
 	ID() int64
 	Name() string
+	Project() string
 	Description() string
 	Status() string
 	LocalStatus() string

From 4b48e17ab725d8bf598b1bcf75021e46ffeedb3e Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Fri, 11 Dec 2020 14:44:14 +0000
Subject: [PATCH 2/8] lxd/network/driver/common: Adds Project function

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 lxd/network/driver_common.go | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lxd/network/driver_common.go b/lxd/network/driver_common.go
index 03f4b9ab7d..c17bf1d8d6 100644
--- a/lxd/network/driver_common.go
+++ b/lxd/network/driver_common.go
@@ -130,6 +130,11 @@ func (n *common) Name() string {
 	return n.name
 }
 
+// Project returns the network project.
+func (n *common) Project() string {
+	return n.project
+}
+
 // Description returns the network description.
 func (n *common) Description() string {
 	return n.description

From 40defa5130a86dedd14ec111ccd6bf0667519fc8 Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Fri, 11 Dec 2020 14:44:35 +0000
Subject: [PATCH 3/8] lxd/network/driver/common: Remove cluster notification
 and DB record removal from delete() function

We need more control over when we generate notifications and remove DB records, so this is being moved into the API route handler function (networkDelete()).

This also aligns better with storage pools, where the notifications and DB record removal is also handled by API route handler function (storagePoolDelete()).

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 lxd/network/driver_common.go | 28 +++++-----------------------
 1 file changed, 5 insertions(+), 23 deletions(-)

diff --git a/lxd/network/driver_common.go b/lxd/network/driver_common.go
index c17bf1d8d6..2c9138f1c1 100644
--- a/lxd/network/driver_common.go
+++ b/lxd/network/driver_common.go
@@ -368,34 +368,16 @@ func (n *common) rename(newName string) error {
 
 // delete the network from the database if clusterNotification is false.
 func (n *common) delete(clientType request.ClientType) error {
-	// Only delete database record if not cluster notification.
-	if clientType != request.ClientTypeNotifier {
-		// Notify all other nodes. If any node is down, an error will be returned.
-		notifier, err := cluster.NewNotifier(n.state, n.state.Endpoints.NetworkCert(), cluster.NotifyAll)
-		if err != nil {
-			return err
-		}
-		err = notifier(func(client lxd.InstanceServer) error {
-			return client.UseProject(n.project).DeleteNetwork(n.name)
-		})
-		if err != nil {
-			return err
-		}
-
-		// Remove the network from the database.
-		err = n.state.Cluster.DeleteNetwork(n.project, n.name)
-		if err != nil {
-			return err
-		}
-
-		n.lifecycle("deleted", nil)
-	}
-
 	// Cleanup storage.
 	if shared.PathExists(shared.VarPath("networks", n.name)) {
 		os.RemoveAll(shared.VarPath("networks", n.name))
 	}
 
+	// Generate lifecycle event if not notification.
+	if clientType != request.ClientTypeNotifier {
+		n.lifecycle("deleted", nil)
+	}
+
 	return nil
 }
 

From 4872518ed2b8a21197392e4445d84707aaa65484 Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Fri, 11 Dec 2020 14:46:54 +0000
Subject: [PATCH 4/8] lxd/network/driver: Always delete when requested, ignore
 LocalStatus() pending

This is now up to caller to decide whether it is appropriate to delete or not (using n.LocalStatus()).

This is needed because if the network fails to start as part of a create operation on a local node we need to delete it, however it will still be in pending state and so would have previously been ignored.

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 lxd/network/driver_bridge.go   | 19 ++++----
 lxd/network/driver_macvlan.go  |  1 +
 lxd/network/driver_ovn.go      | 88 +++++++++++++++++-----------------
 lxd/network/driver_physical.go |  8 ++--
 lxd/network/driver_sriov.go    |  1 +
 5 files changed, 56 insertions(+), 61 deletions(-)

diff --git a/lxd/network/driver_bridge.go b/lxd/network/driver_bridge.go
index 6f469f85e8..a733e5afc1 100644
--- a/lxd/network/driver_bridge.go
+++ b/lxd/network/driver_bridge.go
@@ -436,22 +436,19 @@ func (n *bridge) isRunning() bool {
 func (n *bridge) Delete(clientType request.ClientType) error {
 	n.logger.Debug("Delete", log.Ctx{"clientType": clientType})
 
-	// Bring the local network down if created on this node.
-	if n.LocalStatus() == api.NetworkStatusCreated || n.LocalStatus() == api.NetworkStatusUnknown {
-		if n.isRunning() {
-			err := n.Stop()
-			if err != nil {
-				return err
-			}
-		}
-
-		// Delete apparmor profiles.
-		err := apparmor.NetworkDelete(n.state, n)
+	if n.isRunning() {
+		err := n.Stop()
 		if err != nil {
 			return err
 		}
 	}
 
+	// Delete apparmor profiles.
+	err := apparmor.NetworkDelete(n.state, n)
+	if err != nil {
+		return err
+	}
+
 	return n.common.delete(clientType)
 }
 
diff --git a/lxd/network/driver_macvlan.go b/lxd/network/driver_macvlan.go
index 60eac1a092..bba6d06369 100644
--- a/lxd/network/driver_macvlan.go
+++ b/lxd/network/driver_macvlan.go
@@ -45,6 +45,7 @@ func (n *macvlan) Validate(config map[string]string) error {
 // Delete deletes a network.
 func (n *macvlan) Delete(clientType request.ClientType) error {
 	n.logger.Debug("Delete", log.Ctx{"clientType": clientType})
+
 	return n.common.delete(clientType)
 }
 
diff --git a/lxd/network/driver_ovn.go b/lxd/network/driver_ovn.go
index 7b64b778c7..4d7a6a3851 100644
--- a/lxd/network/driver_ovn.go
+++ b/lxd/network/driver_ovn.go
@@ -1808,63 +1808,61 @@ func (n *ovn) deleteChassisGroupEntry() error {
 func (n *ovn) Delete(clientType request.ClientType) error {
 	n.logger.Debug("Delete", log.Ctx{"clientType": clientType})
 
-	if n.LocalStatus() == api.NetworkStatusCreated || n.LocalStatus() == api.NetworkStatusUnknown {
-		err := n.Stop()
+	err := n.Stop()
+	if err != nil {
+		return err
+	}
+
+	if clientType == request.ClientTypeNormal {
+		client, err := n.getClient()
 		if err != nil {
 			return err
 		}
 
-		if clientType == request.ClientTypeNormal {
-			client, err := n.getClient()
-			if err != nil {
-				return err
-			}
-
-			err = client.LogicalRouterDelete(n.getRouterName())
-			if err != nil {
-				return err
-			}
+		err = client.LogicalRouterDelete(n.getRouterName())
+		if err != nil {
+			return err
+		}
 
-			err = client.LogicalSwitchDelete(n.getExtSwitchName())
-			if err != nil {
-				return err
-			}
+		err = client.LogicalSwitchDelete(n.getExtSwitchName())
+		if err != nil {
+			return err
+		}
 
-			err = client.LogicalSwitchDelete(n.getIntSwitchName())
-			if err != nil {
-				return err
-			}
+		err = client.LogicalSwitchDelete(n.getIntSwitchName())
+		if err != nil {
+			return err
+		}
 
-			err = client.LogicalRouterPortDelete(n.getRouterExtPortName())
-			if err != nil {
-				return err
-			}
+		err = client.LogicalRouterPortDelete(n.getRouterExtPortName())
+		if err != nil {
+			return err
+		}
 
-			err = client.LogicalRouterPortDelete(n.getRouterIntPortName())
-			if err != nil {
-				return err
-			}
+		err = client.LogicalRouterPortDelete(n.getRouterIntPortName())
+		if err != nil {
+			return err
+		}
 
-			err = client.LogicalSwitchPortDelete(n.getExtSwitchRouterPortName())
-			if err != nil {
-				return err
-			}
+		err = client.LogicalSwitchPortDelete(n.getExtSwitchRouterPortName())
+		if err != nil {
+			return err
+		}
 
-			err = client.LogicalSwitchPortDelete(n.getExtSwitchProviderPortName())
-			if err != nil {
-				return err
-			}
+		err = client.LogicalSwitchPortDelete(n.getExtSwitchProviderPortName())
+		if err != nil {
+			return err
+		}
 
-			err = client.LogicalSwitchPortDelete(n.getIntSwitchRouterPortName())
-			if err != nil {
-				return err
-			}
+		err = client.LogicalSwitchPortDelete(n.getIntSwitchRouterPortName())
+		if err != nil {
+			return err
+		}
 
-			// Must be done after logical router removal.
-			err = client.ChassisGroupDelete(n.getChassisGroupName())
-			if err != nil {
-				return err
-			}
+		// Must be done after logical router removal.
+		err = client.ChassisGroupDelete(n.getChassisGroupName())
+		if err != nil {
+			return err
 		}
 	}
 
diff --git a/lxd/network/driver_physical.go b/lxd/network/driver_physical.go
index 6cf8bd31e1..18fcdc3c8f 100644
--- a/lxd/network/driver_physical.go
+++ b/lxd/network/driver_physical.go
@@ -123,11 +123,9 @@ func (n *physical) Create(clientType request.ClientType) error {
 func (n *physical) Delete(clientType request.ClientType) error {
 	n.logger.Debug("Delete", log.Ctx{"clientType": clientType})
 
-	if n.LocalStatus() == api.NetworkStatusCreated || n.LocalStatus() == api.NetworkStatusUnknown {
-		err := n.Stop()
-		if err != nil {
-			return err
-		}
+	err := n.Stop()
+	if err != nil {
+		return err
 	}
 
 	return n.common.delete(clientType)
diff --git a/lxd/network/driver_sriov.go b/lxd/network/driver_sriov.go
index 988e1ea1cc..2300a573c5 100644
--- a/lxd/network/driver_sriov.go
+++ b/lxd/network/driver_sriov.go
@@ -45,6 +45,7 @@ func (n *sriov) Validate(config map[string]string) error {
 // Delete deletes a network.
 func (n *sriov) Delete(clientType request.ClientType) error {
 	n.logger.Debug("Delete", log.Ctx{"clientType": clientType})
+
 	return n.common.delete(clientType)
 }
 

From 8485fc2937ded3135e4272df869d4c7f3eb203db Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Fri, 11 Dec 2020 14:49:28 +0000
Subject: [PATCH 5/8] lxc/networks: Remove revert removal on failure of
 clustered network in networksPost

We need to leave network around in pending state so errors can be corrected.

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 lxd/networks.go | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/lxd/networks.go b/lxd/networks.go
index c4e2e82b66..2f4a0b92c1 100644
--- a/lxd/networks.go
+++ b/lxd/networks.go
@@ -207,9 +207,6 @@ func networksPost(d *Daemon, r *http.Request) response.Response {
 		return resp
 	}
 
-	revert := revert.New()
-	defer revert.Fail()
-
 	targetNode := queryParam(r, "target")
 	if targetNode != "" {
 		if !netTypeInfo.NodeSpecificConfig {
@@ -246,10 +243,6 @@ func networksPost(d *Daemon, r *http.Request) response.Response {
 	if count > 1 {
 		// Simulate adding pending node network config when the driver doesn't support per-node config.
 		if !netTypeInfo.NodeSpecificConfig && clientType != request.ClientTypeJoiner {
-			revert.Add(func() {
-				d.cluster.DeleteNetwork(projectName, req.Name)
-			})
-
 			// Create pending entry for each node.
 			err = d.cluster.Transaction(func(tx *db.ClusterTx) error {
 				nodes, err := tx.GetNodes()
@@ -276,11 +269,13 @@ func networksPost(d *Daemon, r *http.Request) response.Response {
 			return response.SmartError(err)
 		}
 
-		revert.Success()
 		return resp
 	}
 
 	// Non-clustered network creation.
+	revert := revert.New()
+	defer revert.Fail()
+
 	networks, err := d.cluster.GetNetworks(projectName)
 	if err != nil {
 		return response.InternalError(err)
@@ -301,9 +296,7 @@ func networksPost(d *Daemon, r *http.Request) response.Response {
 	if err != nil {
 		return response.SmartError(errors.Wrapf(err, "Error inserting %q into database", req.Name))
 	}
-	revert.Add(func() {
-		d.cluster.DeleteNetwork(projectName, req.Name)
-	})
+	revert.Add(func() { d.cluster.DeleteNetwork(projectName, req.Name) })
 
 	err = doNetworksCreate(d, projectName, req, clientType)
 	if err != nil {

From 0de956cd8e7ace5af4b454d8210c62ad5d25d2e6 Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Fri, 11 Dec 2020 14:50:48 +0000
Subject: [PATCH 6/8] lxd/networks: Allow re-create of pending network when
 pending nodes already exist in networksPost

This is needed as some network types (ovn) auto create the pending node records for each node without using the --target argument,

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 lxd/networks.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lxd/networks.go b/lxd/networks.go
index 2f4a0b92c1..03b068fad8 100644
--- a/lxd/networks.go
+++ b/lxd/networks.go
@@ -252,7 +252,7 @@ func networksPost(d *Daemon, r *http.Request) response.Response {
 
 				for _, node := range nodes {
 					err = tx.CreatePendingNetwork(node.Name, projectName, req.Name, netType.DBType(), req.Config)
-					if err != nil {
+					if err != nil && errors.Cause(err) != db.ErrAlreadyDefined {
 						return errors.Wrapf(err, "Failed creating pending network for node %q", node.Name)
 					}
 				}

From 5b23efe39ab897bcbea474d3abbb5623d0f25236 Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Fri, 11 Dec 2020 14:51:42 +0000
Subject: [PATCH 7/8] lxd/networks: Adds revert to doNetworksCreate

Brings into line with storagePoolCreateLocal.

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 lxd/networks.go | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/lxd/networks.go b/lxd/networks.go
index 03b068fad8..2426e00d2d 100644
--- a/lxd/networks.go
+++ b/lxd/networks.go
@@ -429,6 +429,9 @@ func networksPostCluster(d *Daemon, projectName string, req api.NetworksPost, cl
 // Create the network on the system. The clusterNotification flag is used to indicate whether creation request
 // is coming from a cluster notification (and if so we should not delete the database record on error).
 func doNetworksCreate(d *Daemon, projectName string, req api.NetworksPost, clientType request.ClientType) error {
+	revert := revert.New()
+	defer revert.Fail()
+
 	// Start the network.
 	n, err := network.LoadByName(d.State(), projectName, req.Name)
 	if err != nil {
@@ -452,16 +455,13 @@ func doNetworksCreate(d *Daemon, projectName string, req api.NetworksPost, clien
 		return err
 	}
 
+	revert.Add(func() { n.Delete(clientType) })
+
 	// Only start networks when not doing a cluster pre-join phase (this ensures that networks are only started
 	// once the node has fully joined the clustered database and has consistent config with rest of the nodes).
 	if clientType != request.ClientTypeJoiner {
 		err = n.Start()
 		if err != nil {
-			delErr := n.Delete(clientType)
-			if delErr != nil {
-				logger.Error("Failed clearing up network after failed create", log.Ctx{"project": projectName, "network": n.Name(), "err": delErr})
-			}
-
 			return err
 		}
 	}
@@ -471,14 +471,11 @@ func doNetworksCreate(d *Daemon, projectName string, req api.NetworksPost, clien
 		return tx.NetworkNodeCreated(n.ID())
 	})
 	if err != nil {
-		delErr := n.Delete(clientType)
-		if delErr != nil {
-			logger.Error("Failed clearing up network after failed local status update", log.Ctx{"project": projectName, "network": n.Name(), "err": delErr})
-		}
 		return err
 	}
 	logger.Debug("Marked network local status as created", log.Ctx{"project": projectName, "network": req.Name})
 
+	revert.Success()
 	return nil
 }
 

From 0d413c57563eb91ac002458dc2b536dd0d5cded3 Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Fri, 11 Dec 2020 14:52:25 +0000
Subject: [PATCH 8/8] lxd/networks: Moves cluster notification an DB record
 removal into networkDelete

This is so that only specific requests to delete the network trigger full removal, whereas using n.Delete() in a revert during create only deletes the local network setup, not the DB record.

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 lxd/networks.go | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/lxd/networks.go b/lxd/networks.go
index 2426e00d2d..daf238facf 100644
--- a/lxd/networks.go
+++ b/lxd/networks.go
@@ -616,8 +616,39 @@ func networkDelete(d *Daemon, r *http.Request) response.Response {
 		}
 	}
 
-	// Delete the network from each member.
-	err = n.Delete(clientType)
+	if n.LocalStatus() != api.NetworkStatusPending {
+		err = n.Delete(clientType)
+		if err != nil {
+			return response.InternalError(err)
+		}
+	}
+
+	// If this is a cluster notification, we're done, any database work will be done by the node that is
+	// originally serving the request.
+	if clusterNotification {
+		return response.EmptySyncResponse
+	}
+
+	// If we are clustered, also notify all other nodes, if any.
+	clustered, err := cluster.Enabled(d.db)
+	if err != nil {
+		return response.SmartError(err)
+	}
+	if clustered {
+		notifier, err := cluster.NewNotifier(d.State(), d.endpoints.NetworkCert(), cluster.NotifyAll)
+		if err != nil {
+			return response.SmartError(err)
+		}
+		err = notifier(func(client lxd.InstanceServer) error {
+			return client.UseProject(n.Project()).DeleteNetwork(n.Name())
+		})
+		if err != nil {
+			return response.SmartError(err)
+		}
+	}
+
+	// Remove the network from the database.
+	err = d.State().Cluster.DeleteNetwork(n.Project(), n.Name())
 	if err != nil {
 		return response.SmartError(err)
 	}


More information about the lxc-devel mailing list