[lxc-devel] [lxd/master] VM: Adds SR-IOV NIC support

tomponline on Github lxc-bot at linuxcontainers.org
Mon Jan 27 17:55:50 UTC 2020


A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 346 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20200127/b5b9a261/attachment-0001.bin>
-------------- next part --------------
From c04da5d4aac94bc859d76a8b355cb09a0d4f3603 Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Thu, 23 Jan 2020 17:20:06 +0000
Subject: [PATCH 01/15] lxd/container/lxc: Removes VM specific NIC config
 ignoring

As no longer returned for containers.

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 lxd/container_lxc.go | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/lxd/container_lxc.go b/lxd/container_lxc.go
index 895ca69ee9..0a4497f47e 100644
--- a/lxd/container_lxc.go
+++ b/lxd/container_lxc.go
@@ -2171,10 +2171,6 @@ func (c *containerLXC) startCommon() (string, []func() error, error) {
 			}
 
 			for _, nicItem := range runConf.NetworkInterface {
-				if nicItem.Key == "devName" {
-					// Skip internal device name key, not used by liblxc.
-					continue
-				}
 				err = lxcSetConfigItem(c.c, fmt.Sprintf("%s.%d.%s", networkKeyPrefix, nicID, nicItem.Key), nicItem.Value)
 				if err != nil {
 					return "", postStartHooks, errors.Wrapf(err, "Failed to setup device network interface '%s'", dev.Name)

From ac6520f5ad03e4a2a89309060bb8de8cf0eccf0d Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Thu, 23 Jan 2020 17:20:56 +0000
Subject: [PATCH 02/15] lxd/device: Only return devName NIC config item for VMs

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 lxd/device/infiniband_physical.go | 8 +++++++-
 lxd/device/infiniband_sriov.go    | 1 -
 lxd/device/nic_bridged.go         | 7 ++++---
 lxd/device/nic_macvlan.go         | 7 ++++---
 lxd/device/nic_p2p.go             | 7 ++++---
 lxd/device/nic_physical.go        | 8 +++++++-
 lxd/device/nic_sriov.go           | 1 -
 7 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/lxd/device/infiniband_physical.go b/lxd/device/infiniband_physical.go
index 6c6947b5bc..482f39d817 100644
--- a/lxd/device/infiniband_physical.go
+++ b/lxd/device/infiniband_physical.go
@@ -116,13 +116,19 @@ func (d *infinibandPhysical) Start() (*deviceConfig.RunConfig, error) {
 	}
 
 	runConf.NetworkInterface = []deviceConfig.RunConfigItem{
-		{Key: "devName", Value: d.name},
 		{Key: "name", Value: d.config["name"]},
 		{Key: "type", Value: "phys"},
 		{Key: "flags", Value: "up"},
 		{Key: "link", Value: saveData["host_name"]},
 	}
 
+	if d.inst.Type() == instancetype.VM {
+		runConf.NetworkInterface = append(runConf.NetworkInterface,
+			[]deviceConfig.RunConfigItem{
+				{Key: "devName", Value: d.name},
+			}...)
+	}
+
 	return &runConf, nil
 }
 
diff --git a/lxd/device/infiniband_sriov.go b/lxd/device/infiniband_sriov.go
index cb673e5ee5..b6932e9753 100644
--- a/lxd/device/infiniband_sriov.go
+++ b/lxd/device/infiniband_sriov.go
@@ -138,7 +138,6 @@ func (d *infinibandSRIOV) Start() (*deviceConfig.RunConfig, error) {
 	}
 
 	runConf.NetworkInterface = []deviceConfig.RunConfigItem{
-		{Key: "devName", Value: d.name},
 		{Key: "name", Value: d.config["name"]},
 		{Key: "type", Value: "phys"},
 		{Key: "flags", Value: "up"},
diff --git a/lxd/device/nic_bridged.go b/lxd/device/nic_bridged.go
index dd1e534799..7633dac620 100644
--- a/lxd/device/nic_bridged.go
+++ b/lxd/device/nic_bridged.go
@@ -176,7 +176,6 @@ func (d *nicBridged) Start() (*deviceConfig.RunConfig, error) {
 
 	runConf := deviceConfig.RunConfig{}
 	runConf.NetworkInterface = []deviceConfig.RunConfigItem{
-		{Key: "devName", Value: d.name},
 		{Key: "name", Value: d.config["name"]},
 		{Key: "type", Value: "phys"},
 		{Key: "flags", Value: "up"},
@@ -185,8 +184,10 @@ func (d *nicBridged) Start() (*deviceConfig.RunConfig, error) {
 
 	if d.inst.Type() == instancetype.VM {
 		runConf.NetworkInterface = append(runConf.NetworkInterface,
-			deviceConfig.RunConfigItem{Key: "hwaddr", Value: d.config["hwaddr"]},
-		)
+			[]deviceConfig.RunConfigItem{
+				{Key: "devName", Value: d.name},
+				{Key: "hwaddr", Value: d.config["hwaddr"]},
+			}...)
 	}
 
 	return &runConf, nil
diff --git a/lxd/device/nic_macvlan.go b/lxd/device/nic_macvlan.go
index 43cb93a651..131f5d8bdc 100644
--- a/lxd/device/nic_macvlan.go
+++ b/lxd/device/nic_macvlan.go
@@ -134,7 +134,6 @@ func (d *nicMACVLAN) Start() (*deviceConfig.RunConfig, error) {
 
 	runConf := deviceConfig.RunConfig{}
 	runConf.NetworkInterface = []deviceConfig.RunConfigItem{
-		{Key: "devName", Value: d.name},
 		{Key: "name", Value: d.config["name"]},
 		{Key: "type", Value: "phys"},
 		{Key: "flags", Value: "up"},
@@ -143,8 +142,10 @@ func (d *nicMACVLAN) Start() (*deviceConfig.RunConfig, error) {
 
 	if d.inst.Type() == instancetype.VM {
 		runConf.NetworkInterface = append(runConf.NetworkInterface,
-			deviceConfig.RunConfigItem{Key: "hwaddr", Value: d.config["hwaddr"]},
-		)
+			[]deviceConfig.RunConfigItem{
+				{Key: "devName", Value: d.name},
+				{Key: "hwaddr", Value: d.config["hwaddr"]},
+			}...)
 	}
 
 	revert.Success()
diff --git a/lxd/device/nic_p2p.go b/lxd/device/nic_p2p.go
index 0bfa97462f..2043597110 100644
--- a/lxd/device/nic_p2p.go
+++ b/lxd/device/nic_p2p.go
@@ -97,7 +97,6 @@ func (d *nicP2P) Start() (*deviceConfig.RunConfig, error) {
 
 	runConf := deviceConfig.RunConfig{}
 	runConf.NetworkInterface = []deviceConfig.RunConfigItem{
-		{Key: "devName", Value: d.name},
 		{Key: "name", Value: d.config["name"]},
 		{Key: "type", Value: "phys"},
 		{Key: "flags", Value: "up"},
@@ -106,8 +105,10 @@ func (d *nicP2P) Start() (*deviceConfig.RunConfig, error) {
 
 	if d.inst.Type() == instancetype.VM {
 		runConf.NetworkInterface = append(runConf.NetworkInterface,
-			deviceConfig.RunConfigItem{Key: "hwaddr", Value: d.config["hwaddr"]},
-		)
+			[]deviceConfig.RunConfigItem{
+				{Key: "devName", Value: d.name},
+				{Key: "hwaddr", Value: d.config["hwaddr"]},
+			}...)
 	}
 
 	return &runConf, nil
diff --git a/lxd/device/nic_physical.go b/lxd/device/nic_physical.go
index 54d98c6a0d..5a043db9e0 100644
--- a/lxd/device/nic_physical.go
+++ b/lxd/device/nic_physical.go
@@ -111,13 +111,19 @@ func (d *nicPhysical) Start() (*deviceConfig.RunConfig, error) {
 
 	runConf := deviceConfig.RunConfig{}
 	runConf.NetworkInterface = []deviceConfig.RunConfigItem{
-		{Key: "devName", Value: d.name},
 		{Key: "name", Value: d.config["name"]},
 		{Key: "type", Value: "phys"},
 		{Key: "flags", Value: "up"},
 		{Key: "link", Value: saveData["host_name"]},
 	}
 
+	if d.inst.Type() == instancetype.VM {
+		runConf.NetworkInterface = append(runConf.NetworkInterface,
+			[]deviceConfig.RunConfigItem{
+				{Key: "devName", Value: d.name},
+			}...)
+	}
+
 	return &runConf, nil
 }
 
diff --git a/lxd/device/nic_sriov.go b/lxd/device/nic_sriov.go
index b9fb4a8f30..d57c52604c 100644
--- a/lxd/device/nic_sriov.go
+++ b/lxd/device/nic_sriov.go
@@ -113,7 +113,6 @@ func (d *nicSRIOV) Start() (*deviceConfig.RunConfig, error) {
 
 	runConf := deviceConfig.RunConfig{}
 	runConf.NetworkInterface = []deviceConfig.RunConfigItem{
-		{Key: "devName", Value: d.name},
 		{Key: "name", Value: d.config["name"]},
 		{Key: "type", Value: "phys"},
 		{Key: "flags", Value: "up"},

From 92208250966e5d8ce179799a190ec97010d8c1b6 Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Thu, 23 Jan 2020 17:50:22 +0000
Subject: [PATCH 03/15] lxd/device/nic/physical: Improves revert and deletion
 of created VLAN devices

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 lxd/device/nic_physical.go | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/lxd/device/nic_physical.go b/lxd/device/nic_physical.go
index 5a043db9e0..e8470de7ef 100644
--- a/lxd/device/nic_physical.go
+++ b/lxd/device/nic_physical.go
@@ -5,6 +5,7 @@ import (
 
 	deviceConfig "github.com/lxc/lxd/lxd/device/config"
 	"github.com/lxc/lxd/lxd/instance/instancetype"
+	"github.com/lxc/lxd/lxd/revert"
 	"github.com/lxc/lxd/shared"
 )
 
@@ -62,6 +63,9 @@ func (d *nicPhysical) Start() (*deviceConfig.RunConfig, error) {
 
 	saveData := make(map[string]string)
 
+	revert := revert.New()
+	defer revert.Fail()
+
 	// Record the host_name device used for restoration later.
 	saveData["host_name"] = NetworkGetHostDevice(d.config["parent"], d.config["vlan"])
 	statusDev, err := NetworkCreateVlanDeviceIfNeeded(d.state, d.config["parent"], saveData["host_name"], d.config["vlan"])
@@ -72,16 +76,15 @@ func (d *nicPhysical) Start() (*deviceConfig.RunConfig, error) {
 	// Record whether we created this device or not so it can be removed on stop.
 	saveData["last_state.created"] = fmt.Sprintf("%t", statusDev != "existing")
 
-	// If we return from this function with an error, ensure we clean up created device.
-	defer func() {
-		if err != nil && statusDev == "created" {
-			NetworkRemoveInterface(saveData["host_name"])
-		}
-	}()
+	if shared.IsTrue(saveData["last_state.created"]) {
+		revert.Add(func() {
+			NetworkRemoveInterfaceIfNeeded(d.state, saveData["host_name"], d.inst, d.config["parent"], d.config["vlan"])
+		})
+	}
 
-	// If we didn't create the device we should track various properties so we can
-	// restore them when the instance is stopped or the device is detached.
-	if statusDev == "existing" {
+	// If we didn't create the device we should track various properties so we can restore them when the
+	// instance is stopped or the device is detached.
+	if !shared.IsTrue(saveData["last_state.created"]) {
 		err = networkSnapshotPhysicalNic(saveData["host_name"], saveData)
 		if err != nil {
 			return nil, err
@@ -124,6 +127,7 @@ func (d *nicPhysical) Start() (*deviceConfig.RunConfig, error) {
 			}...)
 	}
 
+	revert.Success()
 	return &runConf, nil
 }
 
@@ -151,9 +155,18 @@ func (d *nicPhysical) postStop() error {
 
 	v := d.volatileGet()
 	hostName := NetworkGetHostDevice(d.config["parent"], d.config["vlan"])
-	err := networkRestorePhysicalNic(hostName, v)
-	if err != nil {
-		return err
+
+	// This will delete the parent interface if we created it for VLAN parent.
+	if shared.IsTrue(v["last_state.created"]) {
+		err := NetworkRemoveInterfaceIfNeeded(d.state, hostName, d.inst, d.config["parent"], d.config["vlan"])
+		if err != nil {
+			return err
+		}
+	} else {
+		err := networkRestorePhysicalNic(hostName, v)
+		if err != nil {
+			return err
+		}
 	}
 
 	return nil

From 07883a1918ce975b857b6c5ddc72efb59c8bb72d Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Thu, 23 Jan 2020 17:51:13 +0000
Subject: [PATCH 04/15] lxd/instance/drivers/driver/qemu/templates: Clarifies
 qemuNetdevPhysical variables

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 lxd/instance/drivers/driver_qemu_templates.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lxd/instance/drivers/driver_qemu_templates.go b/lxd/instance/drivers/driver_qemu_templates.go
index af15de4f6a..a557dc127a 100644
--- a/lxd/instance/drivers/driver_qemu_templates.go
+++ b/lxd/instance/drivers/driver_qemu_templates.go
@@ -220,6 +220,6 @@ var qemuNetdevPhysical = template.Must(template.New("qemuNetdevPhysical").Parse(
 # Network card ("{{.devName}}" device)
 [device "dev-lxd_{{.devName}}"]
 driver = "vfio-pci"
-host = "{{.host}}"
+host = "{{.pciSlotName}}"
 bootindex = "{{.bootIndex}}"
 `))

From 3a7188c004499cc087af9bd3939be215fb61e18a Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Fri, 24 Jan 2020 09:30:12 +0000
Subject: [PATCH 05/15] lxd/device/nic/macvlan: Differentiates config parent
 from actual parent

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 lxd/device/nic_macvlan.go | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/lxd/device/nic_macvlan.go b/lxd/device/nic_macvlan.go
index 131f5d8bdc..65401ebcc9 100644
--- a/lxd/device/nic_macvlan.go
+++ b/lxd/device/nic_macvlan.go
@@ -67,13 +67,13 @@ func (d *nicMACVLAN) Start() (*deviceConfig.RunConfig, error) {
 	saveData := make(map[string]string)
 
 	// Decide which parent we should use based on VLAN setting.
-	parentName := NetworkGetHostDevice(d.config["parent"], d.config["vlan"])
+	actualParentName := NetworkGetHostDevice(d.config["parent"], d.config["vlan"])
 
 	// Record the temporary device name used for deletion later.
 	saveData["host_name"] = NetworkRandomDevName("mac")
 
 	// Create VLAN parent device if needed.
-	statusDev, err := NetworkCreateVlanDeviceIfNeeded(d.state, d.config["parent"], parentName, d.config["vlan"])
+	statusDev, err := NetworkCreateVlanDeviceIfNeeded(d.state, d.config["parent"], actualParentName, d.config["vlan"])
 	if err != nil {
 		return nil, err
 	}
@@ -83,19 +83,19 @@ func (d *nicMACVLAN) Start() (*deviceConfig.RunConfig, error) {
 
 	if shared.IsTrue(saveData["last_state.created"]) {
 		revert.Add(func() {
-			NetworkRemoveInterfaceIfNeeded(d.state, parentName, d.inst, d.config["parent"], d.config["vlan"])
+			NetworkRemoveInterfaceIfNeeded(d.state, actualParentName, d.inst, d.config["parent"], d.config["vlan"])
 		})
 	}
 
 	if d.inst.Type() == instancetype.Container {
 		// Create MACVLAN interface.
-		_, err = shared.RunCommand("ip", "link", "add", "dev", saveData["host_name"], "link", parentName, "type", "macvlan", "mode", "bridge")
+		_, err = shared.RunCommand("ip", "link", "add", "dev", saveData["host_name"], "link", actualParentName, "type", "macvlan", "mode", "bridge")
 		if err != nil {
 			return nil, err
 		}
 	} else if d.inst.Type() == instancetype.VM {
 		// Create MACVTAP interface.
-		_, err = shared.RunCommand("ip", "link", "add", "dev", saveData["host_name"], "link", parentName, "type", "macvtap", "mode", "bridge")
+		_, err = shared.RunCommand("ip", "link", "add", "dev", saveData["host_name"], "link", actualParentName, "type", "macvtap", "mode", "bridge")
 		if err != nil {
 			return nil, err
 		}
@@ -187,8 +187,8 @@ func (d *nicMACVLAN) postStop() error {
 
 	// This will delete the parent interface if we created it for VLAN parent.
 	if shared.IsTrue(v["last_state.created"]) {
-		parentName := NetworkGetHostDevice(d.config["parent"], d.config["vlan"])
-		err := NetworkRemoveInterfaceIfNeeded(d.state, parentName, d.inst, d.config["parent"], d.config["vlan"])
+		actualParentName := NetworkGetHostDevice(d.config["parent"], d.config["vlan"])
+		err := NetworkRemoveInterfaceIfNeeded(d.state, actualParentName, d.inst, d.config["parent"], d.config["vlan"])
 		if err != nil {
 			errs = append(errs, err)
 		}

From 4f4d0837f2b1264b6957809e2afd17d128999406 Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Fri, 24 Jan 2020 09:31:09 +0000
Subject: [PATCH 06/15] lxd/device/device/utils/network: Adds
 networkGetDevicePCIDevice function

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 lxd/device/device_utils_network.go | 44 ++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/lxd/device/device_utils_network.go b/lxd/device/device_utils_network.go
index b2fc04fbd0..a37b69aaef 100644
--- a/lxd/device/device_utils_network.go
+++ b/lxd/device/device_utils_network.go
@@ -783,3 +783,47 @@ func networkParsePortRange(r string) (int64, int64, error) {
 
 	return base, size, nil
 }
+
+// pciDevice represents info about a PCI uevent device.
+type pciDevice struct {
+	ID       string
+	SlotName string
+	Driver   string
+}
+
+// networkGetDevicePCISlot returns the PCI device info for a given uevent file.
+func networkGetDevicePCIDevice(ueventFilePath string) (pciDevice, error) {
+	dev := pciDevice{}
+
+	file, err := os.Open(ueventFilePath)
+	if err != nil {
+		return dev, err
+	}
+	defer file.Close()
+
+	scanner := bufio.NewScanner(file)
+	for scanner.Scan() {
+		// Looking for something like this "PCI_SLOT_NAME=0000:05:10.0"
+		fields := strings.SplitN(scanner.Text(), "=", 2)
+		if len(fields) == 2 {
+			if fields[0] == "PCI_SLOT_NAME" {
+				dev.SlotName = fields[1]
+			} else if fields[0] == "PCI_ID" {
+				dev.ID = fields[1]
+			} else if fields[0] == "DRIVER" {
+				dev.Driver = fields[1]
+			}
+		}
+	}
+
+	err = scanner.Err()
+	if err != nil {
+		return dev, err
+	}
+
+	if dev.SlotName == "" {
+		return dev, fmt.Errorf("Device uevent file could not be parsed")
+	}
+
+	return dev, nil
+}

From 5c97f7bd686ff02afd33d7862a9731e6b8d52eca Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Fri, 24 Jan 2020 09:31:47 +0000
Subject: [PATCH 07/15] lxd/device/nic/sriov: Updates networkGetVFDevicePCISlot
 to use networkGetDevicePCIDevice

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 lxd/device/nic_sriov.go | 21 +++------------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/lxd/device/nic_sriov.go b/lxd/device/nic_sriov.go
index d57c52604c..03e2a63ccb 100644
--- a/lxd/device/nic_sriov.go
+++ b/lxd/device/nic_sriov.go
@@ -5,7 +5,6 @@ import (
 	"bytes"
 	"fmt"
 	"io/ioutil"
-	"os"
 	"os/exec"
 	"path/filepath"
 	"regexp"
@@ -483,27 +482,13 @@ func (d *nicSRIOV) networkGetVirtFuncInfo(devName string, vfID int) (vf virtFunc
 
 // networkGetVFDevicePCISlot returns the PCI slot name for a network virtual function device.
 func (d *nicSRIOV) networkGetVFDevicePCISlot(vfID string) (string, error) {
-	file, err := os.Open(fmt.Sprintf("/sys/class/net/%s/device/virtfn%s/uevent", d.config["parent"], vfID))
+	ueventFile := fmt.Sprintf("/sys/class/net/%s/device/virtfn%s/uevent", d.config["parent"], vfID)
+	pciDev, err := networkGetDevicePCIDevice(ueventFile)
 	if err != nil {
 		return "", err
 	}
-	defer file.Close()
 
-	scanner := bufio.NewScanner(file)
-	for scanner.Scan() {
-		// Looking for something like this "PCI_SLOT_NAME=0000:05:10.0"
-		fields := strings.SplitN(scanner.Text(), "=", 2)
-		if len(fields) == 2 && fields[0] == "PCI_SLOT_NAME" {
-			return fields[1], nil
-		}
-	}
-
-	err = scanner.Err()
-	if err != nil {
-		return "", err
-	}
-
-	return "", fmt.Errorf("PCI_SLOT_NAME not found")
+	return pciDev.SlotName, nil
 }
 
 // networkGetVFDeviceDriverPath returns the path to the network virtual function device driver in /sys.

From bc95615a83476260c594db61660ed686c7e5e038 Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Fri, 24 Jan 2020 09:36:00 +0000
Subject: [PATCH 08/15] lxd/instance/drivers/driver/qemu: Adds physical NIC
 passthrough support

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 lxd/instance/drivers/driver_qemu.go | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/lxd/instance/drivers/driver_qemu.go b/lxd/instance/drivers/driver_qemu.go
index ef0f5dc99b..3495aed123 100644
--- a/lxd/instance/drivers/driver_qemu.go
+++ b/lxd/instance/drivers/driver_qemu.go
@@ -1457,7 +1457,7 @@ func (vm *qemu) addDriveConfig(sb *strings.Builder, bootIndexes map[string]int,
 
 // addNetDevConfig adds the qemu config required for adding a network device.
 func (vm *qemu) addNetDevConfig(sb *strings.Builder, nicIndex int, bootIndexes map[string]int, nicConfig []deviceConfig.RunConfigItem, fdFiles *[]string) error {
-	var devName, nicName, devHwaddr string
+	var devName, nicName, devHwaddr, pciSlotName string
 	for _, nicItem := range nicConfig {
 		if nicItem.Key == "devName" {
 			devName = nicItem.Value
@@ -1465,6 +1465,8 @@ func (vm *qemu) addNetDevConfig(sb *strings.Builder, nicIndex int, bootIndexes m
 			nicName = nicItem.Value
 		} else if nicItem.Key == "hwaddr" {
 			devHwaddr = nicItem.Value
+		} else if nicItem.Key == "pciSlotName" {
+			pciSlotName = nicItem.Value
 		}
 	}
 
@@ -1499,6 +1501,10 @@ func (vm *qemu) addNetDevConfig(sb *strings.Builder, nicIndex int, bootIndexes m
 		// Detect TAP (via TUN driver) device.
 		tplFields["ifName"] = nicName
 		tpl = qemuNetDevTapTun
+	} else if pciSlotName != "" {
+		// Detect physical passthrough device.
+		tplFields["pciSlotName"] = pciSlotName
+		tpl = qemuNetdevPhysical
 	}
 
 	if tpl != nil {

From 8065e5061685cfbe10386f968faa13e33aa3be1a Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Fri, 24 Jan 2020 12:01:48 +0000
Subject: [PATCH 09/15] shared/instance: Updates config key checker to allow
 ".driver" keys

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 shared/instance.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/shared/instance.go b/shared/instance.go
index 1264e991c7..7830c0e806 100644
--- a/shared/instance.go
+++ b/shared/instance.go
@@ -418,6 +418,10 @@ func ConfigKeyChecker(key string) (func(value string) error, error) {
 		if strings.HasSuffix(key, ".ceph_rbd") {
 			return IsAny, nil
 		}
+
+		if strings.HasSuffix(key, ".driver") {
+			return IsAny, nil
+		}
 	}
 
 	if strings.HasPrefix(key, "environment.") {

From 7efa6875cd943c7f1fcd4040611c926167d8a5de Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Mon, 27 Jan 2020 16:09:13 +0000
Subject: [PATCH 10/15] lxd/device/device/utils/network: Adds generic PCI
 device bind/unbind functions

To be used with both physical VM NICs and sriov NICs.

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 lxd/device/device_utils_network.go | 51 ++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/lxd/device/device_utils_network.go b/lxd/device/device_utils_network.go
index a37b69aaef..0d419a1588 100644
--- a/lxd/device/device_utils_network.go
+++ b/lxd/device/device_utils_network.go
@@ -12,6 +12,7 @@ import (
 	"strconv"
 	"strings"
 	"sync"
+	"time"
 
 	"github.com/pkg/errors"
 
@@ -827,3 +828,53 @@ func networkGetDevicePCIDevice(ueventFilePath string) (pciDevice, error) {
 
 	return dev, nil
 }
+
+// networkDeviceUnbind unbinds a network device from the OS using its PCI Slot Name and driver name.
+func networkDeviceUnbind(pciDev pciDevice) error {
+	driverUnbindPath := fmt.Sprintf("/sys/bus/pci/drivers/%s/unbind", pciDev.Driver)
+	err := ioutil.WriteFile(driverUnbindPath, []byte(pciDev.SlotName), 0600)
+	if err != nil {
+		return errors.Wrapf(err, "Failed unbinding device %q via %q", pciDev.SlotName, driverUnbindPath)
+	}
+
+	return nil
+}
+
+// networkDeviceBind binds a network device to the OS using its PCI Slot Name and driver name.
+func networkDeviceBind(pciDev pciDevice) error {
+	driverBindPath := fmt.Sprintf("/sys/bus/pci/drivers/%s/bind", pciDev.Driver)
+	err := ioutil.WriteFile(driverBindPath, []byte(pciDev.SlotName), 0600)
+	if err != nil {
+		return errors.Wrapf(err, "Failed binding device %q via %q", pciDev.SlotName, driverBindPath)
+	}
+
+	return nil
+}
+
+// networkDeviceBindWait waits for network device to appear after being binded to a driver.
+func networkDeviceBindWait(pciDev pciDevice) error {
+	devicePath := fmt.Sprintf("/sys/bus/pci/drivers/%s/%s", pciDev.Driver, pciDev.SlotName)
+
+	for i := 0; i < 10; i++ {
+		if shared.PathExists(devicePath) {
+			return nil
+		}
+
+		time.Sleep(50 * time.Millisecond)
+	}
+
+	return fmt.Errorf("Bind of device %q took too long", devicePath)
+}
+
+// networkInterfaceBindWait waits for network interface to appear after being binded to a driver.
+func networkInterfaceBindWait(ifName string) error {
+	for i := 0; i < 10; i++ {
+		if shared.PathExists(fmt.Sprintf("/sys/class/net/%s", ifName)) {
+			return nil
+		}
+
+		time.Sleep(50 * time.Millisecond)
+	}
+
+	return fmt.Errorf("Bind of interface %q took too long", ifName)
+}

From 829ce0264eef50f267fcba31848ccdd0f596d2d1 Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Mon, 27 Jan 2020 17:07:15 +0000
Subject: [PATCH 11/15] lxd/device/device/utils/network: Adds
 networkVFIOPCIRegister

Allows a PCI device to be registered with the vfio-pci driver.

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 lxd/device/device_utils_network.go | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/lxd/device/device_utils_network.go b/lxd/device/device_utils_network.go
index 0d419a1588..289bb8efe3 100644
--- a/lxd/device/device_utils_network.go
+++ b/lxd/device/device_utils_network.go
@@ -878,3 +878,21 @@ func networkInterfaceBindWait(ifName string) error {
 
 	return fmt.Errorf("Bind of interface %q took too long", ifName)
 }
+
+// networkVFIOPCIRegister registers the PCI device with the VFIO-PCI driver.
+// Should also bind the device to the vfio-pci driver if it is present. Requires the vfio-pci module is loaded.
+func networkVFIOPCIRegister(pciDev pciDevice) error {
+	// vfio-pci module takes device IDs as "n n" but networkGetDevicePCIDevice returns them as "n:n".
+	devIDParts := strings.SplitN(pciDev.ID, ":", 2)
+	if len(devIDParts) < 2 {
+		return fmt.Errorf("Invalid device ID from %q", pciDev.ID)
+	}
+
+	vfioPCINewIDPath := "/sys/bus/pci/drivers/vfio-pci/new_id"
+	err := ioutil.WriteFile(vfioPCINewIDPath, []byte(fmt.Sprintf("%s %s", devIDParts[0], devIDParts[1])), 0600)
+	if err != nil {
+		return errors.Wrapf(err, "Failed registering PCI device ID %q to %q", pciDev.ID, vfioPCINewIDPath)
+	}
+
+	return nil
+}

From 3a610429a8f8d37f98162aabb0dd820844cac6db Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Mon, 27 Jan 2020 16:10:51 +0000
Subject: [PATCH 12/15] lxd/device/nic/sriov: Switches PCI device bind/unbind
 to generic functions

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 lxd/device/nic_sriov.go | 70 ++++++++---------------------------------
 1 file changed, 13 insertions(+), 57 deletions(-)

diff --git a/lxd/device/nic_sriov.go b/lxd/device/nic_sriov.go
index 03e2a63ccb..10a111ef6f 100644
--- a/lxd/device/nic_sriov.go
+++ b/lxd/device/nic_sriov.go
@@ -6,11 +6,9 @@ import (
 	"fmt"
 	"io/ioutil"
 	"os/exec"
-	"path/filepath"
 	"regexp"
 	"strconv"
 	"strings"
-	"time"
 
 	deviceConfig "github.com/lxc/lxd/lxd/device/config"
 	"github.com/lxc/lxd/lxd/instance/instancetype"
@@ -336,27 +334,20 @@ func (d *nicSRIOV) setupSriovParent(vfDevice string, vfID int, volatile map[stri
 	}
 
 	// Get VF device's PCI Slot Name so we can unbind and rebind it from the host.
-	vfPCISlot, err := d.networkGetVFDevicePCISlot(volatile["last_state.vf.id"])
-	if err != nil {
-		return err
-	}
-
-	// Get the path to the VF device's driver now, as once it is unbound we won't be able to
-	// determine its driver path in order to rebind it.
-	vfDriverPath, err := d.networkGetVFDeviceDriverPath(volatile["last_state.vf.id"])
+	vfPCIDev, err := d.networkGetVFDevicePCISlot(volatile["last_state.vf.id"])
 	if err != nil {
 		return err
 	}
 
 	// Unbind VF device from the host so that the settings will take effect when we rebind it.
-	err = d.networkDeviceUnbind(vfPCISlot, vfDriverPath)
+	err = networkDeviceUnbind(vfPCIDev)
 	if err != nil {
 		return err
 	}
 
 	// However we return from this function, we must try to rebind the VF so its not orphaned.
 	// The OS won't let an already bound device be bound again so is safe to call twice.
-	defer d.networkDeviceBind(vfPCISlot, vfDriverPath)
+	defer networkDeviceBind(vfPCIDev)
 
 	// Setup VF VLAN if specified.
 	if d.config["vlan"] != "" {
@@ -402,7 +393,7 @@ func (d *nicSRIOV) setupSriovParent(vfDevice string, vfID int, volatile map[stri
 	}
 
 	// Bind VF device onto the host so that the settings will take effect.
-	err = d.networkDeviceBind(vfPCISlot, vfDriverPath)
+	err = networkDeviceBind(vfPCIDev)
 	if err != nil {
 		return err
 	}
@@ -411,7 +402,7 @@ func (d *nicSRIOV) setupSriovParent(vfDevice string, vfID int, volatile map[stri
 	// it will re-appear shortly after. Unfortunately the time between sending the bind event
 	// to the nic and it actually appearing on the host is non-zero, so we need to watch and wait,
 	// otherwise next steps of applying settings to interface will fail.
-	err = d.networkDeviceBindWait(volatile["host_name"])
+	err = networkInterfaceBindWait(volatile["host_name"])
 	if err != nil {
 		return err
 	}
@@ -481,42 +472,14 @@ func (d *nicSRIOV) networkGetVirtFuncInfo(devName string, vfID int) (vf virtFunc
 }
 
 // networkGetVFDevicePCISlot returns the PCI slot name for a network virtual function device.
-func (d *nicSRIOV) networkGetVFDevicePCISlot(vfID string) (string, error) {
+func (d *nicSRIOV) networkGetVFDevicePCISlot(vfID string) (pciDevice, error) {
 	ueventFile := fmt.Sprintf("/sys/class/net/%s/device/virtfn%s/uevent", d.config["parent"], vfID)
 	pciDev, err := networkGetDevicePCIDevice(ueventFile)
 	if err != nil {
-		return "", err
+		return pciDev, err
 	}
 
-	return pciDev.SlotName, nil
-}
-
-// networkGetVFDeviceDriverPath returns the path to the network virtual function device driver in /sys.
-func (d *nicSRIOV) networkGetVFDeviceDriverPath(vfID string) (string, error) {
-	return filepath.EvalSymlinks(fmt.Sprintf("/sys/class/net/%s/device/virtfn%s/driver", d.config["parent"], vfID))
-}
-
-// networkDeviceUnbind unbinds a network device from the OS using its PCI Slot Name and driver path.
-func (d *nicSRIOV) networkDeviceUnbind(pciSlotName string, driverPath string) error {
-	return ioutil.WriteFile(fmt.Sprintf("%s/unbind", driverPath), []byte(pciSlotName), 0600)
-}
-
-// networkDeviceUnbind binds a network device to the OS using its PCI Slot Name and driver path.
-func (d *nicSRIOV) networkDeviceBind(pciSlotName string, driverPath string) error {
-	return ioutil.WriteFile(fmt.Sprintf("%s/bind", driverPath), []byte(pciSlotName), 0600)
-}
-
-// networkDeviceBindWait waits for network interface to appear after being binded.
-func (d *nicSRIOV) networkDeviceBindWait(devName string) error {
-	for i := 0; i < 10; i++ {
-		if shared.PathExists(fmt.Sprintf("/sys/class/net/%s", devName)) {
-			return nil
-		}
-
-		time.Sleep(50 * time.Millisecond)
-	}
-
-	return fmt.Errorf("Bind of interface \"%s\" took too long", devName)
+	return pciDev, nil
 }
 
 // restoreSriovParent restores SR-IOV parent device settings when removed from an instance using the
@@ -528,27 +491,20 @@ func (d *nicSRIOV) restoreSriovParent(volatile map[string]string) error {
 	}
 
 	// Get VF device's PCI Slot Name so we can unbind and rebind it from the host.
-	vfPCISlot, err := d.networkGetVFDevicePCISlot(volatile["last_state.vf.id"])
-	if err != nil {
-		return err
-	}
-
-	// Get the path to the VF device's driver now, as once it is unbound we won't be able to
-	// determine its driver path in order to rebind it.
-	vfDriverPath, err := d.networkGetVFDeviceDriverPath(volatile["last_state.vf.id"])
+	vfPCIDev, err := d.networkGetVFDevicePCISlot(volatile["last_state.vf.id"])
 	if err != nil {
 		return err
 	}
 
 	// Unbind VF device from the host so that the settings will take effect when we rebind it.
-	err = d.networkDeviceUnbind(vfPCISlot, vfDriverPath)
+	err = networkDeviceUnbind(vfPCIDev)
 	if err != nil {
 		return err
 	}
 
 	// However we return from this function, we must try to rebind the VF so its not orphaned.
 	// The OS won't let an already bound device be bound again so is safe to call twice.
-	defer d.networkDeviceBind(vfPCISlot, vfDriverPath)
+	defer networkDeviceBind(vfPCIDev)
 
 	// Reset VF VLAN if specified
 	if volatile["last_state.vf.vlan"] != "" {
@@ -581,7 +537,7 @@ func (d *nicSRIOV) restoreSriovParent(volatile map[string]string) error {
 	}
 
 	// Bind VF device onto the host so that the settings will take effect.
-	err = d.networkDeviceBind(vfPCISlot, vfDriverPath)
+	err = networkDeviceBind(vfPCIDev)
 	if err != nil {
 		return err
 	}
@@ -590,7 +546,7 @@ func (d *nicSRIOV) restoreSriovParent(volatile map[string]string) error {
 	// and it will re-appear on the host. Unfortunately the time between sending the bind event
 	// to the nic and it actually appearing on the host is non-zero, so we need to watch and wait,
 	// otherwise next step of restoring MAC and MTU settings in restorePhysicalNic will fail.
-	err = d.networkDeviceBindWait(volatile["host_name"])
+	err = networkInterfaceBindWait(volatile["host_name"])
 	if err != nil {
 		return err
 	}

From 5e3807da38b747ec3c67e40f33f54e36020dbe6a Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Fri, 24 Jan 2020 12:03:03 +0000
Subject: [PATCH 13/15] lxd/device/nic/physical: Adds VM PCI passthrough
 support

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 lxd/device/nic_physical.go | 162 ++++++++++++++++++++++++++++---------
 1 file changed, 123 insertions(+), 39 deletions(-)

diff --git a/lxd/device/nic_physical.go b/lxd/device/nic_physical.go
index e8470de7ef..25e36a7cc8 100644
--- a/lxd/device/nic_physical.go
+++ b/lxd/device/nic_physical.go
@@ -3,9 +3,12 @@ package device
 import (
 	"fmt"
 
+	"github.com/pkg/errors"
+
 	deviceConfig "github.com/lxc/lxd/lxd/device/config"
 	"github.com/lxc/lxd/lxd/instance/instancetype"
 	"github.com/lxc/lxd/lxd/revert"
+	"github.com/lxc/lxd/lxd/util"
 	"github.com/lxc/lxd/shared"
 )
 
@@ -15,20 +18,22 @@ type nicPhysical struct {
 
 // validateConfig checks the supplied config for correctness.
 func (d *nicPhysical) validateConfig() error {
-	if d.inst.Type() != instancetype.Container {
+	if d.inst.Type() != instancetype.Container && d.inst.Type() != instancetype.VM {
 		return ErrUnsupportedDevType
 	}
 
 	requiredFields := []string{"parent"}
 	optionalFields := []string{
 		"name",
-		"mtu",
-		"hwaddr",
-		"vlan",
 		"maas.subnet.ipv4",
 		"maas.subnet.ipv6",
 		"boot.priority",
 	}
+
+	if d.inst.Type() == instancetype.Container {
+		optionalFields = append(optionalFields, "mtu", "hwaddr", "vlan")
+	}
+
 	err := d.config.Validate(nicValidationRules(requiredFields, optionalFields))
 	if err != nil {
 		return err
@@ -66,45 +71,97 @@ func (d *nicPhysical) Start() (*deviceConfig.RunConfig, error) {
 	revert := revert.New()
 	defer revert.Fail()
 
+	// pciSlotName, used for VM physical passthrough.
+	var pciSlotName string
+
+	// If VM, then try and load the vfio-pci module first.
+	if d.inst.Type() == instancetype.VM {
+		err = util.LoadModule("vfio-pci")
+		if err != nil {
+			return nil, errors.Wrapf(err, "Error loading %q module", "vfio-pci")
+		}
+	}
+
 	// Record the host_name device used for restoration later.
 	saveData["host_name"] = NetworkGetHostDevice(d.config["parent"], d.config["vlan"])
-	statusDev, err := NetworkCreateVlanDeviceIfNeeded(d.state, d.config["parent"], saveData["host_name"], d.config["vlan"])
-	if err != nil {
-		return nil, err
-	}
 
-	// Record whether we created this device or not so it can be removed on stop.
-	saveData["last_state.created"] = fmt.Sprintf("%t", statusDev != "existing")
+	if d.inst.Type() == instancetype.Container {
+		statusDev, err := NetworkCreateVlanDeviceIfNeeded(d.state, d.config["parent"], saveData["host_name"], d.config["vlan"])
+		if err != nil {
+			return nil, err
+		}
 
-	if shared.IsTrue(saveData["last_state.created"]) {
-		revert.Add(func() {
-			NetworkRemoveInterfaceIfNeeded(d.state, saveData["host_name"], d.inst, d.config["parent"], d.config["vlan"])
-		})
-	}
+		// Record whether we created this device or not so it can be removed on stop.
+		saveData["last_state.created"] = fmt.Sprintf("%t", statusDev != "existing")
+
+		if shared.IsTrue(saveData["last_state.created"]) {
+			revert.Add(func() {
+				NetworkRemoveInterfaceIfNeeded(d.state, saveData["host_name"], d.inst, d.config["parent"], d.config["vlan"])
+			})
+		}
+
+		// If we didn't create the device we should track various properties so we can restore them when the
+		// instance is stopped or the device is detached.
+		if !shared.IsTrue(saveData["last_state.created"]) {
+			err = networkSnapshotPhysicalNic(saveData["host_name"], saveData)
+			if err != nil {
+				return nil, err
+			}
+		}
+
+		// Set the MAC address.
+		if d.config["hwaddr"] != "" {
+			_, err := shared.RunCommand("ip", "link", "set", "dev", saveData["host_name"], "address", d.config["hwaddr"])
+			if err != nil {
+				return nil, fmt.Errorf("Failed to set the MAC address: %s", err)
+			}
+		}
+
+		// Set the MTU.
+		if d.config["mtu"] != "" {
+			_, err := shared.RunCommand("ip", "link", "set", "dev", saveData["host_name"], "mtu", d.config["mtu"])
+			if err != nil {
+				return nil, fmt.Errorf("Failed to set the MTU: %s", err)
+			}
+		}
+	} else if d.inst.Type() == instancetype.VM {
+		// Get PCI information about the network interface.
+		ueventPath := fmt.Sprintf("/sys/class/net/%s/device/uevent", saveData["host_name"])
+		pciDev, err := networkGetDevicePCIDevice(ueventPath)
+		if err != nil {
+			return nil, errors.Wrapf(err, "Failed to get PCI device info for %q", saveData["host_name"])
+		}
+
+		saveData["last_state.pci.slot.name"] = pciDev.SlotName
+		saveData["last_state.pci.driver"] = pciDev.Driver
 
-	// If we didn't create the device we should track various properties so we can restore them when the
-	// instance is stopped or the device is detached.
-	if !shared.IsTrue(saveData["last_state.created"]) {
-		err = networkSnapshotPhysicalNic(saveData["host_name"], saveData)
+		// Unbind the interface from the host.
+		err = networkDeviceUnbind(pciDev)
 		if err != nil {
 			return nil, err
 		}
-	}
 
-	// Set the MAC address.
-	if d.config["hwaddr"] != "" {
-		_, err := shared.RunCommand("ip", "link", "set", "dev", saveData["host_name"], "address", d.config["hwaddr"])
+		revert.Add(func() { networkDeviceBind(pciDev) })
+
+		// Register the device with the vfio-pci module.
+		err = networkVFIOPCIRegister(pciDev)
 		if err != nil {
-			return nil, fmt.Errorf("Failed to set the MAC address: %s", err)
+			return nil, err
 		}
-	}
 
-	// Set the MTU.
-	if d.config["mtu"] != "" {
-		_, err := shared.RunCommand("ip", "link", "set", "dev", saveData["host_name"], "mtu", d.config["mtu"])
+		vfioDev := pciDevice{
+			Driver:   "vfio-pci",
+			SlotName: pciDev.SlotName,
+		}
+
+		revert.Add(func() { networkDeviceUnbind(vfioDev) })
+
+		err = networkDeviceBindWait(vfioDev)
 		if err != nil {
-			return nil, fmt.Errorf("Failed to set the MTU: %s", err)
+			return nil, err
 		}
+
+		pciSlotName = saveData["last_state.pci.slot.name"]
 	}
 
 	err = d.volatileSet(saveData)
@@ -124,6 +181,7 @@ func (d *nicPhysical) Start() (*deviceConfig.RunConfig, error) {
 		runConf.NetworkInterface = append(runConf.NetworkInterface,
 			[]deviceConfig.RunConfigItem{
 				{Key: "devName", Value: d.name},
+				{Key: "pciSlotName", Value: pciSlotName},
 			}...)
 	}
 
@@ -147,26 +205,52 @@ func (d *nicPhysical) Stop() (*deviceConfig.RunConfig, error) {
 // postStop is run after the device is removed from the instance.
 func (d *nicPhysical) postStop() error {
 	defer d.volatileSet(map[string]string{
-		"host_name":          "",
-		"last_state.hwaddr":  "",
-		"last_state.mtu":     "",
-		"last_state.created": "",
+		"host_name":                "",
+		"last_state.hwaddr":        "",
+		"last_state.mtu":           "",
+		"last_state.created":       "",
+		"last_state.pci.slot.name": "",
+		"last_state.pci.driver":    "",
 	})
 
 	v := d.volatileGet()
-	hostName := NetworkGetHostDevice(d.config["parent"], d.config["vlan"])
 
-	// This will delete the parent interface if we created it for VLAN parent.
-	if shared.IsTrue(v["last_state.created"]) {
-		err := NetworkRemoveInterfaceIfNeeded(d.state, hostName, d.inst, d.config["parent"], d.config["vlan"])
+	// If VM physical pass through, unbind from vfio-pci and bind back to host driver.
+	if d.inst.Type() == instancetype.VM && v["last_state.pci.slot.name"] != "" {
+		vfioDev := pciDevice{
+			Driver:   "vfio-pci",
+			SlotName: v["last_state.pci.slot.name"],
+		}
+
+		err := networkDeviceUnbind(vfioDev)
 		if err != nil {
 			return err
 		}
-	} else {
-		err := networkRestorePhysicalNic(hostName, v)
+
+		hostDev := pciDevice{
+			Driver:   v["last_state.pci.driver"],
+			SlotName: v["last_state.pci.slot.name"],
+		}
+
+		err = networkDeviceBind(hostDev)
 		if err != nil {
 			return err
 		}
+	} else if d.inst.Type() == instancetype.Container {
+		hostName := NetworkGetHostDevice(d.config["parent"], d.config["vlan"])
+
+		// This will delete the parent interface if we created it for VLAN parent.
+		if shared.IsTrue(v["last_state.created"]) {
+			err := NetworkRemoveInterfaceIfNeeded(d.state, hostName, d.inst, d.config["parent"], d.config["vlan"])
+			if err != nil {
+				return err
+			}
+		} else if v["last_state.pci.slot.name"] == "" {
+			err := networkRestorePhysicalNic(hostName, v)
+			if err != nil {
+				return err
+			}
+		}
 	}
 
 	return nil

From ba93cc69ec37eab0b73ab7316dca2b14d563a894 Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Mon, 27 Jan 2020 16:20:03 +0000
Subject: [PATCH 14/15] lxd/device: Unexports NetworkRemoveInterfaceIfNeeded

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 lxd/device/device_utils_network.go | 4 ++--
 lxd/device/nic_ipvlan.go           | 2 +-
 lxd/device/nic_macvlan.go          | 4 ++--
 lxd/device/nic_physical.go         | 4 ++--
 lxd/device/nic_routed.go           | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/lxd/device/device_utils_network.go b/lxd/device/device_utils_network.go
index 289bb8efe3..9abd4babc1 100644
--- a/lxd/device/device_utils_network.go
+++ b/lxd/device/device_utils_network.go
@@ -139,8 +139,8 @@ func NetworkRemoveInterface(nic string) error {
 	return err
 }
 
-// NetworkRemoveInterfaceIfNeeded removes a network interface by name but only if no other instance is using it.
-func NetworkRemoveInterfaceIfNeeded(state *state.State, nic string, current instance.Instance, parent string, vlanID string) error {
+// networkRemoveInterfaceIfNeeded removes a network interface by name but only if no other instance is using it.
+func networkRemoveInterfaceIfNeeded(state *state.State, nic string, current instance.Instance, parent string, vlanID string) error {
 	// Check if it's used by another instance.
 	instances, err := InstanceLoadNodeAll(state)
 	if err != nil {
diff --git a/lxd/device/nic_ipvlan.go b/lxd/device/nic_ipvlan.go
index ee61a8353e..75c8227f8f 100644
--- a/lxd/device/nic_ipvlan.go
+++ b/lxd/device/nic_ipvlan.go
@@ -232,7 +232,7 @@ func (d *nicIPVLAN) postStop() error {
 	// This will delete the parent interface if we created it for VLAN parent.
 	if shared.IsTrue(v["last_state.created"]) {
 		parentName := NetworkGetHostDevice(d.config["parent"], d.config["vlan"])
-		err := NetworkRemoveInterfaceIfNeeded(d.state, parentName, d.inst, d.config["parent"], d.config["vlan"])
+		err := networkRemoveInterfaceIfNeeded(d.state, parentName, d.inst, d.config["parent"], d.config["vlan"])
 		if err != nil {
 			return err
 		}
diff --git a/lxd/device/nic_macvlan.go b/lxd/device/nic_macvlan.go
index 65401ebcc9..03d34a9e47 100644
--- a/lxd/device/nic_macvlan.go
+++ b/lxd/device/nic_macvlan.go
@@ -83,7 +83,7 @@ func (d *nicMACVLAN) Start() (*deviceConfig.RunConfig, error) {
 
 	if shared.IsTrue(saveData["last_state.created"]) {
 		revert.Add(func() {
-			NetworkRemoveInterfaceIfNeeded(d.state, actualParentName, d.inst, d.config["parent"], d.config["vlan"])
+			networkRemoveInterfaceIfNeeded(d.state, actualParentName, d.inst, d.config["parent"], d.config["vlan"])
 		})
 	}
 
@@ -188,7 +188,7 @@ func (d *nicMACVLAN) postStop() error {
 	// This will delete the parent interface if we created it for VLAN parent.
 	if shared.IsTrue(v["last_state.created"]) {
 		actualParentName := NetworkGetHostDevice(d.config["parent"], d.config["vlan"])
-		err := NetworkRemoveInterfaceIfNeeded(d.state, actualParentName, d.inst, d.config["parent"], d.config["vlan"])
+		err := networkRemoveInterfaceIfNeeded(d.state, actualParentName, d.inst, d.config["parent"], d.config["vlan"])
 		if err != nil {
 			errs = append(errs, err)
 		}
diff --git a/lxd/device/nic_physical.go b/lxd/device/nic_physical.go
index 25e36a7cc8..b8d4662b72 100644
--- a/lxd/device/nic_physical.go
+++ b/lxd/device/nic_physical.go
@@ -96,7 +96,7 @@ func (d *nicPhysical) Start() (*deviceConfig.RunConfig, error) {
 
 		if shared.IsTrue(saveData["last_state.created"]) {
 			revert.Add(func() {
-				NetworkRemoveInterfaceIfNeeded(d.state, saveData["host_name"], d.inst, d.config["parent"], d.config["vlan"])
+				networkRemoveInterfaceIfNeeded(d.state, saveData["host_name"], d.inst, d.config["parent"], d.config["vlan"])
 			})
 		}
 
@@ -241,7 +241,7 @@ func (d *nicPhysical) postStop() error {
 
 		// This will delete the parent interface if we created it for VLAN parent.
 		if shared.IsTrue(v["last_state.created"]) {
-			err := NetworkRemoveInterfaceIfNeeded(d.state, hostName, d.inst, d.config["parent"], d.config["vlan"])
+			err := networkRemoveInterfaceIfNeeded(d.state, hostName, d.inst, d.config["parent"], d.config["vlan"])
 			if err != nil {
 				return err
 			}
diff --git a/lxd/device/nic_routed.go b/lxd/device/nic_routed.go
index 3f6d0cb66f..6077ce41bc 100644
--- a/lxd/device/nic_routed.go
+++ b/lxd/device/nic_routed.go
@@ -310,7 +310,7 @@ func (d *nicRouted) postStop() error {
 	// This will delete the parent interface if we created it for VLAN parent.
 	if shared.IsTrue(v["last_state.created"]) {
 		parentName := NetworkGetHostDevice(d.config["parent"], d.config["vlan"])
-		err := NetworkRemoveInterfaceIfNeeded(d.state, parentName, d.inst, d.config["parent"], d.config["vlan"])
+		err := networkRemoveInterfaceIfNeeded(d.state, parentName, d.inst, d.config["parent"], d.config["vlan"])
 		if err != nil {
 			return err
 		}

From 47bd67bb1ff0a4207ac6abdb13505cca77a5aabb Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Mon, 27 Jan 2020 17:54:39 +0000
Subject: [PATCH 15/15] lxd/device/nic/sriov: Adds VM support

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 lxd/device/nic_sriov.go | 181 ++++++++++++++++++++++++++++------------
 1 file changed, 128 insertions(+), 53 deletions(-)

diff --git a/lxd/device/nic_sriov.go b/lxd/device/nic_sriov.go
index 10a111ef6f..9269604381 100644
--- a/lxd/device/nic_sriov.go
+++ b/lxd/device/nic_sriov.go
@@ -12,6 +12,7 @@ import (
 
 	deviceConfig "github.com/lxc/lxd/lxd/device/config"
 	"github.com/lxc/lxd/lxd/instance/instancetype"
+	"github.com/lxc/lxd/lxd/revert"
 	"github.com/lxc/lxd/shared"
 )
 
@@ -21,14 +22,13 @@ type nicSRIOV struct {
 
 // validateConfig checks the supplied config for correctness.
 func (d *nicSRIOV) validateConfig() error {
-	if d.inst.Type() != instancetype.Container {
+	if d.inst.Type() != instancetype.Container && d.inst.Type() != instancetype.VM {
 		return ErrUnsupportedDevType
 	}
 
 	requiredFields := []string{"parent"}
 	optionalFields := []string{
 		"name",
-		"mtu",
 		"hwaddr",
 		"vlan",
 		"security.mac_filtering",
@@ -36,6 +36,12 @@ func (d *nicSRIOV) validateConfig() error {
 		"maas.subnet.ipv6",
 		"boot.priority",
 	}
+
+	// For VMs only NIC properties that can be specified on the parent's VF settings are controllable.
+	if d.inst.Type() == instancetype.Container {
+		optionalFields = append(optionalFields, "mtu")
+	}
+
 	err := d.config.Validate(nicValidationRules(requiredFields, optionalFields))
 	if err != nil {
 		return err
@@ -76,31 +82,33 @@ func (d *nicSRIOV) Start() (*deviceConfig.RunConfig, error) {
 		return nil, err
 	}
 
-	err = d.setupSriovParent(vfDev, vfID, saveData)
+	vfPCIDev, err := d.setupSriovParent(vfDev, vfID, saveData)
 	if err != nil {
 		return nil, err
 	}
 
-	// Set the MAC address.
-	if d.config["hwaddr"] != "" {
-		_, err := shared.RunCommand("ip", "link", "set", "dev", saveData["host_name"], "address", d.config["hwaddr"])
-		if err != nil {
-			return nil, fmt.Errorf("Failed to set the MAC address: %s", err)
+	if d.inst.Type() == instancetype.Container {
+		// Set the MAC address.
+		if d.config["hwaddr"] != "" {
+			_, err := shared.RunCommand("ip", "link", "set", "dev", saveData["host_name"], "address", d.config["hwaddr"])
+			if err != nil {
+				return nil, fmt.Errorf("Failed to set the MAC address: %s", err)
+			}
 		}
-	}
 
-	// Set the MTU.
-	if d.config["mtu"] != "" {
-		_, err := shared.RunCommand("ip", "link", "set", "dev", saveData["host_name"], "mtu", d.config["mtu"])
-		if err != nil {
-			return nil, fmt.Errorf("Failed to set the MTU: %s", err)
+		// Set the MTU.
+		if d.config["mtu"] != "" {
+			_, err := shared.RunCommand("ip", "link", "set", "dev", saveData["host_name"], "mtu", d.config["mtu"])
+			if err != nil {
+				return nil, fmt.Errorf("Failed to set the MTU: %s", err)
+			}
 		}
-	}
 
-	// Bring the interface up.
-	_, err = shared.RunCommand("ip", "link", "set", "dev", saveData["host_name"], "up")
-	if err != nil {
-		return nil, fmt.Errorf("Failed to bring up the interface: %v", err)
+		// Bring the interface up.
+		_, err = shared.RunCommand("ip", "link", "set", "dev", saveData["host_name"], "up")
+		if err != nil {
+			return nil, fmt.Errorf("Failed to bring up the interface: %v", err)
+		}
 	}
 
 	err = d.volatileSet(saveData)
@@ -116,6 +124,14 @@ func (d *nicSRIOV) Start() (*deviceConfig.RunConfig, error) {
 		{Key: "link", Value: saveData["host_name"]},
 	}
 
+	if d.inst.Type() == instancetype.VM {
+		runConf.NetworkInterface = append(runConf.NetworkInterface,
+			[]deviceConfig.RunConfigItem{
+				{Key: "devName", Value: d.name},
+				{Key: "pciSlotName", Value: vfPCIDev.SlotName},
+			}...)
+	}
+
 	return &runConf, nil
 }
 
@@ -143,6 +159,7 @@ func (d *nicSRIOV) postStop() error {
 		"last_state.vf.hwaddr":     "",
 		"last_state.vf.vlan":       "",
 		"last_state.vf.spoofcheck": "",
+		"last_state.pci.driver":    "",
 	})
 
 	v := d.volatileGet()
@@ -308,15 +325,20 @@ func (d *nicSRIOV) getFreeVFInterface(reservedDevices map[string]struct{}, vfLis
 	return "", nil
 }
 
-// setupSriovParent configures a SR-IOV virtual function (VF) device on parent and stores original
-// properties of the physical device into voltatile for restoration on detach.
-func (d *nicSRIOV) setupSriovParent(vfDevice string, vfID int, volatile map[string]string) error {
+// setupSriovParent configures a SR-IOV virtual function (VF) device on parent and stores original properties of
+// the physical device into voltatile for restoration on detach. Returns VF PCI device info.
+func (d *nicSRIOV) setupSriovParent(vfDevice string, vfID int, volatile map[string]string) (pciDevice, error) {
+	var vfPCIDev pciDevice
+
 	// Retrieve VF settings from parent device.
 	vfInfo, err := d.networkGetVirtFuncInfo(d.config["parent"], vfID)
 	if err != nil {
-		return err
+		return vfPCIDev, err
 	}
 
+	revert := revert.New()
+	defer revert.Fail()
+
 	// Record properties of VF settings on the parent device.
 	volatile["last_state.vf.hwaddr"] = vfInfo.mac
 	volatile["last_state.vf.id"] = fmt.Sprintf("%d", vfID)
@@ -330,30 +352,28 @@ func (d *nicSRIOV) setupSriovParent(vfDevice string, vfID int, volatile map[stri
 	// Record properties of VF device.
 	err = networkSnapshotPhysicalNic(volatile["host_name"], volatile)
 	if err != nil {
-		return err
+		return vfPCIDev, err
 	}
 
 	// Get VF device's PCI Slot Name so we can unbind and rebind it from the host.
-	vfPCIDev, err := d.networkGetVFDevicePCISlot(volatile["last_state.vf.id"])
+	vfPCIDev, err = d.networkGetVFDevicePCISlot(volatile["last_state.vf.id"])
 	if err != nil {
-		return err
+		return vfPCIDev, err
 	}
 
 	// Unbind VF device from the host so that the settings will take effect when we rebind it.
 	err = networkDeviceUnbind(vfPCIDev)
 	if err != nil {
-		return err
+		return vfPCIDev, err
 	}
 
-	// However we return from this function, we must try to rebind the VF so its not orphaned.
-	// The OS won't let an already bound device be bound again so is safe to call twice.
-	defer networkDeviceBind(vfPCIDev)
+	revert.Add(func() { networkDeviceBind(vfPCIDev) })
 
 	// Setup VF VLAN if specified.
 	if d.config["vlan"] != "" {
 		_, err := shared.RunCommand("ip", "link", "set", "dev", d.config["parent"], "vf", volatile["last_state.vf.id"], "vlan", d.config["vlan"])
 		if err != nil {
-			return err
+			return vfPCIDev, err
 		}
 	}
 
@@ -370,44 +390,82 @@ func (d *nicSRIOV) setupSriovParent(vfDevice string, vfID int, volatile map[stri
 		// Set MAC on VF (this combined with spoof checking prevents any other MAC being used).
 		_, err = shared.RunCommand("ip", "link", "set", "dev", d.config["parent"], "vf", volatile["last_state.vf.id"], "mac", mac)
 		if err != nil {
-			return err
+			return vfPCIDev, err
 		}
 
 		// Now that MAC is set on VF, we can enable spoof checking.
 		_, err = shared.RunCommand("ip", "link", "set", "dev", d.config["parent"], "vf", volatile["last_state.vf.id"], "spoofchk", "on")
 		if err != nil {
-			return err
+			return vfPCIDev, err
 		}
 	} else {
 		// Reset VF to ensure no previous MAC restriction exists.
 		_, err := shared.RunCommand("ip", "link", "set", "dev", d.config["parent"], "vf", volatile["last_state.vf.id"], "mac", "00:00:00:00:00:00")
 		if err != nil {
-			return err
+			return vfPCIDev, err
 		}
 
 		// Ensure spoof checking is disabled if not enabled in instance.
 		_, err = shared.RunCommand("ip", "link", "set", "dev", d.config["parent"], "vf", volatile["last_state.vf.id"], "spoofchk", "off")
 		if err != nil {
-			return err
+			return vfPCIDev, err
 		}
-	}
 
-	// Bind VF device onto the host so that the settings will take effect.
-	err = networkDeviceBind(vfPCIDev)
-	if err != nil {
-		return err
+		// Set MAC on VF if specified (this should be passed through into VM when it is bound to vfio-pci).
+		if d.inst.Type() == instancetype.VM {
+			// If no MAC specified in config, use current VF interface MAC.
+			mac := d.config["hwaddr"]
+			if mac == "" {
+				mac = volatile["last_state.hwaddr"]
+			}
+
+			_, err = shared.RunCommand("ip", "link", "set", "dev", d.config["parent"], "vf", volatile["last_state.vf.id"], "mac", mac)
+			if err != nil {
+				return vfPCIDev, err
+			}
+		}
 	}
 
-	// Wait for VF driver to be reloaded, this will remove the VF interface temporarily, and
-	// it will re-appear shortly after. Unfortunately the time between sending the bind event
-	// to the nic and it actually appearing on the host is non-zero, so we need to watch and wait,
-	// otherwise next steps of applying settings to interface will fail.
-	err = networkInterfaceBindWait(volatile["host_name"])
-	if err != nil {
-		return err
+	if d.inst.Type() == instancetype.Container {
+		// Bind VF device onto the host so that the settings will take effect.
+		err = networkDeviceBind(vfPCIDev)
+		if err != nil {
+			return vfPCIDev, err
+		}
+
+		// Wait for VF driver to be reloaded, this will remove the VF interface temporarily, and
+		// it will re-appear shortly after. Unfortunately the time between sending the bind event
+		// to the nic and it actually appearing on the host is non-zero, so we need to watch and wait,
+		// otherwise next steps of applying settings to interface will fail.
+		err = networkInterfaceBindWait(volatile["host_name"])
+		if err != nil {
+			return vfPCIDev, err
+		}
+	} else if d.inst.Type() == instancetype.VM {
+		// Register VF device with vfio-pci driver so it can be passed to VM.
+		err = networkVFIOPCIRegister(vfPCIDev)
+		if err != nil {
+			return vfPCIDev, err
+		}
+
+		vfioDev := pciDevice{
+			Driver:   "vfio-pci",
+			SlotName: vfPCIDev.SlotName,
+		}
+
+		revert.Add(func() { networkDeviceUnbind(vfioDev) })
+
+		err = networkDeviceBindWait(vfioDev)
+		if err != nil {
+			return vfPCIDev, err
+		}
+
+		// Record original driver used by VF device for restore.
+		volatile["last_state.pci.driver"] = vfPCIDev.Driver
 	}
 
-	return nil
+	revert.Success()
+	return vfPCIDev, nil
 }
 
 // virtFuncInfo holds information about SR-IOV virtual functions.
@@ -490,16 +548,33 @@ func (d *nicSRIOV) restoreSriovParent(volatile map[string]string) error {
 		return nil
 	}
 
-	// Get VF device's PCI Slot Name so we can unbind and rebind it from the host.
+	// Get VF device's PCI info so we can unbind and rebind it from the host.
 	vfPCIDev, err := d.networkGetVFDevicePCISlot(volatile["last_state.vf.id"])
 	if err != nil {
 		return err
 	}
 
-	// Unbind VF device from the host so that the settings will take effect when we rebind it.
-	err = networkDeviceUnbind(vfPCIDev)
-	if err != nil {
-		return err
+	if d.inst.Type() == instancetype.Container {
+		// Unbind VF device from the host so that the settings will take effect when we rebind it.
+		err = networkDeviceUnbind(vfPCIDev)
+		if err != nil {
+			return err
+		}
+	} else if d.inst.Type() == instancetype.VM {
+		// Unbind VF device from vfio-pci driver so that we can rebind it on host.
+		vfioDev := pciDevice{
+			Driver:   "vfio-pci",
+			SlotName: vfPCIDev.SlotName,
+		}
+
+		err := networkDeviceUnbind(vfioDev)
+		if err != nil {
+			return err
+		}
+
+		// Before we bind the device back to the host, ensure we restore the original driver info as it
+		// should be currently set to vfio-pci.
+		vfPCIDev.Driver = volatile["last_state.pci.driver"]
 	}
 
 	// However we return from this function, we must try to rebind the VF so its not orphaned.


More information about the lxc-devel mailing list