[lxc-devel] [lxd/master] api: add support for SR-IOV enabled network devices

brauner on Github lxc-bot at linuxcontainers.org
Wed Oct 18 10:12:35 UTC 2017


A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 1731 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20171018/7ddcf3a7/attachment.bin>
-------------- next part --------------
From fcb83715682766d813d5dbd8b403d90511b3f1d7 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Tue, 17 Oct 2017 14:26:16 +0200
Subject: [PATCH 1/3] container: add nictype "vfio"

Closes #3941.

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 lxd/container.go      |   6 +--
 lxd/container_lxc.go  | 111 +++++++++++++++++++++++++++++++++++++++++++++++---
 lxd/networks_utils.go |   2 +-
 3 files changed, 109 insertions(+), 10 deletions(-)

diff --git a/lxd/container.go b/lxd/container.go
index d53d91e19..bfe0f2349 100644
--- a/lxd/container.go
+++ b/lxd/container.go
@@ -306,12 +306,12 @@ func containerValidDevices(dbObj *sql.DB, devices types.Devices, profile bool, e
 				return fmt.Errorf("Missing nic type")
 			}
 
-			if !shared.StringInSlice(m["nictype"], []string{"bridged", "physical", "p2p", "macvlan"}) {
+			if !shared.StringInSlice(m["nictype"], []string{"bridged", "macvlan", "p2p", "physical", "vfio"}) {
 				return fmt.Errorf("Bad nic type: %s", m["nictype"])
 			}
 
-			if shared.StringInSlice(m["nictype"], []string{"bridged", "physical", "macvlan"}) && m["parent"] == "" {
-				return fmt.Errorf("Missing parent for %s type nic.", m["nictype"])
+			if shared.StringInSlice(m["nictype"], []string{"bridged", "macvlan", "physical", "vfio"}) && m["parent"] == "" {
+				return fmt.Errorf("Missing parent for %s type nic", m["nictype"])
 			}
 		} else if m["type"] == "disk" {
 			if !expanded && !shared.StringInSlice(m["path"], diskDevicePaths) {
diff --git a/lxd/container_lxc.go b/lxd/container_lxc.go
index fc5ed8cb0..45a33066e 100644
--- a/lxd/container_lxc.go
+++ b/lxd/container_lxc.go
@@ -1350,7 +1350,7 @@ func (c *containerLXC) initLXC() error {
 				if err != nil {
 					return err
 				}
-			} else if m["nictype"] == "physical" {
+			} else if m["nictype"] == "physical" || m["nictype"] == "vfio" {
 				err = lxcSetConfigItem(cc, fmt.Sprintf("%s.%d.type", networkKeyPrefix, networkidx), "phys")
 				if err != nil {
 					return err
@@ -1377,6 +1377,11 @@ func (c *containerLXC) initLXC() error {
 				if err != nil {
 					return err
 				}
+			} else if m["nictype"] == "vfio" {
+				err = lxcSetConfigItem(cc, fmt.Sprintf("%s.%d.link", networkKeyPrefix, networkidx), m["host_name"])
+				if err != nil {
+					return err
+				}
 			} else if shared.StringInSlice(m["nictype"], []string{"macvlan", "physical"}) {
 				err = lxcSetConfigItem(cc, fmt.Sprintf("%s.%d.link", networkKeyPrefix, networkidx), networkGetHostDevice(m["parent"], m["vlan"]))
 				if err != nil {
@@ -1386,7 +1391,7 @@ func (c *containerLXC) initLXC() error {
 
 			// Host Virtual NIC name
 			vethName := ""
-			if m["host_name"] != "" {
+			if m["host_name"] != "" && m["nictype"] != "vfio" {
 				vethName = m["host_name"]
 			} else if shared.IsTrue(m["security.mac_filtering"]) {
 				// We need a known device name for MAC filtering
@@ -5885,6 +5890,10 @@ func (c *containerLXC) createNetworkDevice(name string, m types.Device) (string,
 		}
 	}
 
+	if m["nictype"] == "vfio" {
+		dev = m["host_name"]
+	}
+
 	// Handle bridged and p2p
 	if shared.StringInSlice(m["nictype"], []string{"bridged", "p2p"}) {
 		n2 := deviceNextVeth()
@@ -5914,7 +5923,7 @@ func (c *containerLXC) createNetworkDevice(name string, m types.Device) (string,
 	}
 
 	// Handle physical and macvlan
-	if shared.StringInSlice(m["nictype"], []string{"physical", "macvlan"}) {
+	if shared.StringInSlice(m["nictype"], []string{"macvlan", "physical"}) {
 		// Deal with VLAN
 		device := m["parent"]
 		if m["vlan"] != "" {
@@ -6118,11 +6127,99 @@ func (c *containerLXC) fillNetworkDevice(name string, m types.Device) (types.Dev
 	}
 
 	// Fill in the host name (but don't generate a static one ourselves)
-	if m["host_name"] == "" && shared.StringInSlice(m["nictype"], []string{"bridged", "p2p"}) {
-		configKey := fmt.Sprintf("volatile.%s.host_name", name)
+	configKey := fmt.Sprintf("volatile.%s.host_name", name)
+	if m["host_name"] == "" && shared.StringInSlice(m["nictype"], []string{"bridged", "p2p", "vfio"}) {
 		newDevice["host_name"] = c.localConfig[configKey]
 	}
 
+	if m["nictype"] == "vfio" && m["parent"] != "" {
+		if !shared.PathExists(fmt.Sprintf("/sys/class/net/%s", m["parent"])) {
+			return nil, fmt.Errorf("Parent device '%s' doesn't exist", m["parent"])
+		}
+
+		if newDevice["host_name"] == "" {
+			sriovNumVFs := fmt.Sprintf("/sys/class/net/%s/device/sriov_numvfs", m["parent"])
+			sriovTotalVFs := fmt.Sprintf("/sys/class/net/%s/device/sriov_totalvfs", m["parent"])
+
+			// verify that this is indeed a SR-IOV enabled device
+			if !shared.PathExists(sriovTotalVFs) {
+				return nil, fmt.Errorf("Parent device '%s' doesn't support SR-IOV", m["parent"])
+			}
+
+			// get number of currently enabled VFs
+			sriovNumVfsBuf, err := ioutil.ReadFile(sriovNumVFs)
+			if err != nil {
+				return nil, err
+			}
+			sriovNumVfsStr := strings.TrimSpace(string(sriovNumVfsBuf))
+			sriovNum, err := strconv.Atoi(sriovNumVfsStr)
+			if err != nil {
+				return nil, err
+			}
+
+			// get number of possible VFs
+			sriovTotalVfsBuf, err := ioutil.ReadFile(sriovTotalVFs)
+			if err != nil {
+				return nil, err
+			}
+			sriovTotalVfsStr := strings.TrimSpace(string(sriovTotalVfsBuf))
+			sriovTotal, err := strconv.Atoi(sriovTotalVfsStr)
+			if err != nil {
+				return nil, err
+			}
+
+			// Check if any VFs are already enabled
+			vf := ""
+			for i := 0; i < sriovNum; i++ {
+				vf = fmt.Sprintf("virtfn%d", i)
+				if !shared.PathExists(fmt.Sprintf("/sys/class/net/%s/device/%s/net", m["parent"], vf)) {
+					vf = ""
+					continue
+				}
+
+				// Check if VF is already in use
+				empty, err := shared.PathIsEmpty(fmt.Sprintf("/sys/class/net/%s/device/%s/net", m["parent"], vf))
+				if err != nil {
+					return nil, err
+				}
+				if empty {
+					vf = ""
+					continue
+				}
+
+				// found free VF
+				break
+			}
+
+			if vf == "" {
+				if sriovNum == sriovTotal {
+					return nil, fmt.Errorf("All virtual functions of vfio device '%s' seem to be in use", m["parent"])
+				}
+
+				// bump the number of VFs to the maximum
+				err := ioutil.WriteFile(sriovNumVFs, []byte(sriovTotalVfsStr), 0644)
+				if err != nil {
+					return nil, err
+				}
+
+				// use next free VF index
+				vf = fmt.Sprintf("virtfn%d", sriovNum+1)
+			}
+
+			vf = fmt.Sprintf("/sys/class/net/%s/device/%s/net", m["parent"], vf)
+			ents, err := ioutil.ReadDir(vf)
+			if err != nil {
+				return nil, err
+			}
+			if len(ents) == 0 || len(ents) > 1 {
+				return nil, fmt.Errorf("Failed to determine unique device name")
+			}
+
+			newDevice["host_name"] = ents[0].Name()
+			c.localConfig[configKey] = ents[0].Name()
+		}
+	}
+
 	return newDevice, nil
 }
 
@@ -6249,6 +6346,8 @@ func (c *containerLXC) removeNetworkDevice(name string, m types.Device) error {
 	var hostName string
 	if m["nictype"] == "physical" {
 		hostName = m["parent"]
+	} else if m["nictype"] == "vfio" {
+		hostName = m["host_name"]
 	} else {
 		hostName = deviceNextVeth()
 	}
@@ -6266,7 +6365,7 @@ func (c *containerLXC) removeNetworkDevice(name string, m types.Device) error {
 	}
 
 	// If a veth, destroy it
-	if m["nictype"] != "physical" {
+	if m["nictype"] != "physical" && m["nictype"] != "vfio" {
 		deviceRemoveInterface(hostName)
 	}
 
diff --git a/lxd/networks_utils.go b/lxd/networks_utils.go
index 4cf455a84..95656ba53 100644
--- a/lxd/networks_utils.go
+++ b/lxd/networks_utils.go
@@ -104,7 +104,7 @@ func networkIsInUse(c container, name string) bool {
 			continue
 		}
 
-		if !shared.StringInSlice(d["nictype"], []string{"bridged", "macvlan", "physical"}) {
+		if !shared.StringInSlice(d["nictype"], []string{"bridged", "macvlan", "physical", "vfio"}) {
 			continue
 		}
 

From 817fc4cacb2acb4513cb48c41ad7330147226490 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Wed, 18 Oct 2017 11:33:35 +0200
Subject: [PATCH 2/3] doc/containers: add nictype=vfio

Closes #3941.

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 doc/containers.md | 58 ++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 42 insertions(+), 16 deletions(-)

diff --git a/doc/containers.md b/doc/containers.md
index c5a72bea9..c8a0f8f14 100644
--- a/doc/containers.md
+++ b/doc/containers.md
@@ -62,7 +62,7 @@ Key                             | Type      | Default       | Description
 :--                             | :---      | :------       | :----------
 volatile.\<name\>.hwaddr        | string    | -             | Network device MAC address (when no hwaddr property is set on the device itself)
 volatile.\<name\>.name          | string    | -             | Network device name (when no name propery is set on the device itself)
-volatile.\<name\>.host\_name    | string    | -             | Network device name on the host (for nictype=bridged or nictype=p2p)
+volatile.\<name\>.host\_name    | string    | -             | Network device name on the host (for nictype=bridged or nictype=p2p, or nictype=vfio)
 volatile.apply\_quota           | string    | -             | Disk quota to be applied on next container start
 volatile.apply\_template        | string    | -             | The name of a template hook which should be triggered upon next startup
 volatile.base\_image            | string    | -             | The hash of the image the container was created from, if any.
@@ -170,24 +170,25 @@ LXD supports different kind of network devices:
  - `bridged`: Uses an existing bridge on the host and creates a virtual device pair to connect the host bridge to the container.
  - `macvlan`: Sets up a new network device based on an existing one but using a different MAC address.
  - `p2p`: Creates a virtual device pair, putting one side in the container and leaving the other side on the host.
+ - `vfio`: Passes a virtual function of an SR-IOV enabled physical network device into the container.
 
 Different network interface types have different additional properties, the current list is:
 
-Key                     | Type      | Default           | Required  | Used by                       | API extension                          | Description
-:--                     | :--       | :--               | :--       | :--                           | :--                                    | :--
-nictype                 | string    | -                 | yes       | all                           | -                                      | The device type, one of "physical", "bridged", "macvlan" or "p2p"
-limits.ingress          | string    | -                 | no        | bridged, p2p                  | -                                      | I/O limit in bit/s (supports kbit, Mbit, Gbit suffixes)
-limits.egress           | string    | -                 | no        | bridged, p2p                  | -                                      | I/O limit in bit/s (supports kbit, Mbit, Gbit suffixes)
-limits.max              | string    | -                 | no        | bridged, p2p                  | -                                      | Same as modifying both limits.read and limits.write
-name                    | string    | kernel assigned   | no        | all                           | -                                      | The name of the interface inside the container
-host\_name              | string    | randomly assigned | no        | bridged, p2p, macvlan         | -                                      | The name of the interface inside the host
-hwaddr                  | string    | randomly assigned | no        | all                           | -                                      | The MAC address of the new interface
-mtu                     | integer   | parent MTU        | no        | all                           | -                                      | The MTU of the new interface
-parent                  | string    | -                 | yes       | physical, bridged, macvlan    | -                                      | The name of the host device or bridge
-vlan                    | integer   | -                 | no        | macvlan, physical             | network\_vlan, network\_vlan\_physical | The VLAN ID to attach to
-ipv4.address            | string    | -                 | no        | bridged                       | network                                | An IPv4 address to assign to the container through DHCP
-ipv6.address            | string    | -                 | no        | bridged                       | network                                | An IPv6 address to assign to the container through DHCP
-security.mac\_filtering | boolean   | false             | no        | bridged                       | network                                | Prevent the container from spoofing another's MAC address
+Key                     | Type      | Default           | Required  | Used by                          | API extension                          | Description
+:--                     | :--       | :--               | :--       | :--                              | :--                                    | :--
+nictype                 | string    | -                 | yes       | all                              | -                                      | The device type, one of "bridged", "macvlan", "p2p", "physical", or "vfio"
+limits.ingress          | string    | -                 | no        | bridged, p2p                     | -                                      | I/O limit in bit/s (supports kbit, Mbit, Gbit suffixes)
+limits.egress           | string    | -                 | no        | bridged, p2p                     | -                                      | I/O limit in bit/s (supports kbit, Mbit, Gbit suffixes)
+limits.max              | string    | -                 | no        | bridged, p2p                     | -                                      | Same as modifying both limits.read and limits.write
+name                    | string    | kernel assigned   | no        | all                              | -                                      | The name of the interface inside the container
+host\_name              | string    | randomly assigned | no        | bridged, macvlan, p2p, vfio      | -                                      | The name of the interface inside the host
+hwaddr                  | string    | randomly assigned | no        | all                              | -                                      | The MAC address of the new interface
+mtu                     | integer   | parent MTU        | no        | all                              | -                                      | The MTU of the new interface
+parent                  | string    | -                 | yes       | bridged, macvlan, physical, vfio | -                                      | The name of the host device or bridge
+vlan                    | integer   | -                 | no        | macvlan, physical                | network\_vlan, network\_vlan\_physical | The VLAN ID to attach to
+ipv4.address            | string    | -                 | no        | bridged                          | network                                | An IPv4 address to assign to the container through DHCP
+ipv6.address            | string    | -                 | no        | bridged                          | network                                | An IPv6 address to assign to the container through DHCP
+security.mac\_filtering | boolean   | false             | no        | bridged                          | network                                | Prevent the container from spoofing another's MAC address
 
 #### bridged or macvlan for connection to physical network
 The `bridged` and `macvlan` interface types can both be used to connect
@@ -206,6 +207,31 @@ your containers to talk to the host itself.
 In such case, a bridge is preferable. A bridge will also let you use mac
 filtering and I/O limits which cannot be applied to a macvlan device.
 
+#### vfio
+The `vfio` interface type supports SR-IOV enabled network devices. These
+devices associate a set of virtual functions (VFs) with the single physical
+function (PF) of the network device. PFs are standard PCIe functions. VFs on
+the other hand are very lightweight PCIe functions that are optimized for data
+movement. They come with a limited set of configuration capabilites to prevent
+changing properties of the PF. Given that VFs appear as regular PCIe devices to
+the system they can be passed to containers just like a regular physical
+device. The `vfio` interface type expects to be passed the name of an SR-IOV
+enabled network device on the system via the `parent` property. LXD will then
+check for any available VFs on the system. By default LXD will allocate the
+first free VF it finds. If it detects that either none are enabled or all
+currently enabled VFs are in use it will bump the number of supported VFs to
+the maximum value and use the first free VF. If all possible VFs are in use or
+the kernel or card doesn't support incrementing the number of VFs LXD will
+return an error. To create a `vfio` network device use:
+
+```
+lxc config device add <container> <device-name> nic nictype=vfio parent=<sriov-enabled-device>
+```
+
+To tell LXD to use a specific unused VF add the `host_name` property and pass
+it the name of the enabled VF.
+
+
 ### Type: disk
 Disk entries are essentially mountpoints inside the container. They can
 either be a bind-mount of an existing file or directory on the host, or

From d0aea5b2360c8ee551c352fb4610cf153027884d Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Wed, 18 Oct 2017 12:07:52 +0200
Subject: [PATCH 3/3] api extension: add "network_vfio" extension

Closes #3941.

This adds support for SR-IOV enabled network devices.

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 doc/api-extensions.md | 3 +++
 lxd/api_1.0.go        | 1 +
 2 files changed, 4 insertions(+)

diff --git a/doc/api-extensions.md b/doc/api-extensions.md
index 04695ed6d..cbb607418 100644
--- a/doc/api-extensions.md
+++ b/doc/api-extensions.md
@@ -346,3 +346,6 @@ This adds support for querying an LXD daemon for the system resources it has
 ## kernel\_limits
 This adds support for setting process limits such as maximum number of open
 files for the container via `nofile`. The format is `limits.kernel.[limit name]`.
+
+## network\_vfio
+This adds support for SR-IOV enabled network devices.
diff --git a/lxd/api_1.0.go b/lxd/api_1.0.go
index d05dc407d..4ac0634ce 100644
--- a/lxd/api_1.0.go
+++ b/lxd/api_1.0.go
@@ -130,6 +130,7 @@ func api10Get(d *Daemon, r *http.Request) Response {
 			"storage_block_filesystem_btrfs",
 			"resources",
 			"kernel_limits",
+			"network_vfio",
 		},
 		APIStatus:  "stable",
 		APIVersion: version.APIVersion,


More information about the lxc-devel mailing list