[lxc-devel] [lxd/master] api: add support for SR-IOV enabled network devices
brauner on Github
lxc-bot at linuxcontainers.org
Wed Oct 18 10:12:35 UTC 2017
A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 1731 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20171018/7ddcf3a7/attachment.bin>
-------------- next part --------------
From fcb83715682766d813d5dbd8b403d90511b3f1d7 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Tue, 17 Oct 2017 14:26:16 +0200
Subject: [PATCH 1/3] container: add nictype "vfio"
Closes #3941.
Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
lxd/container.go | 6 +--
lxd/container_lxc.go | 111 +++++++++++++++++++++++++++++++++++++++++++++++---
lxd/networks_utils.go | 2 +-
3 files changed, 109 insertions(+), 10 deletions(-)
diff --git a/lxd/container.go b/lxd/container.go
index d53d91e19..bfe0f2349 100644
--- a/lxd/container.go
+++ b/lxd/container.go
@@ -306,12 +306,12 @@ func containerValidDevices(dbObj *sql.DB, devices types.Devices, profile bool, e
return fmt.Errorf("Missing nic type")
}
- if !shared.StringInSlice(m["nictype"], []string{"bridged", "physical", "p2p", "macvlan"}) {
+ if !shared.StringInSlice(m["nictype"], []string{"bridged", "macvlan", "p2p", "physical", "vfio"}) {
return fmt.Errorf("Bad nic type: %s", m["nictype"])
}
- if shared.StringInSlice(m["nictype"], []string{"bridged", "physical", "macvlan"}) && m["parent"] == "" {
- return fmt.Errorf("Missing parent for %s type nic.", m["nictype"])
+ if shared.StringInSlice(m["nictype"], []string{"bridged", "macvlan", "physical", "vfio"}) && m["parent"] == "" {
+ return fmt.Errorf("Missing parent for %s type nic", m["nictype"])
}
} else if m["type"] == "disk" {
if !expanded && !shared.StringInSlice(m["path"], diskDevicePaths) {
diff --git a/lxd/container_lxc.go b/lxd/container_lxc.go
index fc5ed8cb0..45a33066e 100644
--- a/lxd/container_lxc.go
+++ b/lxd/container_lxc.go
@@ -1350,7 +1350,7 @@ func (c *containerLXC) initLXC() error {
if err != nil {
return err
}
- } else if m["nictype"] == "physical" {
+ } else if m["nictype"] == "physical" || m["nictype"] == "vfio" {
err = lxcSetConfigItem(cc, fmt.Sprintf("%s.%d.type", networkKeyPrefix, networkidx), "phys")
if err != nil {
return err
@@ -1377,6 +1377,11 @@ func (c *containerLXC) initLXC() error {
if err != nil {
return err
}
+ } else if m["nictype"] == "vfio" {
+ err = lxcSetConfigItem(cc, fmt.Sprintf("%s.%d.link", networkKeyPrefix, networkidx), m["host_name"])
+ if err != nil {
+ return err
+ }
} else if shared.StringInSlice(m["nictype"], []string{"macvlan", "physical"}) {
err = lxcSetConfigItem(cc, fmt.Sprintf("%s.%d.link", networkKeyPrefix, networkidx), networkGetHostDevice(m["parent"], m["vlan"]))
if err != nil {
@@ -1386,7 +1391,7 @@ func (c *containerLXC) initLXC() error {
// Host Virtual NIC name
vethName := ""
- if m["host_name"] != "" {
+ if m["host_name"] != "" && m["nictype"] != "vfio" {
vethName = m["host_name"]
} else if shared.IsTrue(m["security.mac_filtering"]) {
// We need a known device name for MAC filtering
@@ -5885,6 +5890,10 @@ func (c *containerLXC) createNetworkDevice(name string, m types.Device) (string,
}
}
+ if m["nictype"] == "vfio" {
+ dev = m["host_name"]
+ }
+
// Handle bridged and p2p
if shared.StringInSlice(m["nictype"], []string{"bridged", "p2p"}) {
n2 := deviceNextVeth()
@@ -5914,7 +5923,7 @@ func (c *containerLXC) createNetworkDevice(name string, m types.Device) (string,
}
// Handle physical and macvlan
- if shared.StringInSlice(m["nictype"], []string{"physical", "macvlan"}) {
+ if shared.StringInSlice(m["nictype"], []string{"macvlan", "physical"}) {
// Deal with VLAN
device := m["parent"]
if m["vlan"] != "" {
@@ -6118,11 +6127,99 @@ func (c *containerLXC) fillNetworkDevice(name string, m types.Device) (types.Dev
}
// Fill in the host name (but don't generate a static one ourselves)
- if m["host_name"] == "" && shared.StringInSlice(m["nictype"], []string{"bridged", "p2p"}) {
- configKey := fmt.Sprintf("volatile.%s.host_name", name)
+ configKey := fmt.Sprintf("volatile.%s.host_name", name)
+ if m["host_name"] == "" && shared.StringInSlice(m["nictype"], []string{"bridged", "p2p", "vfio"}) {
newDevice["host_name"] = c.localConfig[configKey]
}
+ if m["nictype"] == "vfio" && m["parent"] != "" {
+ if !shared.PathExists(fmt.Sprintf("/sys/class/net/%s", m["parent"])) {
+ return nil, fmt.Errorf("Parent device '%s' doesn't exist", m["parent"])
+ }
+
+ if newDevice["host_name"] == "" {
+ sriovNumVFs := fmt.Sprintf("/sys/class/net/%s/device/sriov_numvfs", m["parent"])
+ sriovTotalVFs := fmt.Sprintf("/sys/class/net/%s/device/sriov_totalvfs", m["parent"])
+
+ // verify that this is indeed a SR-IOV enabled device
+ if !shared.PathExists(sriovTotalVFs) {
+ return nil, fmt.Errorf("Parent device '%s' doesn't support SR-IOV", m["parent"])
+ }
+
+ // get number of currently enabled VFs
+ sriovNumVfsBuf, err := ioutil.ReadFile(sriovNumVFs)
+ if err != nil {
+ return nil, err
+ }
+ sriovNumVfsStr := strings.TrimSpace(string(sriovNumVfsBuf))
+ sriovNum, err := strconv.Atoi(sriovNumVfsStr)
+ if err != nil {
+ return nil, err
+ }
+
+ // get number of possible VFs
+ sriovTotalVfsBuf, err := ioutil.ReadFile(sriovTotalVFs)
+ if err != nil {
+ return nil, err
+ }
+ sriovTotalVfsStr := strings.TrimSpace(string(sriovTotalVfsBuf))
+ sriovTotal, err := strconv.Atoi(sriovTotalVfsStr)
+ if err != nil {
+ return nil, err
+ }
+
+ // Check if any VFs are already enabled
+ vf := ""
+ for i := 0; i < sriovNum; i++ {
+ vf = fmt.Sprintf("virtfn%d", i)
+ if !shared.PathExists(fmt.Sprintf("/sys/class/net/%s/device/%s/net", m["parent"], vf)) {
+ vf = ""
+ continue
+ }
+
+ // Check if VF is already in use
+ empty, err := shared.PathIsEmpty(fmt.Sprintf("/sys/class/net/%s/device/%s/net", m["parent"], vf))
+ if err != nil {
+ return nil, err
+ }
+ if empty {
+ vf = ""
+ continue
+ }
+
+ // found free VF
+ break
+ }
+
+ if vf == "" {
+ if sriovNum == sriovTotal {
+ return nil, fmt.Errorf("All virtual functions of vfio device '%s' seem to be in use", m["parent"])
+ }
+
+ // bump the number of VFs to the maximum
+ err := ioutil.WriteFile(sriovNumVFs, []byte(sriovTotalVfsStr), 0644)
+ if err != nil {
+ return nil, err
+ }
+
+ // use next free VF index
+ vf = fmt.Sprintf("virtfn%d", sriovNum+1)
+ }
+
+ vf = fmt.Sprintf("/sys/class/net/%s/device/%s/net", m["parent"], vf)
+ ents, err := ioutil.ReadDir(vf)
+ if err != nil {
+ return nil, err
+ }
+ if len(ents) == 0 || len(ents) > 1 {
+ return nil, fmt.Errorf("Failed to determine unique device name")
+ }
+
+ newDevice["host_name"] = ents[0].Name()
+ c.localConfig[configKey] = ents[0].Name()
+ }
+ }
+
return newDevice, nil
}
@@ -6249,6 +6346,8 @@ func (c *containerLXC) removeNetworkDevice(name string, m types.Device) error {
var hostName string
if m["nictype"] == "physical" {
hostName = m["parent"]
+ } else if m["nictype"] == "vfio" {
+ hostName = m["host_name"]
} else {
hostName = deviceNextVeth()
}
@@ -6266,7 +6365,7 @@ func (c *containerLXC) removeNetworkDevice(name string, m types.Device) error {
}
// If a veth, destroy it
- if m["nictype"] != "physical" {
+ if m["nictype"] != "physical" && m["nictype"] != "vfio" {
deviceRemoveInterface(hostName)
}
diff --git a/lxd/networks_utils.go b/lxd/networks_utils.go
index 4cf455a84..95656ba53 100644
--- a/lxd/networks_utils.go
+++ b/lxd/networks_utils.go
@@ -104,7 +104,7 @@ func networkIsInUse(c container, name string) bool {
continue
}
- if !shared.StringInSlice(d["nictype"], []string{"bridged", "macvlan", "physical"}) {
+ if !shared.StringInSlice(d["nictype"], []string{"bridged", "macvlan", "physical", "vfio"}) {
continue
}
From 817fc4cacb2acb4513cb48c41ad7330147226490 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Wed, 18 Oct 2017 11:33:35 +0200
Subject: [PATCH 2/3] doc/containers: add nictype=vfio
Closes #3941.
Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
doc/containers.md | 58 ++++++++++++++++++++++++++++++++++++++++---------------
1 file changed, 42 insertions(+), 16 deletions(-)
diff --git a/doc/containers.md b/doc/containers.md
index c5a72bea9..c8a0f8f14 100644
--- a/doc/containers.md
+++ b/doc/containers.md
@@ -62,7 +62,7 @@ Key | Type | Default | Description
:-- | :--- | :------ | :----------
volatile.\<name\>.hwaddr | string | - | Network device MAC address (when no hwaddr property is set on the device itself)
volatile.\<name\>.name | string | - | Network device name (when no name propery is set on the device itself)
-volatile.\<name\>.host\_name | string | - | Network device name on the host (for nictype=bridged or nictype=p2p)
+volatile.\<name\>.host\_name | string | - | Network device name on the host (for nictype=bridged or nictype=p2p, or nictype=vfio)
volatile.apply\_quota | string | - | Disk quota to be applied on next container start
volatile.apply\_template | string | - | The name of a template hook which should be triggered upon next startup
volatile.base\_image | string | - | The hash of the image the container was created from, if any.
@@ -170,24 +170,25 @@ LXD supports different kind of network devices:
- `bridged`: Uses an existing bridge on the host and creates a virtual device pair to connect the host bridge to the container.
- `macvlan`: Sets up a new network device based on an existing one but using a different MAC address.
- `p2p`: Creates a virtual device pair, putting one side in the container and leaving the other side on the host.
+ - `vfio`: Passes a virtual function of an SR-IOV enabled physical network device into the container.
Different network interface types have different additional properties, the current list is:
-Key | Type | Default | Required | Used by | API extension | Description
-:-- | :-- | :-- | :-- | :-- | :-- | :--
-nictype | string | - | yes | all | - | The device type, one of "physical", "bridged", "macvlan" or "p2p"
-limits.ingress | string | - | no | bridged, p2p | - | I/O limit in bit/s (supports kbit, Mbit, Gbit suffixes)
-limits.egress | string | - | no | bridged, p2p | - | I/O limit in bit/s (supports kbit, Mbit, Gbit suffixes)
-limits.max | string | - | no | bridged, p2p | - | Same as modifying both limits.read and limits.write
-name | string | kernel assigned | no | all | - | The name of the interface inside the container
-host\_name | string | randomly assigned | no | bridged, p2p, macvlan | - | The name of the interface inside the host
-hwaddr | string | randomly assigned | no | all | - | The MAC address of the new interface
-mtu | integer | parent MTU | no | all | - | The MTU of the new interface
-parent | string | - | yes | physical, bridged, macvlan | - | The name of the host device or bridge
-vlan | integer | - | no | macvlan, physical | network\_vlan, network\_vlan\_physical | The VLAN ID to attach to
-ipv4.address | string | - | no | bridged | network | An IPv4 address to assign to the container through DHCP
-ipv6.address | string | - | no | bridged | network | An IPv6 address to assign to the container through DHCP
-security.mac\_filtering | boolean | false | no | bridged | network | Prevent the container from spoofing another's MAC address
+Key | Type | Default | Required | Used by | API extension | Description
+:-- | :-- | :-- | :-- | :-- | :-- | :--
+nictype | string | - | yes | all | - | The device type, one of "bridged", "macvlan", "p2p", "physical", or "vfio"
+limits.ingress | string | - | no | bridged, p2p | - | I/O limit in bit/s (supports kbit, Mbit, Gbit suffixes)
+limits.egress | string | - | no | bridged, p2p | - | I/O limit in bit/s (supports kbit, Mbit, Gbit suffixes)
+limits.max | string | - | no | bridged, p2p | - | Same as modifying both limits.read and limits.write
+name | string | kernel assigned | no | all | - | The name of the interface inside the container
+host\_name | string | randomly assigned | no | bridged, macvlan, p2p, vfio | - | The name of the interface inside the host
+hwaddr | string | randomly assigned | no | all | - | The MAC address of the new interface
+mtu | integer | parent MTU | no | all | - | The MTU of the new interface
+parent | string | - | yes | bridged, macvlan, physical, vfio | - | The name of the host device or bridge
+vlan | integer | - | no | macvlan, physical | network\_vlan, network\_vlan\_physical | The VLAN ID to attach to
+ipv4.address | string | - | no | bridged | network | An IPv4 address to assign to the container through DHCP
+ipv6.address | string | - | no | bridged | network | An IPv6 address to assign to the container through DHCP
+security.mac\_filtering | boolean | false | no | bridged | network | Prevent the container from spoofing another's MAC address
#### bridged or macvlan for connection to physical network
The `bridged` and `macvlan` interface types can both be used to connect
@@ -206,6 +207,31 @@ your containers to talk to the host itself.
In such case, a bridge is preferable. A bridge will also let you use mac
filtering and I/O limits which cannot be applied to a macvlan device.
+#### vfio
+The `vfio` interface type supports SR-IOV enabled network devices. These
+devices associate a set of virtual functions (VFs) with the single physical
+function (PF) of the network device. PFs are standard PCIe functions. VFs on
+the other hand are very lightweight PCIe functions that are optimized for data
+movement. They come with a limited set of configuration capabilites to prevent
+changing properties of the PF. Given that VFs appear as regular PCIe devices to
+the system they can be passed to containers just like a regular physical
+device. The `vfio` interface type expects to be passed the name of an SR-IOV
+enabled network device on the system via the `parent` property. LXD will then
+check for any available VFs on the system. By default LXD will allocate the
+first free VF it finds. If it detects that either none are enabled or all
+currently enabled VFs are in use it will bump the number of supported VFs to
+the maximum value and use the first free VF. If all possible VFs are in use or
+the kernel or card doesn't support incrementing the number of VFs LXD will
+return an error. To create a `vfio` network device use:
+
+```
+lxc config device add <container> <device-name> nic nictype=vfio parent=<sriov-enabled-device>
+```
+
+To tell LXD to use a specific unused VF add the `host_name` property and pass
+it the name of the enabled VF.
+
+
### Type: disk
Disk entries are essentially mountpoints inside the container. They can
either be a bind-mount of an existing file or directory on the host, or
From d0aea5b2360c8ee551c352fb4610cf153027884d Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Wed, 18 Oct 2017 12:07:52 +0200
Subject: [PATCH 3/3] api extension: add "network_vfio" extension
Closes #3941.
This adds support for SR-IOV enabled network devices.
Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
doc/api-extensions.md | 3 +++
lxd/api_1.0.go | 1 +
2 files changed, 4 insertions(+)
diff --git a/doc/api-extensions.md b/doc/api-extensions.md
index 04695ed6d..cbb607418 100644
--- a/doc/api-extensions.md
+++ b/doc/api-extensions.md
@@ -346,3 +346,6 @@ This adds support for querying an LXD daemon for the system resources it has
## kernel\_limits
This adds support for setting process limits such as maximum number of open
files for the container via `nofile`. The format is `limits.kernel.[limit name]`.
+
+## network\_vfio
+This adds support for SR-IOV enabled network devices.
diff --git a/lxd/api_1.0.go b/lxd/api_1.0.go
index d05dc407d..4ac0634ce 100644
--- a/lxd/api_1.0.go
+++ b/lxd/api_1.0.go
@@ -130,6 +130,7 @@ func api10Get(d *Daemon, r *http.Request) Response {
"storage_block_filesystem_btrfs",
"resources",
"kernel_limits",
+ "network_vfio",
},
APIStatus: "stable",
APIVersion: version.APIVersion,
More information about the lxc-devel
mailing list