[lxc-devel] [lxd/master] GPU device port to use resources package
tomponline on Github
lxc-bot at linuxcontainers.org
Mon Aug 19 14:12:02 UTC 2019
A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 449 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20190819/44d5034c/attachment-0001.bin>
-------------- next part --------------
From 18e893168eca7721d06d49987f40b88cb867909b Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Mon, 19 Aug 2019 14:39:36 +0100
Subject: [PATCH 1/3] device/utils/unix: Fix double device name encoding in
file name
Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
lxd/device/device_utils_unix.go | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lxd/device/device_utils_unix.go b/lxd/device/device_utils_unix.go
index d97c6f3984..de2cf57644 100644
--- a/lxd/device/device_utils_unix.go
+++ b/lxd/device/device_utils_unix.go
@@ -325,7 +325,7 @@ func unixDeviceSetup(s *state.State, devicesPath string, typePrefix string, devi
}
// Create the device on the host.
- ourPrefix := unixDeviceEncode(unixDeviceJoinPath(typePrefix, deviceName))
+ ourPrefix := unixDeviceJoinPath(typePrefix, deviceName)
d, err := UnixDeviceCreate(s, nil, devicesPath, ourPrefix, m, defaultMode)
if err != nil {
return err
From 3045f0a4704ab97b002f316d4cca71338ed3f82f Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Mon, 19 Aug 2019 14:39:56 +0100
Subject: [PATCH 2/3] test: Adds GPU tests
Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
test/main.sh | 3 +-
test/suites/container_devices_gpu.sh | 88 ++++++++++++++++++++++++++++
2 files changed, 90 insertions(+), 1 deletion(-)
create mode 100644 test/suites/container_devices_gpu.sh
diff --git a/test/main.sh b/test/main.sh
index 53cbc6a94c..a207dcb28f 100755
--- a/test/main.sh
+++ b/test/main.sh
@@ -198,6 +198,8 @@ run_test test_container_devices_nic_ipvlan "container devices - nic - ipvlan"
run_test test_container_devices_nic_sriov "container devices - nic - sriov"
run_test test_container_devices_ib_physical "container devices - infiniband - physical"
run_test test_container_devices_ib_sriov "container devices - infiniband - sriov"
+run_test test_container_devices_proxy "container devices - proxy"
+run_test test_container_devices_gpu "container devices - gpu"
run_test test_security "security features"
run_test test_security_protection "container protection"
run_test test_image_expiry "image expiry"
@@ -240,7 +242,6 @@ run_test test_kernel_limits "kernel limits"
run_test test_macaroon_auth "macaroon authentication"
run_test test_console "console"
run_test test_query "query"
-run_test test_container_devices_proxy "container devices - proxy"
run_test test_storage_local_volume_handling "storage local volume handling"
run_test test_backup_import "backup import"
run_test test_backup_export "backup export"
diff --git a/test/suites/container_devices_gpu.sh b/test/suites/container_devices_gpu.sh
new file mode 100644
index 0000000000..fb03f71686
--- /dev/null
+++ b/test/suites/container_devices_gpu.sh
@@ -0,0 +1,88 @@
+test_container_devices_gpu() {
+ ensure_import_testimage
+ ensure_has_localhost_remote "${LXD_ADDR}"
+
+ if [ ! -c /dev/dri/card0 ]; then
+ echo "==> SKIP: No /dev/dri/card0 device found"
+ return
+ fi
+
+ ctName="ct$$"
+ lxc launch testimage "${ctName}"
+
+ # Check adding all cards creates the correct device mounts and cleans up on removal.
+ startMountCount=$(lxc exec "${ctName}" -- mount | wc -l)
+ startDevCount=$(find "${LXD_DIR}"/devices/"${ctName}" -type c | wc -l)
+ lxc config device add "${ctName}" gpu-all gpu mode=0600
+ lxc exec "${ctName}" -- mount | grep "tmpfs on /dev/dri/card0 type tmpfs"
+ lxc exec "${ctName}" -- stat -c '%a' /dev/dri/card0 | grep 600
+ stat -c '%a' "${LXD_DIR}"/devices/"${ctName}"/unix.gpu--all.dev-dri-card0 | grep 600
+ lxc config device remove "${ctName}" gpu-all
+ endMountCount=$(lxc exec "${ctName}" -- mount | wc -l)
+ endDevCount=$(find "${LXD_DIR}"/devices/"${ctName}" -type c | wc -l)
+
+ if [ "$startMountCount" != "$endMountCount" ]; then
+ echo "leftover container mounts detected"
+ false
+ fi
+
+ if [ "$startDevCount" != "$endDevCount" ]; then
+ echo "leftover host devices detected"
+ false
+ fi
+
+ # Check adding non-existent card fails.
+ ! lxc config device add "${ctName}" gpu-missing gpu id=9999
+
+ # Check default create mode is 0660.
+ lxc config device add "${ctName}" gpu-default gpu id=0
+ lxc exec "${ctName}" -- stat -c '%a' /dev/dri/card0 | grep 660
+ lxc config device remove "${ctName}" gpu-default
+
+ # Check Nvidia devices if card0 is an Nvidia GPU.
+ card0Minor=$(stat -c %T /dev/dri/card0)
+ if [ ! -c /dev/nvidia"${card0Minor}" ]; then
+ echo "==> SKIP: /dev/dri/card0 is not Nvidia card, skipping Nvidia tests"
+ lxc delete -f "${ctName}"
+ return
+ fi
+
+ # Check the Nvidia specific devices are mounted correctly.
+ lxc config device add "${ctName}" gpu-nvidia gpu id=0 mode=0600
+ lxc exec "${ctName}" -- mount | grep "tmpfs on /dev/dri/card0 type tmpfs"
+
+ lxc exec "${ctName}" -- mount | grep /dev/nvidia0
+ stat -c '%a' "${LXD_DIR}"/devices/"${ctName}"/unix.gpu--nvidia.dev-dri-card0 | grep 600
+
+ lxc exec "${ctName}" -- mount | grep /dev/nvidia-modeset
+ stat -c '%a' "${LXD_DIR}"/devices/"${ctName}"/unix.gpu--nvidia.dev-nvidia--modeset | grep 600
+
+ lxc exec "${ctName}" -- mount | grep /dev/nvidia-uvm
+ stat -c '%a' "${LXD_DIR}"/devices/"${ctName}"/unix.gpu--nvidia.dev-nvidia--uvm | grep 600
+
+ lxc exec "${ctName}" -- mount | grep /dev/nvidia-uvm-tools
+ stat -c '%a' "${LXD_DIR}"/devices/"${ctName}"/unix.gpu--nvidia.dev-nvidia--uvm--tools | grep 600
+
+ lxc exec "${ctName}" -- mount | grep /dev/nvidiactl
+ stat -c '%a' "${LXD_DIR}"/devices/"${ctName}"/unix.gpu--nvidia.dev-nvidiactl | grep 600
+
+ lxc config device remove "${ctName}" gpu-nvidia
+
+ # Check support for nvidia runtime (requires libnvidia-container-tools be installed).
+ if [ ! -f /usr/bin/nvidia-container-cli ]; then
+ echo "==> SKIP: /usr/bin/nvidia-container-cli not available (please install libnvidia-container-tools)"
+ lxc delete -f "${ctName}"
+ return
+ fi
+
+ lxc stop -f "${ctName}"
+ lxc config set "${ctName}" nvidia.runtime true
+ lxc start "${ctName}"
+ nvidiaMountCount=$(lxc exec "${ctName}" -- mount | grep -c nvidia)
+ if [ "$nvidiaMountCount" != "16" ]; then
+ echo "nvidia runtime mounts invalid"
+ false
+ fi
+
+ lxc delete -f "${ctName}"
+}
From 62cc64260dd812231e7adf8eb00055932246bf41 Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Mon, 19 Aug 2019 11:33:10 +0100
Subject: [PATCH 3/3] device/gpu: Moves nvidia device loading to use resources
package
Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
lxd/device/gpu.go | 591 ++++++++--------------------------------------
1 file changed, 105 insertions(+), 486 deletions(-)
diff --git a/lxd/device/gpu.go b/lxd/device/gpu.go
index 0f7deea459..dca055ad29 100644
--- a/lxd/device/gpu.go
+++ b/lxd/device/gpu.go
@@ -1,105 +1,33 @@
package device
import (
- "encoding/csv"
"fmt"
- "io"
"io/ioutil"
"os"
- "os/exec"
"path/filepath"
"regexp"
"strconv"
"strings"
- "github.com/jaypipes/pcidb"
"golang.org/x/sys/unix"
"github.com/lxc/lxd/lxd/device/config"
"github.com/lxc/lxd/lxd/instance"
+ "github.com/lxc/lxd/lxd/resources"
"github.com/lxc/lxd/shared"
)
-type gpu struct {
- deviceCommon
-}
-
-// /dev/dri/card0. If we detect that vendor == nvidia, then nvidia will contain
-// the corresponding nvidia car, e.g. {/dev/dri/card1 to /dev/nvidia1}.
-type gpuDevice struct {
- // DRM node information
- id string
- path string
- major uint32
- minor uint32
-
- // Device information
- vendorID string
- vendorName string
- productID string
- productName string
- numaNode uint64
-
- // If related devices have the same PCI address as the GPU we should
- // mount them all. Meaning if we detect /dev/dri/card0,
- // /dev/dri/controlD64, and /dev/dri/renderD128 with the same PCI
- // address, then they should all be made available in the container.
- pci string
- driver string
- driverVersion string
-
- // NVIDIA specific handling
- isNvidia bool
- nvidia nvidiaGpuCard
-}
+const gpuDRIDevPath = "/dev/dri"
-func (g *gpuDevice) isNvidiaGpu() bool {
- return strings.EqualFold(g.vendorID, "10de")
-}
-
-// /dev/nvidia[0-9]+
-type nvidiaGpuCard struct {
+// Non-card devices such as {/dev/nvidiactl, /dev/nvidia-uvm, ...}
+type nvidiaNonCardDevice struct {
path string
major uint32
minor uint32
- id string
-
- nvrmVersion string
- cudaVersion string
- model string
- brand string
- uuid string
- architecture string
}
-// {/dev/nvidiactl, /dev/nvidia-uvm, ...}
-type nvidiaGpuDevice struct {
- isCard bool
- path string
- major uint32
- minor uint32
-}
-
-// Nvidia container info
-type nvidiaContainerInfo struct {
- Cards map[string]*nvidiaContainerCardInfo
- NVRMVersion string
- CUDAVersion string
-}
-
-type nvidiaContainerCardInfo struct {
- DeviceIndex string
- DeviceMinor string
- Model string
- Brand string
- UUID string
- PCIAddress string
- Architecture string
-}
-
-type cardIds struct {
- id string
- pci string
+type gpu struct {
+ deviceCommon
}
// validateConfig checks the supplied config for correctness.
@@ -151,66 +79,91 @@ func (d *gpu) Start() (*RunConfig, error) {
}
runConf := RunConfig{}
-
- allGpus := d.deviceWantsAllGPUs(d.config)
- gpus, nvidiaDevices, err := d.deviceLoadGpu(allGpus)
+ gpus, err := resources.GetGPU()
if err != nil {
return nil, err
}
sawNvidia := false
found := false
- for _, gpu := range gpus {
- if (d.config["vendorid"] != "" && gpu.vendorID != d.config["vendorid"]) ||
- (d.config["pci"] != "" && gpu.pci != d.config["pci"]) ||
- (d.config["productid"] != "" && gpu.productID != d.config["productid"]) ||
- (d.config["id"] != "" && gpu.id != d.config["id"]) {
+ for _, gpu := range gpus.Cards {
+ if (d.config["vendorid"] != "" && gpu.VendorID != d.config["vendorid"]) ||
+ (d.config["pci"] != "" && gpu.PCIAddress != d.config["pci"]) ||
+ (d.config["productid"] != "" && gpu.ProductID != d.config["productid"]) {
continue
}
- found = true
- err := unixDeviceSetupCharNum(d.state, d.instance.DevicesPath(), "unix", d.name, d.config, gpu.major, gpu.minor, gpu.path, false, &runConf)
- if err != nil {
- return nil, err
- }
+ // Handle DRM devices if present and matches criteria.
+ if gpu.DRM != nil && (d.config["id"] == "" || fmt.Sprintf("%d", gpu.DRM.ID) == d.config["id"]) {
+ found = true
- if !gpu.isNvidia {
- continue
- }
+ // DRM &{ID:0 CardName:card0 CardDevice:226:0 ControlName: ControlDevice: RenderName:renderD128 RenderDevice:226:128}
+ // TODO: Need to also support ControlName & ControlDevice?
- if gpu.nvidia.path != "" {
- err = unixDeviceSetupCharNum(d.state, d.instance.DevicesPath(), "unix", d.name, d.config, gpu.nvidia.major, gpu.nvidia.minor, gpu.nvidia.path, false, &runConf)
- if err != nil {
- return nil, err
- }
- } else if !allGpus {
- return nil, fmt.Errorf("Failed to detect correct \"/dev/nvidia\" path")
- }
+ if gpu.DRM.CardName != "" && gpu.DRM.CardDevice != "" {
+ path := filepath.Join(gpuDRIDevPath, gpu.DRM.CardName)
+ major, minor, err := d.deviceNumStringToUint32(gpu.DRM.CardDevice)
+ if err != nil {
+ return nil, err
+ }
- sawNvidia = true
- }
+ err = unixDeviceSetupCharNum(d.state, d.instance.DevicesPath(), "unix", d.name, d.config, major, minor, path, false, &runConf)
+ if err != nil {
+ return nil, err
+ }
+ }
- if sawNvidia {
- for _, gpu := range nvidiaDevices {
- instanceConfig := d.instance.ExpandedConfig()
+ if gpu.DRM.RenderName != "" && gpu.DRM.RenderDevice != "" {
+ path := filepath.Join(gpuDRIDevPath, gpu.DRM.RenderName)
+ major, minor, err := d.deviceNumStringToUint32(gpu.DRM.RenderDevice)
+ if err != nil {
+ return nil, err
+ }
- // No need to mount additional nvidia non-card devices as the nvidia.runtime
- // setting will do this for us.
- if shared.IsTrue(instanceConfig["nvidia.runtime"]) {
- if !gpu.isCard {
- continue
+ err = unixDeviceSetupCharNum(d.state, d.instance.DevicesPath(), "unix", d.name, d.config, major, minor, path, false, &runConf)
+ if err != nil {
+ return nil, err
}
}
- prefix := unixDeviceJoinPath("unix", d.name)
- if UnixDeviceExists(d.instance.DevicesPath(), prefix, gpu.path) {
- continue
+ // Add Nvidia device if present.
+ if gpu.Nvidia != nil && gpu.Nvidia.CardName != "" && gpu.Nvidia.CardDevice != "" {
+ sawNvidia = true
+ path := filepath.Join("/dev", gpu.Nvidia.CardName)
+ major, minor, err := d.deviceNumStringToUint32(gpu.Nvidia.CardDevice)
+ if err != nil {
+ return nil, err
+ }
+
+ err = unixDeviceSetupCharNum(d.state, d.instance.DevicesPath(), "unix", d.name, d.config, major, minor, path, false, &runConf)
+ if err != nil {
+ return nil, err
+ }
}
+ }
+ }
- err = unixDeviceSetupCharNum(d.state, d.instance.DevicesPath(), "unix", d.name, d.config, gpu.major, gpu.minor, gpu.path, false, &runConf)
+ if sawNvidia {
+ // No need to mount additional nvidia non-card devices as the nvidia.runtime
+ // setting will do this for us.
+ instanceConfig := d.instance.ExpandedConfig()
+ if !shared.IsTrue(instanceConfig["nvidia.runtime"]) {
+ nvidiaDevices, err := d.getNvidiaNonCardDevices()
if err != nil {
return nil, err
}
+
+ for _, dev := range nvidiaDevices {
+ prefix := unixDeviceJoinPath("unix", d.name)
+ if UnixDeviceExists(d.instance.DevicesPath(), prefix, dev.path) {
+ continue
+ }
+
+ err = unixDeviceSetupCharNum(d.state, d.instance.DevicesPath(), "unix", d.name, d.config, dev.major, dev.minor, dev.path, false, &runConf)
+ if err != nil {
+ return nil, err
+ }
+ }
}
}
@@ -246,399 +199,65 @@ func (d *gpu) postStop() error {
return nil
}
-// deviceWantsAllGPUs whether the LXD device wants to passthrough all GPUs on the host.
-func (d *gpu) deviceWantsAllGPUs(m map[string]string) bool {
- return m["vendorid"] == "" && m["productid"] == "" && m["id"] == "" && m["pci"] == ""
-}
-
-// deviceLoadGpu probes the system for information about the available GPUs.
-func (d *gpu) deviceLoadGpu(all bool) ([]gpuDevice, []nvidiaGpuDevice, error) {
- const drmPath = "/sys/class/drm/"
- var gpus []gpuDevice
- var nvidiaDevices []nvidiaGpuDevice
- var cards []cardIds
-
- // Load NVIDIA information (if available)
- var nvidiaContainer *nvidiaContainerInfo
-
- _, err := exec.LookPath("nvidia-container-cli")
- if err == nil {
- out, err := shared.RunCommand("nvidia-container-cli", "info", "--csv")
- if err == nil {
- r := csv.NewReader(strings.NewReader(out))
- r.FieldsPerRecord = -1
-
- nvidiaContainer = &nvidiaContainerInfo{}
- nvidiaContainer.Cards = map[string]*nvidiaContainerCardInfo{}
- line := 0
- for {
- record, err := r.Read()
- if err == io.EOF {
- break
- }
- line++
-
- if err != nil {
- continue
- }
-
- if line == 2 && len(record) >= 2 {
- nvidiaContainer.NVRMVersion = record[0]
- nvidiaContainer.CUDAVersion = record[1]
- } else if line >= 4 {
- nvidiaContainer.Cards[record[5]] = &nvidiaContainerCardInfo{
- DeviceIndex: record[0],
- DeviceMinor: record[1],
- Model: record[2],
- Brand: record[3],
- UUID: record[4],
- PCIAddress: record[5],
- Architecture: record[6],
- }
- }
- }
- }
+// deviceNumStringToUint32 converts a device number string (major:minor) into separare major and
+// minor uint32s.
+func (d *gpu) deviceNumStringToUint32(devNum string) (uint32, uint32, error) {
+ devParts := strings.SplitN(devNum, ":", 2)
+ tmp, err := strconv.ParseUint(devParts[0], 10, 32)
+ if err != nil {
+ return 0, 0, err
}
+ major := uint32(tmp)
- // Load PCI database
- pciDB, err := pcidb.New()
+ tmp, err = strconv.ParseUint(devParts[1], 10, 32)
if err != nil {
- pciDB = nil
+ return 0, 0, err
}
+ minor := uint32(tmp)
- // Get the list of DRM devices
- ents, err := ioutil.ReadDir(drmPath)
+ return major, minor, nil
+}
+
+// getNvidiaNonCardDevices returns device information about Nvidia non-card devices.
+func (d *gpu) getNvidiaNonCardDevices() ([]nvidiaNonCardDevice, error) {
+ nvidiaEnts, err := ioutil.ReadDir("/dev")
if err != nil {
- // No GPUs
if os.IsNotExist(err) {
- return nil, nil, nil
+ return nil, err
}
-
- return nil, nil, err
}
- // Get the list of cards
- devices := []string{}
- for _, ent := range ents {
- dev, err := filepath.EvalSymlinks(fmt.Sprintf("%s/%s/device", drmPath, ent.Name()))
- if err != nil {
- continue
- }
-
- if !shared.StringInSlice(dev, devices) {
- devices = append(devices, dev)
- }
+ regexNvidiaCard, err := regexp.Compile(`^nvidia[0-9]+`)
+ if err != nil {
+ return nil, err
}
- isNvidia := false
- for _, device := range devices {
- // The pci address == the name of the directory. So let's use
- // this cheap way of retrieving it.
- pciAddr := filepath.Base(device)
-
- // Make sure that we are dealing with a GPU by looking whether
- // the "drm" subfolder exists.
- drm := filepath.Join(device, "drm")
- drmEnts, err := ioutil.ReadDir(drm)
- if err != nil {
- if os.IsNotExist(err) {
- continue
- }
- }
-
- // Retrieve vendor ID.
- vendorIDPath := filepath.Join(device, "vendor")
- vendorID, err := ioutil.ReadFile(vendorIDPath)
- if err != nil {
- if os.IsNotExist(err) {
- continue
- }
- }
-
- // Retrieve device ID.
- productIDPath := filepath.Join(device, "device")
- productID, err := ioutil.ReadFile(productIDPath)
- if err != nil {
- if os.IsNotExist(err) {
- continue
- }
- }
-
- // Retrieve node ID
- numaPath := fmt.Sprintf(filepath.Join(device, "numa_node"))
- numaNode := uint64(0)
- if shared.PathExists(numaPath) {
- numaID, err := shared.ParseNumberFromFile(numaPath)
- if err != nil {
- continue
- }
-
- if numaID > 0 {
- numaNode = uint64(numaID)
- }
- }
-
- // Retrieve driver
- driver := ""
- driverVersion := ""
- driverPath := filepath.Join(device, "driver")
- if shared.PathExists(driverPath) {
- target, err := os.Readlink(driverPath)
- if err != nil {
- continue
- }
-
- driver = filepath.Base(target)
-
- out, err := ioutil.ReadFile(filepath.Join(driverPath, "module", "version"))
- if err == nil {
- driverVersion = strings.TrimSpace(string(out))
- } else {
- uname, err := shared.Uname()
- if err != nil {
- continue
- }
- driverVersion = uname.Release
- }
- }
-
- // Store all associated subdevices, e.g. controlD64, renderD128.
- // The name of the directory == the last part of the
- // /dev/dri/controlD64 path. So drmEnt.Name() will give us
- // controlD64.
- for _, drmEnt := range drmEnts {
- vendorTmp := strings.TrimSpace(string(vendorID))
- productTmp := strings.TrimSpace(string(productID))
- vendorTmp = strings.TrimPrefix(vendorTmp, "0x")
- productTmp = strings.TrimPrefix(productTmp, "0x")
- tmpGpu := gpuDevice{
- pci: pciAddr,
- vendorID: vendorTmp,
- productID: productTmp,
- numaNode: numaNode,
- driver: driver,
- driverVersion: driverVersion,
- path: filepath.Join("/dev/dri", drmEnt.Name()),
- }
-
- // Fill vendor and product names
- if pciDB != nil {
- vendor, ok := pciDB.Vendors[tmpGpu.vendorID]
- if ok {
- tmpGpu.vendorName = vendor.Name
-
- for _, product := range vendor.Products {
- if product.ID == tmpGpu.productID {
- tmpGpu.productName = product.Name
- break
- }
- }
- }
- }
-
- majMinPath := filepath.Join(drm, drmEnt.Name(), "dev")
- majMinByte, err := ioutil.ReadFile(majMinPath)
- if err != nil {
- if os.IsNotExist(err) {
- continue
- }
- }
-
- majMin := strings.TrimSpace(string(majMinByte))
- majMinSlice := strings.Split(string(majMin), ":")
- if len(majMinSlice) != 2 {
- continue
- }
-
- majorInt, err := strconv.ParseUint(majMinSlice[0], 10, 32)
- if err != nil {
- continue
- }
-
- minorInt, err := strconv.ParseUint(majMinSlice[1], 10, 32)
- if err != nil {
- continue
- }
-
- tmpGpu.major = uint32(majorInt)
- tmpGpu.minor = uint32(minorInt)
-
- isCard, err := regexp.MatchString("^card[0-9]+", drmEnt.Name())
- if err != nil {
- continue
- }
-
- // Find matching /dev/nvidia* entry for /dev/dri/card*
- if tmpGpu.isNvidiaGpu() && isCard {
- if !isNvidia {
- isNvidia = true
- }
- tmpGpu.isNvidia = true
-
- if !all {
- minor, err := d.findNvidiaMinor(tmpGpu.pci)
- if err == nil {
- nvidiaPath := "/dev/nvidia" + minor
- stat := unix.Stat_t{}
- err = unix.Stat(nvidiaPath, &stat)
- if err != nil {
- if os.IsNotExist(err) {
- continue
- }
-
- return nil, nil, err
- }
-
- tmpGpu.nvidia.path = nvidiaPath
- tmpGpu.nvidia.major = unix.Major(stat.Rdev)
- tmpGpu.nvidia.minor = unix.Minor(stat.Rdev)
- tmpGpu.nvidia.id = strconv.FormatInt(int64(tmpGpu.nvidia.minor), 10)
-
- if nvidiaContainer != nil {
- tmpGpu.nvidia.nvrmVersion = nvidiaContainer.NVRMVersion
- tmpGpu.nvidia.cudaVersion = nvidiaContainer.CUDAVersion
- nvidiaInfo, ok := nvidiaContainer.Cards[tmpGpu.pci]
- if !ok {
- nvidiaInfo, ok = nvidiaContainer.Cards[fmt.Sprintf("0000%v", tmpGpu.pci)]
- }
- if ok {
- tmpGpu.nvidia.brand = nvidiaInfo.Brand
- tmpGpu.nvidia.model = nvidiaInfo.Model
- tmpGpu.nvidia.uuid = nvidiaInfo.UUID
- tmpGpu.nvidia.architecture = nvidiaInfo.Architecture
- }
- }
- }
- }
- }
-
- if isCard {
- // If it is a card it's minor number will be its id.
- tmpGpu.id = strconv.FormatInt(int64(minorInt), 10)
- tmp := cardIds{
- id: tmpGpu.id,
- pci: tmpGpu.pci,
- }
+ nvidiaDevices := []nvidiaNonCardDevice{}
- cards = append(cards, tmp)
- }
-
- gpus = append(gpus, tmpGpu)
+ for _, nvidiaEnt := range nvidiaEnts {
+ if !strings.HasPrefix(nvidiaEnt.Name(), "nvidia") {
+ continue
}
- }
- // We detected a Nvidia card, so let's collect all other nvidia devices
- // that are not /dev/nvidia[0-9]+.
- if isNvidia {
- nvidiaEnts, err := ioutil.ReadDir("/dev")
- if err != nil {
- if os.IsNotExist(err) {
- return nil, nil, err
- }
+ if regexNvidiaCard.MatchString(nvidiaEnt.Name()) {
+ continue
}
- validNvidia, err := regexp.Compile(`^nvidia[^0-9]+`)
+ nvidiaPath := filepath.Join("/dev", nvidiaEnt.Name())
+ stat := unix.Stat_t{}
+ err = unix.Stat(nvidiaPath, &stat)
if err != nil {
- return nil, nil, err
- }
-
- for _, nvidiaEnt := range nvidiaEnts {
- if all {
- if !strings.HasPrefix(nvidiaEnt.Name(), "nvidia") {
- continue
- }
- } else {
- if !validNvidia.MatchString(nvidiaEnt.Name()) {
- continue
- }
- }
-
- nvidiaPath := filepath.Join("/dev", nvidiaEnt.Name())
- stat := unix.Stat_t{}
- err = unix.Stat(nvidiaPath, &stat)
- if err != nil {
- continue
- }
-
- tmpNividiaGpu := nvidiaGpuDevice{
- isCard: !validNvidia.MatchString(nvidiaEnt.Name()),
- path: nvidiaPath,
- major: unix.Major(stat.Rdev),
- minor: unix.Minor(stat.Rdev),
- }
-
- nvidiaDevices = append(nvidiaDevices, tmpNividiaGpu)
- }
- }
-
- // Since we'll give users to ability to specify and id we need to group
- // devices on the same PCI that belong to the same card by id.
- for _, card := range cards {
- for i := 0; i < len(gpus); i++ {
- if gpus[i].pci == card.pci {
- gpus[i].id = card.id
- }
- }
- }
-
- return gpus, nvidiaDevices, nil
-}
-
-// findNvidiaMinorOld fallback for old drivers which don't provide "Device Minor:".
-func (d *gpu) findNvidiaMinorOld() (string, error) {
- var minor string
-
- // For now, just handle most common case (single nvidia card)
- ents, err := ioutil.ReadDir("/dev")
- if err != nil {
- return "", err
- }
-
- rp := regexp.MustCompile("^nvidia([0-9]+)$")
- for _, ent := range ents {
- matches := rp.FindStringSubmatch(ent.Name())
- if matches == nil {
continue
}
- if minor != "" {
- return "", fmt.Errorf("No device minor index detected, and more than one NVIDIA card present")
+ tmpNividiaGpu := nvidiaNonCardDevice{
+ path: nvidiaPath,
+ major: unix.Major(stat.Rdev),
+ minor: unix.Minor(stat.Rdev),
}
- minor = matches[1]
- }
-
- if minor == "" {
- return "", fmt.Errorf("No device minor index detected, and no NVIDIA card present")
- }
-
- return minor, nil
-}
-
-// findNvidiaMinor returns minor number of nvidia device corresponding to the given pci id.
-func (d *gpu) findNvidiaMinor(pci string) (string, error) {
- nvidiaPath := fmt.Sprintf("/proc/driver/nvidia/gpus/%s/information", pci)
- buf, err := ioutil.ReadFile(nvidiaPath)
- if err != nil {
- return "", err
- }
-
- strBuf := strings.TrimSpace(string(buf))
- idx := strings.Index(strBuf, "Device Minor:")
- if idx != -1 {
- idx += len("Device Minor:")
- strBuf = strBuf[idx:]
- strBuf = strings.TrimSpace(strBuf)
- parts := strings.SplitN(strBuf, "\n", 2)
- _, err = strconv.Atoi(parts[0])
- if err == nil {
- return parts[0], nil
- }
- }
- minor, err := d.findNvidiaMinorOld()
- if err == nil {
- return minor, nil
+ nvidiaDevices = append(nvidiaDevices, tmpNividiaGpu)
}
- return "", err
+ return nvidiaDevices, nil
}
More information about the lxc-devel
mailing list