[lxc-devel] [lxd/master] GPU device port to use resources package

tomponline on Github lxc-bot at linuxcontainers.org
Mon Aug 19 14:12:02 UTC 2019


A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 449 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20190819/44d5034c/attachment-0001.bin>
-------------- next part --------------
From 18e893168eca7721d06d49987f40b88cb867909b Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Mon, 19 Aug 2019 14:39:36 +0100
Subject: [PATCH 1/3] device/utils/unix: Fix double device name encoding in
 file name

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 lxd/device/device_utils_unix.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lxd/device/device_utils_unix.go b/lxd/device/device_utils_unix.go
index d97c6f3984..de2cf57644 100644
--- a/lxd/device/device_utils_unix.go
+++ b/lxd/device/device_utils_unix.go
@@ -325,7 +325,7 @@ func unixDeviceSetup(s *state.State, devicesPath string, typePrefix string, devi
 	}
 
 	// Create the device on the host.
-	ourPrefix := unixDeviceEncode(unixDeviceJoinPath(typePrefix, deviceName))
+	ourPrefix := unixDeviceJoinPath(typePrefix, deviceName)
 	d, err := UnixDeviceCreate(s, nil, devicesPath, ourPrefix, m, defaultMode)
 	if err != nil {
 		return err

From 3045f0a4704ab97b002f316d4cca71338ed3f82f Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Mon, 19 Aug 2019 14:39:56 +0100
Subject: [PATCH 2/3] test: Adds GPU tests

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 test/main.sh                         |  3 +-
 test/suites/container_devices_gpu.sh | 88 ++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+), 1 deletion(-)
 create mode 100644 test/suites/container_devices_gpu.sh

diff --git a/test/main.sh b/test/main.sh
index 53cbc6a94c..a207dcb28f 100755
--- a/test/main.sh
+++ b/test/main.sh
@@ -198,6 +198,8 @@ run_test test_container_devices_nic_ipvlan "container devices - nic - ipvlan"
 run_test test_container_devices_nic_sriov "container devices - nic - sriov"
 run_test test_container_devices_ib_physical "container devices - infiniband - physical"
 run_test test_container_devices_ib_sriov "container devices - infiniband - sriov"
+run_test test_container_devices_proxy "container devices - proxy"
+run_test test_container_devices_gpu "container devices - gpu"
 run_test test_security "security features"
 run_test test_security_protection "container protection"
 run_test test_image_expiry "image expiry"
@@ -240,7 +242,6 @@ run_test test_kernel_limits "kernel limits"
 run_test test_macaroon_auth "macaroon authentication"
 run_test test_console "console"
 run_test test_query "query"
-run_test test_container_devices_proxy "container devices - proxy"
 run_test test_storage_local_volume_handling "storage local volume handling"
 run_test test_backup_import "backup import"
 run_test test_backup_export "backup export"
diff --git a/test/suites/container_devices_gpu.sh b/test/suites/container_devices_gpu.sh
new file mode 100644
index 0000000000..fb03f71686
--- /dev/null
+++ b/test/suites/container_devices_gpu.sh
@@ -0,0 +1,88 @@
+test_container_devices_gpu() {
+  ensure_import_testimage
+  ensure_has_localhost_remote "${LXD_ADDR}"
+
+  if [ ! -c /dev/dri/card0 ]; then
+    echo "==> SKIP: No /dev/dri/card0 device found"
+    return
+  fi
+
+  ctName="ct$$"
+  lxc launch testimage "${ctName}"
+
+  # Check adding all cards creates the correct device mounts and cleans up on removal.
+  startMountCount=$(lxc exec "${ctName}" -- mount | wc -l)
+  startDevCount=$(find "${LXD_DIR}"/devices/"${ctName}" -type c | wc -l)
+  lxc config device add "${ctName}" gpu-all gpu mode=0600
+  lxc exec "${ctName}" -- mount | grep "tmpfs on /dev/dri/card0 type tmpfs"
+  lxc exec "${ctName}" -- stat -c '%a' /dev/dri/card0 | grep 600
+  stat -c '%a' "${LXD_DIR}"/devices/"${ctName}"/unix.gpu--all.dev-dri-card0 | grep 600
+  lxc config device remove "${ctName}" gpu-all
+  endMountCount=$(lxc exec "${ctName}" -- mount | wc -l)
+  endDevCount=$(find "${LXD_DIR}"/devices/"${ctName}" -type c | wc -l)
+
+  if [ "$startMountCount" != "$endMountCount" ]; then
+    echo "leftover container mounts detected"
+    false
+  fi
+
+  if [ "$startDevCount" != "$endDevCount" ]; then
+    echo "leftover host devices detected"
+    false
+  fi
+
+  # Check adding non-existent card fails.
+  ! lxc config device add "${ctName}" gpu-missing gpu id=9999
+
+  # Check default create mode is 0660.
+  lxc config device add "${ctName}" gpu-default gpu id=0
+  lxc exec "${ctName}" -- stat -c '%a' /dev/dri/card0 | grep 660
+  lxc config device remove "${ctName}" gpu-default
+
+  # Check Nvidia devices if card0 is an Nvidia GPU.
+  card0Minor=$(stat -c %T /dev/dri/card0)
+  if [ ! -c /dev/nvidia"${card0Minor}"  ]; then
+    echo "==> SKIP: /dev/dri/card0 is not Nvidia card, skipping Nvidia tests"
+    lxc delete -f "${ctName}"
+    return
+  fi
+
+  # Check the Nvidia specific devices are mounted correctly.
+  lxc config device add "${ctName}" gpu-nvidia gpu id=0 mode=0600
+  lxc exec "${ctName}" -- mount | grep "tmpfs on /dev/dri/card0 type tmpfs"
+
+  lxc exec "${ctName}" -- mount | grep /dev/nvidia0
+  stat -c '%a' "${LXD_DIR}"/devices/"${ctName}"/unix.gpu--nvidia.dev-dri-card0 | grep 600
+
+  lxc exec "${ctName}" -- mount | grep /dev/nvidia-modeset
+  stat -c '%a' "${LXD_DIR}"/devices/"${ctName}"/unix.gpu--nvidia.dev-nvidia--modeset | grep 600
+
+  lxc exec "${ctName}" -- mount | grep /dev/nvidia-uvm
+  stat -c '%a' "${LXD_DIR}"/devices/"${ctName}"/unix.gpu--nvidia.dev-nvidia--uvm | grep 600
+
+  lxc exec "${ctName}" -- mount | grep /dev/nvidia-uvm-tools
+  stat -c '%a' "${LXD_DIR}"/devices/"${ctName}"/unix.gpu--nvidia.dev-nvidia--uvm--tools | grep 600
+
+  lxc exec "${ctName}" -- mount | grep /dev/nvidiactl
+  stat -c '%a' "${LXD_DIR}"/devices/"${ctName}"/unix.gpu--nvidia.dev-nvidiactl | grep 600
+
+  lxc config device remove "${ctName}" gpu-nvidia
+
+  # Check support for nvidia runtime (requires libnvidia-container-tools be installed).
+  if [ ! -f /usr/bin/nvidia-container-cli ]; then
+    echo "==> SKIP: /usr/bin/nvidia-container-cli not available (please install libnvidia-container-tools)"
+    lxc delete -f "${ctName}"
+    return
+  fi
+
+  lxc stop -f "${ctName}"
+  lxc config set "${ctName}" nvidia.runtime true
+  lxc start "${ctName}"
+  nvidiaMountCount=$(lxc exec "${ctName}" -- mount | grep -c nvidia)
+  if [ "$nvidiaMountCount" != "16" ]; then
+    echo "nvidia runtime mounts invalid"
+    false
+  fi
+
+  lxc delete -f "${ctName}"
+}

From 62cc64260dd812231e7adf8eb00055932246bf41 Mon Sep 17 00:00:00 2001
From: Thomas Parrott <thomas.parrott at canonical.com>
Date: Mon, 19 Aug 2019 11:33:10 +0100
Subject: [PATCH 3/3] device/gpu: Moves nvidia device loading to use resources
 package

Signed-off-by: Thomas Parrott <thomas.parrott at canonical.com>
---
 lxd/device/gpu.go | 591 ++++++++--------------------------------------
 1 file changed, 105 insertions(+), 486 deletions(-)

diff --git a/lxd/device/gpu.go b/lxd/device/gpu.go
index 0f7deea459..dca055ad29 100644
--- a/lxd/device/gpu.go
+++ b/lxd/device/gpu.go
@@ -1,105 +1,33 @@
 package device
 
 import (
-	"encoding/csv"
 	"fmt"
-	"io"
 	"io/ioutil"
 	"os"
-	"os/exec"
 	"path/filepath"
 	"regexp"
 	"strconv"
 	"strings"
 
-	"github.com/jaypipes/pcidb"
 	"golang.org/x/sys/unix"
 
 	"github.com/lxc/lxd/lxd/device/config"
 	"github.com/lxc/lxd/lxd/instance"
+	"github.com/lxc/lxd/lxd/resources"
 	"github.com/lxc/lxd/shared"
 )
 
-type gpu struct {
-	deviceCommon
-}
-
-// /dev/dri/card0. If we detect that vendor == nvidia, then nvidia will contain
-// the corresponding nvidia car, e.g. {/dev/dri/card1 to /dev/nvidia1}.
-type gpuDevice struct {
-	// DRM node information
-	id    string
-	path  string
-	major uint32
-	minor uint32
-
-	// Device information
-	vendorID    string
-	vendorName  string
-	productID   string
-	productName string
-	numaNode    uint64
-
-	// If related devices have the same PCI address as the GPU we should
-	// mount them all. Meaning if we detect /dev/dri/card0,
-	// /dev/dri/controlD64, and /dev/dri/renderD128 with the same PCI
-	// address, then they should all be made available in the container.
-	pci           string
-	driver        string
-	driverVersion string
-
-	// NVIDIA specific handling
-	isNvidia bool
-	nvidia   nvidiaGpuCard
-}
+const gpuDRIDevPath = "/dev/dri"
 
-func (g *gpuDevice) isNvidiaGpu() bool {
-	return strings.EqualFold(g.vendorID, "10de")
-}
-
-// /dev/nvidia[0-9]+
-type nvidiaGpuCard struct {
+// Non-card devices such as {/dev/nvidiactl, /dev/nvidia-uvm, ...}
+type nvidiaNonCardDevice struct {
 	path  string
 	major uint32
 	minor uint32
-	id    string
-
-	nvrmVersion  string
-	cudaVersion  string
-	model        string
-	brand        string
-	uuid         string
-	architecture string
 }
 
-// {/dev/nvidiactl, /dev/nvidia-uvm, ...}
-type nvidiaGpuDevice struct {
-	isCard bool
-	path   string
-	major  uint32
-	minor  uint32
-}
-
-// Nvidia container info
-type nvidiaContainerInfo struct {
-	Cards       map[string]*nvidiaContainerCardInfo
-	NVRMVersion string
-	CUDAVersion string
-}
-
-type nvidiaContainerCardInfo struct {
-	DeviceIndex  string
-	DeviceMinor  string
-	Model        string
-	Brand        string
-	UUID         string
-	PCIAddress   string
-	Architecture string
-}
-
-type cardIds struct {
-	id  string
-	pci string
+type gpu struct {
+	deviceCommon
 }
 
 // validateConfig checks the supplied config for correctness.
@@ -151,66 +79,91 @@ func (d *gpu) Start() (*RunConfig, error) {
 	}
 
 	runConf := RunConfig{}
-
-	allGpus := d.deviceWantsAllGPUs(d.config)
-	gpus, nvidiaDevices, err := d.deviceLoadGpu(allGpus)
+	gpus, err := resources.GetGPU()
 	if err != nil {
 		return nil, err
 	}
 
 	sawNvidia := false
 	found := false
-	for _, gpu := range gpus {
-		if (d.config["vendorid"] != "" && gpu.vendorID != d.config["vendorid"]) ||
-			(d.config["pci"] != "" && gpu.pci != d.config["pci"]) ||
-			(d.config["productid"] != "" && gpu.productID != d.config["productid"]) ||
-			(d.config["id"] != "" && gpu.id != d.config["id"]) {
+	for _, gpu := range gpus.Cards {
+		if (d.config["vendorid"] != "" && gpu.VendorID != d.config["vendorid"]) ||
+			(d.config["pci"] != "" && gpu.PCIAddress != d.config["pci"]) ||
+			(d.config["productid"] != "" && gpu.ProductID != d.config["productid"]) {
 			continue
 		}
 
-		found = true
-		err := unixDeviceSetupCharNum(d.state, d.instance.DevicesPath(), "unix", d.name, d.config, gpu.major, gpu.minor, gpu.path, false, &runConf)
-		if err != nil {
-			return nil, err
-		}
+		// Handle DRM devices if present and matches criteria.
+		if gpu.DRM != nil && (d.config["id"] == "" || fmt.Sprintf("%d", gpu.DRM.ID) == d.config["id"]) {
+			found = true
 
-		if !gpu.isNvidia {
-			continue
-		}
+			// DRM &{ID:0 CardName:card0 CardDevice:226:0 ControlName: ControlDevice: RenderName:renderD128 RenderDevice:226:128}
+			// TODO: Need to also support ControlName & ControlDevice?
 
-		if gpu.nvidia.path != "" {
-			err = unixDeviceSetupCharNum(d.state, d.instance.DevicesPath(), "unix", d.name, d.config, gpu.nvidia.major, gpu.nvidia.minor, gpu.nvidia.path, false, &runConf)
-			if err != nil {
-				return nil, err
-			}
-		} else if !allGpus {
-			return nil, fmt.Errorf("Failed to detect correct \"/dev/nvidia\" path")
-		}
+			if gpu.DRM.CardName != "" && gpu.DRM.CardDevice != "" {
+				path := filepath.Join(gpuDRIDevPath, gpu.DRM.CardName)
+				major, minor, err := d.deviceNumStringToUint32(gpu.DRM.CardDevice)
+				if err != nil {
+					return nil, err
+				}
 
-		sawNvidia = true
-	}
+				err = unixDeviceSetupCharNum(d.state, d.instance.DevicesPath(), "unix", d.name, d.config, major, minor, path, false, &runConf)
+				if err != nil {
+					return nil, err
+				}
+			}
 
-	if sawNvidia {
-		for _, gpu := range nvidiaDevices {
-			instanceConfig := d.instance.ExpandedConfig()
+			if gpu.DRM.RenderName != "" && gpu.DRM.RenderDevice != "" {
+				path := filepath.Join(gpuDRIDevPath, gpu.DRM.RenderName)
+				major, minor, err := d.deviceNumStringToUint32(gpu.DRM.RenderDevice)
+				if err != nil {
+					return nil, err
+				}
 
-			// No need to mount additional nvidia non-card devices as the nvidia.runtime
-			// setting will do this for us.
-			if shared.IsTrue(instanceConfig["nvidia.runtime"]) {
-				if !gpu.isCard {
-					continue
+				err = unixDeviceSetupCharNum(d.state, d.instance.DevicesPath(), "unix", d.name, d.config, major, minor, path, false, &runConf)
+				if err != nil {
+					return nil, err
 				}
 			}
 
-			prefix := unixDeviceJoinPath("unix", d.name)
-			if UnixDeviceExists(d.instance.DevicesPath(), prefix, gpu.path) {
-				continue
+			// Add Nvidia device if present.
+			if gpu.Nvidia != nil && gpu.Nvidia.CardName != "" && gpu.Nvidia.CardDevice != "" {
+				sawNvidia = true
+				path := filepath.Join("/dev", gpu.Nvidia.CardName)
+				major, minor, err := d.deviceNumStringToUint32(gpu.Nvidia.CardDevice)
+				if err != nil {
+					return nil, err
+				}
+
+				err = unixDeviceSetupCharNum(d.state, d.instance.DevicesPath(), "unix", d.name, d.config, major, minor, path, false, &runConf)
+				if err != nil {
+					return nil, err
+				}
 			}
+		}
+	}
 
-			err = unixDeviceSetupCharNum(d.state, d.instance.DevicesPath(), "unix", d.name, d.config, gpu.major, gpu.minor, gpu.path, false, &runConf)
+	if sawNvidia {
+		// No need to mount additional nvidia non-card devices as the nvidia.runtime
+		// setting will do this for us.
+		instanceConfig := d.instance.ExpandedConfig()
+		if !shared.IsTrue(instanceConfig["nvidia.runtime"]) {
+			nvidiaDevices, err := d.getNvidiaNonCardDevices()
 			if err != nil {
 				return nil, err
 			}
+
+			for _, dev := range nvidiaDevices {
+				prefix := unixDeviceJoinPath("unix", d.name)
+				if UnixDeviceExists(d.instance.DevicesPath(), prefix, dev.path) {
+					continue
+				}
+
+				err = unixDeviceSetupCharNum(d.state, d.instance.DevicesPath(), "unix", d.name, d.config, dev.major, dev.minor, dev.path, false, &runConf)
+				if err != nil {
+					return nil, err
+				}
+			}
 		}
 	}
 
@@ -246,399 +199,65 @@ func (d *gpu) postStop() error {
 	return nil
 }
 
-// deviceWantsAllGPUs whether the LXD device wants to passthrough all GPUs on the host.
-func (d *gpu) deviceWantsAllGPUs(m map[string]string) bool {
-	return m["vendorid"] == "" && m["productid"] == "" && m["id"] == "" && m["pci"] == ""
-}
-
-// deviceLoadGpu probes the system for information about the available GPUs.
-func (d *gpu) deviceLoadGpu(all bool) ([]gpuDevice, []nvidiaGpuDevice, error) {
-	const drmPath = "/sys/class/drm/"
-	var gpus []gpuDevice
-	var nvidiaDevices []nvidiaGpuDevice
-	var cards []cardIds
-
-	// Load NVIDIA information (if available)
-	var nvidiaContainer *nvidiaContainerInfo
-
-	_, err := exec.LookPath("nvidia-container-cli")
-	if err == nil {
-		out, err := shared.RunCommand("nvidia-container-cli", "info", "--csv")
-		if err == nil {
-			r := csv.NewReader(strings.NewReader(out))
-			r.FieldsPerRecord = -1
-
-			nvidiaContainer = &nvidiaContainerInfo{}
-			nvidiaContainer.Cards = map[string]*nvidiaContainerCardInfo{}
-			line := 0
-			for {
-				record, err := r.Read()
-				if err == io.EOF {
-					break
-				}
-				line++
-
-				if err != nil {
-					continue
-				}
-
-				if line == 2 && len(record) >= 2 {
-					nvidiaContainer.NVRMVersion = record[0]
-					nvidiaContainer.CUDAVersion = record[1]
-				} else if line >= 4 {
-					nvidiaContainer.Cards[record[5]] = &nvidiaContainerCardInfo{
-						DeviceIndex:  record[0],
-						DeviceMinor:  record[1],
-						Model:        record[2],
-						Brand:        record[3],
-						UUID:         record[4],
-						PCIAddress:   record[5],
-						Architecture: record[6],
-					}
-				}
-			}
-		}
+// deviceNumStringToUint32 converts a device number string (major:minor) into separare major and
+// minor uint32s.
+func (d *gpu) deviceNumStringToUint32(devNum string) (uint32, uint32, error) {
+	devParts := strings.SplitN(devNum, ":", 2)
+	tmp, err := strconv.ParseUint(devParts[0], 10, 32)
+	if err != nil {
+		return 0, 0, err
 	}
+	major := uint32(tmp)
 
-	// Load PCI database
-	pciDB, err := pcidb.New()
+	tmp, err = strconv.ParseUint(devParts[1], 10, 32)
 	if err != nil {
-		pciDB = nil
+		return 0, 0, err
 	}
+	minor := uint32(tmp)
 
-	// Get the list of DRM devices
-	ents, err := ioutil.ReadDir(drmPath)
+	return major, minor, nil
+}
+
+// getNvidiaNonCardDevices returns device information about Nvidia non-card devices.
+func (d *gpu) getNvidiaNonCardDevices() ([]nvidiaNonCardDevice, error) {
+	nvidiaEnts, err := ioutil.ReadDir("/dev")
 	if err != nil {
-		// No GPUs
 		if os.IsNotExist(err) {
-			return nil, nil, nil
+			return nil, err
 		}
-
-		return nil, nil, err
 	}
 
-	// Get the list of cards
-	devices := []string{}
-	for _, ent := range ents {
-		dev, err := filepath.EvalSymlinks(fmt.Sprintf("%s/%s/device", drmPath, ent.Name()))
-		if err != nil {
-			continue
-		}
-
-		if !shared.StringInSlice(dev, devices) {
-			devices = append(devices, dev)
-		}
+	regexNvidiaCard, err := regexp.Compile(`^nvidia[0-9]+`)
+	if err != nil {
+		return nil, err
 	}
 
-	isNvidia := false
-	for _, device := range devices {
-		// The pci address == the name of the directory. So let's use
-		// this cheap way of retrieving it.
-		pciAddr := filepath.Base(device)
-
-		// Make sure that we are dealing with a GPU by looking whether
-		// the "drm" subfolder exists.
-		drm := filepath.Join(device, "drm")
-		drmEnts, err := ioutil.ReadDir(drm)
-		if err != nil {
-			if os.IsNotExist(err) {
-				continue
-			}
-		}
-
-		// Retrieve vendor ID.
-		vendorIDPath := filepath.Join(device, "vendor")
-		vendorID, err := ioutil.ReadFile(vendorIDPath)
-		if err != nil {
-			if os.IsNotExist(err) {
-				continue
-			}
-		}
-
-		// Retrieve device ID.
-		productIDPath := filepath.Join(device, "device")
-		productID, err := ioutil.ReadFile(productIDPath)
-		if err != nil {
-			if os.IsNotExist(err) {
-				continue
-			}
-		}
-
-		// Retrieve node ID
-		numaPath := fmt.Sprintf(filepath.Join(device, "numa_node"))
-		numaNode := uint64(0)
-		if shared.PathExists(numaPath) {
-			numaID, err := shared.ParseNumberFromFile(numaPath)
-			if err != nil {
-				continue
-			}
-
-			if numaID > 0 {
-				numaNode = uint64(numaID)
-			}
-		}
-
-		// Retrieve driver
-		driver := ""
-		driverVersion := ""
-		driverPath := filepath.Join(device, "driver")
-		if shared.PathExists(driverPath) {
-			target, err := os.Readlink(driverPath)
-			if err != nil {
-				continue
-			}
-
-			driver = filepath.Base(target)
-
-			out, err := ioutil.ReadFile(filepath.Join(driverPath, "module", "version"))
-			if err == nil {
-				driverVersion = strings.TrimSpace(string(out))
-			} else {
-				uname, err := shared.Uname()
-				if err != nil {
-					continue
-				}
-				driverVersion = uname.Release
-			}
-		}
-
-		// Store all associated subdevices, e.g. controlD64, renderD128.
-		// The name of the directory == the last part of the
-		// /dev/dri/controlD64 path. So drmEnt.Name() will give us
-		// controlD64.
-		for _, drmEnt := range drmEnts {
-			vendorTmp := strings.TrimSpace(string(vendorID))
-			productTmp := strings.TrimSpace(string(productID))
-			vendorTmp = strings.TrimPrefix(vendorTmp, "0x")
-			productTmp = strings.TrimPrefix(productTmp, "0x")
-			tmpGpu := gpuDevice{
-				pci:           pciAddr,
-				vendorID:      vendorTmp,
-				productID:     productTmp,
-				numaNode:      numaNode,
-				driver:        driver,
-				driverVersion: driverVersion,
-				path:          filepath.Join("/dev/dri", drmEnt.Name()),
-			}
-
-			// Fill vendor and product names
-			if pciDB != nil {
-				vendor, ok := pciDB.Vendors[tmpGpu.vendorID]
-				if ok {
-					tmpGpu.vendorName = vendor.Name
-
-					for _, product := range vendor.Products {
-						if product.ID == tmpGpu.productID {
-							tmpGpu.productName = product.Name
-							break
-						}
-					}
-				}
-			}
-
-			majMinPath := filepath.Join(drm, drmEnt.Name(), "dev")
-			majMinByte, err := ioutil.ReadFile(majMinPath)
-			if err != nil {
-				if os.IsNotExist(err) {
-					continue
-				}
-			}
-
-			majMin := strings.TrimSpace(string(majMinByte))
-			majMinSlice := strings.Split(string(majMin), ":")
-			if len(majMinSlice) != 2 {
-				continue
-			}
-
-			majorInt, err := strconv.ParseUint(majMinSlice[0], 10, 32)
-			if err != nil {
-				continue
-			}
-
-			minorInt, err := strconv.ParseUint(majMinSlice[1], 10, 32)
-			if err != nil {
-				continue
-			}
-
-			tmpGpu.major = uint32(majorInt)
-			tmpGpu.minor = uint32(minorInt)
-
-			isCard, err := regexp.MatchString("^card[0-9]+", drmEnt.Name())
-			if err != nil {
-				continue
-			}
-
-			// Find matching /dev/nvidia* entry for /dev/dri/card*
-			if tmpGpu.isNvidiaGpu() && isCard {
-				if !isNvidia {
-					isNvidia = true
-				}
-				tmpGpu.isNvidia = true
-
-				if !all {
-					minor, err := d.findNvidiaMinor(tmpGpu.pci)
-					if err == nil {
-						nvidiaPath := "/dev/nvidia" + minor
-						stat := unix.Stat_t{}
-						err = unix.Stat(nvidiaPath, &stat)
-						if err != nil {
-							if os.IsNotExist(err) {
-								continue
-							}
-
-							return nil, nil, err
-						}
-
-						tmpGpu.nvidia.path = nvidiaPath
-						tmpGpu.nvidia.major = unix.Major(stat.Rdev)
-						tmpGpu.nvidia.minor = unix.Minor(stat.Rdev)
-						tmpGpu.nvidia.id = strconv.FormatInt(int64(tmpGpu.nvidia.minor), 10)
-
-						if nvidiaContainer != nil {
-							tmpGpu.nvidia.nvrmVersion = nvidiaContainer.NVRMVersion
-							tmpGpu.nvidia.cudaVersion = nvidiaContainer.CUDAVersion
-							nvidiaInfo, ok := nvidiaContainer.Cards[tmpGpu.pci]
-							if !ok {
-								nvidiaInfo, ok = nvidiaContainer.Cards[fmt.Sprintf("0000%v", tmpGpu.pci)]
-							}
-							if ok {
-								tmpGpu.nvidia.brand = nvidiaInfo.Brand
-								tmpGpu.nvidia.model = nvidiaInfo.Model
-								tmpGpu.nvidia.uuid = nvidiaInfo.UUID
-								tmpGpu.nvidia.architecture = nvidiaInfo.Architecture
-							}
-						}
-					}
-				}
-			}
-
-			if isCard {
-				// If it is a card it's minor number will be its id.
-				tmpGpu.id = strconv.FormatInt(int64(minorInt), 10)
-				tmp := cardIds{
-					id:  tmpGpu.id,
-					pci: tmpGpu.pci,
-				}
+	nvidiaDevices := []nvidiaNonCardDevice{}
 
-				cards = append(cards, tmp)
-			}
-
-			gpus = append(gpus, tmpGpu)
+	for _, nvidiaEnt := range nvidiaEnts {
+		if !strings.HasPrefix(nvidiaEnt.Name(), "nvidia") {
+			continue
 		}
-	}
 
-	// We detected a Nvidia card, so let's collect all other nvidia devices
-	// that are not /dev/nvidia[0-9]+.
-	if isNvidia {
-		nvidiaEnts, err := ioutil.ReadDir("/dev")
-		if err != nil {
-			if os.IsNotExist(err) {
-				return nil, nil, err
-			}
+		if regexNvidiaCard.MatchString(nvidiaEnt.Name()) {
+			continue
 		}
 
-		validNvidia, err := regexp.Compile(`^nvidia[^0-9]+`)
+		nvidiaPath := filepath.Join("/dev", nvidiaEnt.Name())
+		stat := unix.Stat_t{}
+		err = unix.Stat(nvidiaPath, &stat)
 		if err != nil {
-			return nil, nil, err
-		}
-
-		for _, nvidiaEnt := range nvidiaEnts {
-			if all {
-				if !strings.HasPrefix(nvidiaEnt.Name(), "nvidia") {
-					continue
-				}
-			} else {
-				if !validNvidia.MatchString(nvidiaEnt.Name()) {
-					continue
-				}
-			}
-
-			nvidiaPath := filepath.Join("/dev", nvidiaEnt.Name())
-			stat := unix.Stat_t{}
-			err = unix.Stat(nvidiaPath, &stat)
-			if err != nil {
-				continue
-			}
-
-			tmpNividiaGpu := nvidiaGpuDevice{
-				isCard: !validNvidia.MatchString(nvidiaEnt.Name()),
-				path:   nvidiaPath,
-				major:  unix.Major(stat.Rdev),
-				minor:  unix.Minor(stat.Rdev),
-			}
-
-			nvidiaDevices = append(nvidiaDevices, tmpNividiaGpu)
-		}
-	}
-
-	// Since we'll give users to ability to specify and id we need to group
-	// devices on the same PCI that belong to the same card by id.
-	for _, card := range cards {
-		for i := 0; i < len(gpus); i++ {
-			if gpus[i].pci == card.pci {
-				gpus[i].id = card.id
-			}
-		}
-	}
-
-	return gpus, nvidiaDevices, nil
-}
-
-// findNvidiaMinorOld fallback for old drivers which don't provide "Device Minor:".
-func (d *gpu) findNvidiaMinorOld() (string, error) {
-	var minor string
-
-	// For now, just handle most common case (single nvidia card)
-	ents, err := ioutil.ReadDir("/dev")
-	if err != nil {
-		return "", err
-	}
-
-	rp := regexp.MustCompile("^nvidia([0-9]+)$")
-	for _, ent := range ents {
-		matches := rp.FindStringSubmatch(ent.Name())
-		if matches == nil {
 			continue
 		}
 
-		if minor != "" {
-			return "", fmt.Errorf("No device minor index detected, and more than one NVIDIA card present")
+		tmpNividiaGpu := nvidiaNonCardDevice{
+			path:  nvidiaPath,
+			major: unix.Major(stat.Rdev),
+			minor: unix.Minor(stat.Rdev),
 		}
-		minor = matches[1]
-	}
-
-	if minor == "" {
-		return "", fmt.Errorf("No device minor index detected, and no NVIDIA card present")
-	}
-
-	return minor, nil
-}
-
-// findNvidiaMinor returns minor number of nvidia device corresponding to the given pci id.
-func (d *gpu) findNvidiaMinor(pci string) (string, error) {
-	nvidiaPath := fmt.Sprintf("/proc/driver/nvidia/gpus/%s/information", pci)
-	buf, err := ioutil.ReadFile(nvidiaPath)
-	if err != nil {
-		return "", err
-	}
-
-	strBuf := strings.TrimSpace(string(buf))
-	idx := strings.Index(strBuf, "Device Minor:")
-	if idx != -1 {
-		idx += len("Device Minor:")
-		strBuf = strBuf[idx:]
-		strBuf = strings.TrimSpace(strBuf)
-		parts := strings.SplitN(strBuf, "\n", 2)
-		_, err = strconv.Atoi(parts[0])
-		if err == nil {
-			return parts[0], nil
-		}
-	}
 
-	minor, err := d.findNvidiaMinorOld()
-	if err == nil {
-		return minor, nil
+		nvidiaDevices = append(nvidiaDevices, tmpNividiaGpu)
 	}
 
-	return "", err
+	return nvidiaDevices, nil
 }


More information about the lxc-devel mailing list