[lxc-devel] [lxd/master] gpu: fix gpu attach

brauner on Github lxc-bot at linuxcontainers.org
Thu Aug 10 18:27:12 UTC 2017


A non-text attachment was scrubbed...
Name: not available
Type: text/x-mailbox
Size: 748 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20170810/0f9af7e1/attachment.bin>
-------------- next part --------------
From f6595d97c95e0eaeaf0e390d40da378ce9f1f539 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner at ubuntu.com>
Date: Thu, 10 Aug 2017 20:17:16 +0200
Subject: [PATCH] gpu: fix gpu attach

The previous code assumes that the nvidia card index and the dri card index are
identical, i.e. for a given pair {/dev/card<card-idx>, /dev/nvidia<nvidia-idx>}
it was assumed that <card-idx> == <nvidia-idx> but it is definitely possible
that <card-idx> != <nvidia-idx>.

Also, let's report an error when we don't find the gpu device that the user
requested.

Closes #3642.

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
---
 lxd/container_lxc.go | 18 ++++++++++++++++++
 lxd/devices.go       | 34 +++++++++++++++++++++++++++++++---
 2 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/lxd/container_lxc.go b/lxd/container_lxc.go
index 62356543d..8d26217f6 100644
--- a/lxd/container_lxc.go
+++ b/lxd/container_lxc.go
@@ -1812,6 +1812,7 @@ func (c *containerLXC) startCommon() (string, error) {
 			}
 
 			sawNvidia := false
+			found := false
 			for _, gpu := range gpus {
 				if (m["vendorid"] != "" && gpu.vendorid != m["vendorid"]) ||
 					(m["pci"] != "" && gpu.pci != m["pci"]) ||
@@ -1820,6 +1821,8 @@ func (c *containerLXC) startCommon() (string, error) {
 					continue
 				}
 
+				found = true
+
 				err := c.setupUnixDevice(k, m, gpu.major, gpu.minor, gpu.path, true)
 				if err != nil {
 					return "", err
@@ -1845,6 +1848,12 @@ func (c *containerLXC) startCommon() (string, error) {
 					}
 				}
 			}
+
+			if !found {
+				msg := "Failed to detect requested GPU device"
+				logger.Error(msg)
+				return "", fmt.Errorf(msg)
+			}
 		} else if m["type"] == "disk" {
 			if m["path"] != "/" {
 				diskDevices[k] = m
@@ -3859,6 +3868,7 @@ func (c *containerLXC) Update(args containerArgs, userRequested bool) error {
 				}
 
 				sawNvidia := false
+				found := false
 				for _, gpu := range gpus {
 					if (m["vendorid"] != "" && gpu.vendorid != m["vendorid"]) ||
 						(m["pci"] != "" && gpu.pci != m["pci"]) ||
@@ -3867,6 +3877,8 @@ func (c *containerLXC) Update(args containerArgs, userRequested bool) error {
 						continue
 					}
 
+					found = true
+
 					err = c.insertUnixDeviceNum(m, gpu.major, gpu.minor, gpu.path)
 					if err != nil {
 						logger.Error("Failed to insert GPU device.", log.Ctx{"err": err, "gpu": gpu, "container": c.Name()})
@@ -3898,6 +3910,12 @@ func (c *containerLXC) Update(args containerArgs, userRequested bool) error {
 						}
 					}
 				}
+
+				if !found {
+					msg := "Failed to detect requested GPU device"
+					logger.Error(msg)
+					return fmt.Errorf(msg)
+				}
 			}
 		}
 
diff --git a/lxd/devices.go b/lxd/devices.go
index 771094b5e..2fa749a1f 100644
--- a/lxd/devices.go
+++ b/lxd/devices.go
@@ -205,11 +205,39 @@ func deviceLoadGpu() ([]gpuDevice, []nvidiaGpuDevices, error) {
 				if !isNvidia {
 					isNvidia = true
 				}
-				nvidiaPath := "/dev/nvidia" + strconv.Itoa(tmpGpu.minor)
+
+				nvidiaPath := fmt.Sprintf("/proc/driver/nvidia/gpus/%s/information", tmpGpu.pci)
+				buf, err := ioutil.ReadFile(nvidiaPath)
+				if err != nil {
+					return nil, nil, err
+				}
+				strBuf := strings.TrimSpace(string(buf))
+				idx := strings.Index(strBuf, "Device Minor:")
+				idx += len("Device Minor:")
+				strBuf = strBuf[idx:]
+				strBuf = strings.TrimSpace(strBuf)
+				idx = strings.Index(strBuf, " ")
+				if idx == -1 {
+					idx = strings.Index(strBuf, "\t")
+				}
+				if idx >= 1 {
+					strBuf = strBuf[:idx]
+				}
+
+				if strBuf == "" {
+					return nil, nil, fmt.Errorf("No device minor index detected")
+				}
+
+				_, err = strconv.Atoi(strBuf)
+				if err != nil {
+					return nil, nil, err
+				}
+
+				nvidiaPath = "/dev/nvidia" + strBuf
 				stat := syscall.Stat_t{}
-				err := syscall.Stat(nvidiaPath, &stat)
+				err = syscall.Stat(nvidiaPath, &stat)
 				if err != nil {
-					continue
+					return nil, nil, err
 				}
 				tmpGpu.nvidia.path = nvidiaPath
 				tmpGpu.nvidia.major = int(stat.Rdev / 256)


More information about the lxc-devel mailing list