11package mgr
22
33import (
4+ "errors"
45 "fmt"
6+ "os"
7+ "strconv"
8+ "strings"
59
610 "github.com/alibaba/pouch/apis/types"
711 "github.com/alibaba/pouch/daemon/logger/syslog"
@@ -10,6 +14,20 @@ import (
1014 "github.com/sirupsen/logrus"
1115)
1216
17+ var (
18+ // all: all GPUs will be accessible, this is the default value in our container images.
19+ // none: no GPU will be accessible, but driver capabilities will be enabled.
20+ supportedDevices = map [string ]* struct {}{"all" : nil , "none" : nil , "void" : nil }
21+
22+ // none: no GPU will be accessible, but driver capabilities will be enabled.
23+ // void or empty: no GPU will be accessible, and driver capabilities will be disabled.
24+ // all: all GPUs will be accessible
25+ supportedDrivers = map [string ]* struct {}{"compute" : nil , "compat32" : nil , "graphics" : nil , "utility" : nil , "video" : nil , "display" : nil }
26+
27+ errInvalidDevice = errors .New ("invalid nvidia device" )
28+ errInvalidDriver = errors .New ("invalid nvidia driver capability" )
29+ )
30+
1331// validateConfig validates container config
1432func (mgr * ContainerManager ) validateConfig (c * Container , update bool ) ([]string , error ) {
1533 // validates container hostconfig
@@ -19,6 +37,10 @@ func (mgr *ContainerManager) validateConfig(c *Container, update bool) ([]string
1937 if err != nil {
2038 return nil , err
2139 }
40+ // validates nvidia config
41+ if err := validateNvidiaConfig (& hostConfig .Resources ); err != nil {
42+ return warnings , err
43+ }
2244 warnings = append (warnings , warns ... )
2345
2446 if hostConfig .OomScoreAdj < - 1000 || hostConfig .OomScoreAdj > 1000 {
@@ -188,3 +210,74 @@ func (mgr *ContainerManager) validateLogConfig(c *Container) error {
188210 return fmt .Errorf ("not support (%v) log driver yet" , logCfg .LogDriver )
189211 }
190212}
213+
214+ // validateNvidiaConfig
215+ func validateNvidiaConfig (r * types.Resources ) error {
216+ if r .NvidiaConfig == nil {
217+ return nil
218+ }
219+
220+ if err := validateNvidiaDriver (r ); err != nil {
221+ return err
222+ }
223+
224+ if err := validateNvidiaDevice (r ); err != nil {
225+ return err
226+ }
227+
228+ return nil
229+ }
230+
231+ func validateNvidiaDriver (r * types.Resources ) error {
232+ n := r .NvidiaConfig
233+ n .NvidiaDriverCapabilities = strings .TrimSpace (n .NvidiaDriverCapabilities )
234+
235+ if n .NvidiaDriverCapabilities == "" {
236+ // use default driver capability: utility
237+ return nil
238+ }
239+
240+ if n .NvidiaDriverCapabilities == "all" {
241+ // enable all capabilities
242+ return nil
243+ }
244+
245+ drivers := strings .Split (n .NvidiaDriverCapabilities , "," )
246+
247+ for _ , d := range drivers {
248+ d = strings .TrimSpace (d )
249+ if _ , found := supportedDrivers [d ]; ! found {
250+ return errInvalidDriver
251+ }
252+ }
253+ return nil
254+ }
255+
256+ func validateNvidiaDevice (r * types.Resources ) error {
257+ n := r .NvidiaConfig
258+ n .NvidiaVisibleDevices = strings .TrimSpace (n .NvidiaVisibleDevices )
259+
260+ if n .NvidiaVisibleDevices == "" {
261+ // no GPU will be accessible, and driver capabilities will be disabled.
262+ return nil
263+ }
264+
265+ if _ , found := supportedDevices [n .NvidiaVisibleDevices ]; found {
266+ return nil
267+ }
268+
269+ // 0,1,2, GPU-fef8089b …: a comma-separated list of GPU UUID(s) or index(es).
270+ devs := strings .Split (n .NvidiaVisibleDevices , "," )
271+ for _ , dev := range devs {
272+ dev = strings .TrimSpace (dev )
273+ if _ , err := strconv .Atoi (dev ); err == nil {
274+ //dev is numeric, the realDev should be /dev/nvidiaN
275+ realDev := fmt .Sprintf ("/dev/nvidia%s" , dev )
276+ if _ , err := os .Stat (realDev ); err != nil {
277+ return errInvalidDevice
278+ }
279+ }
280+ // TODO: how to validate GPU UUID
281+ }
282+ return nil
283+ }
0 commit comments