Skip to content

Commit a77e0da

Browse files
authored
Merge pull request #2029 from CodeJuan/nvidia2
feature: support nvidia-container 2.0 to enable GPU access
2 parents 7a555f1 + cf12087 commit a77e0da

File tree

12 files changed

+462
-0
lines changed

12 files changed

+462
-0
lines changed

apis/swagger.yml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2483,6 +2483,29 @@ definitions:
24832483
x-nullable: false
24842484
minimum: 0
24852485
maximum: 1
2486+
NvidiaConfig:
2487+
$ref: "#/definitions/NvidiaConfig"
2488+
2489+
NvidiaConfig:
2490+
type: "object"
2491+
properties:
2492+
NvidiaVisibleDevices:
2493+
description: "NvidiaVisibleDevices controls which GPUs will be made accessible inside the container"
2494+
type: "string"
2495+
example: |
2496+
Possible values.
2497+
0,1,2, GPU-fef8089b …: a comma-separated list of GPU UUID(s) or index(es).
2498+
all: all GPUs will be accessible, this is the default value in our container images.
2499+
none: no GPU will be accessible, but driver capabilities will be enabled.
2500+
x-nullable: false
2501+
NvidiaDriverCapabilities:
2502+
description: "NvidiaDriverCapabilities controls which driver libraries/binaries will be mounted inside the container"
2503+
type: "string"
2504+
example: |
2505+
Possible values
2506+
compute,video, graphics,utility …: a comma-separated list of driver features the container needs.
2507+
all: enable all available driver capabilities.
2508+
x-nullable: false
24862509

24872510
ThrottleDevice:
24882511
type: "object"

apis/types/nvidia_config.go

Lines changed: 57 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

apis/types/resources.go

Lines changed: 29 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

cli/common_flags.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,5 +109,9 @@ func addCommonFlags(flagSet *pflag.FlagSet) *container {
109109
// additional runtime spec annotations
110110
flagSet.StringSliceVar(&c.specAnnotation, "annotation", nil, "Additional annotation for runtime")
111111

112+
// nvidia container
113+
flagSet.StringVar(&c.nvidiaDriverCapabilities, "nvidia-capabilities", "", "NvidiaDriverCapabilities controls which driver libraries/binaries will be mounted inside the container")
114+
flagSet.StringVar(&c.nvidiaVisibleDevices, "nvidia-visible-devs", "", "NvidiaVisibleDevices controls which GPUs will be made accessible inside the container")
115+
112116
return c
113117
}

cli/container.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ type container struct {
8383
rich bool
8484
richMode string
8585
initScript string
86+
87+
// nvidia container
88+
nvidiaVisibleDevices string
89+
nvidiaDriverCapabilities string
8690
}
8791

8892
func (c *container) config() (*types.ContainerCreateConfig, error) {
@@ -269,5 +273,12 @@ func (c *container) config() (*types.ContainerCreateConfig, error) {
269273
NetworkingConfig: networkingConfig,
270274
}
271275

276+
if c.nvidiaDriverCapabilities != "" || c.nvidiaVisibleDevices != "" {
277+
config.HostConfig.Resources.NvidiaConfig = &types.NvidiaConfig{
278+
NvidiaDriverCapabilities: c.nvidiaDriverCapabilities,
279+
NvidiaVisibleDevices: c.nvidiaVisibleDevices,
280+
}
281+
}
282+
272283
return config, nil
273284
}

daemon/mgr/container_validation.go

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
package mgr
22

33
import (
4+
"errors"
45
"fmt"
6+
"os"
7+
"strconv"
8+
"strings"
59

610
"github.com/alibaba/pouch/apis/types"
711
"github.com/alibaba/pouch/daemon/logger/syslog"
@@ -10,6 +14,20 @@ import (
1014
"github.com/sirupsen/logrus"
1115
)
1216

17+
var (
18+
// all: all GPUs will be accessible, this is the default value in our container images.
19+
// none: no GPU will be accessible, but driver capabilities will be enabled.
20+
supportedDevices = map[string]*struct{}{"all": nil, "none": nil, "void": nil}
21+
22+
// none: no GPU will be accessible, but driver capabilities will be enabled.
23+
// void or empty: no GPU will be accessible, and driver capabilities will be disabled.
24+
// all: all GPUs will be accessible
25+
supportedDrivers = map[string]*struct{}{"compute": nil, "compat32": nil, "graphics": nil, "utility": nil, "video": nil, "display": nil}
26+
27+
errInvalidDevice = errors.New("invalid nvidia device")
28+
errInvalidDriver = errors.New("invalid nvidia driver capability")
29+
)
30+
1331
// validateConfig validates container config
1432
func (mgr *ContainerManager) validateConfig(c *Container, update bool) ([]string, error) {
1533
// validates container hostconfig
@@ -19,6 +37,10 @@ func (mgr *ContainerManager) validateConfig(c *Container, update bool) ([]string
1937
if err != nil {
2038
return nil, err
2139
}
40+
// validates nvidia config
41+
if err := validateNvidiaConfig(&hostConfig.Resources); err != nil {
42+
return warnings, err
43+
}
2244
warnings = append(warnings, warns...)
2345

2446
if hostConfig.OomScoreAdj < -1000 || hostConfig.OomScoreAdj > 1000 {
@@ -188,3 +210,74 @@ func (mgr *ContainerManager) validateLogConfig(c *Container) error {
188210
return fmt.Errorf("not support (%v) log driver yet", logCfg.LogDriver)
189211
}
190212
}
213+
214+
// validateNvidiaConfig
215+
func validateNvidiaConfig(r *types.Resources) error {
216+
if r.NvidiaConfig == nil {
217+
return nil
218+
}
219+
220+
if err := validateNvidiaDriver(r); err != nil {
221+
return err
222+
}
223+
224+
if err := validateNvidiaDevice(r); err != nil {
225+
return err
226+
}
227+
228+
return nil
229+
}
230+
231+
func validateNvidiaDriver(r *types.Resources) error {
232+
n := r.NvidiaConfig
233+
n.NvidiaDriverCapabilities = strings.TrimSpace(n.NvidiaDriverCapabilities)
234+
235+
if n.NvidiaDriverCapabilities == "" {
236+
// use default driver capability: utility
237+
return nil
238+
}
239+
240+
if n.NvidiaDriverCapabilities == "all" {
241+
// enable all capabilities
242+
return nil
243+
}
244+
245+
drivers := strings.Split(n.NvidiaDriverCapabilities, ",")
246+
247+
for _, d := range drivers {
248+
d = strings.TrimSpace(d)
249+
if _, found := supportedDrivers[d]; !found {
250+
return errInvalidDriver
251+
}
252+
}
253+
return nil
254+
}
255+
256+
func validateNvidiaDevice(r *types.Resources) error {
257+
n := r.NvidiaConfig
258+
n.NvidiaVisibleDevices = strings.TrimSpace(n.NvidiaVisibleDevices)
259+
260+
if n.NvidiaVisibleDevices == "" {
261+
// no GPU will be accessible, and driver capabilities will be disabled.
262+
return nil
263+
}
264+
265+
if _, found := supportedDevices[n.NvidiaVisibleDevices]; found {
266+
return nil
267+
}
268+
269+
// 0,1,2, GPU-fef8089b …: a comma-separated list of GPU UUID(s) or index(es).
270+
devs := strings.Split(n.NvidiaVisibleDevices, ",")
271+
for _, dev := range devs {
272+
dev = strings.TrimSpace(dev)
273+
if _, err := strconv.Atoi(dev); err == nil {
274+
//dev is numeric, the realDev should be /dev/nvidiaN
275+
realDev := fmt.Sprintf("/dev/nvidia%s", dev)
276+
if _, err := os.Stat(realDev); err != nil {
277+
return errInvalidDevice
278+
}
279+
}
280+
// TODO: how to validate GPU UUID
281+
}
282+
return nil
283+
}

0 commit comments

Comments
 (0)