Skip to content

Commit 2b3ef3f

Browse files
committed
feature: add nvidia-container 2.0
Signed-off-by: codejuan <[email protected]>
1 parent f76d3bc commit 2b3ef3f

File tree

12 files changed

+456
-0
lines changed

12 files changed

+456
-0
lines changed

apis/swagger.yml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2479,6 +2479,29 @@ definitions:
24792479
x-nullable: false
24802480
minimum: 0
24812481
maximum: 1
2482+
NvidiaConfig:
2483+
$ref: "#/definitions/NvidiaConfig"
2484+
2485+
NvidiaConfig:
2486+
type: "object"
2487+
properties:
2488+
NvidiaVisibleDevices:
2489+
description: "NvidiaVisibleDevices controls which GPUs will be made accessible inside the container"
2490+
type: "string"
2491+
example: |
2492+
Possible values.
2493+
0,1,2, GPU-fef8089b …: a comma-separated list of GPU UUID(s) or index(es).
2494+
all: all GPUs will be accessible, this is the default value in our container images.
2495+
none: no GPU will be accessible, but driver capabilities will be enabled.
2496+
x-nullable: false
2497+
NvidiaDriverCapabilities:
2498+
description: "NvidiaDriverCapabilities controls which driver libraries/binaries will be mounted inside the container"
2499+
type: "string"
2500+
example: |
2501+
Possible values
2502+
compute,video, graphics,utility …: a comma-separated list of driver features the container needs.
2503+
all: enable all available driver capabilities.
2504+
x-nullable: false
24822505

24832506
ThrottleDevice:
24842507
type: "object"

apis/types/nvidia_config.go

Lines changed: 57 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

apis/types/resources.go

Lines changed: 29 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

cli/common_flags.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,5 +109,9 @@ func addCommonFlags(flagSet *pflag.FlagSet) *container {
109109
// additional runtime spec annotations
110110
flagSet.StringSliceVar(&c.specAnnotation, "annotation", nil, "Additional annotation for runtime")
111111

112+
// nvidia container
113+
flagSet.StringVar(&c.nvidiaDriverCapabilities, "nvidia-capabilities", "", "NvidiaDriverCapabilities controls which driver libraries/binaries will be mounted inside the container")
114+
flagSet.StringVar(&c.nvidiaVisibleDevices, "nvidia-visible-devs", "", "NvidiaVisibleDevices controls which GPUs will be made accessible inside the container")
115+
112116
return c
113117
}

cli/container.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ type container struct {
8383
rich bool
8484
richMode string
8585
initScript string
86+
87+
// nvidia container
88+
nvidiaVisibleDevices string
89+
nvidiaDriverCapabilities string
8690
}
8791

8892
func (c *container) config() (*types.ContainerCreateConfig, error) {
@@ -269,5 +273,12 @@ func (c *container) config() (*types.ContainerCreateConfig, error) {
269273
NetworkingConfig: networkingConfig,
270274
}
271275

276+
if c.nvidiaDriverCapabilities != "" || c.nvidiaVisibleDevices != "" {
277+
config.HostConfig.Resources.NvidiaConfig = &types.NvidiaConfig{
278+
NvidiaDriverCapabilities: c.nvidiaDriverCapabilities,
279+
NvidiaVisibleDevices: c.nvidiaVisibleDevices,
280+
}
281+
}
282+
272283
return config, nil
273284
}

daemon/mgr/container_validation.go

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
package mgr
22

33
import (
4+
"errors"
45
"fmt"
6+
"os"
7+
"strconv"
8+
"strings"
59

610
"github.com/alibaba/pouch/apis/types"
711
"github.com/alibaba/pouch/daemon/logger/syslog"
@@ -10,6 +14,14 @@ import (
1014
"github.com/sirupsen/logrus"
1115
)
1216

17+
var (
18+
supportedDevices = map[string]*struct{}{"all": nil, "none": nil, "void": nil}
19+
supportedDrivers = map[string]*struct{}{"compute": nil, "compat32": nil, "graphics": nil, "utility": nil, "video": nil, "display": nil}
20+
21+
errInvalidDevice = errors.New("invalid nvidia device")
22+
errInvalidDriver = errors.New("invalid nvidia driver capability")
23+
)
24+
1325
// validateConfig validates container config
1426
func (mgr *ContainerManager) validateConfig(c *Container, update bool) ([]string, error) {
1527
// validates container hostconfig
@@ -19,6 +31,10 @@ func (mgr *ContainerManager) validateConfig(c *Container, update bool) ([]string
1931
if err != nil {
2032
return nil, err
2133
}
34+
// validates nvidia config
35+
if err := validateNvidiaConfig(&hostConfig.Resources); err != nil {
36+
return warnings, err
37+
}
2238
warnings = append(warnings, warns...)
2339

2440
if hostConfig.OomScoreAdj < -1000 || hostConfig.OomScoreAdj > 1000 {
@@ -188,3 +204,76 @@ func (mgr *ContainerManager) validateLogConfig(c *Container) error {
188204
return fmt.Errorf("not support (%v) log driver yet", logCfg.LogDriver)
189205
}
190206
}
207+
208+
// validateNvidiaConfig
209+
func validateNvidiaConfig(r *types.Resources) error {
210+
if r.NvidiaConfig == nil {
211+
return nil
212+
}
213+
214+
if err := validateNvidiaDriver(r); err != nil {
215+
return err
216+
}
217+
218+
if err := validateNvidiaDevice(r); err != nil {
219+
return err
220+
}
221+
222+
return nil
223+
}
224+
225+
func validateNvidiaDriver(r *types.Resources) error {
226+
n := r.NvidiaConfig
227+
n.NvidiaDriverCapabilities = strings.TrimSpace(n.NvidiaDriverCapabilities)
228+
229+
if n.NvidiaDriverCapabilities == "" {
230+
// use default driver capability: utility
231+
return nil
232+
}
233+
234+
if n.NvidiaDriverCapabilities == "all" {
235+
// enable all capabilities
236+
return nil
237+
}
238+
239+
drivers := strings.Split(n.NvidiaDriverCapabilities, ",")
240+
241+
for _, d := range drivers {
242+
d = strings.TrimSpace(d)
243+
if _, found := supportedDrivers[d]; !found {
244+
return errInvalidDriver
245+
}
246+
}
247+
return nil
248+
}
249+
250+
func validateNvidiaDevice(r *types.Resources) error {
251+
n := r.NvidiaConfig
252+
n.NvidiaVisibleDevices = strings.TrimSpace(n.NvidiaVisibleDevices)
253+
254+
// none: no GPU will be accessible, but driver capabilities will be enabled.
255+
// void or empty: no GPU will be accessible, and driver capabilities will be disabled.
256+
// all: all GPUs will be accessible
257+
if n.NvidiaVisibleDevices == "" {
258+
return nil
259+
}
260+
261+
if _, found := supportedDevices[n.NvidiaVisibleDevices]; found {
262+
return nil
263+
}
264+
265+
// 0,1,2, GPU-fef8089b …: a comma-separated list of GPU UUID(s) or index(es).
266+
devs := strings.Split(n.NvidiaVisibleDevices, ",")
267+
for _, dev := range devs {
268+
dev = strings.TrimSpace(dev)
269+
if _, err := strconv.Atoi(dev); err == nil {
270+
//dev is numeric, the realDev should be /dev/nvidiaN
271+
realDev := fmt.Sprintf("/dev/nvidia%s", dev)
272+
if _, err := os.Stat(realDev); err != nil {
273+
return errInvalidDevice
274+
}
275+
}
276+
// TODO: how to validate GPU UUID
277+
}
278+
return nil
279+
}

0 commit comments

Comments
 (0)