Skip to content

Commit f590d96

Browse files
committed
feature: add nvidia-container 2.0
Signed-off-by: codejuan <[email protected]>
1 parent cec4108 commit f590d96

File tree

8 files changed

+286
-0
lines changed

8 files changed

+286
-0
lines changed

apis/swagger.yml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2471,6 +2471,29 @@ definitions:
24712471
x-nullable: false
24722472
minimum: 0
24732473
maximum: 1
2474+
NvidiaConfig:
2475+
$ref: "#/definitions/NvidiaConfig"
2476+
2477+
NvidiaConfig:
2478+
type: "object"
2479+
properties:
2480+
NvidiaVisibleDevices:
2481+
description: "NvidiaVisibleDevices controls which GPUs will be made accessible inside the container"
2482+
type: "string"
2483+
example: |
2484+
Possible values.
2485+
0,1,2, GPU-fef8089b …: a comma-separated list of GPU UUID(s) or index(es).
2486+
all: all GPUs will be accessible, this is the default value in our container images.
2487+
none: no GPU will be accessible, but driver capabilities will be enabled.
2488+
x-nullable: false
2489+
NvidiaDriverCapabilities:
2490+
description: "NvidiaDriverCapabilities controls which driver libraries/binaries will be mounted inside the container"
2491+
type: "string"
2492+
example: |
2493+
Possible values
2494+
compute,video, graphics,utility …: a comma-separated list of driver features the container needs.
2495+
all: enable all available driver capabilities.
2496+
x-nullable: false
24742497

24752498
ThrottleDevice:
24762499
type: "object"

apis/types/nvidia_config.go

Lines changed: 57 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

apis/types/resources.go

Lines changed: 29 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

daemon/mgr/container_utils.go

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package mgr
22

33
import (
44
"fmt"
5+
"os"
56
"strconv"
67
"strings"
78

@@ -234,6 +235,10 @@ func validateConfig(config *types.ContainerConfig, hostConfig *types.HostConfig,
234235
if err != nil {
235236
return nil, err
236237
}
238+
// vlidates nvidia config
239+
if err := validateNvidiaConfig(hostConfig); err != nil {
240+
return warnings, err
241+
}
237242
warnings = append(warnings, warns...)
238243

239244
if hostConfig.OomScoreAdj < -1000 || hostConfig.OomScoreAdj > 1000 {
@@ -379,3 +384,77 @@ func amendContainerSettings(config *types.ContainerConfig, hostConfig *types.Hos
379384
r.MemorySwap = 2 * r.Memory
380385
}
381386
}
387+
388+
// validateNvidiaConfig
389+
func validateNvidiaConfig(hostConfig *types.HostConfig) error {
390+
r := &hostConfig.Resources
391+
if r.NvidiaConfig == nil {
392+
return nil
393+
}
394+
395+
if err := vlidateNvidiaDriver(r); err != nil {
396+
return err
397+
}
398+
399+
if err := vlidateNvidiaDevice(hostConfig); err != nil {
400+
return err
401+
}
402+
403+
return nil
404+
}
405+
406+
func stringInSlice(a string, list []string) bool {
407+
for _, b := range list {
408+
if b == a {
409+
return true
410+
}
411+
}
412+
return false
413+
}
414+
415+
func vlidateNvidiaDriver(r *types.Resources) error {
416+
n := r.NvidiaConfig
417+
n.NvidiaDriverCapabilities = strings.TrimSpace(n.NvidiaDriverCapabilities)
418+
if n.NvidiaDriverCapabilities == "" {
419+
return nil
420+
}
421+
422+
if n.NvidiaDriverCapabilities == "all" {
423+
return nil
424+
}
425+
426+
supportedDrivers := []string{"compute", "compat32", "graphics", "utility", "video", "display"}
427+
drivers := strings.Split(n.NvidiaDriverCapabilities, ",")
428+
for _, d := range drivers {
429+
d = strings.TrimSpace(d)
430+
found := stringInSlice(d, supportedDrivers)
431+
if !found {
432+
return fmt.Errorf("invalid nvidia driver capability %s", d)
433+
}
434+
}
435+
return nil
436+
}
437+
438+
func vlidateNvidiaDevice(hostConfig *types.HostConfig) error {
439+
n := hostConfig.Resources.NvidiaConfig
440+
n.NvidiaVisibleDevices = strings.TrimSpace(n.NvidiaVisibleDevices)
441+
442+
supportedDevices := []string{"", "all", "none", "void"}
443+
found := stringInSlice(n.NvidiaVisibleDevices, supportedDevices)
444+
if found {
445+
return nil
446+
}
447+
devs := strings.Split(n.NvidiaVisibleDevices, ",")
448+
for _, dev := range devs {
449+
dev = strings.TrimSpace(dev)
450+
if _, err := strconv.Atoi(dev); err == nil {
451+
//dev is numeric, the realDev should be /dev/nvidiaN
452+
realDev := fmt.Sprintf("/dev/nvidia%s", dev)
453+
if _, err := os.Stat(realDev); err != nil {
454+
return fmt.Errorf("invalid nvidia device %s", realDev)
455+
}
456+
}
457+
// TODO: how to validate GPU UUID
458+
}
459+
return nil
460+
}

daemon/mgr/spec_hook.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
"io/ioutil"
77
"os"
8+
"os/exec"
89
"path/filepath"
910
"sort"
1011
"strconv"
@@ -64,6 +65,11 @@ func setupHook(ctx context.Context, c *Container, specWrapper *SpecWrapper) erro
6465
return errors.Wrap(err, "failed to set volume mount tab prestart hook")
6566
}
6667

68+
// set nvidia config
69+
if err := setNvidiaHook(ctx, c, specWrapper); err != nil {
70+
return errors.Wrap(err, "failed to set nvidia prestart hook")
71+
}
72+
6773
return nil
6874
}
6975

@@ -152,6 +158,24 @@ func setMountTab(ctx context.Context, c *Container, spec *SpecWrapper) error {
152158
return nil
153159
}
154160

161+
func setNvidiaHook(ctx context.Context, c *Container, spec *SpecWrapper) error {
162+
n := c.HostConfig.NvidiaConfig
163+
if n == nil {
164+
return nil
165+
}
166+
path, err := exec.LookPath("nvidia-container-runtime-hook")
167+
if err != nil {
168+
return err
169+
}
170+
args := []string{path}
171+
nvidiaPrestart := specs.Hook{
172+
Path: path,
173+
Args: append(args, "prestart"),
174+
}
175+
spec.s.Hooks.Prestart = append(spec.s.Hooks.Prestart, nvidiaPrestart)
176+
return nil
177+
}
178+
155179
type hookArray []*wrapperEmbedPrestart
156180

157181
// Len is defined in order to support sort

daemon/mgr/spec_process.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package mgr
22

33
import (
44
"context"
5+
"fmt"
56
"io/ioutil"
67
"os"
78
"strings"
@@ -61,6 +62,9 @@ func setupProcess(ctx context.Context, c *Container, s *specs.Spec) error {
6162
return err
6263
}
6364

65+
if err := setupNvidiaEnv(ctx, c, s); err != nil {
66+
return err
67+
}
6468
return nil
6569
}
6670

@@ -163,3 +167,13 @@ func setupRlimits(ctx context.Context, hostConfig *types.HostConfig, s *specs.Sp
163167
s.Process.Rlimits = rlimits
164168
return nil
165169
}
170+
171+
func setupNvidiaEnv(ctx context.Context, c *Container, s *specs.Spec) error {
172+
n := c.HostConfig.NvidiaConfig
173+
if n == nil {
174+
return nil
175+
}
176+
s.Process.Env = append(s.Process.Env, fmt.Sprintf("NVIDIA_DRIVER_CAPABILITIES=%s", n.NvidiaDriverCapabilities))
177+
s.Process.Env = append(s.Process.Env, fmt.Sprintf("NVIDIA_VISIBLE_DEVICES=%s", n.NvidiaVisibleDevices))
178+
return nil
179+
}

hack/package/rpm/build.sh

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ POUCHDIR=$TMP/source
1616
[ -d "$POUCHDIR" ] || mkdir -p "$POUCHDIR"
1717
BINDIR=$POUCHDIR/bin
1818
[ -d "$BINDIR" ] || mkdir -p "$BINDIR"
19+
LIBDIR=$POUCHDIR/lib
20+
[ -d "$LIBDIR" ] || mkdir -p "$LIBDIR"
1921
LXC_DIR=$TMP/lxc
2022
[ -d "$LXC_DIR" ] || mkdir -p "$LXC_DIR"
2123

@@ -34,6 +36,9 @@ CATEGORY='Tools/Pouch'
3436
MAINTAINER='Pouch [email protected]'
3537
VENDOR='Pouch'
3638

39+
LIB_NVIDIA_VERSION="1.0.0-rc.2"
40+
NVIDIA_RUNTIME_VERSION="1.4.0-1"
41+
3742
# build lxcfs
3843
function build_lxcfs ()
3944
{
@@ -69,6 +74,24 @@ function build_pouch()
6974
popd
7075
}
7176

77+
# install nvidia-container-runtime
78+
function build_nvidia_runtime(){
79+
echo "Downloading libnvidia-container."
80+
wget --quiet "https://github.com/NVIDIA/libnvidia-container/releases/download/v${LIB_NVIDIA_VERSION}/libnvidia-container_${LIB_NVIDIA_VERSION}_x86_64.tar.xz" -P "${TMP}"
81+
tar -xf "${TMP}/libnvidia-container_${LIB_NVIDIA_VERSION}_x86_64.tar.xz" -C "${TMP}"
82+
cp "${TMP}/libnvidia-container_${LIB_NVIDIA_VERSION}/usr/local/bin/nvidia-container-cli" "${BINDIR}/"
83+
cp "${TMP}/libnvidia-container_${LIB_NVIDIA_VERSION}/usr/local/lib/libnvidia-container.so" "${LIBDIR}/"
84+
cp "${TMP}/libnvidia-container_${LIB_NVIDIA_VERSION}/usr/local/lib/libnvidia-container.so.1" "${LIBDIR}/"
85+
cp "${TMP}/libnvidia-container_${LIB_NVIDIA_VERSION}/usr/local/lib/libnvidia-container.so.1.0.0" "${LIBDIR}/"
86+
87+
echo "Downloading nvidia-container-runtime."
88+
wget --quiet "https://github.com/NVIDIA/nvidia-container-runtime/archive/v${NVIDIA_RUNTIME_VERSION}.tar.gz" -P "${TMP}"
89+
mkdir -p ${GOPATH}/src/github.com/NVIDIA
90+
tar -xzf "${TMP}/v${NVIDIA_RUNTIME_VERSION}.tar.gz" -C "${GOPATH}/src/github.com/NVIDIA"
91+
mv "${GOPATH}/src/github.com/NVIDIA/nvidia-container-runtime-${NVIDIA_RUNTIME_VERSION}" "${GOPATH}/src/github.com/NVIDIA/nvidia-container-runtime"
92+
go build -o "${BINDIR}/nvidia-container-runtime-hook" "github.com/NVIDIA/nvidia-container-runtime/hook/nvidia-container-runtime-hook"
93+
}
94+
7295
function build_rpm ()
7396
{
7497
pushd $MOUNTDIR
@@ -110,6 +133,7 @@ function build_rpm ()
110133
-d fuse-libs \
111134
-d fuse \
112135
"$BINDIR/"=/usr/local/bin/ \
136+
"$LIBDIR/"=/usr/lib64/ \
113137
"$SERVICEDIR/"=/usr/lib/systemd/system/ \
114138
"$LXC_DIR/usr/local/bin/lxcfs"=/usr/bin/pouch-lxcfs \
115139
"$LXC_DIR/usr/local/lib/lxcfs/libpouchlxcfs.so"=/usr/lib64/libpouchlxcfs.so \
@@ -121,6 +145,7 @@ function main()
121145
echo "Building rpm package."
122146
build_pouch
123147
build_lxcfs
148+
build_nvidia_runtime
124149
build_rpm
125150
}
126151

0 commit comments

Comments
 (0)