Skip to content

Commit d7d928c

Browse files
committed
feature: add nvidia-container 2.0
Signed-off-by: codejuan <[email protected]>
1 parent cec4108 commit d7d928c

File tree

8 files changed

+286
-0
lines changed

8 files changed

+286
-0
lines changed

apis/swagger.yml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2471,6 +2471,29 @@ definitions:
24712471
x-nullable: false
24722472
minimum: 0
24732473
maximum: 1
2474+
NvidiaConfig:
2475+
$ref: "#/definitions/NvidiaConfig"
2476+
2477+
NvidiaConfig:
2478+
type: "object"
2479+
properties:
2480+
NvidiaVisibleDevices:
2481+
description: "NvidiaVisibleDevices controls which GPUs will be made accessible inside the container"
2482+
type: "string"
2483+
example: |
2484+
Possible values.
2485+
0,1,2, GPU-fef8089b …: a comma-separated list of GPU UUID(s) or index(es).
2486+
all: all GPUs will be accessible, this is the default value in our container images.
2487+
none: no GPU will be accessible, but driver capabilities will be enabled.
2488+
x-nullable: false
2489+
NvidiaDriverCapabilities:
2490+
description: "NvidiaDriverCapabilities controls which driver libraries/binaries will be mounted inside the container"
2491+
type: "string"
2492+
example: |
2493+
Possible values
2494+
compute,video, graphics,utility …: a comma-separated list of driver features the container needs.
2495+
all: enable all available driver capabilities.
2496+
x-nullable: false
24742497

24752498
ThrottleDevice:
24762499
type: "object"

apis/types/nvidia_config.go

Lines changed: 57 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

apis/types/resources.go

Lines changed: 29 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

daemon/mgr/container_utils.go

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package mgr
22

33
import (
44
"fmt"
5+
"os"
56
"strconv"
67
"strings"
78

@@ -11,6 +12,7 @@ import (
1112
"github.com/alibaba/pouch/pkg/meta"
1213
"github.com/alibaba/pouch/pkg/randomid"
1314
"github.com/alibaba/pouch/pkg/system"
15+
"github.com/alibaba/pouch/pkg/utils"
1416

1517
"github.com/opencontainers/selinux/go-selinux/label"
1618
"github.com/pkg/errors"
@@ -234,6 +236,10 @@ func validateConfig(config *types.ContainerConfig, hostConfig *types.HostConfig,
234236
if err != nil {
235237
return nil, err
236238
}
239+
// validates nvidia config
240+
if err := validateNvidiaConfig(hostConfig); err != nil {
241+
return warnings, err
242+
}
237243
warnings = append(warnings, warns...)
238244

239245
if hostConfig.OomScoreAdj < -1000 || hostConfig.OomScoreAdj > 1000 {
@@ -379,3 +385,76 @@ func amendContainerSettings(config *types.ContainerConfig, hostConfig *types.Hos
379385
r.MemorySwap = 2 * r.Memory
380386
}
381387
}
388+
389+
// validateNvidiaConfig
390+
func validateNvidiaConfig(hostConfig *types.HostConfig) error {
391+
r := &hostConfig.Resources
392+
if r.NvidiaConfig == nil {
393+
return nil
394+
}
395+
396+
if err := vlidateNvidiaDriver(r); err != nil {
397+
return err
398+
}
399+
400+
if err := validateNvidiaDevice(hostConfig); err != nil {
401+
return err
402+
}
403+
404+
return nil
405+
}
406+
407+
func vlidateNvidiaDriver(r *types.Resources) error {
408+
n := r.NvidiaConfig
409+
n.NvidiaDriverCapabilities = strings.TrimSpace(n.NvidiaDriverCapabilities)
410+
if n.NvidiaDriverCapabilities == "" {
411+
// use default driver capability: utility
412+
return nil
413+
}
414+
415+
if n.NvidiaDriverCapabilities == "all" {
416+
// enable all capabilities
417+
return nil
418+
}
419+
420+
supportedDrivers := []string{"compute", "compat32", "graphics", "utility", "video", "display"}
421+
drivers := strings.Split(n.NvidiaDriverCapabilities, ",")
422+
for _, d := range drivers {
423+
d = strings.TrimSpace(d)
424+
found := utils.StringInSlice(supportedDrivers, d)
425+
if !found {
426+
return fmt.Errorf("invalid nvidia driver capability (%s)", d)
427+
}
428+
}
429+
return nil
430+
}
431+
432+
func validateNvidiaDevice(hostConfig *types.HostConfig) error {
433+
n := hostConfig.Resources.NvidiaConfig
434+
n.NvidiaVisibleDevices = strings.TrimSpace(n.NvidiaVisibleDevices)
435+
436+
// none: no GPU will be accessible, but driver capabilities will be enabled.
437+
// void or empty: no GPU will be accessible, and driver capabilities will be disabled.
438+
// all: all GPUs will be accessible
439+
if n.NvidiaDriverCapabilities == "" {
440+
return nil
441+
}
442+
supportedDevices := []string{"all", "none", "void"}
443+
found := utils.StringInSlice(supportedDevices, n.NvidiaVisibleDevices)
444+
if found {
445+
return nil
446+
}
447+
devs := strings.Split(n.NvidiaVisibleDevices, ",")
448+
for _, dev := range devs {
449+
dev = strings.TrimSpace(dev)
450+
if _, err := strconv.Atoi(dev); err == nil {
451+
//dev is numeric, the realDev should be /dev/nvidiaN
452+
realDev := fmt.Sprintf("/dev/nvidia%s", dev)
453+
if _, err := os.Stat(realDev); err != nil {
454+
return fmt.Errorf("invalid nvidia device %s", realDev)
455+
}
456+
}
457+
// TODO: how to validate GPU UUID
458+
}
459+
return nil
460+
}

daemon/mgr/spec_hook.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
"io/ioutil"
77
"os"
8+
"os/exec"
89
"path/filepath"
910
"sort"
1011
"strconv"
@@ -64,6 +65,11 @@ func setupHook(ctx context.Context, c *Container, specWrapper *SpecWrapper) erro
6465
return errors.Wrap(err, "failed to set volume mount tab prestart hook")
6566
}
6667

68+
// set nvidia config
69+
if err := setNvidiaHook(ctx, c, specWrapper); err != nil {
70+
return errors.Wrap(err, "failed to set nvidia prestart hook")
71+
}
72+
6773
return nil
6874
}
6975

@@ -152,6 +158,24 @@ func setMountTab(ctx context.Context, c *Container, spec *SpecWrapper) error {
152158
return nil
153159
}
154160

161+
func setNvidiaHook(ctx context.Context, c *Container, spec *SpecWrapper) error {
162+
n := c.HostConfig.NvidiaConfig
163+
if n == nil {
164+
return nil
165+
}
166+
path, err := exec.LookPath("nvidia-container-runtime-hook")
167+
if err != nil {
168+
return err
169+
}
170+
args := []string{path}
171+
nvidiaPrestart := specs.Hook{
172+
Path: path,
173+
Args: append(args, "prestart"),
174+
}
175+
spec.s.Hooks.Prestart = append(spec.s.Hooks.Prestart, nvidiaPrestart)
176+
return nil
177+
}
178+
155179
type hookArray []*wrapperEmbedPrestart
156180

157181
// Len is defined in order to support sort

daemon/mgr/spec_process.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package mgr
22

33
import (
44
"context"
5+
"fmt"
56
"io/ioutil"
67
"os"
78
"strings"
@@ -61,6 +62,9 @@ func setupProcess(ctx context.Context, c *Container, s *specs.Spec) error {
6162
return err
6263
}
6364

65+
if err := setupNvidiaEnv(ctx, c, s); err != nil {
66+
return err
67+
}
6468
return nil
6569
}
6670

@@ -163,3 +167,13 @@ func setupRlimits(ctx context.Context, hostConfig *types.HostConfig, s *specs.Sp
163167
s.Process.Rlimits = rlimits
164168
return nil
165169
}
170+
171+
func setupNvidiaEnv(ctx context.Context, c *Container, s *specs.Spec) error {
172+
n := c.HostConfig.NvidiaConfig
173+
if n == nil {
174+
return nil
175+
}
176+
s.Process.Env = append(s.Process.Env, fmt.Sprintf("NVIDIA_DRIVER_CAPABILITIES=%s", n.NvidiaDriverCapabilities))
177+
s.Process.Env = append(s.Process.Env, fmt.Sprintf("NVIDIA_VISIBLE_DEVICES=%s", n.NvidiaVisibleDevices))
178+
return nil
179+
}

hack/package/rpm/build.sh

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ POUCHDIR=$TMP/source
1616
[ -d "$POUCHDIR" ] || mkdir -p "$POUCHDIR"
1717
BINDIR=$POUCHDIR/bin
1818
[ -d "$BINDIR" ] || mkdir -p "$BINDIR"
19+
LIBDIR=$POUCHDIR/lib
20+
[ -d "$LIBDIR" ] || mkdir -p "$LIBDIR"
1921
LXC_DIR=$TMP/lxc
2022
[ -d "$LXC_DIR" ] || mkdir -p "$LXC_DIR"
2123

@@ -34,6 +36,9 @@ CATEGORY='Tools/Pouch'
3436
MAINTAINER='Pouch [email protected]'
3537
VENDOR='Pouch'
3638

39+
LIB_NVIDIA_VERSION="1.0.0-rc.2"
40+
NVIDIA_RUNTIME_VERSION="1.4.0-1"
41+
3742
# build lxcfs
3843
function build_lxcfs ()
3944
{
@@ -69,6 +74,24 @@ function build_pouch()
6974
popd
7075
}
7176

77+
# install nvidia-container-runtime
78+
function build_nvidia_runtime(){
79+
echo "Downloading libnvidia-container."
80+
wget --quiet "https://github.com/NVIDIA/libnvidia-container/releases/download/v${LIB_NVIDIA_VERSION}/libnvidia-container_${LIB_NVIDIA_VERSION}_x86_64.tar.xz" -P "${TMP}"
81+
tar -xf "${TMP}/libnvidia-container_${LIB_NVIDIA_VERSION}_x86_64.tar.xz" -C "${TMP}"
82+
cp "${TMP}/libnvidia-container_${LIB_NVIDIA_VERSION}/usr/local/bin/nvidia-container-cli" "${BINDIR}/"
83+
cp "${TMP}/libnvidia-container_${LIB_NVIDIA_VERSION}/usr/local/lib/libnvidia-container.so" "${LIBDIR}/"
84+
cp "${TMP}/libnvidia-container_${LIB_NVIDIA_VERSION}/usr/local/lib/libnvidia-container.so.1" "${LIBDIR}/"
85+
cp "${TMP}/libnvidia-container_${LIB_NVIDIA_VERSION}/usr/local/lib/libnvidia-container.so.1.0.0" "${LIBDIR}/"
86+
87+
echo "Downloading nvidia-container-runtime."
88+
wget --quiet "https://github.com/NVIDIA/nvidia-container-runtime/archive/v${NVIDIA_RUNTIME_VERSION}.tar.gz" -P "${TMP}"
89+
mkdir -p "${GOPATH}/src/github.com/NVIDIA"
90+
tar -xzf "${TMP}/v${NVIDIA_RUNTIME_VERSION}.tar.gz" -C "${GOPATH}/src/github.com/NVIDIA"
91+
mv "${GOPATH}/src/github.com/NVIDIA/nvidia-container-runtime-${NVIDIA_RUNTIME_VERSION}" "${GOPATH}/src/github.com/NVIDIA/nvidia-container-runtime"
92+
go build -o "${BINDIR}/nvidia-container-runtime-hook" "github.com/NVIDIA/nvidia-container-runtime/hook/nvidia-container-runtime-hook"
93+
}
94+
7295
function build_rpm ()
7396
{
7497
pushd $MOUNTDIR
@@ -110,6 +133,7 @@ function build_rpm ()
110133
-d fuse-libs \
111134
-d fuse \
112135
"$BINDIR/"=/usr/local/bin/ \
136+
"$LIBDIR/"=/usr/lib64/ \
113137
"$SERVICEDIR/"=/usr/lib/systemd/system/ \
114138
"$LXC_DIR/usr/local/bin/lxcfs"=/usr/bin/pouch-lxcfs \
115139
"$LXC_DIR/usr/local/lib/lxcfs/libpouchlxcfs.so"=/usr/lib64/libpouchlxcfs.so \
@@ -121,6 +145,7 @@ function main()
121145
echo "Building rpm package."
122146
build_pouch
123147
build_lxcfs
148+
build_nvidia_runtime
124149
build_rpm
125150
}
126151

0 commit comments

Comments
 (0)