Skip to content

Commit d3516db

Browse files
author
Yuanhong Peng
committed
Add support for cgroup namespace
Cgroup namespace can be configured in `config.json` as other namespaces. Here is an example: ``` "namespaces": [ { "type": "pid" }, { "type": "network" }, { "type": "ipc" }, { "type": "uts" }, { "type": "mount" }, { "type": "cgroup" } ], ``` Note that if you want to run a container which has shared cgroup ns with another container, then it's strongly recommended that you set proper `CgroupsPath` of both containers(the second container's cgroup path must be the subdirectory of the first one). Or there might be some unexpected results. Signed-off-by: Yuanhong Peng <pengyuanhong@huawei.com>
1 parent 53eeb9d commit d3516db

10 files changed

Lines changed: 197 additions & 50 deletions

File tree

libcontainer/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ config := &configs.Config{
8282
{Type: configs.NEWPID},
8383
{Type: configs.NEWUSER},
8484
{Type: configs.NEWNET},
85+
{Type: configs.NEWCGROUP},
8586
}),
8687
Cgroups: &configs.Cgroup{
8788
Name: "test-container",

libcontainer/SPEC.md

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,17 @@ Minimum requirements:
2121

2222
### Namespaces
2323

24-
| Flag | Enabled |
25-
| ------------ | ------- |
26-
| CLONE_NEWPID | 1 |
27-
| CLONE_NEWUTS | 1 |
28-
| CLONE_NEWIPC | 1 |
29-
| CLONE_NEWNET | 1 |
30-
| CLONE_NEWNS | 1 |
31-
| CLONE_NEWUSER | 1 |
32-
33-
Namespaces are created for the container via the `clone` syscall.
24+
| Flag | Enabled |
25+
| --------------- | ------- |
26+
| CLONE_NEWPID | 1 |
27+
| CLONE_NEWUTS | 1 |
28+
| CLONE_NEWIPC | 1 |
29+
| CLONE_NEWNET | 1 |
30+
| CLONE_NEWNS | 1 |
31+
| CLONE_NEWUSER | 1 |
32+
| CLONE_NEWCGROUP | 1 |
33+
34+
Namespaces are created for the container via the `unshare` syscall.
3435

3536

3637
### Filesystem

libcontainer/cgroups/utils.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ import (
1717
)
1818

1919
const (
20-
cgroupNamePrefix = "name="
20+
CgroupNamePrefix = "name="
2121
CgroupProcesses = "cgroup.procs"
2222
)
2323

@@ -139,8 +139,8 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount,
139139
if !ss[opt] {
140140
continue
141141
}
142-
if strings.HasPrefix(opt, cgroupNamePrefix) {
143-
m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
142+
if strings.HasPrefix(opt, CgroupNamePrefix) {
143+
m.Subsystems = append(m.Subsystems, opt[len(CgroupNamePrefix):])
144144
} else {
145145
m.Subsystems = append(m.Subsystems, opt)
146146
}
@@ -294,7 +294,7 @@ func getControllerPath(subsystem string, cgroups map[string]string) (string, err
294294
return p, nil
295295
}
296296

297-
if p, ok := cgroups[cgroupNamePrefix+subsystem]; ok {
297+
if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok {
298298
return p, nil
299299
}
300300

libcontainer/configs/namespaces_syscall.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ func (n *Namespace) Syscall() int {
99
}
1010

1111
// This is not yet in the Go stdlib.
12-
const syscall_CLONE_NEWCGROUP = (1 << 29)
12+
const syscall_CLONE_NEWCGROUP = (1 << 25)
1313

1414
var namespaceInfo = map[NamespaceType]int{
1515
NEWNET: syscall.CLONE_NEWNET,

libcontainer/configs/validate/validator.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ func (v *ConfigValidator) Validate(config *configs.Config) error {
3737
if err := v.usernamespace(config); err != nil {
3838
return err
3939
}
40+
if err := v.cgroupnamespace(config); err != nil {
41+
return err
42+
}
4043
if err := v.sysctl(config); err != nil {
4144
return err
4245
}
@@ -107,6 +110,15 @@ func (v *ConfigValidator) usernamespace(config *configs.Config) error {
107110
return nil
108111
}
109112

113+
func (v *ConfigValidator) cgroupnamespace(config *configs.Config) error {
114+
if config.Namespaces.Contains(configs.NEWCGROUP) {
115+
if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
116+
return fmt.Errorf("cgroup namespaces aren't enabled in the kernel")
117+
}
118+
}
119+
return nil
120+
}
121+
110122
// sysctl validates that the specified sysctl keys are valid or not.
111123
// /proc/sys isn't completely namespaced and depending on which namespaces
112124
// are specified, a subset of sysctls are permitted.

libcontainer/container_linux.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1293,6 +1293,7 @@ func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceTyp
12931293
configs.NEWNET,
12941294
configs.NEWPID,
12951295
configs.NEWNS,
1296+
configs.NEWCGROUP,
12961297
}
12971298

12981299
// Remove namespaces that we don't need to join.

libcontainer/integration/exec_test.go

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1694,3 +1694,60 @@ func TestTmpfsCopyUp(t *testing.T) {
16941694
t.Fatalf("/etc/passwd not copied up as expected: %v", outputLs)
16951695
}
16961696
}
1697+
1698+
func TestCGROUPPrivate(t *testing.T) {
1699+
if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
1700+
t.Skip("cgroupns is unsupported")
1701+
}
1702+
if testing.Short() {
1703+
return
1704+
}
1705+
1706+
rootfs, err := newRootfs()
1707+
ok(t, err)
1708+
defer remove(rootfs)
1709+
1710+
l, err := os.Readlink("/proc/1/ns/cgroup")
1711+
ok(t, err)
1712+
1713+
config := newTemplateConfig(rootfs)
1714+
config.Namespaces.Add(configs.NEWCGROUP, "")
1715+
buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/cgroup")
1716+
ok(t, err)
1717+
1718+
if exitCode != 0 {
1719+
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
1720+
}
1721+
1722+
if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual == l {
1723+
t.Fatalf("cgroup link should be private to the container but equals host %q %q", actual, l)
1724+
}
1725+
}
1726+
1727+
func TestCGROUPHost(t *testing.T) {
1728+
if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
1729+
t.Skip("cgroupns is unsupported")
1730+
}
1731+
if testing.Short() {
1732+
return
1733+
}
1734+
1735+
rootfs, err := newRootfs()
1736+
ok(t, err)
1737+
defer remove(rootfs)
1738+
1739+
l, err := os.Readlink("/proc/1/ns/cgroup")
1740+
ok(t, err)
1741+
1742+
config := newTemplateConfig(rootfs)
1743+
buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/cgroup")
1744+
ok(t, err)
1745+
1746+
if exitCode != 0 {
1747+
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
1748+
}
1749+
1750+
if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
1751+
t.Fatalf("cgroup link not equal to host link %q %q", actual, l)
1752+
}
1753+
}

libcontainer/nsenter/nsexec.c

Lines changed: 34 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,12 @@ enum sync_t {
4040
SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
4141
};
4242

43+
/*
44+
* Synchronisation value for cgroup namespace setup.
45+
* The same constant is defined in process_linux.go as "createCgroupns".
46+
*/
47+
#define CREATECGROUPNS 0x80
48+
4349
/* longjmp() arguments. */
4450
#define JUMP_PARENT 0x00
4551
#define JUMP_CHILD 0xA0
@@ -570,6 +576,17 @@ void nsexec(void)
570576
kill(child, SIGKILL);
571577
bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
572578
}
579+
580+
/* Send the init_func pid back to our parent. */
581+
len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", child);
582+
if (len < 0) {
583+
kill(child, SIGKILL);
584+
bail("unable to generate JSON for child pid");
585+
}
586+
if (write(pipenum, buf, len) != len) {
587+
kill(child, SIGKILL);
588+
bail("unable to send child pid to bootstrapper");
589+
}
573590
}
574591
break;
575592
case SYNC_CHILD_READY:
@@ -614,17 +631,6 @@ void nsexec(void)
614631
}
615632
}
616633

617-
/* Send the init_func pid back to our parent. */
618-
len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", child);
619-
if (len < 0) {
620-
kill(child, SIGKILL);
621-
bail("unable to generate JSON for child pid");
622-
}
623-
if (write(pipenum, buf, len) != len) {
624-
kill(child, SIGKILL);
625-
bail("unable to send child pid to bootstrapper");
626-
}
627-
628634
exit(0);
629635
}
630636

@@ -640,6 +646,7 @@ void nsexec(void)
640646
case JUMP_CHILD: {
641647
pid_t child;
642648
enum sync_t s;
649+
uint32_t actual_flags = config.cloneflags;
643650

644651
/* We're in a child and thus need to tell the parent if we die. */
645652
syncfd = sync_child_pipe[0];
@@ -667,7 +674,9 @@ void nsexec(void)
667674
* some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
668675
* was broken, so we'll just do it the long way anyway.
669676
*/
670-
if (unshare(config.cloneflags) < 0)
677+
if (actual_flags & CLONE_NEWCGROUP)
678+
actual_flags &= ~CLONE_NEWCGROUP;
679+
if (unshare(actual_flags) < 0)
671680
bail("failed to unshare namespaces");
672681

673682
/*
@@ -777,6 +786,19 @@ void nsexec(void)
777786
if (setgroups(0, NULL) < 0)
778787
bail("setgroups failed");
779788

789+
/* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */
790+
if (config.cloneflags & CLONE_NEWCGROUP) {
791+
uint8_t value;
792+
if (read(pipenum, &value, sizeof(value)) != sizeof(value))
793+
bail("read synchronisation value failed");
794+
if (value == CREATECGROUPNS) {
795+
if (unshare(CLONE_NEWCGROUP) < 0)
796+
bail("failed to unshare cgroup namespace");
797+
}
798+
else
799+
bail("received unknown synchronisation value");
800+
}
801+
780802
s = SYNC_CHILD_READY;
781803
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
782804
bail("failed to sync with patent: write(SYNC_CHILD_READY)");

libcontainer/process_linux.go

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ import (
1919
"github.com/opencontainers/runc/libcontainer/utils"
2020
)
2121

22+
// Synchronisation value for cgroup namespace setup.
23+
// The same constant is defined in nsexec.c as "CREATECGROUPNS".
24+
const createCgroupns byte = (1 << 7)
25+
2226
type parentProcess interface {
2327
// pid returns the pid for the running process.
2428
pid() int
@@ -224,12 +228,17 @@ func (p *initProcess) externalDescriptors() []string {
224228
return p.fds
225229
}
226230

227-
// execSetns runs the process that executes C code to perform the setns calls
228-
// because setns support requires the C process to fork off a child and perform the setns
229-
// before the go runtime boots, we wait on the process to die and receive the child's pid
230-
// over the provided pipe.
231-
// This is called by initProcess.start function
232-
func (p *initProcess) execSetns() error {
231+
// getChildPid receives the final child's pid over the provided pipe.
232+
func (p *initProcess) getChildPid() (int, error) {
233+
var pid *pid
234+
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
235+
p.cmd.Wait()
236+
return -1, err
237+
}
238+
return pid.Pid, nil
239+
}
240+
241+
func (p *initProcess) waitForChildExit(childPid int) error {
233242
status, err := p.cmd.Process.Wait()
234243
if err != nil {
235244
p.cmd.Wait()
@@ -239,12 +248,7 @@ func (p *initProcess) execSetns() error {
239248
p.cmd.Wait()
240249
return &exec.ExitError{ProcessState: status}
241250
}
242-
var pid *pid
243-
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
244-
p.cmd.Wait()
245-
return err
246-
}
247-
process, err := os.FindProcess(pid.Pid)
251+
process, err := os.FindProcess(childPid)
248252
if err != nil {
249253
return err
250254
}
@@ -266,22 +270,35 @@ func (p *initProcess) start() error {
266270
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
267271
return err
268272
}
269-
if err := p.execSetns(); err != nil {
270-
return newSystemErrorWithCause(err, "running exec setns process for init")
273+
childPid, err := p.getChildPid()
274+
if err != nil {
275+
return newSystemErrorWithCause(err, "getting the final child's pid from pipe")
271276
}
272277
// Save the standard descriptor names before the container process
273278
// can potentially move them (e.g., via dup2()). If we don't do this now,
274279
// we won't know at checkpoint time which file descriptor to look up.
275-
fds, err := getPipeFds(p.pid())
280+
fds, err := getPipeFds(childPid)
276281
if err != nil {
277-
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
282+
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", childPid)
278283
}
279284
p.setExternalDescriptors(fds)
280285
// Do this before syncing with child so that no children
281286
// can escape the cgroup
282-
if err := p.manager.Apply(p.pid()); err != nil {
287+
if err := p.manager.Apply(childPid); err != nil {
283288
return newSystemErrorWithCause(err, "applying cgroup configuration for process")
284289
}
290+
// Now it's time to setup cgroup namesapce
291+
if p.config.Config.Namespaces.Contains(configs.NEWCGROUP) && p.config.Config.Namespaces.PathOf(configs.NEWCGROUP) == "" {
292+
if _, err := p.parentPipe.Write([]byte{createCgroupns}); err != nil {
293+
return newSystemErrorWithCause(err, "sending synchronization value to init process")
294+
}
295+
}
296+
297+
// Wait for our first child to exit
298+
if err := p.waitForChildExit(childPid); err != nil {
299+
return newSystemErrorWithCause(err, "waiting for our first child to exit")
300+
}
301+
285302
defer func() {
286303
if err != nil {
287304
// TODO: should not be the responsibility to call here

0 commit comments

Comments
 (0)