Skip to content

Commit be35225

Browse files
committed
libct/nsenter: namespace the bindfd shuffle
Processes can watch /proc/self/mounts or /mountinfo, and the kernel will notify them whenever the namespace's mount table is modified. The notified process still needs to read and parse the mountinfo to determine what changed once notified. Many such processes, including udisksd and SystemD < v248, make no attempt to rate-limit their mountinfo notifications. This tends to not be a problem on many systems, where mount tables are small and mounting and unmounting is uncommon. Every runC exec which successfully uses the try_bindfd container-escape mitigation performs two mount()s and one umount() in the host's mount namespace, causing any mount-watching processes to wake up and parse the mountinfo file three times in a row. Consequently, using 'exec' health checks on containers has a larger-than-expected impact on system load when such mount-watching daemons are running. Furthermore, the size of the mount table in the host's mount namespace tends to be proportional to the number of OCI containers as a unique mount is required for the rootfs of each container. Therefore, on systems with mount-watching processes, the system load increases *quadratically* with the number of running containers which use health checks! Prevent runC from incidentally modifying the host's mount namespace for container-escape mitigations by setting up the mitigation in a private mount namespace. Signed-off-by: Cory Snider <[email protected]>
1 parent f992c6b commit be35225

File tree

1 file changed

+87
-19
lines changed

1 file changed

+87
-19
lines changed

libcontainer/nsenter/cloned_binary.c

Lines changed: 87 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,20 @@
4949
#include <fcntl.h>
5050
#include <errno.h>
5151

52+
#include <sched.h>
5253
#include <sys/types.h>
5354
#include <sys/stat.h>
5455
#include <sys/statfs.h>
5556
#include <sys/vfs.h>
5657
#include <sys/mman.h>
5758
#include <sys/mount.h>
5859
#include <sys/sendfile.h>
60+
#include <sys/socket.h>
5961
#include <sys/syscall.h>
62+
#include <sys/wait.h>
63+
64+
#include "clone.h"
65+
#include "ipc.h"
6066

6167
/* Use our own wrapper for memfd_create. */
6268
#ifndef SYS_memfd_create
@@ -399,6 +405,8 @@ static int try_bindfd(void)
399405
int fd, ret = -1;
400406
char template[PATH_MAX] = { 0 };
401407
char *prefix = getenv("_LIBCONTAINER_STATEDIR");
408+
jmp_buf env;
409+
int pipe[2], is_child;
402410

403411
if (!prefix || *prefix != '/')
404412
prefix = "/tmp";
@@ -415,29 +423,87 @@ static int try_bindfd(void)
415423
close(fd);
416424

417425
/*
418-
* For obvious reasons this won't work in rootless mode because we haven't
419-
* created a userns+mntns -- but getting that to work will be a bit
420-
* complicated and it's only worth doing if someone actually needs it.
426+
* Daemons such as systemd and udisks2 watch /proc/self/mountinfo and
427+
* re-parse it on every change, which gets expensive when the mount table
428+
* is large and/or changes frequently. Set up the bind-mounts in a new,
429+
* private mount namespace so as not to wake up those processes every
430+
* time we nsexec into a container. We clone a child process into a new
431+
* mount namespace to do the dirty work so the side effects of unsharing
432+
* the mount namespace do not leak into the current process.
421433
*/
422-
ret = -EPERM;
423-
if (mount("/proc/self/exe", template, "", MS_BIND, "") < 0)
434+
if ((ret = socketpair(AF_LOCAL, SOCK_STREAM, 0, pipe)) < 0)
424435
goto out;
425-
if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
426-
goto out_umount;
427436

428-
/* Get read-only handle that we're sure can't be made read-write. */
429-
ret = open(template, O_PATH | O_CLOEXEC);
437+
is_child = setjmp(env);
438+
if (is_child) {
439+
close(pipe[0]);
440+
fd = -1;
441+
/*
442+
* For obvious reasons this won't work in rootless mode because we haven't
443+
* created a userns -- but getting that to work will be a bit complicated and
444+
* it's only worth doing if someone actually needs it.
445+
*/
446+
if ((ret = mount("none", "/", NULL, MS_PRIVATE | MS_REC, NULL)) < 0)
447+
goto out_child;
448+
/*
449+
* The kernel refuses to bind-mount from the magic symlink when the process has
450+
* been cloned (or unshared) into a new mount namespace, at least on Linux 4.4.
451+
*/
452+
char linkbuf[PATH_MAX + 1] = { 0 };
453+
ssize_t linkpathlen = readlink("/proc/self/exe", linkbuf, sizeof(linkbuf));
454+
if (linkpathlen == sizeof(linkbuf)) {
455+
/*
456+
* The link path is longer than PATH_MAX, and the contents of
457+
* linkbuf might have been truncated. A truncated path could
458+
* happen to be a valid path to a different file, which could
459+
* allow for local privilege escalation if we were to exec it.
460+
* The mount syscall doesn't accept paths longer than PATH_MAX,
461+
* anyway.
462+
*/
463+
ret = -ENAMETOOLONG;
464+
goto out_child;
465+
}
466+
linkbuf[linkpathlen] = '\0';
467+
if ((ret = mount(linkbuf, template, "", MS_BIND, "")) < 0)
468+
goto out_child;
469+
if ((ret = mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "")) < 0)
470+
goto out_child;
471+
472+
/* Get read-only handle that we're sure can't be made read-write. */
473+
fd = open(template, O_PATH | O_CLOEXEC);
474+
if (fd < 0) {
475+
ret = fd;
476+
goto out_child;
477+
}
430478

431-
out_umount:
432-
/*
433-
* Make sure the MNT_DETACH works, otherwise we could get remounted
434-
* read-write and that would be quite bad (the fd would be made read-write
435-
* too, invalidating the protection).
436-
*/
437-
if (umount2(template, MNT_DETACH) < 0) {
438-
if (ret >= 0)
439-
close(ret);
440-
ret = -ENOTRECOVERABLE;
479+
/*
480+
* Make sure the MNT_DETACH works, otherwise we could get remounted
481+
* read-write and that would be quite bad (the fd would be made read-write
482+
* too, invalidating the protection).
483+
*/
484+
if (umount2(template, MNT_DETACH) < 0) {
485+
ret = -ENOTRECOVERABLE;
486+
goto out_child;
487+
}
488+
send_fd(pipe[1], fd);
489+
out_child:
490+
if (fd >= 0)
491+
close(fd);
492+
close(pipe[1]);
493+
exit(-ret);
494+
} else {
495+
int cpid = clone_longjmp(&env, 1, CLONE_NEWNS | SIGCHLD);
496+
if (cpid < 0)
497+
goto out;
498+
int wstatus = 0;
499+
if ((ret = waitpid(cpid, &wstatus, 0)) < 0)
500+
goto out;
501+
if ((ret = -WEXITSTATUS(wstatus)))
502+
goto out;
503+
fd = receive_fd(pipe[0]);
504+
if ((ret = fcntl(ret, F_SETFD, O_PATH | O_CLOEXEC)) < 0)
505+
goto out;
506+
ret = fd;
441507
}
442508

443509
out:
@@ -446,6 +512,8 @@ static int try_bindfd(void)
446512
* there's an empty file left around in STATEDIR.
447513
*/
448514
unlink(template);
515+
close(pipe[0]);
516+
close(pipe[1]);
449517
return ret;
450518
}
451519

0 commit comments

Comments
 (0)