From 7f84857e00f25b1625bf70ffa2552743d0ada60d Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 10 Oct 2024 11:20:21 +1100 Subject: [PATCH 1/6] rootfs: make pivot_root(2) dance handle initramfs case While pivot_root(2) normally refuses to pivot a mount if you are running with / as initramfs (because initramfs doesn't have a parent mount), you can create a bind-mount of / and make that your new root to work around this problem. This does use chroot(2), but this is only done temporarily to set current->fs->root to the new mount. Once pivot_root(2) finishes, the chroot(2) and / are gone. Variants of this hack are fairly well known and is used all over the place (see [1,2]) but until now we have forced users to have a far less secure configuration with --no-pivot. This is a slightly modified version that uses the container rootfs as the temporary spot for the / clone -- this allows runc to continue working with read-only image-based OS images. [1]: https://github.com/containers/bubblewrap/issues/592#issuecomment-2243087731 [2]: https://aconz2.github.io/2024/07/29/container-from-initramfs.html Signed-off-by: Aleksa Sarai --- libcontainer/rootfs_linux.go | 59 ++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 10 deletions(-) diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index f7cd95dd189..d1ec02ef6c2 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -1068,19 +1068,58 @@ func pivotRoot(rootfs string) error { } defer unix.Close(oldroot) //nolint: errcheck - newroot, err := unix.Open(rootfs, unix.O_DIRECTORY|unix.O_RDONLY, 0) - if err != nil { - return &os.PathError{Op: "open", Path: rootfs, Err: err} - } - defer unix.Close(newroot) //nolint: errcheck - // Change to the new root so that the pivot_root actually acts on it. - if err := unix.Fchdir(newroot); err != nil { - return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(newroot), Err: err} + if err := os.Chdir(rootfs); err != nil { + return err } - if err := unix.PivotRoot(".", "."); err != nil { - return &os.PathError{Op: "pivot_root", Path: ".", Err: err} + pivotErr := unix.PivotRoot(".", ".") + if errors.Is(pivotErr, unix.EINVAL) { + // If pivot_root(2) failed with -EINVAL, one of the possible reasons is + // that we are in early boot and trying pivot_root on top of the + // initramfs (which isn't allowed because initramfs/rootfs doesn't have + // a parent mount). + // + // Traditionally, users were told to pass --no-pivot (which used chroot + // instead) but this is very insecure (even with the hardenings we've + // put into our chroot() wrapper). + // + // A much better solution is to create a bind-mount clone of / (which + // would have a parent) and then chroot into that clone so that we are + // properly rooted within a mount that has a parent mount. Then we can + // retry the pivot_root(). + + // Clone / on top of . to create a version of / that has a parent and + // so can be pivot-rooted. + if err := unix.Mount("/", ".", "", unix.MS_BIND|unix.MS_REC, ""); err != nil { + err := &os.PathError{Op: "make clone of / mount", Path: rootfs, Err: err} + return fmt.Errorf("error during fallback for failed pivot_root (%w): %w", pivotErr, err) + } + // Switch to the cloned mount. We have to use the full path here + // because we need to get the kernel to move us into the new mount + // (chdir(".") will keep us in the old non-cloned / mount). + if err := os.Chdir(rootfs); err != nil { + return fmt.Errorf("error during fallback for failed pivot_root (%w): switch to cloned mount: %w", pivotErr, err) + } + // Move the cloned mount to /. + if err := unix.Mount(".", "/", "", unix.MS_MOVE, ""); err != nil { + err := &os.PathError{Op: "move / clone mount to /", Path: rootfs, Err: err} + return fmt.Errorf("error during fallback for failed pivot_root (%w): %w", pivotErr, err) + } + // Update current->fs->root to be the cloned / (to be pivot_root'd). + if err := unix.Chroot("."); err != nil { + err := &os.PathError{Op: "chroot into cloned /", Path: rootfs, Err: err} + return fmt.Errorf("error during fallback for failed pivot_root (%w): %w", pivotErr, err) + } + + // Go back to the container rootfs and retry pivot_root. + if err := os.Chdir(rootfs); err != nil { + return fmt.Errorf("error during fallback for failed pivot_root (%w): %w", pivotErr, err) + } + pivotErr = unix.PivotRoot(".", ".") + } + if pivotErr != nil { + return &os.PathError{Op: "pivot_root", Path: rootfs, Err: pivotErr} } // Currently our "." is oldroot (according to the current kernel code). From 17927cc386526e6d8942a7f54988e0249eda6193 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 10 Oct 2024 11:09:06 +1100 Subject: [PATCH 2/6] rootfs: always pivot_root(2) and treat --no-pivot as a fallback Despite the hardenings we've added to the MS_MOVE+chroot dance over the years like commit 28a697cce3e4 ("rootfs: umount all procfs and sysfs with --no-pivot"), --no-pivot is fundamentally insecure and the primary reason why people use it (to run containers from initramfs) can now be done safely with pivot_root(2). So we should always try to pivot_root(2) and give a warning to the user that their configuration is insecure if we have to use the --no-pivot fallback (users should not see this message in practice, because the primary users that couldn't use pivot_root(2) now can and will transparently use it if possible). Signed-off-by: Aleksa Sarai --- create.go | 2 +- libcontainer/rootfs_linux.go | 15 ++++++++++++--- run.go | 2 +- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/create.go b/create.go index 8ed59b2e75c..0c76a268fb5 100644 --- a/create.go +++ b/create.go @@ -45,7 +45,7 @@ command(s) that get executed on start, edit the args parameter of the spec. See }, cli.BoolFlag{ Name: "no-pivot", - Usage: "do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk", + Usage: "do not use pivot root to jail process inside rootfs to work around limitations when running in an initramfs (this option is deprecated, insecure, and unnecessary now that pivot_root works with initramfs -- see )", }, cli.BoolFlag{ Name: "no-new-keyring", diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index d1ec02ef6c2..8a26726b7a7 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -202,10 +202,19 @@ func prepareRootfs(pipe *syncSocket, iConfig *initConfig) (err error) { return err } - if config.NoPivotRoot { - err = msMoveRoot(config.Rootfs) - } else if config.Namespaces.Contains(configs.NEWNS) { + if config.Namespaces.Contains(configs.NEWNS) { err = pivotRoot(config.Rootfs) + if config.NoPivotRoot { + logrus.Warnf("--no-pivot is deprecated and may be removed or silently ignored in a future version of runc -- see for more details") + if err != nil { + // Always try to do pivot_root(2) because it's safe, and only fallback + // to the unsafe MS_MOVE+chroot(2) dance if pivot_root(2) fails. + logrus.Warnf("your container failed to start with pivot_root(2) (%v) -- please open a bug report to let us know about your usecase", err) + err = msMoveRoot(config.Rootfs) + } else { + logrus.Warnf("despite setting --no-pivot, this container successfully started using pivot_root(2) -- consider removing the --no-pivot flag") + } + } } else { err = chroot() } diff --git a/run.go b/run.go index b03b8129bd9..0b907528b0a 100644 --- a/run.go +++ b/run.go @@ -58,7 +58,7 @@ command(s) that get executed on start, edit the args parameter of the spec. See }, cli.BoolFlag{ Name: "no-pivot", - Usage: "do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk", + Usage: "do not use pivot root to jail process inside rootfs to work around limitations when running in an initramfs (this option is deprecated, insecure, and unnecessary now that pivot_root works with initramfs -- see )", }, cli.BoolFlag{ Name: "no-new-keyring", From 1f3707b911e117f9b178f6bc356fa03a7031aaa1 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 29 Oct 2024 13:52:56 +1100 Subject: [PATCH 3/6] bats: update to v1.11.0 This includes bats_pipe and some other nice features we can use in our tests. Signed-off-by: Aleksa Sarai --- .cirrus.yml | 2 +- .github/workflows/test.yml | 2 +- Dockerfile | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index e63ef51cf1a..9e30298e37b 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -78,7 +78,7 @@ task: HOME: /root CIRRUS_WORKING_DIR: /home/runc GO_VERSION: "1.23" - BATS_VERSION: "v1.9.0" + BATS_VERSION: "v1.11.0" RPMS: gcc git iptables jq glibc-static libseccomp-devel make criu fuse-sshfs container-selinux # yamllint disable rule:key-duplicates matrix: diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 754fd87f155..b07d1bd4e7e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -140,7 +140,7 @@ jobs: - name: Setup Bats and bats libs uses: bats-core/bats-action@3.0.0 with: - bats-version: 1.9.0 + bats-version: 1.11.0 support-install: false assert-install: false detik-install: false diff --git a/Dockerfile b/Dockerfile index f51f5956c85..bd21290ce82 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ ARG GO_VERSION=1.23 -ARG BATS_VERSION=v1.9.0 +ARG BATS_VERSION=v1.11.0 ARG LIBSECCOMP_VERSION=2.5.5 FROM golang:${GO_VERSION}-bookworm @@ -16,6 +16,7 @@ RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \ criu \ gcc \ gcc-multilib \ + cpio \ curl \ gawk \ gperf \ @@ -24,6 +25,7 @@ RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \ kmod \ pkg-config \ python3-minimal \ + qemu-kvm \ sshfs \ sudo \ uidmap \ From 59b0465b7481448dcb17f919e568f9478c0e547d Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Mon, 28 Oct 2024 11:29:06 +1100 Subject: [PATCH 4/6] tests: helpers: provide a "sane" run helper The runc wrapper we have had for a long time is useful for debugging errors (because bats's built-in "run" doesn't provide the output of the program in case of an error). However, it might be necessary to run other programs in a similar wrapper. In addition, it might be needed to add timeouts or use the bats_pipe feature (bats v1.10.0) with similar wrapping. Adding support for this in the sane_run wrapper makes it a little easier to use these things when writing tests. This is somewhat adapted from umoci's sane_run helper. Signed-off-by: Aleksa Sarai --- Makefile | 1 + tests/integration/helpers.bash | 44 ++++++++++++++++++++++++++++++---- 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 0a15fd908ea..730b853ca1a 100644 --- a/Makefile +++ b/Makefile @@ -164,6 +164,7 @@ localunittest: all integration: runcimage $(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \ -t --privileged --rm \ + -v /boot:/boot:ro \ -v /lib/modules:/lib/modules:ro \ -v $(CURDIR):/go/src/$(PROJECT) \ $(RUNC_IMAGE) make localintegration TESTPATH="$(TESTPATH)" diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash index 85f1fb0a558..726f61faf28 100755 --- a/tests/integration/helpers.bash +++ b/tests/integration/helpers.bash @@ -39,18 +39,54 @@ ARCH=$(uname -m) # Seccomp agent socket. SECCCOMP_AGENT_SOCKET="$BATS_TMPDIR/seccomp-agent.sock" -# Wrapper for runc. -function runc() { - run __runc "$@" +function sane_run() { # --pipe --timeout= + local getopt + getopt="$(getopt -o + --long pipe,timeout: -- "$@")" + eval set -- "$getopt" + + pipe= + timeout= + while true; do + case "$1" in + --pipe) + pipe=1 + shift + ;; + --timeout) + timeout="$2" + shift 2 + ;; + --) + shift + break + ;; + *) + fail "unknown argument $1 ${2:-} to sane_run" + ;; + esac + done + cmd_prefix=() + [ -n "$pipe" ] && cmd_prefix+=("bats_pipe") + [ -n "$timeout" ] && cmd_prefix+=("timeout" "--foreground" "--signal=KILL" "$timeout") + + local cmd="$1" + shift + + run "${cmd_prefix[@]}" "$cmd" "$@" # Some debug information to make life easier. bats will only print it if the # test failed, in which case the output is useful. # shellcheck disable=SC2154 - echo "$(basename "$RUNC") $* (status=$status):" >&2 + echo "$cmd $* (status=$status):" >&2 # shellcheck disable=SC2154 echo "$output" >&2 } +# Wrapper for runc. +function runc() { + sane_run __runc "$@" +} + # Raw wrapper for runc. function __runc() { "$RUNC" ${RUNC_USE_SYSTEMD+--systemd-cgroup} \ From 0b283ea21d737fccfe157931211b5b51a50ae593 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 29 Oct 2024 13:51:54 +1100 Subject: [PATCH 5/6] tests: add test for pivot_root in initramfs support The only way to run in the proper initramfs is to start a VM using a custom initrd that runs runc. This should be a fairly reasonable smoke-test that matches what minikube and kata do. Unfortunately, running the right qemu for the native architecture on various distros is a little different, so we need a helper function to get it to work on both Debian and AlmaLinux. Signed-off-by: Aleksa Sarai --- .cirrus.yml | 2 +- .github/workflows/test.yml | 2 +- Vagrantfile.fedora | 2 +- tests/integration/initramfs.bats | 167 +++++++++++++++++++++++++++++++ 4 files changed, 170 insertions(+), 3 deletions(-) create mode 100644 tests/integration/initramfs.bats diff --git a/.cirrus.yml b/.cirrus.yml index 9e30298e37b..bf97471d393 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -79,7 +79,7 @@ task: CIRRUS_WORKING_DIR: /home/runc GO_VERSION: "1.23" BATS_VERSION: "v1.11.0" - RPMS: gcc git iptables jq glibc-static libseccomp-devel make criu fuse-sshfs container-selinux + RPMS: cpio gcc git iptables jq qemu-kvm glibc-static libseccomp-devel make criu fuse-sshfs container-selinux # yamllint disable rule:key-duplicates matrix: DISTRO: almalinux-8 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b07d1bd4e7e..8d7703d91c9 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -105,7 +105,7 @@ jobs: - name: install deps run: | sudo apt update - sudo apt -y install libseccomp-dev sshfs uidmap + sudo apt -y install cpio libseccomp-dev qemu-kvm sshfs uidmap - name: install CRIU if: ${{ matrix.criu == '' }} diff --git a/Vagrantfile.fedora b/Vagrantfile.fedora index 4863808ee51..018a10c879a 100644 --- a/Vagrantfile.fedora +++ b/Vagrantfile.fedora @@ -24,7 +24,7 @@ Vagrant.configure("2") do |config| cat << EOF | dnf -y --exclude=kernel,kernel-core shell && break config install_weak_deps false update -install iptables gcc golang-go make glibc-static libseccomp-devel bats jq git-core criu fuse-sshfs container-selinux +install cpio iptables gcc golang-go make qemu-kvm glibc-static libseccomp-devel bats jq git-core criu fuse-sshfs container-selinux ts run EOF done diff --git a/tests/integration/initramfs.bats b/tests/integration/initramfs.bats new file mode 100644 index 00000000000..39332f541b1 --- /dev/null +++ b/tests/integration/initramfs.bats @@ -0,0 +1,167 @@ +#!/usr/bin/env bats + +load helpers + +# Rather than building our own kernel for use with qemu, just reuse the host's +# kernel since we just need some kernel that supports containers that we can +# use to run our custom initramfs. +function find_vmlinuz() { + shopt -s nullglob + local candidate candidates=( + /boot/vmlinuz + /boot/vmlinuz-"$(uname -r)"* + /usr/lib*/modules/"$(uname -r)"/vmlinuz* + ) + shopt -u nullglob + + for candidate in "${candidates[@]}"; do + [ -e "$candidate" ] || continue + export HOST_KERNEL="$candidate" + return 0 + done + + # Actuated doesn't provide a copy of the boot kernel, so we have to skip + # the test in that case. It also seems they don't allow aarch64 guests + # either (see ). + skip "could not find host vmlinuz kernel" +} + +function setup() { + INITRAMFS_ROOT="$(mktemp -d "$BATS_RUN_TMPDIR/runc-initramfs.XXXXXX")" + find_vmlinuz +} + +function teardown() { + [ -v INITRAMFS_ROOT ] && rm -rf "$INITRAMFS_ROOT" +} + +function qemu_native() { + # Different distributions put qemu-kvm in different locations and with + # different names. Debian and Ubuntu have a "kvm" binary, while AlmaLinux + # has /usr/libexec/qemu-kvm. + local qemu_binary="" qemu_candidates=("kvm" "qemu-kvm" "/usr/libexec/qemu-kvm") + local candidate + for candidate in "${qemu_candidates[@]}"; do + "$candidate" -help &>/dev/null || continue + qemu_binary="$candidate" + break + done + # TODO: Maybe we should also try to call qemu-system-FOO for the current + # architecture if qemu-kvm is missing? + [ -n "$qemu_binary" ] || skip "could not find qemu-kvm binary" + + local machine= + case "$(go env GOARCH)" in + 386 | amd64) + # Try to use a slightly newer PC CPU. + machine="pc" + ;; + arm | arm64) + # ARM doesn't provide a "default" machine value (because its use is so + # varied) so we have to specify the machine manually. + machine="virt" + ;; + *) + echo "could not figure out -machine argument for qemu -- using default" >&2 + ;; + esac + # We use -cpu max to ensure that the glibc we built runc with doesn't rely + # on CPU features that the default QEMU CPU doesn't support (such as on + # AlmaLinux 9). + local machine_args=("-cpu" "max") + [ -n "$machine" ] && machine_args+=("-machine" "$machine") + + sane_run --timeout=3m \ + "$qemu_binary" "${machine_args[@]}" "$@" + if [ "$status" -ne 0 ]; then + # To help with debugging, output the set of valid machine values. + "$qemu_binary" -machine help >&2 + fi +} + +@test "runc run [initramfs + pivot_root]" { + requires root + + # Configure our minimal initrd. + mkdir -p "$INITRAMFS_ROOT/initrd" + pushd "$INITRAMFS_ROOT/initrd" + + # Use busybox as a base for our initrd. + tar --exclude './dev/*' -xf "$BUSYBOX_IMAGE" + # Make sure that "sh" and "poweroff" are installed, otherwise qemu will + # boot loop when init stops. + [ -x ./bin/sh ] || skip "busybox image is missing /bin/sh" + [ -x ./bin/poweroff ] || skip "busybox image is missing /bin/poweroff" + + # Copy the runc binary into the container. In theory we would prefer to + # copy a static binary, but some distros (like openSUSE) don't ship + # libseccomp-static so requiring a static build for any integration test + # run wouldn't work. Instead, we copy all of the library dependencies into + # the rootfs (note that we also have to copy ld-linux-*.so because runc was + # probably built with a newer glibc than the one in our busybox image. + cp "$RUNC" ./bin/runc + readarray -t runclibs \ + <<<"$(ldd "$RUNC" | grep -Eo '/[^ ]*lib[^ ]*.so.[^ ]*')" + cp -vt ./lib64/ "${runclibs[@]}" + # busybox has /lib64 -> /lib so we can just fill in one path. + + # Create a container bundle using the same busybox image. + mkdir -p ./run/bundle + pushd ./run/bundle + mkdir -p rootfs + tar --exclude './dev/*' -C rootfs -xf "$BUSYBOX_IMAGE" + runc spec + update_config '.process.args = ["/bin/echo", "hello from inside the container"]' + popd + + # Build a custom /init script. + cat >./init <<-EOF + #!/bin/sh + + set -x + echo "==START INIT SCRIPT==" + + mkdir -p /proc + mount -t proc proc /proc + mkdir -p /sys + mount -t sysfs sysfs /sys + + mkdir -p /sys/fs/cgroup + mount -t cgroup2 cgroup2 /sys/fs/cgroup + + mkdir -p /tmp + mount -t tmpfs tmpfs /tmp + + mkdir -p /dev + mount -t devtmpfs devtmpfs /dev + mkdir -p /dev/pts + mount -t devpts -o newinstance devpts /dev/pts + mkdir -p /dev/shm + mount --bind /tmp /dev/shm + + # Wait for as little as possible if we panic so we can output the error + # log as part of the test failure before the test times out. + echo 1 >/proc/sys/kernel/panic + + runc run -b /run/bundle ctr + + echo "==END INIT SCRIPT==" + poweroff -f + EOF + chmod +x ./init + + find . | cpio -o -H newc >"$INITRAMFS_ROOT/initrd.cpio" + popd + + # Now we can just run the image (use qemu-kvm so that we run on the same + # architecture as the host system). We can just reuse the host kernel. + qemu_native \ + -initrd "$INITRAMFS_ROOT/initrd.cpio" \ + -kernel "$HOST_KERNEL" \ + -m 512M \ + -nographic -append console=ttyS0 -no-reboot + [ "$status" -eq 0 ] + [[ "$output" = *"==START INIT SCRIPT=="* ]] + [[ "$output" = *"hello from inside the container"* ]] + [[ "$output" = *"==END INIT SCRIPT=="* ]] +} From 363768c7f7a8b11a7aa284ea4a07286edee25b4e Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 31 Oct 2024 14:26:43 +1100 Subject: [PATCH 6/6] *: deprecate --pivot-root and hide it runc --help Existing users will still be able to use it, but new users won't be tempted into using this flag (and their containers should be able to run without issue without this flag anyway). Signed-off-by: Aleksa Sarai --- create.go | 5 +++-- run.go | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/create.go b/create.go index 0c76a268fb5..59827b34248 100644 --- a/create.go +++ b/create.go @@ -44,8 +44,9 @@ command(s) that get executed on start, edit the args parameter of the spec. See Usage: "specify the file to write the process id to", }, cli.BoolFlag{ - Name: "no-pivot", - Usage: "do not use pivot root to jail process inside rootfs to work around limitations when running in an initramfs (this option is deprecated, insecure, and unnecessary now that pivot_root works with initramfs -- see )", + Name: "no-pivot", + Usage: "(deprecated; do not use)", + Hidden: true, }, cli.BoolFlag{ Name: "no-new-keyring", diff --git a/run.go b/run.go index 0b907528b0a..6348ad00cd6 100644 --- a/run.go +++ b/run.go @@ -57,8 +57,9 @@ command(s) that get executed on start, edit the args parameter of the spec. See Usage: "disable the use of the subreaper used to reap reparented processes", }, cli.BoolFlag{ - Name: "no-pivot", - Usage: "do not use pivot root to jail process inside rootfs to work around limitations when running in an initramfs (this option is deprecated, insecure, and unnecessary now that pivot_root works with initramfs -- see )", + Name: "no-pivot", + Usage: "(deprecated; do not use)", + Hidden: true, }, cli.BoolFlag{ Name: "no-new-keyring",