diff --git a/CHANGELOG.md b/CHANGELOG.md index 720e27a9f40..49649793c9e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - The deprecated `libcontainer/userns` package has been removed; use `github.com/moby/sys/userns` instead. +### Breaking ### +- The handling of `pids.limit` has been updated to match the newer guidance + from the OCI runtime specification. In particular, now a maximum limit value + of `0` will be treated as an actual limit (due to limitations with systemd, + it will be treated the same as a limit value of `1`). We only expect users + that explicitly set `pids.limit` to `0` will see a behaviour change. + (opencontainers/cgroups#48, #4949) + +### Fixed ### +- cgroups: provide iocost statistics for cgroupv2. (opencontainers/cgroups#43) +- cgroups: retry DBus connection when it fails with EAGAIN. + (opencontainers/cgroups#45) +- cgroups: improve `cpuacct.usage_all` resilience when parsing data from + patched kernels (such as the Tencent kernels). (opencontainers/cgroups#46, + opencontainers/cgroups#50) + ## [1.4.0-rc.1] - 2025-09-05 > おめェもボスになったんだろぉ? diff --git a/go.mod b/go.mod index 03fcdca6452..df65bf5e83d 100644 --- a/go.mod +++ b/go.mod @@ -14,8 +14,8 @@ require ( github.com/moby/sys/user v0.4.0 github.com/moby/sys/userns v0.1.0 github.com/mrunalp/fileutils v0.5.1 - github.com/opencontainers/cgroups v0.0.5 - github.com/opencontainers/runtime-spec v1.2.2-0.20250818071321-383cadbf08c0 + github.com/opencontainers/cgroups v0.0.6 + github.com/opencontainers/runtime-spec v1.3.0 github.com/opencontainers/selinux v1.13.0 github.com/seccomp/libseccomp-golang v0.11.1 github.com/sirupsen/logrus v1.9.3 diff --git a/go.sum b/go.sum index 5ff1156f22a..01d49385ba1 100644 --- a/go.sum +++ b/go.sum @@ -46,10 +46,10 @@ github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28= github.com/mrunalp/fileutils v0.5.1 h1:F+S7ZlNKnrwHfSwdlgNSkKo67ReVf8o9fel6C3dkm/Q= github.com/mrunalp/fileutils v0.5.1/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ= -github.com/opencontainers/cgroups v0.0.5 h1:DRITAqcOnY0uSBzIpt1RYWLjh5DPDiqUs4fY6Y0ktls= -github.com/opencontainers/cgroups v0.0.5/go.mod h1:oWVzJsKK0gG9SCRBfTpnn16WcGEqDI8PAcpMGbqWxcs= -github.com/opencontainers/runtime-spec v1.2.2-0.20250818071321-383cadbf08c0 h1:RLn0YfUWkiqPGtgUANvJrcjIkCHGRl3jcz/c557M28M= -github.com/opencontainers/runtime-spec v1.2.2-0.20250818071321-383cadbf08c0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= +github.com/opencontainers/cgroups v0.0.6 h1:tfZFWTIIGaUUFImTyuTg+Mr5x8XRiSdZESgEBW7UxuI= +github.com/opencontainers/cgroups v0.0.6/go.mod h1:oWVzJsKK0gG9SCRBfTpnn16WcGEqDI8PAcpMGbqWxcs= +github.com/opencontainers/runtime-spec v1.3.0 h1:YZupQUdctfhpZy3TM39nN9Ika5CBWT5diQ8ibYCRkxg= +github.com/opencontainers/runtime-spec v1.3.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/selinux v1.13.0 h1:Zza88GWezyT7RLql12URvoxsbLfjFx988+LGaWfbL84= github.com/opencontainers/selinux v1.13.0/go.mod h1:XxWTed+A/s5NNq4GmYScVy+9jzXhGBVEOAyucdRUY8s= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index c46a576eaab..bdb90a0ef73 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -526,20 +526,22 @@ func TestPidsSystemd(t *testing.T) { testPids(t, true) } +func mkPtr[T any](v T) *T { return &v } + func testPids(t *testing.T, systemd bool) { if testing.Short() { return } config := newTemplateConfig(t, &tParam{systemd: systemd}) - config.Cgroups.Resources.PidsLimit = -1 + config.Cgroups.Resources.PidsLimit = mkPtr[int64](-1) // Running multiple processes, expecting it to succeed with no pids limit. runContainerOk(t, config, "/bin/sh", "-c", "/bin/true | /bin/true | /bin/true | /bin/true") // Enforce a permissive limit. This needs to be fairly hand-wavey due to the // issues with running Go binaries with pids restrictions (see below). - config.Cgroups.Resources.PidsLimit = 64 + config.Cgroups.Resources.PidsLimit = mkPtr[int64](64) runContainerOk(t, config, "/bin/sh", "-c", ` /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | @@ -548,7 +550,7 @@ func testPids(t *testing.T, systemd bool) { // Enforce a restrictive limit. 64 * /bin/true + 1 * shell should cause // this to fail reliably. - config.Cgroups.Resources.PidsLimit = 64 + config.Cgroups.Resources.PidsLimit = mkPtr[int64](64) out, _, err := runContainer(t, config, "/bin/sh", "-c", ` /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | diff --git a/man/runc-update.8.md b/man/runc-update.8.md index 0e95d85ded1..af23be35ca8 100644 --- a/man/runc-update.8.md +++ b/man/runc-update.8.md @@ -85,7 +85,8 @@ stdin. If this option is used, all other options are ignored. (i.e. use unlimited swap). **--pids-limit** _num_ -: Set the maximum number of processes allowed in the container. +: Set the maximum number of processes allowed in the container. Use **-1** to +unset the limit. **--l3-cache-schema** _value_ : Set the value for Intel RDT/CAT L3 cache schema. diff --git a/tests/integration/cgroups.bats b/tests/integration/cgroups.bats index a2df63e8002..6d829fd19f6 100644 --- a/tests/integration/cgroups.bats +++ b/tests/integration/cgroups.bats @@ -324,6 +324,37 @@ convert_hugetlb_size() { done } +# https://github.com/opencontainers/runc/issues/4014. +@test "runc run (pids.limit=0 means 1)" { + [ $EUID -ne 0 ] && requires rootless_cgroup + requires cgroups_pids + + set_cgroups_path + update_config '.linux.resources.pids.limit = 0' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_pids + [ "$status" -eq 0 ] + # systemd doesn't support TasksMax=0 so runc will silently remap it to 1 + # (for consistency, we do this for systemd *and* cgroupfs). + check_cgroup_value "pids.max" "1" + check_systemd_value "TasksMax" "1" +} + +# https://github.com/opencontainers/runc/issues/4014. +@test "runc run (pids.limit=-1 means unlimited)" { + [ $EUID -ne 0 ] && requires rootless_cgroup + requires cgroups_pids + + set_cgroups_path + update_config '.linux.resources.pids.limit = -1' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_pids + [ "$status" -eq 0 ] + check_cgroup_value "pids.max" "max" + # systemd < v227 shows UINT64_MAX instead of "infinity". + check_systemd_value "TasksMax" "infinity" "18446744073709551615" +} + @test "runc run (cgroup v2 resources.unified only)" { requires root cgroups_v2 diff --git a/tests/integration/update.bats b/tests/integration/update.bats index 29d36fdfa2a..07f3eff1733 100644 --- a/tests/integration/update.bats +++ b/tests/integration/update.bats @@ -327,6 +327,37 @@ EOF check_cpu_shares 100 } +@test "update pids.limit" { + [ $EUID -ne 0 ] && requires rootless_cgroup + requires cgroups_pids + + runc run -d --console-socket "$CONSOLE_SOCKET" test_update + [ "$status" -eq 0 ] + + check_cgroup_value "pids.max" 20 + check_systemd_value "TasksMax" 20 + + runc update test_update --pids-limit 12345 + [ "$status" -eq 0 ] + + check_cgroup_value "pids.max" "12345" + check_systemd_value "TasksMax" "12345" + + runc update test_update --pids-limit -1 + [ "$status" -eq 0 ] + + check_cgroup_value "pids.max" "max" + # systemd < v227 shows UINT64_MAX instead of "infinity". + check_systemd_value "TasksMax" "infinity" "18446744073709551615" + + runc update test_update --pids-limit 0 + [ "$status" -eq 0 ] + + # systemd doesn't support TasksMax=0 so runc will silently remap it to 1. + check_cgroup_value "pids.max" "1" + check_systemd_value "TasksMax" "1" +} + @test "cpu burst" { [ $EUID -ne 0 ] && requires rootless_cgroup requires cgroups_cpu_burst diff --git a/update.go b/update.go index 8510c217cd5..11a157184d7 100644 --- a/update.go +++ b/update.go @@ -17,10 +17,7 @@ import ( "github.com/urfave/cli" ) -func i64Ptr(i int64) *int64 { return &i } -func u64Ptr(i uint64) *uint64 { return &i } -func u16Ptr(i uint16) *uint16 { return &i } -func boolPtr(b bool) *bool { return &b } +func mkPtr[T any](v T) *T { return &v } var updateCommand = cli.Command{ Name: "update", @@ -147,9 +144,9 @@ other options are ignored. } r := specs.LinuxResources{ - // nil and u64Ptr(0) are not interchangeable + // nil and mkPtr(0) are not interchangeable Memory: &specs.LinuxMemory{ - CheckBeforeUpdate: boolPtr(false), // constant + CheckBeforeUpdate: mkPtr(false), // constant }, CPU: &specs.LinuxCPU{}, BlockIO: &specs.LinuxBlockIO{}, @@ -179,7 +176,7 @@ other options are ignored. } } else { if val := context.Int("blkio-weight"); val != 0 { - r.BlockIO.Weight = u16Ptr(uint16(val)) + r.BlockIO.Weight = mkPtr(uint16(val)) } if val := context.String("cpuset-cpus"); val != "" { r.CPU.Cpus = val @@ -192,7 +189,7 @@ other options are ignored. if err != nil { return fmt.Errorf("invalid value for cpu-idle: %w", err) } - r.CPU.Idle = i64Ptr(idle) + r.CPU.Idle = mkPtr(idle) } for _, pair := range []struct { @@ -252,17 +249,19 @@ other options are ignored. } } - r.Pids.Limit = int64(context.Int("pids-limit")) + if context.IsSet("pids-limit") { + r.Pids.Limit = mkPtr(int64(context.Int("pids-limit"))) + } } // Fix up values if r.Memory.Limit != nil && *r.Memory.Limit == -1 && r.Memory.Swap == nil { // To avoid error "unable to set swap limit without memory limit" - r.Memory.Swap = i64Ptr(0) + r.Memory.Swap = mkPtr[int64](0) } if r.CPU.Idle != nil && r.CPU.Shares == nil { // To avoid error "failed to write \"4\": write /sys/fs/cgroup/runc-cgroups-integration-test/test-cgroup-7341/cpu.weight: invalid argument" - r.CPU.Shares = u64Ptr(0) + r.CPU.Shares = mkPtr[uint64](0) } if (r.Memory.Kernel != nil) || (r.Memory.KernelTCP != nil) { //nolint:staticcheck // Ignore SA1019. Need to keep deprecated package for compatibility. diff --git a/vendor/github.com/opencontainers/cgroups/config_linux.go b/vendor/github.com/opencontainers/cgroups/config_linux.go index 9bc58a3789b..3d29d938bf0 100644 --- a/vendor/github.com/opencontainers/cgroups/config_linux.go +++ b/vendor/github.com/opencontainers/cgroups/config_linux.go @@ -90,8 +90,8 @@ type Resources struct { // Cgroup's SCHED_IDLE value. CPUIdle *int64 `json:"cpu_idle,omitempty"` - // Process limit; set <= `0' to disable limit. - PidsLimit int64 `json:"pids_limit,omitempty"` + // Process limit; set < `0' to disable limit. `nil` means "keep current limit". + PidsLimit *int64 `json:"pids_limit,omitempty"` // Specifies per cgroup weight, range is from 10 to 1000. BlkioWeight uint16 `json:"blkio_weight,omitempty"` diff --git a/vendor/github.com/opencontainers/cgroups/fs/cpuacct.go b/vendor/github.com/opencontainers/cgroups/fs/cpuacct.go index 391a023c751..bde25b07594 100644 --- a/vendor/github.com/opencontainers/cgroups/fs/cpuacct.go +++ b/vendor/github.com/opencontainers/cgroups/fs/cpuacct.go @@ -129,12 +129,16 @@ func getPercpuUsageInModes(path string) ([]uint64, []uint64, error) { defer fd.Close() scanner := bufio.NewScanner(fd) - scanner.Scan() // skipping header line + scanner.Scan() // Read header line. + const want = "cpu user system" + if hdr := scanner.Text(); !strings.HasPrefix(hdr, want) { + return nil, nil, malformedLine(path, file, hdr) + } for scanner.Scan() { - // Each line is: cpu user system - fields := strings.SplitN(scanner.Text(), " ", 3) - if len(fields) != 3 { + // Each line is: cpu user system. Keep N at 4 to ignore extra fields. + fields := strings.SplitN(scanner.Text(), " ", 4) + if len(fields) < 3 { continue } diff --git a/vendor/github.com/opencontainers/cgroups/fs/pids.go b/vendor/github.com/opencontainers/cgroups/fs/pids.go index 9319761e6ad..36bd339af82 100644 --- a/vendor/github.com/opencontainers/cgroups/fs/pids.go +++ b/vendor/github.com/opencontainers/cgroups/fs/pids.go @@ -19,19 +19,24 @@ func (s *PidsGroup) Apply(path string, _ *cgroups.Resources, pid int) error { } func (s *PidsGroup) Set(path string, r *cgroups.Resources) error { - if r.PidsLimit != 0 { - // "max" is the fallback value. - limit := "max" - - if r.PidsLimit > 0 { - limit = strconv.FormatInt(r.PidsLimit, 10) - } - - if err := cgroups.WriteFile(path, "pids.max", limit); err != nil { - return err - } + if r.PidsLimit == nil { + return nil } + // "max" is the fallback value. + val := "max" + if limit := *r.PidsLimit; limit > 0 { + val = strconv.FormatInt(limit, 10) + } else if limit == 0 { + // systemd doesn't support setting pids.max to "0", so when setting + // TasksMax we need to remap it to "1". We do the same thing here to + // avoid flip-flop behaviour between the fs and systemd drivers. In + // practice, the pids cgroup behaviour is basically identical. + val = "1" + } + if err := cgroups.WriteFile(path, "pids.max", val); err != nil { + return err + } return nil } diff --git a/vendor/github.com/opencontainers/cgroups/fs2/io.go b/vendor/github.com/opencontainers/cgroups/fs2/io.go index 0f6ef7fea55..3c6dcc3bf2e 100644 --- a/vendor/github.com/opencontainers/cgroups/fs2/io.go +++ b/vendor/github.com/opencontainers/cgroups/fs2/io.go @@ -165,11 +165,22 @@ func statIo(dirPath string, stats *cgroups.Stats) error { case "wios": op = "Write" targetTable = &parsedStats.IoServicedRecursive + + case "cost.usage": + op = "Count" + targetTable = &parsedStats.IoCostUsage + case "cost.wait": + op = "Count" + targetTable = &parsedStats.IoCostWait + case "cost.indebt": + op = "Count" + targetTable = &parsedStats.IoCostIndebt + case "cost.indelay": + op = "Count" + targetTable = &parsedStats.IoCostIndelay + default: - // Skip over entries we cannot map to cgroupv1 stats for now. - // In the future we should expand the stats struct to include - // them. - logrus.Debugf("cgroupv2 io stats: skipping over unmappable %s entry", item) + logrus.Debugf("cgroupv2 io stats: unknown entry %s", item) continue } diff --git a/vendor/github.com/opencontainers/cgroups/fs2/pids.go b/vendor/github.com/opencontainers/cgroups/fs2/pids.go index 9b82b90115e..f932259ad56 100644 --- a/vendor/github.com/opencontainers/cgroups/fs2/pids.go +++ b/vendor/github.com/opencontainers/cgroups/fs2/pids.go @@ -4,6 +4,7 @@ import ( "errors" "math" "os" + "strconv" "strings" "golang.org/x/sys/unix" @@ -13,19 +14,26 @@ import ( ) func isPidsSet(r *cgroups.Resources) bool { - return r.PidsLimit != 0 + return r.PidsLimit != nil } func setPids(dirPath string, r *cgroups.Resources) error { if !isPidsSet(r) { return nil } - if val := numToStr(r.PidsLimit); val != "" { - if err := cgroups.WriteFile(dirPath, "pids.max", val); err != nil { - return err - } + val := "max" + if limit := *r.PidsLimit; limit > 0 { + val = strconv.FormatInt(limit, 10) + } else if limit == 0 { + // systemd doesn't support setting pids.max to "0", so when setting + // TasksMax we need to remap it to "1". We do the same thing here to + // avoid flip-flop behaviour between the fs and systemd drivers. In + // practice, the pids cgroup behaviour is basically identical. + val = "1" + } + if err := cgroups.WriteFile(dirPath, "pids.max", val); err != nil { + return err } - return nil } diff --git a/vendor/github.com/opencontainers/cgroups/stats.go b/vendor/github.com/opencontainers/cgroups/stats.go index 6cd6253ee0a..01701333ab3 100644 --- a/vendor/github.com/opencontainers/cgroups/stats.go +++ b/vendor/github.com/opencontainers/cgroups/stats.go @@ -159,6 +159,10 @@ type BlkioStats struct { IoTimeRecursive []BlkioStatEntry `json:"io_time_recursive,omitempty"` SectorsRecursive []BlkioStatEntry `json:"sectors_recursive,omitempty"` PSI *PSIStats `json:"psi,omitempty"` + IoCostUsage []BlkioStatEntry `json:"io_cost_usage,omitempty"` + IoCostWait []BlkioStatEntry `json:"io_cost_wait,omitempty"` + IoCostIndebt []BlkioStatEntry `json:"io_cost_indebt,omitempty"` + IoCostIndelay []BlkioStatEntry `json:"io_cost_indelay,omitempty"` } type HugetlbStats struct { diff --git a/vendor/github.com/opencontainers/cgroups/systemd/dbus.go b/vendor/github.com/opencontainers/cgroups/systemd/dbus.go index bb87ae83aef..c492372a7af 100644 --- a/vendor/github.com/opencontainers/cgroups/systemd/dbus.go +++ b/vendor/github.com/opencontainers/cgroups/systemd/dbus.go @@ -4,10 +4,13 @@ import ( "context" "errors" "fmt" + "math/rand/v2" "sync" + "time" systemdDbus "github.com/coreos/go-systemd/v22/dbus" dbus "github.com/godbus/dbus/v5" + "golang.org/x/sys/unix" ) var ( @@ -64,10 +67,27 @@ func (d *dbusConnManager) getConnection() (*systemdDbus.Conn, error) { } func (d *dbusConnManager) newConnection() (*systemdDbus.Conn, error) { - if dbusRootless { - return newUserSystemdDbus() + newDbusConn := func() (*systemdDbus.Conn, error) { + if dbusRootless { + return newUserSystemdDbus() + } + return systemdDbus.NewWithContext(context.TODO()) + } + + var err error + for retry := range 7 { + var conn *systemdDbus.Conn + conn, err = newDbusConn() + if !errors.Is(err, unix.EAGAIN) { + return conn, err + } + // Exponential backoff (100ms * 2^attempt + ~12.5% jitter). + // At most we would expect 15 seconds of delay with 7 attempts. + delay := 100 * time.Millisecond << retry + delay += time.Duration(rand.Int64N(1 + (delay.Milliseconds() >> 3))) + time.Sleep(delay) } - return systemdDbus.NewWithContext(context.TODO()) + return nil, fmt.Errorf("dbus connection failed after several retries: %w", err) } // resetConnection resets the connection to its initial state diff --git a/vendor/github.com/opencontainers/cgroups/systemd/v1.go b/vendor/github.com/opencontainers/cgroups/systemd/v1.go index 5500a53ac0f..96e69bb8608 100644 --- a/vendor/github.com/opencontainers/cgroups/systemd/v1.go +++ b/vendor/github.com/opencontainers/cgroups/systemd/v1.go @@ -2,6 +2,7 @@ package systemd import ( "errors" + "math" "os" "path/filepath" "strings" @@ -97,9 +98,17 @@ func genV1ResourcesProperties(r *cgroups.Resources, cm *dbusConnManager) ([]syst newProp("BlockIOWeight", uint64(r.BlkioWeight))) } - if r.PidsLimit > 0 || r.PidsLimit == -1 { + if r.PidsLimit != nil { + var tasksMax uint64 + if limit := *r.PidsLimit; limit < 0 { + tasksMax = math.MaxUint64 // "infinity" + } else if limit == 0 { + tasksMax = 1 // systemd does not accept "0" for TasksMax + } else { + tasksMax = uint64(limit) + } properties = append(properties, - newProp("TasksMax", uint64(r.PidsLimit))) + newProp("TasksMax", tasksMax)) } err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems) diff --git a/vendor/github.com/opencontainers/cgroups/systemd/v2.go b/vendor/github.com/opencontainers/cgroups/systemd/v2.go index c2f2e87f3ba..f76c93e8444 100644 --- a/vendor/github.com/opencontainers/cgroups/systemd/v2.go +++ b/vendor/github.com/opencontainers/cgroups/systemd/v2.go @@ -176,6 +176,9 @@ func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err) } } + if num == 0 { + num = 1 // systemd does not accept "0" for TasksMax + } props = append(props, newProp("TasksMax", num)) @@ -256,9 +259,17 @@ func genV2ResourcesProperties(dirPath string, r *cgroups.Resources, cm *dbusConn addCPUQuota(cm, &properties, &r.CpuQuota, r.CpuPeriod) - if r.PidsLimit > 0 || r.PidsLimit == -1 { + if r.PidsLimit != nil { + var tasksMax uint64 + if limit := *r.PidsLimit; limit < 0 { + tasksMax = math.MaxUint64 // "infinity" + } else if limit == 0 { + tasksMax = 1 // systemd does not accept "0" for TasksMax + } else { + tasksMax = uint64(limit) + } properties = append(properties, - newProp("TasksMax", uint64(r.PidsLimit))) + newProp("TasksMax", tasksMax)) } err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems) diff --git a/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go b/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go index 36d28032e66..3ef333387b0 100644 --- a/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go +++ b/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go @@ -31,6 +31,8 @@ type Spec struct { VM *VM `json:"vm,omitempty" platform:"vm"` // ZOS is platform-specific configuration for z/OS based containers. ZOS *ZOS `json:"zos,omitempty" platform:"zos"` + // FreeBSD is platform-specific configuration for FreeBSD based containers. + FreeBSD *FreeBSD `json:"freebsd,omitempty" platform:"freebsd"` } // Scheduler represents the scheduling attributes for a process. It is based on @@ -170,7 +172,7 @@ type Mount struct { // Destination is the absolute path where the mount will be placed in the container. Destination string `json:"destination"` // Type specifies the mount kind. - Type string `json:"type,omitempty" platform:"linux,solaris,zos"` + Type string `json:"type,omitempty" platform:"linux,solaris,zos,freebsd"` // Source specifies the source path of the mount. Source string `json:"source,omitempty"` // Options are fstab style mount options. @@ -434,7 +436,7 @@ type LinuxCPU struct { // LinuxPids for Linux cgroup 'pids' resource management (Linux 4.3) type LinuxPids struct { // Maximum number of PIDs. Default is "no limit". - Limit int64 `json:"limit"` + Limit *int64 `json:"limit,omitempty"` } // LinuxNetwork identification and priority configuration @@ -688,6 +690,32 @@ type WindowsHyperV struct { UtilityVMPath string `json:"utilityVMPath,omitempty"` } +// IOMems contains information about iomem addresses that should be passed to the VM. +type IOMems struct { + // Guest Frame Number to map the iomem range. If GFN is not specified, the mapping will be done to the same Frame Number as was provided in FirstMFN. + FirstGFN *uint64 `json:"firstGFN,omitempty"` + // Physical page number of iomem regions. + FirstMFN *uint64 `json:"firstMFN"` + // Number of pages to be mapped. + NrMFNs *uint64 `json:"nrMFNs"` +} + +// Hardware configuration for the VM image +type HWConfig struct { + // Path to the container device-tree file that should be passed to the VM configuration. + DeviceTree string `json:"deviceTree,omitempty"` + // Number of virtual cpus for the VM. + VCPUs *uint32 `json:"vcpus,omitempty"` + // Maximum memory in bytes allocated to the VM. + Memory *uint64 `json:"memory,omitempty"` + // Host device tree nodes to passthrough to the VM. + DtDevs []string `json:"dtdevs,omitempty"` + // Allow auto-translated domains to access specific hardware I/O memory pages. + IOMems []IOMems `json:"iomems,omitempty"` + // Allows VM to access specific physical IRQs. + Irqs []uint32 `json:"irqs,omitempty"` +} + // VM contains information for virtual-machine-based containers. type VM struct { // Hypervisor specifies hypervisor-related configuration for virtual-machine-based containers. @@ -696,6 +724,8 @@ type VM struct { Kernel VMKernel `json:"kernel"` // Image specifies guest image related configuration for virtual-machine-based containers. Image VMImage `json:"image,omitempty"` + // Hardware configuration that should be passed to the VM. + HwConfig *HWConfig `json:"hwconfig,omitempty"` } // VMHypervisor contains information about the hypervisor to use for a virtual machine. @@ -963,3 +993,75 @@ const ( // SchedFlagUtilClampMin represents the utilization clamp maximum scheduling flag SchedFlagUtilClampMax LinuxSchedulerFlag = "SCHED_FLAG_UTIL_CLAMP_MAX" ) + +// FreeBSD contains platform-specific configuration for FreeBSD based containers. +type FreeBSD struct { + // Devices which are accessible in the container + Devices []FreeBSDDevice `json:"devices,omitempty"` + // Jail definition for this container + Jail *FreeBSDJail `json:"jail,omitempty"` +} + +type FreeBSDDevice struct { + // Path to the device, relative to /dev. + Path string `json:"path"` + // FileMode permission bits for the device. + Mode *os.FileMode `json:"mode,omitempty"` +} + +// FreeBSDJail describes how to configure the container's jail +type FreeBSDJail struct { + // Parent jail name - this can be used to share a single vnet + // across several containers + Parent string `json:"parent,omitempty"` + // Whether to use parent UTS names or override in the container + Host FreeBSDSharing `json:"host,omitempty"` + // IPv4 address sharing for the container + Ip4 FreeBSDSharing `json:"ip4,omitempty"` + // IPv4 addresses for the container + Ip4Addr []string `json:"ip4Addr,omitempty"` + // IPv6 address sharing for the container + Ip6 FreeBSDSharing `json:"ip6,omitempty"` + // IPv6 addresses for the container + Ip6Addr []string `json:"ip6Addr,omitempty"` + // Which network stack to use for the container + Vnet FreeBSDSharing `json:"vnet,omitempty"` + // If set, Ip4Addr and Ip6Addr addresses will be added to this interface + Interface string `json:"interface,omitempty"` + // List interfaces to be moved to the container's vnet + VnetInterfaces []string `json:"vnetInterfaces,omitempty"` + // SystemV IPC message sharing for the container + SysVMsg FreeBSDSharing `json:"sysvmsg,omitempty"` + // SystemV semaphore message sharing for the container + SysVSem FreeBSDSharing `json:"sysvsem,omitempty"` + // SystemV memory sharing for the container + SysVShm FreeBSDSharing `json:"sysvshm,omitempty"` + // Mount visibility (see jail(8) for details) + EnforceStatfs *int `json:"enforceStatfs,omitempty"` + // Jail capabilities + Allow *FreeBSDJailAllow `json:"allow,omitempty"` +} + +// These values are used to control access to features in the container, either +// disabling the feature, sharing state with the parent or creating new private +// state in the container. +type FreeBSDSharing string + +const ( + FreeBSDShareDisable FreeBSDSharing = "disable" + FreeBSDShareNew FreeBSDSharing = "new" + FreeBSDShareInherit FreeBSDSharing = "inherit" +) + +// FreeBSDJailAllow describes jail capabilities +type FreeBSDJailAllow struct { + SetHostname bool `json:"setHostname,omitempty"` + RawSockets bool `json:"rawSockets,omitempty"` + Chflags bool `json:"chflags,omitempty"` + Mount []string `json:"mount,omitempty"` + Quotas bool `json:"quotas,omitempty"` + SocketAf bool `json:"socketAf,omitempty"` + Mlock bool `json:"mlock,omitempty"` + ReservedPorts bool `json:"reservedPorts,omitempty"` + Suser bool `json:"suser,omitempty"` +} diff --git a/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go b/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go index b0a00466b61..0257dba3e74 100644 --- a/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go +++ b/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go @@ -6,12 +6,12 @@ const ( // VersionMajor is for an API incompatible changes VersionMajor = 1 // VersionMinor is for functionality in a backwards-compatible manner - VersionMinor = 2 + VersionMinor = 3 // VersionPatch is for backwards-compatible bug fixes - VersionPatch = 1 + VersionPatch = 0 // VersionDev indicates development branch. Releases will be empty string. - VersionDev = "+dev" + VersionDev = "" ) // Version is the specification version that the package types support. diff --git a/vendor/modules.txt b/vendor/modules.txt index 7e9c87b5982..b6d94ce7bc8 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -68,7 +68,7 @@ github.com/moby/sys/userns # github.com/mrunalp/fileutils v0.5.1 ## explicit; go 1.13 github.com/mrunalp/fileutils -# github.com/opencontainers/cgroups v0.0.5 +# github.com/opencontainers/cgroups v0.0.6 ## explicit; go 1.23.0 github.com/opencontainers/cgroups github.com/opencontainers/cgroups/devices @@ -79,7 +79,7 @@ github.com/opencontainers/cgroups/fscommon github.com/opencontainers/cgroups/internal/path github.com/opencontainers/cgroups/manager github.com/opencontainers/cgroups/systemd -# github.com/opencontainers/runtime-spec v1.2.2-0.20250818071321-383cadbf08c0 +# github.com/opencontainers/runtime-spec v1.3.0 ## explicit github.com/opencontainers/runtime-spec/specs-go github.com/opencontainers/runtime-spec/specs-go/features