Skip to content

Commit 5c80d06

Browse files
Re-introduce parent cgroup CPU limit parsing (used on ECS) (#13794)
* Re-introduce parent cgroup CPU limit parsing (used on ECS) * Update releasenotes/notes/fix-ecs-parent-limit-2bcf000a0e895503.yaml Co-authored-by: Kari Halsted <[email protected]> Co-authored-by: Kari Halsted <[email protected]>
1 parent 1d518d2 commit 5c80d06

File tree

3 files changed

+63
-9
lines changed

3 files changed

+63
-9
lines changed

pkg/util/containers/metrics/system/collector_linux.go

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
package system
1010

1111
import (
12+
"errors"
1213
"fmt"
1314
"strconv"
1415
"strings"
@@ -18,6 +19,7 @@ import (
1819
"github.com/DataDog/datadog-agent/pkg/util/cgroups"
1920
"github.com/DataDog/datadog-agent/pkg/util/containers/metrics/provider"
2021
"github.com/DataDog/datadog-agent/pkg/util/log"
22+
"github.com/DataDog/datadog-agent/pkg/util/pointer"
2123
systemutils "github.com/DataDog/datadog-agent/pkg/util/system"
2224
)
2325

@@ -163,10 +165,22 @@ func (c *systemCollector) buildContainerMetrics(cg cgroups.Cgroup, cacheValidity
163165
return nil, fmt.Errorf("cgroup parsing failed, incomplete data for containerID: %s, err: %w", cg.Identifier(), err)
164166
}
165167

168+
parentCPUStatRetriever := func(parentCPUStats *cgroups.CPUStats) error {
169+
parentCg, err := cg.GetParent()
170+
if err != nil {
171+
return err
172+
}
173+
if parentCg == nil {
174+
return errors.New("no parent cgroup")
175+
}
176+
177+
return parentCg.GetCPUStats(parentCPUStats)
178+
}
179+
166180
cs := &provider.ContainerStats{
167181
Timestamp: time.Now(),
168182
Memory: buildMemoryStats(stats.Memory),
169-
CPU: buildCPUStats(stats.CPU),
183+
CPU: buildCPUStats(stats.CPU, parentCPUStatRetriever),
170184
IO: buildIOStats(c.procPath, stats.IO),
171185
PID: buildPIDStats(stats.PID),
172186
}
@@ -204,7 +218,7 @@ func buildMemoryStats(cgs *cgroups.MemoryStats) *provider.ContainerMemStats {
204218
return cs
205219
}
206220

207-
func buildCPUStats(cgs *cgroups.CPUStats) *provider.ContainerCPUStats {
221+
func buildCPUStats(cgs *cgroups.CPUStats, parentCPUStatsRetriever func(parentCPUStats *cgroups.CPUStats) error) *provider.ContainerCPUStats {
208222
if cgs == nil {
209223
return nil
210224
}
@@ -220,12 +234,32 @@ func buildCPUStats(cgs *cgroups.CPUStats) *provider.ContainerCPUStats {
220234
convertField(cgs.ThrottledTime, &cs.ThrottledTime)
221235

222236
// Compute complex fields
223-
cs.Limit = computeCPULimitPct(cgs)
237+
cs.Limit = computeCPULimitPct(cgs, parentCPUStatsRetriever)
224238

225239
return cs
226240
}
227241

228-
func computeCPULimitPct(cgs *cgroups.CPUStats) *float64 {
242+
func computeCPULimitPct(cgs *cgroups.CPUStats, parentCPUStatsRetriever func(parentCPUStats *cgroups.CPUStats) error) *float64 {
243+
limitPct := computeCgroupCPULimitPct(cgs)
244+
245+
// Check parent cgroup as it's used on ECS
246+
if limitPct == nil {
247+
parentCPUStats := &cgroups.CPUStats{}
248+
if err := parentCPUStatsRetriever(parentCPUStats); err == nil {
249+
limitPct = computeCgroupCPULimitPct(parentCPUStats)
250+
}
251+
}
252+
253+
// If no limit is available, setting the limit to number of CPUs.
254+
// Always reporting a limit allows to compute CPU % accurately.
255+
if limitPct == nil {
256+
limitPct = pointer.Float64Ptr(float64(systemutils.HostCPUCount() * 100))
257+
}
258+
259+
return limitPct
260+
}
261+
262+
func computeCgroupCPULimitPct(cgs *cgroups.CPUStats) *float64 {
229263
// Limit is computed using min(CPUSet, CFS CPU Quota)
230264
var limitPct float64
231265
if cgs.CPUCount != nil {
@@ -237,12 +271,11 @@ func computeCPULimitPct(cgs *cgroups.CPUStats) *float64 {
237271
limitPct = quotaLimitPct
238272
}
239273
}
240-
// If no limit is available, setting the limit to number of CPUs.
241-
// Always reporting a limit allows to compute CPU % accurately.
242-
if limitPct == 0 {
243-
limitPct = float64(systemutils.HostCPUCount()) * 100
274+
275+
if limitPct != 0 {
276+
return &limitPct
244277
}
245-
return &limitPct
278+
return nil
246279
}
247280

248281
func buildPIDStats(cgs *cgroups.PIDStats) *provider.ContainerPIDStats {

pkg/util/containers/metrics/system/collector_linux_test.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,23 @@ func TestBuildContainerMetrics(t *testing.T) {
141141
PID: &provider.ContainerPIDStats{},
142142
},
143143
},
144+
{
145+
name: "limit cpu count on parent",
146+
cg: &cgroups.MockCgroup{
147+
CPU: &cgroups.CPUStats{},
148+
Parent: &cgroups.MockCgroup{
149+
CPU: &cgroups.CPUStats{
150+
CPUCount: pointer.UInt64Ptr(10),
151+
},
152+
},
153+
},
154+
want: &provider.ContainerStats{
155+
CPU: &provider.ContainerCPUStats{
156+
Limit: pointer.Float64Ptr(1000),
157+
},
158+
PID: &provider.ContainerPIDStats{},
159+
},
160+
},
144161
}
145162

146163
for _, tt := range tests {
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
fixes:
3+
- |
4+
The container CPU limit that is reported by `docker` and `container` checks on ECS was not defaulting to the task limit when no CPU limit is set at container level.

0 commit comments

Comments
 (0)