Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions charts/agent-stack-k8s/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,72 @@
"default": 1000000,
"minimum": 1,
"title": "Sets the maximum number of Jobs the controller will hold in the work queue."
},
"resource-classes": {
"type": "object",
"default": null,
"title": "Resource classes",
"description": "Define reusable resource configurations that can be applied to jobs based on the resource_class agent tag.",
"additionalProperties": {
"type": "object",
"properties": {
"resource": {
"$ref": "https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/v1.35.0/_definitions.json#/definitions/io.k8s.api.core.v1.ResourceRequirements"
},
"nodeSelector": {
"type": "object",
"additionalProperties": {
"type": "string"
}
}
}
},
"examples": [
{
"xs": {
"resource": {
"requests": {
"cpu": "100m",
"memory": "128Mi"
},
"limits": {
"cpu": "200m",
"memory": "256Mi"
}
}
},
"gpu": {
"resource": {
"requests": {
"nvidia.com/gpu": "1",
"cpu": "1000m",
"memory": "2Gi"
},
"limits": {
"nvidia.com/gpu": "1",
"cpu": "2000m",
"memory": "4Gi"
}
},
"nodeSelector": {
"accelerator": "nvidia-tesla-k80"
}
},
"spot": {
"nodeSelector": {
"node-type": "spot",
"instance-type": "large"
}
}
}
]
},
"default-resource-class-name": {
"type": "string",
"default": "",
"title": "Default resource class name",
"description": "The name of a resource class (defined in resource-classes) to apply by default to jobs that do not specify a resource_class agent tag.",
"examples": ["xs", "spot"]
}
},
"examples": [
Expand Down
1 change: 1 addition & 0 deletions cmd/controller/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ func TestReadAndParseConfig(t *testing.T) {
},
},
},
DefaultResourceClassName: "small",

WorkspaceVolume: &corev1.Volume{
Name: "workspace-2-the-reckoning",
Expand Down
2 changes: 2 additions & 0 deletions examples/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ resource-classes:
memory: "512Mi"
hugepages-2Mi: "1Mi"

default-resource-class-name: "small"

queue: my-queue
tags:
- priority=high
Expand Down
3 changes: 2 additions & 1 deletion internal/controller/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ type Config struct {
DefaultMetadata Metadata `json:"default-metadata" validate:"omitempty"`
AdditionalRedactedVars []string `json:"additional-redacted-vars" validate:"omitempty"`

ResourceClasses map[string]*ResourceClass `json:"resource-classes" validate:"omitempty"`
ResourceClasses map[string]*ResourceClass `json:"resource-classes" validate:"omitempty"`
DefaultResourceClassName string `json:"default-resource-class-name" validate:"omitempty"`

DefaultImagePullPolicy corev1.PullPolicy `json:"default-image-pull-policy" validate:"omitempty"`
DefaultImageCheckPullPolicy corev1.PullPolicy `json:"default-image-check-pull-policy" validate:"omitempty"`
Expand Down
1 change: 1 addition & 0 deletions internal/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ func Run(ctx context.Context, logger *slog.Logger, k8sClient kubernetes.Interfac
ImageCheckContainerCPULimit: cfg.ImageCheckContainerCPULimit,
ImageCheckContainerMemoryLimit: cfg.ImageCheckContainerMemoryLimit,
ResourceClasses: cfg.ResourceClasses,
DefaultResourceClassName: cfg.DefaultResourceClassName,
})

informerFactory, err := NewInformerFactory(k8sClient, cfg.Namespace, cfg.ID)
Expand Down
27 changes: 21 additions & 6 deletions internal/controller/scheduler/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,39 @@ import (
corev1 "k8s.io/api/core/v1"
)

// applyResourceClass applies resource class if specified in agent tags as "resource_class"
// applyResourceClass applies resource class if specified in agent tags as "resource_class",
// or applies the default resource class if configured and no tag is present.
func (w *worker) applyResourceClass(podSpec *corev1.PodSpec, tags map[string]string) error {
resourceClassName, resourceClassTagExist := tags["resource_class"]

if !resourceClassTagExist {
return nil
if w.cfg.DefaultResourceClassName == "" {
return nil
}
resourceClassName = w.cfg.DefaultResourceClassName
}

if w.cfg.ResourceClasses == nil {
return fmt.Errorf("resource classes not configured but resource_class tag specified")
if resourceClassTagExist {
return fmt.Errorf("resource classes not configured but resource_class tag specified")
}
return fmt.Errorf("resource classes not configured but default-resource-class-name is set")
}
resourceClass, resourceClassFound := w.cfg.ResourceClasses[resourceClassName]

resourceClass, resourceClassFound := w.cfg.ResourceClasses[resourceClassName]
if !resourceClassFound {
return fmt.Errorf("resource class not found: %s", resourceClassName)
if resourceClassTagExist {
return fmt.Errorf("resource class not found: %s", resourceClassName)
}
return fmt.Errorf("default resource class not found: %s", resourceClassName)
}

resourceClass.Apply(podSpec)
w.logger.Debug("Applied resource class", "resource_class", resourceClassName)
if resourceClassTagExist {
w.logger.Debug("Applied resource class", "resource_class", resourceClassName)
} else {
w.logger.Debug("Applied default resource class", "resource_class", resourceClassName)
}

return nil
}
161 changes: 161 additions & 0 deletions internal/controller/scheduler/resource_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
package scheduler

import (
"log/slog"
"os"
"testing"

"github.com/buildkite/agent-stack-k8s/v2/internal/controller/config"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
)

func TestApplyResourceClass(t *testing.T) {
logger := slog.New(slog.NewTextHandler(os.Stdout, nil))

smallResourceClass := &config.ResourceClass{
Resource: &corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("100m"),
corev1.ResourceMemory: resource.MustParse("128Mi"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("200m"),
corev1.ResourceMemory: resource.MustParse("256Mi"),
},
},
}

largeResourceClass := &config.ResourceClass{
Resource: &corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("2"),
corev1.ResourceMemory: resource.MustParse("2Gi"),
},
},
}

commandContainerEnv := []corev1.EnvVar{
{Name: "BUILDKITE_BOOTSTRAP_PHASES", Value: "plugin,command"},
{Name: "BUILDKITE_COMMAND", Value: "echo hello"},
}

tests := []struct {
name string
resourceClasses map[string]*config.ResourceClass
defaultResourceClassName string
tags map[string]string
wantErr string
wantResources *corev1.ResourceRequirements
}{
{
name: "no tag and no default - no resources applied",
resourceClasses: map[string]*config.ResourceClass{"small": smallResourceClass},
defaultResourceClassName: "",
tags: map[string]string{"queue": "test"},
wantErr: "",
wantResources: nil,
},
{
name: "tag specified - applies tagged resource class",
resourceClasses: map[string]*config.ResourceClass{"small": smallResourceClass, "large": largeResourceClass},
defaultResourceClassName: "",
tags: map[string]string{"queue": "test", "resource_class": "large"},
wantErr: "",
wantResources: largeResourceClass.Resource,
},
{
name: "no tag but default set - applies default resource class",
resourceClasses: map[string]*config.ResourceClass{"small": smallResourceClass, "large": largeResourceClass},
defaultResourceClassName: "small",
tags: map[string]string{"queue": "test"},
wantErr: "",
wantResources: smallResourceClass.Resource,
},
{
name: "tag overrides default - applies tagged resource class",
resourceClasses: map[string]*config.ResourceClass{"small": smallResourceClass, "large": largeResourceClass},
defaultResourceClassName: "small",
tags: map[string]string{"queue": "test", "resource_class": "large"},
wantErr: "",
wantResources: largeResourceClass.Resource,
},
{
name: "tag specified but resource classes not configured",
resourceClasses: nil,
defaultResourceClassName: "",
tags: map[string]string{"queue": "test", "resource_class": "small"},
wantErr: "resource classes not configured but resource_class tag specified",
wantResources: nil,
},
{
name: "default set but resource classes not configured",
resourceClasses: nil,
defaultResourceClassName: "small",
tags: map[string]string{"queue": "test"},
wantErr: "resource classes not configured but default-resource-class-name is set",
wantResources: nil,
},
{
name: "tag references non-existent resource class",
resourceClasses: map[string]*config.ResourceClass{"small": smallResourceClass},
defaultResourceClassName: "",
tags: map[string]string{"queue": "test", "resource_class": "nonexistent"},
wantErr: "resource class not found: nonexistent",
wantResources: nil,
},
{
name: "default references non-existent resource class",
resourceClasses: map[string]*config.ResourceClass{"small": smallResourceClass},
defaultResourceClassName: "nonexistent",
tags: map[string]string{"queue": "test"},
wantErr: "default resource class not found: nonexistent",
wantResources: nil,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
w := &worker{
cfg: Config{
ResourceClasses: tt.resourceClasses,
DefaultResourceClassName: tt.defaultResourceClassName,
},
logger: logger,
}

podSpec := &corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "container-0",
Env: commandContainerEnv,
},
},
}

err := w.applyResourceClass(podSpec, tt.tags)

if tt.wantErr != "" {
require.Error(t, err)
assert.Contains(t, err.Error(), tt.wantErr)
return
}

require.NoError(t, err)

if tt.wantResources == nil {
assert.Empty(t, podSpec.Containers[0].Resources.Requests)
assert.Empty(t, podSpec.Containers[0].Resources.Limits)
} else {
assert.Equal(t, tt.wantResources.Requests, podSpec.Containers[0].Resources.Requests)
assert.Equal(t, tt.wantResources.Limits, podSpec.Containers[0].Resources.Limits)
}
})
}
}
1 change: 1 addition & 0 deletions internal/controller/scheduler/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ type Config struct {
ImageCheckContainerCPULimit string
ImageCheckContainerMemoryLimit string
ResourceClasses map[string]*config.ResourceClass
DefaultResourceClassName string
}

func New(logger *slog.Logger, client kubernetes.Interface, agentClient *api.AgentClient, cfg Config) *worker {
Expand Down
66 changes: 66 additions & 0 deletions internal/integration/fixtures/default-resource-class.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
steps:
- label: ":memory: Default Resource Class Test"
agents:
queue: "{{.queue}}"
image: alpine:latest
command: |-
echo "=== Memory Limit Check ==="
# Check memory limit from cgroup v1 or v2
if [ -f /sys/fs/cgroup/memory/memory.limit_in_bytes ]; then
# cgroup v1
MEMORY_LIMIT=$$(cat /sys/fs/cgroup/memory/memory.limit_in_bytes)
elif [ -f /sys/fs/cgroup/memory.max ]; then
# cgroup v2
MEMORY_LIMIT=$$(cat /sys/fs/cgroup/memory.max)
else
echo "Could not find memory limit file"
exit 1
fi

echo "Memory limit: $$MEMORY_LIMIT bytes"

# Convert to MB for easier comparison (256Mi = 268435456 bytes)
MEMORY_LIMIT_MB=$$((MEMORY_LIMIT / 1024 / 1024))
echo "Memory limit: $${MEMORY_LIMIT_MB}MB"

# Check if memory limit is approximately 256MB (allowing some variance)
# We expect 256Mi which is 268435456 bytes = 256MB
if [ "$$MEMORY_LIMIT_MB" -ge 240 ] && [ "$$MEMORY_LIMIT_MB" -le 280 ]; then
echo "✅ Memory limit is correctly set to ~256MB (from default resource class)"
else
echo "❌ Memory limit is not as expected. Got $${MEMORY_LIMIT_MB}MB, expected ~256MB"
exit 1
fi

echo "=== CPU Limit Check ==="
# Check CPU limit from cgroup
if [ -f /sys/fs/cgroup/cpu/cpu.cfs_quota_us ] && [ -f /sys/fs/cgroup/cpu/cpu.cfs_period_us ]; then
# cgroup v1
CPU_QUOTA=$$(cat /sys/fs/cgroup/cpu/cpu.cfs_quota_us)
CPU_PERIOD=$$(cat /sys/fs/cgroup/cpu/cpu.cfs_period_us)
elif [ -f /sys/fs/cgroup/cpu.max ]; then
# cgroup v2
CPU_MAX=$$(cat /sys/fs/cgroup/cpu.max)
CPU_QUOTA=$$(echo $$CPU_MAX | cut -d' ' -f1)
CPU_PERIOD=$$(echo $$CPU_MAX | cut -d' ' -f2)
else
echo "Could not find CPU limit files"
exit 1
fi

if [ "$$CPU_QUOTA" != "-1" ] && [ "$$CPU_QUOTA" != "max" ]; then
CPU_CORES=$$((CPU_QUOTA * 1000 / CPU_PERIOD))
echo "CPU quota: $${CPU_QUOTA}, CPU period: $${CPU_PERIOD}"
echo "CPU limit: $${CPU_CORES}m cores"

# Check if CPU limit is approximately 250m (allowing some variance)
if [ "$$CPU_CORES" -ge 200 ] && [ "$$CPU_CORES" -le 300 ]; then
echo "✅ CPU limit is correctly set to ~250m (from default resource class)"
else
echo "❌ CPU limit is not as expected. Got $${CPU_CORES}m, expected ~250m"
exit 1
fi
else
echo "❌ No CPU limit set"
exit 1
fi
Loading