Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions charts/agent-stack-k8s/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,7 @@ version: 0.1.0
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "1.16.0"

# Sidecar support requires the SidecarContainers feature gate, first enabled
# by default in Kubernetes 1.29.
kubeVersion: ">=1.29.0"
Comment thread
zhming0 marked this conversation as resolved.
10 changes: 0 additions & 10 deletions internal/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -216,16 +216,6 @@ func Run(ctx context.Context, logger *slog.Logger, k8sClient kubernetes.Interfac
// But it does bring a trade-off of more likely reservation expiration.
reserver := reserver.New(logger.With("component", "reserver"), agentClient, limiter)

// PodCompletionWatcher watches k8s for pods where the agent has terminated,
// in order to clean up the pod. This is necessary because "sidecars" are
// not internally managed by buildkite-agent, and would continue running
// forever, preventing the pod being cleaned up.
completions := scheduler.NewPodCompletionWatcher(logger.With("component", "completions"), k8sClient)
if err := completions.RegisterInformer(ctx, informerFactory); err != nil {
logger.Error("failed to register completions informer", "error", err)
return
}

// JobWatcher watches for jobs in bad conditions to clean up:
// * Jobs that fail without ever creating a pod
// * Jobs that stall forever without ever creating a pod
Expand Down
117 changes: 0 additions & 117 deletions internal/controller/scheduler/completions.go

This file was deleted.

30 changes: 0 additions & 30 deletions internal/controller/scheduler/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,33 +202,3 @@ var (
Help: "Count of failures to delete pod forcefully by podWatcher",
}, []string{"delete_reason", "error_reason"})
)

// Completion watcher metrics

var (
completionWatcherOnAddEventCounter = promauto.NewCounter(prometheus.CounterOpts{
Namespace: promNamespace,
Subsystem: "completion_watcher",
Name: "onadd_events_total",
Help: "Count of OnAdd informer events",
})
completionWatcherOnUpdateEventCounter = promauto.NewCounter(prometheus.CounterOpts{
Namespace: promNamespace,
Subsystem: "completion_watcher",
Name: "onupdate_events_total",
Help: "Count of OnUpdate informer events",
})

completionWatcherJobCleanupsCounter = promauto.NewCounter(prometheus.CounterOpts{
Namespace: promNamespace,
Subsystem: "completion_watcher",
Name: "cleanups_total",
Help: "Count of jobs with finished agents successfully cleaned up",
})
completionWatcherJobCleanupErrorsCounter = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: promNamespace,
Subsystem: "completion_watcher",
Name: "cleanup_errors_total",
Help: "Count of errors during attempts to clean up a job with a finished agent",
}, []string{"reason"})
)
8 changes: 1 addition & 7 deletions internal/controller/scheduler/pod_watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package scheduler
import (
"cmp"
"context"
"log/slog"
"regexp"
"slices"
"strconv"
Expand All @@ -16,7 +17,6 @@ import (

"github.com/google/uuid"
"github.com/jedib0t/go-pretty/v6/table"
"log/slog"
corev1 "k8s.io/api/core/v1"
kerrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -581,12 +581,6 @@ func (w *podWatcher) stopWatchingForImageFailure(jobUUID uuid.UUID) {
// All container-\d containers will have the agent installed as their PID 1.
// Therefore, their lifecycle is well monitored in our backend, allowing us to terminate them if they fail to start.
//
// However, sidecar containers are completely unmonitored.
// We avoid terminating jobs due to sidecar image pull backoff watcher
// to prevent customer confusion.
//
// Most importantly, the CI can still pass (in theory) even if sidecars fail.
//
// (The name "system container" is subject to more debate.)
func isSystemContainer(containerStatus *corev1.ContainerStatus) bool {
name := containerStatus.Name
Expand Down
8 changes: 5 additions & 3 deletions internal/controller/scheduler/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"encoding/json"
"errors"
"fmt"
"log/slog"
"maps"
"net/url"
"path/filepath"
Expand All @@ -26,7 +27,6 @@ import (

"github.com/distribution/reference"
"github.com/jedib0t/go-pretty/v6/table"
"log/slog"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
kerrors "k8s.io/apimachinery/pkg/api/errors"
Expand Down Expand Up @@ -503,7 +503,9 @@ func (w *worker) Build(podSpec *corev1.PodSpec, skipCheckout bool, inputs buildI
}
maps.Copy(kjob.Spec.Template.Annotations, sidecarAnnotation)

podSpec.Containers = append(podSpec.Containers, c)
// Per https://kubernetes.io/docs/concepts/workloads/pods/sidecar-containers/
c.RestartPolicy = ptr.To(corev1.ContainerRestartPolicyAlways)
podSpec.InitContainers = append(podSpec.InitContainers, c)

sidecarCounterCount += 1
}
Expand Down Expand Up @@ -536,7 +538,7 @@ func (w *worker) Build(podSpec *corev1.PodSpec, skipCheckout bool, inputs buildI
// The coordinating agent container are expecting those managed containers to connect.
//
// Calculating this imperatively is risky given we lack control over the context, this is subject to refactor.
managedContainerCount := len(podSpec.Containers) + systemContainerCount - sidecarCounterCount
managedContainerCount := len(podSpec.Containers) + systemContainerCount

agentContainer := corev1.Container{
Name: AgentContainerName,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ steps:
mv /tmp/extra-volume-mount-sidecar/pass-the-parcel /tmp/extra-volume-mount/pass-the-parcel
ls -lah /tmp/extra-volume-mount-sidecar
ls -lah /tmp/extra-volume-mount
# So Kubernetes can mark this as Started
# In reality a sidecar should not be used like this.
# As this is effectively an ad-hoc initContainer.
sleep infinity
podSpec:
containers:
- image: alpine:latest
Expand Down
8 changes: 2 additions & 6 deletions internal/integration/fixtures/sidecars.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,9 @@ steps:
- label: ":wave:"
agents:
queue: "{{.queue}}"
image: curlimages/curl:latest
command: curl localhost:80
plugins:
- kubernetes:
sidecars:
- image: nginx:latest
podSpec:
containers:
- image: curlimages/curl:latest
name: curl
command:
- curl --retry 5 --retry-all-errors localhost:80