Skip to content

Commit bc7f063

Browse files
committed
PB-862: use Kubernetes native sidecar mechanism
1 parent 6cc7788 commit bc7f063

7 files changed

Lines changed: 13 additions & 173 deletions

File tree

internal/controller/controller.go

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -216,16 +216,6 @@ func Run(ctx context.Context, logger *slog.Logger, k8sClient kubernetes.Interfac
216216
// But it does bring a trade-off of more likely reservation expiration.
217217
reserver := reserver.New(logger.With("component", "reserver"), agentClient, limiter)
218218

219-
// PodCompletionWatcher watches k8s for pods where the agent has terminated,
220-
// in order to clean up the pod. This is necessary because "sidecars" are
221-
// not internally managed by buildkite-agent, and would continue running
222-
// forever, preventing the pod being cleaned up.
223-
completions := scheduler.NewPodCompletionWatcher(logger.With("component", "completions"), k8sClient)
224-
if err := completions.RegisterInformer(ctx, informerFactory); err != nil {
225-
logger.Error("failed to register completions informer", "error", err)
226-
return
227-
}
228-
229219
// JobWatcher watches for jobs in bad conditions to clean up:
230220
// * Jobs that fail without ever creating a pod
231221
// * Jobs that stall forever without ever creating a pod

internal/controller/scheduler/completions.go

Lines changed: 0 additions & 117 deletions
This file was deleted.

internal/controller/scheduler/metrics.go

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -202,33 +202,3 @@ var (
202202
Help: "Count of failures to delete pod forcefully by podWatcher",
203203
}, []string{"delete_reason", "error_reason"})
204204
)
205-
206-
// Completion watcher metrics
207-
208-
var (
209-
completionWatcherOnAddEventCounter = promauto.NewCounter(prometheus.CounterOpts{
210-
Namespace: promNamespace,
211-
Subsystem: "completion_watcher",
212-
Name: "onadd_events_total",
213-
Help: "Count of OnAdd informer events",
214-
})
215-
completionWatcherOnUpdateEventCounter = promauto.NewCounter(prometheus.CounterOpts{
216-
Namespace: promNamespace,
217-
Subsystem: "completion_watcher",
218-
Name: "onupdate_events_total",
219-
Help: "Count of OnUpdate informer events",
220-
})
221-
222-
completionWatcherJobCleanupsCounter = promauto.NewCounter(prometheus.CounterOpts{
223-
Namespace: promNamespace,
224-
Subsystem: "completion_watcher",
225-
Name: "cleanups_total",
226-
Help: "Count of jobs with finished agents successfully cleaned up",
227-
})
228-
completionWatcherJobCleanupErrorsCounter = promauto.NewCounterVec(prometheus.CounterOpts{
229-
Namespace: promNamespace,
230-
Subsystem: "completion_watcher",
231-
Name: "cleanup_errors_total",
232-
Help: "Count of errors during attempts to clean up a job with a finished agent",
233-
}, []string{"reason"})
234-
)

internal/controller/scheduler/pod_watcher.go

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package scheduler
33
import (
44
"cmp"
55
"context"
6+
"log/slog"
67
"regexp"
78
"slices"
89
"strconv"
@@ -16,7 +17,6 @@ import (
1617

1718
"github.com/google/uuid"
1819
"github.com/jedib0t/go-pretty/v6/table"
19-
"log/slog"
2020
corev1 "k8s.io/api/core/v1"
2121
kerrors "k8s.io/apimachinery/pkg/api/errors"
2222
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -581,12 +581,6 @@ func (w *podWatcher) stopWatchingForImageFailure(jobUUID uuid.UUID) {
581581
// All container-\d containers will have the agent installed as their PID 1.
582582
// Therefore, their lifecycle is well monitored in our backend, allowing us to terminate them if they fail to start.
583583
//
584-
// However, sidecar containers are completely unmonitored.
585-
// We avoid terminating jobs due to sidecar image pull backoff watcher
586-
// to prevent customer confusion.
587-
//
588-
// Most importantly, the CI can still pass (in theory) even if sidecars fail.
589-
//
590584
// (The name "system container" is subject to more debate.)
591585
func isSystemContainer(containerStatus *corev1.ContainerStatus) bool {
592586
name := containerStatus.Name

internal/controller/scheduler/scheduler.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,10 @@ import (
2424
"github.com/buildkite/agent/v3/agent"
2525
"github.com/buildkite/agent/v3/clicommand"
2626

27+
"log/slog"
28+
2729
"github.com/distribution/reference"
2830
"github.com/jedib0t/go-pretty/v6/table"
29-
"log/slog"
3031
batchv1 "k8s.io/api/batch/v1"
3132
corev1 "k8s.io/api/core/v1"
3233
kerrors "k8s.io/apimachinery/pkg/api/errors"
@@ -503,7 +504,9 @@ func (w *worker) Build(podSpec *corev1.PodSpec, skipCheckout bool, inputs buildI
503504
}
504505
maps.Copy(kjob.Spec.Template.Annotations, sidecarAnnotation)
505506

506-
podSpec.Containers = append(podSpec.Containers, c)
507+
// Per https://kubernetes.io/docs/concepts/workloads/pods/sidecar-containers/
508+
c.RestartPolicy = ptr.To(corev1.ContainerRestartPolicyAlways)
509+
podSpec.InitContainers = append(podSpec.InitContainers, c)
507510

508511
sidecarCounterCount += 1
509512
}
@@ -536,7 +539,7 @@ func (w *worker) Build(podSpec *corev1.PodSpec, skipCheckout bool, inputs buildI
536539
// The coordinating agent container are expecting those managed containers to connect.
537540
//
538541
// Calculating this imperatively is risky given we lack control over the context, this is subject to refactor.
539-
managedContainerCount := len(podSpec.Containers) + systemContainerCount - sidecarCounterCount
542+
managedContainerCount := len(podSpec.Containers) + systemContainerCount
540543

541544
agentContainer := corev1.Container{
542545
Name: AgentContainerName,

internal/integration/fixtures/extra-volume-mounts-sidecar.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ steps:
2323
mv /tmp/extra-volume-mount-sidecar/pass-the-parcel /tmp/extra-volume-mount/pass-the-parcel
2424
ls -lah /tmp/extra-volume-mount-sidecar
2525
ls -lah /tmp/extra-volume-mount
26+
# So Kubernetes can mark this as Started
27+
# In reality a sidecar should not be used like this.
28+
# As this is effectively an ad-hoc initContainer.
29+
sleep 120
2630
podSpec:
2731
containers:
2832
- image: alpine:latest

internal/integration/fixtures/sidecars.yaml

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,9 @@ steps:
22
- label: ":wave:"
33
agents:
44
queue: "{{.queue}}"
5+
image: curlimages/curl:latest
6+
command: curl localhost:80
57
plugins:
68
- kubernetes:
79
sidecars:
810
- image: nginx:latest
9-
podSpec:
10-
containers:
11-
- image: curlimages/curl:latest
12-
name: curl
13-
command:
14-
- curl --retry 5 --retry-all-errors localhost:80

0 commit comments

Comments
 (0)