Skip to content

Commit 819a190

Browse files
committed
PB-5: improve exit code propogation for non-command containers
1 parent 6f85fd5 commit 819a190

4 files changed

Lines changed: 53 additions & 9 deletions

File tree

internal/controller/scheduler/fail_job.go

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,13 @@ import (
1919
"k8s.io/client-go/kubernetes"
2020
)
2121

22+
// FailureInfo contains data about a job failure for reporting to Buildkite.
23+
type FailureInfo struct {
24+
Message string
25+
ExitCode int32
26+
Reason string
27+
}
28+
2229
// acquireAndFailForObject figures out how to fail the BK job corresponding to
2330
// the k8s object (a pod or job) by inspecting the object's labels.
2431
func acquireAndFailForObject(
@@ -27,7 +34,7 @@ func acquireAndFailForObject(
2734
k8sClient kubernetes.Interface,
2835
cfg *config.Config,
2936
obj metav1.Object,
30-
message string,
37+
failureCtx FailureInfo,
3138
) error {
3239
agentToken, err := fetchAgentToken(ctx, logger, k8sClient, obj.GetNamespace(), cfg.AgentTokenSecret)
3340
if err != nil {
@@ -45,7 +52,7 @@ func acquireAndFailForObject(
4552
tags := agenttags.TagsFromLabels(labels)
4653
opts := cfg.AgentConfig.ControllerOptions()
4754

48-
if err := acquireAndFail(ctx, logger, agentToken, cfg.JobPrefix, jobUUID, tags, message, opts...); err != nil {
55+
if err := acquireAndFail(ctx, logger, agentToken, cfg.JobPrefix, jobUUID, tags, failureCtx, opts...); err != nil {
4956
logger.Error("failed to acquire and fail the job on Buildkite", zap.Error(err))
5057
return err
5158
}
@@ -61,7 +68,7 @@ func acquireAndFail(
6168
jobPrefix string,
6269
jobUUID string,
6370
tags []string,
64-
message string,
71+
failureCtx FailureInfo,
6572
options ...agentcore.ControllerOption,
6673
) error {
6774
opts := append([]agentcore.ControllerOption{
@@ -89,13 +96,24 @@ func acquireAndFail(
8996
return fmt.Errorf("starting job: %w", err)
9097
}
9198

92-
if err := jctr.WriteLog(ctx, message); err != nil {
99+
if err := jctr.WriteLog(ctx, failureCtx.Message); err != nil {
93100
zapLogger.Error("writing log", zap.Error(err))
94101
return fmt.Errorf("writing log: %w", err)
95102
}
96103

97104
var ignoreAgentInDispatches *bool
98-
if err := jctr.Finish(ctx, agentcore.ProcessExit{Status: 1, SignalReason: agent.SignalReasonStackError}, ignoreAgentInDispatches); err != nil {
105+
106+
status := failureCtx.ExitCode
107+
if status == 0 {
108+
status = 1
109+
}
110+
reason := failureCtx.Reason
111+
if reason == "" {
112+
// By default, we consider all failure triggered by the stack controller are stack errors
113+
reason = agent.SignalReasonStackError
114+
}
115+
116+
if err := jctr.Finish(ctx, agentcore.ProcessExit{Status: int(status), SignalReason: reason}, ignoreAgentInDispatches); err != nil {
99117
zapLogger.Error("finishing job", zap.Error(err))
100118
return fmt.Errorf("finishing job: %w", err)
101119
}

internal/controller/scheduler/job_watcher.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88

99
"github.com/buildkite/agent-stack-k8s/v2/internal/controller/config"
1010
"github.com/buildkite/agent-stack-k8s/v2/internal/controller/model"
11+
"github.com/buildkite/agent/v3/agent"
1112

1213
"github.com/google/uuid"
1314
"github.com/jedib0t/go-pretty/v6/table"
@@ -231,7 +232,12 @@ func (w *jobWatcher) fetchEvents(ctx context.Context, log *zap.Logger, kjob *bat
231232
}
232233

233234
func (w *jobWatcher) failJob(ctx context.Context, log *zap.Logger, kjob *batchv1.Job, message string) {
234-
if err := acquireAndFailForObject(ctx, log, w.k8s, w.cfg, kjob, message); err != nil {
235+
failureCtx := FailureInfo{
236+
Message: message,
237+
// We can know almost all failures triggered by job watcher are stack related error.
238+
Reason: agent.SignalReasonStackError,
239+
}
240+
if err := acquireAndFailForObject(ctx, log, w.k8s, w.cfg, kjob, failureCtx); err != nil {
235241
// Maybe the job was cancelled in the meantime?
236242
log.Error("Could not fail Buildkite job", zap.Error(err))
237243
jobWatcherBuildkiteJobFailErrorsCounter.Inc()

internal/controller/scheduler/pod_watcher.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111

1212
"github.com/buildkite/agent-stack-k8s/v2/api"
1313
"github.com/buildkite/agent-stack-k8s/v2/internal/controller/config"
14+
"github.com/buildkite/agent/v3/agent"
1415
"github.com/buildkite/roko"
1516

1617
"github.com/google/uuid"
@@ -297,6 +298,7 @@ func (w *podWatcher) failOnInitContainerFailure(ctx context.Context, log *zap.Lo
297298
log.Debug("Checking pod for failed init containers")
298299

299300
containerFails := make(map[string]*corev1.ContainerStateTerminated)
301+
var lastFailExitCode int32
300302

301303
// If any init container fails, whether it's one we added specifically to
302304
// check for pull failure or not, the pod won't run.
@@ -306,6 +308,7 @@ func (w *podWatcher) failOnInitContainerFailure(ctx context.Context, log *zap.Lo
306308
continue
307309
}
308310
containerFails[containerStatus.Name] = term
311+
lastFailExitCode = term.ExitCode
309312
}
310313

311314
if len(containerFails) == 0 {
@@ -322,7 +325,12 @@ func (w *podWatcher) failOnInitContainerFailure(ctx context.Context, log *zap.Lo
322325
// probably shouldn't interfere.
323326
log.Info("One or more init containers failed. Failing.")
324327
message := w.formatInitContainerFails(containerFails)
325-
if err := acquireAndFailForObject(ctx, log, w.k8s, w.cfg, pod, message); err != nil {
328+
failureCtx := FailureInfo{
329+
Message: message,
330+
ExitCode: lastFailExitCode,
331+
Reason: agent.SignalReasonStackError,
332+
}
333+
if err := acquireAndFailForObject(ctx, log, w.k8s, w.cfg, pod, failureCtx); err != nil {
326334
// Maybe the job was cancelled in the meantime?
327335
log.Error("Could not fail Buildkite job", zap.Error(err))
328336
podWatcherBuildkiteJobFailErrorsCounter.Inc()
@@ -477,7 +485,11 @@ func (w *podWatcher) failForImageFailure(ctx context.Context, log *zap.Logger, f
477485
// We can acquire it and fail it ourselves.
478486
log.Info("One or more job containers are waiting too long for images. Failing.")
479487
message := w.formatImagePullFailureMessage(statuses)
480-
if err := acquireAndFailForObject(ctx, log, w.k8s, w.cfg, pod, message); err != nil {
488+
failureCtx := FailureInfo{
489+
Message: message,
490+
// Do we have a better status code to report here?
491+
}
492+
if err := acquireAndFailForObject(ctx, log, w.k8s, w.cfg, pod, failureCtx); err != nil {
481493
podWatcherBuildkiteJobFailErrorsCounter.Inc()
482494
// Maybe the job was acquired by an agent in the meantime?
483495
// Maybe the job was cancelled in the meantime?

internal/controller/scheduler/scheduler.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"github.com/buildkite/agent-stack-k8s/v2/internal/version"
2222
"github.com/buildkite/roko"
2323

24+
"github.com/buildkite/agent/v3/agent"
2425
"github.com/buildkite/agent/v3/clicommand"
2526

2627
"github.com/distribution/reference"
@@ -1272,7 +1273,14 @@ func (w *worker) failJob(ctx context.Context, inputs buildInputs, message string
12721273
}
12731274

12741275
opts := w.cfg.AgentConfig.ControllerOptions()
1275-
if err := acquireAndFail(ctx, w.logger, agentToken, w.cfg.JobPrefix, inputs.uuid, inputs.agentQueryRules, message, opts...); err != nil {
1276+
failureCtx := FailureInfo{
1277+
Message: message,
1278+
// In scheduler worker, often these failure are technically users error in YAML.
1279+
// Using agent refuse seems to be reasonable as that represent that agent won't execute these yaml.
1280+
// Deserve more discussion though..
1281+
Reason: agent.SignalReasonAgentRefused,
1282+
}
1283+
if err := acquireAndFail(ctx, w.logger, agentToken, w.cfg.JobPrefix, inputs.uuid, inputs.agentQueryRules, failureCtx, opts...); err != nil {
12761284
w.logger.Error("failed to acquire and fail the job on Buildkite", zap.Error(err))
12771285
schedulerBuildkiteJobFailErrorsCounter.Inc()
12781286
return err

0 commit comments

Comments
 (0)