@@ -11,6 +11,7 @@ import (
1111
1212 "github.com/buildkite/agent-stack-k8s/v2/api"
1313 "github.com/buildkite/agent-stack-k8s/v2/internal/controller/config"
14+ "github.com/buildkite/agent/v3/agent"
1415 "github.com/buildkite/roko"
1516
1617 "github.com/google/uuid"
@@ -297,6 +298,7 @@ func (w *podWatcher) failOnInitContainerFailure(ctx context.Context, log *zap.Lo
297298 log .Debug ("Checking pod for failed init containers" )
298299
299300 containerFails := make (map [string ]* corev1.ContainerStateTerminated )
301+ var lastFailExitCode int32
300302
301303 // If any init container fails, whether it's one we added specifically to
302304 // check for pull failure or not, the pod won't run.
@@ -306,6 +308,7 @@ func (w *podWatcher) failOnInitContainerFailure(ctx context.Context, log *zap.Lo
306308 continue
307309 }
308310 containerFails [containerStatus .Name ] = term
311+ lastFailExitCode = term .ExitCode
309312 }
310313
311314 if len (containerFails ) == 0 {
@@ -322,7 +325,12 @@ func (w *podWatcher) failOnInitContainerFailure(ctx context.Context, log *zap.Lo
322325 // probably shouldn't interfere.
323326 log .Info ("One or more init containers failed. Failing." )
324327 message := w .formatInitContainerFails (containerFails )
325- if err := acquireAndFailForObject (ctx , log , w .k8s , w .cfg , pod , message ); err != nil {
328+ failureCtx := FailureInfo {
329+ Message : message ,
330+ ExitCode : lastFailExitCode ,
331+ Reason : agent .SignalReasonStackError ,
332+ }
333+ if err := acquireAndFailForObject (ctx , log , w .k8s , w .cfg , pod , failureCtx ); err != nil {
326334 // Maybe the job was cancelled in the meantime?
327335 log .Error ("Could not fail Buildkite job" , zap .Error (err ))
328336 podWatcherBuildkiteJobFailErrorsCounter .Inc ()
@@ -477,7 +485,11 @@ func (w *podWatcher) failForImageFailure(ctx context.Context, log *zap.Logger, f
477485 // We can acquire it and fail it ourselves.
478486 log .Info ("One or more job containers are waiting too long for images. Failing." )
479487 message := w .formatImagePullFailureMessage (statuses )
480- if err := acquireAndFailForObject (ctx , log , w .k8s , w .cfg , pod , message ); err != nil {
488+ failureCtx := FailureInfo {
489+ Message : message ,
490+ // Do we have a better status code to report here?
491+ }
492+ if err := acquireAndFailForObject (ctx , log , w .k8s , w .cfg , pod , failureCtx ); err != nil {
481493 podWatcherBuildkiteJobFailErrorsCounter .Inc ()
482494 // Maybe the job was acquired by an agent in the meantime?
483495 // Maybe the job was cancelled in the meantime?
0 commit comments