Skip to content

Commit 0773b47

Browse files
committed
wait for a given duration in case of imagePullBackOff
We have implemented imagePullBackOff as fail fast. The issue with this approach is, the node where the pod is scheduled often experiences registry rate limit. The image pull failure because of the rate limit returns the same warning (reason: Failed and message: ImagePullBackOff). The pod can potentially recover after waiting for enough time until the cap is expired. Kubernetes can then successfully pull the image and bring the pod up. Introducing a default configuration to specify cluster level timeout to allow the imagePullBackOff to retry for a given duration. Once that duration has passed, return a permanent failure. tektoncd#5987 tektoncd#7184 This is a manual cheery-pick of tektoncd#7666 Signed-off-by: Priti Desai <[email protected]>
1 parent c58fa31 commit 0773b47

File tree

9 files changed

+245
-22
lines changed

9 files changed

+245
-22
lines changed

config/config-defaults.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,3 +86,8 @@ data:
8686
# default-resolver-type contains the default resolver type to be used in the cluster,
8787
# no default-resolver-type is specified by default
8888
default-resolver-type:
89+
90+
# default-imagepullbackoff-timeout contains the default duration to wait
91+
# before requeuing the TaskRun to retry, specifying 0 here is equivalent to fail fast
92+
# possible values could be 1m, 5m, 10s, 1h, etc
93+
# default-imagepullbackoff-timeout: "5m"

docs/additional-configs.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ installation.
3131
- [Verify the transparency logs using `rekor-cli`](#verify-the-transparency-logs-using-rekor-cli)
3232
- [Verify Tekton Resources](#verify-tekton-resources)
3333
- [Pipelinerun with Affinity Assistant](#pipelineruns-with-affinity-assistant)
34+
- [TaskRuns with `imagePullBackOff` Timeout](#taskruns-with-imagepullbackoff-timeout)
3435
- [Next steps](#next-steps)
3536

3637

@@ -607,6 +608,26 @@ please take a look at [Trusted Resources](./trusted-resources.md).
607608
The cluster operators can review the [guidelines](developers/affinity-assistant.md) to `cordon` a node in the cluster
608609
with the tekton controller and the affinity assistant is enabled.
609610
611+
## TaskRuns with `imagePullBackOff` Timeout
612+
613+
Tekton pipelines has adopted a fail fast strategy with a taskRun failing with `TaskRunImagePullFailed` in case of an
614+
`imagePullBackOff`. This can be limited in some cases, and it generally depends on the infrastructure. To allow the
615+
cluster operators to decide whether to wait in case of an `imagePullBackOff`, a setting is available to configure
616+
the wait time in minutes such that the controller will wait for the specified duration before declaring a failure.
617+
For example, with the following `config-defaults`, the controller does not mark the taskRun as failure for 5 minutes since
618+
the pod is scheduled in case the image pull fails with `imagePullBackOff`.
619+
See issue https://github.com/tektoncd/pipeline/issues/5987 for more details.
620+
621+
```yaml
622+
apiVersion: v1
623+
kind: ConfigMap
624+
metadata:
625+
name: config-defaults
626+
namespace: tekton-pipelines
627+
data:
628+
default-imagepullbackoff-timeout: "5"
629+
```
630+
610631
## Next steps
611632
612633
To get started with Tekton check the [Introductory tutorials][quickstarts],

pkg/apis/config/default.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ const (
4646
DefaultMaxMatrixCombinationsCount = 256
4747
// DefaultResolverTypeValue is used when no default resolver type is specified
4848
DefaultResolverTypeValue = ""
49+
// DefaultImagePullBackOffTimeout is used when no imagePullBackOff timeout is specified
50+
DefaultImagePullBackOffTimeout = 0 * time.Minute
4951

5052
defaultTimeoutMinutesKey = "default-timeout-minutes"
5153
defaultServiceAccountKey = "default-service-account"
@@ -57,6 +59,7 @@ const (
5759
defaultMaxMatrixCombinationsCountKey = "default-max-matrix-combinations-count"
5860
defaultForbiddenEnv = "default-forbidden-env"
5961
defaultResolverTypeKey = "default-resolver-type"
62+
defaultImagePullBackOffTimeout = "default-imagepullbackoff-timeout"
6063
)
6164

6265
// DefaultConfig holds all the default configurations for the config.
@@ -75,6 +78,7 @@ type Defaults struct {
7578
DefaultMaxMatrixCombinationsCount int
7679
DefaultForbiddenEnv []string
7780
DefaultResolverType string
81+
DefaultImagePullBackOffTimeout time.Duration
7882
}
7983

8084
// GetDefaultsConfigName returns the name of the configmap containing all
@@ -105,6 +109,7 @@ func (cfg *Defaults) Equals(other *Defaults) bool {
105109
other.DefaultTaskRunWorkspaceBinding == cfg.DefaultTaskRunWorkspaceBinding &&
106110
other.DefaultMaxMatrixCombinationsCount == cfg.DefaultMaxMatrixCombinationsCount &&
107111
other.DefaultResolverType == cfg.DefaultResolverType &&
112+
other.DefaultImagePullBackOffTimeout == cfg.DefaultImagePullBackOffTimeout &&
108113
reflect.DeepEqual(other.DefaultForbiddenEnv, cfg.DefaultForbiddenEnv)
109114
}
110115

@@ -117,6 +122,7 @@ func NewDefaultsFromMap(cfgMap map[string]string) (*Defaults, error) {
117122
DefaultCloudEventsSink: DefaultCloudEventSinkValue,
118123
DefaultMaxMatrixCombinationsCount: DefaultMaxMatrixCombinationsCount,
119124
DefaultResolverType: DefaultResolverTypeValue,
125+
DefaultImagePullBackOffTimeout: DefaultImagePullBackOffTimeout,
120126
}
121127

122128
if defaultTimeoutMin, ok := cfgMap[defaultTimeoutMinutesKey]; ok {
@@ -179,6 +185,14 @@ func NewDefaultsFromMap(cfgMap map[string]string) (*Defaults, error) {
179185
tc.DefaultResolverType = defaultResolverType
180186
}
181187

188+
if defaultImagePullBackOff, ok := cfgMap[defaultImagePullBackOffTimeout]; ok {
189+
timeout, err := time.ParseDuration(defaultImagePullBackOff)
190+
if err != nil {
191+
return nil, fmt.Errorf("failed parsing tracing config %q", defaultImagePullBackOffTimeout)
192+
}
193+
tc.DefaultImagePullBackOffTimeout = timeout
194+
}
195+
182196
return &tc, nil
183197
}
184198

pkg/apis/config/default_test.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package config_test
1818

1919
import (
2020
"testing"
21+
"time"
2122

2223
"github.com/google/go-cmp/cmp"
2324
"github.com/tektoncd/pipeline/pkg/apis/config"
@@ -41,6 +42,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) {
4142
DefaultManagedByLabelValue: "something-else",
4243
DefaultMaxMatrixCombinationsCount: 256,
4344
DefaultResolverType: "git",
45+
DefaultImagePullBackOffTimeout: time.Duration(5) * time.Second,
4446
},
4547
fileName: config.GetDefaultsConfigName(),
4648
},
@@ -60,12 +62,16 @@ func TestNewDefaultsFromConfigMap(t *testing.T) {
6062
},
6163
},
6264
DefaultMaxMatrixCombinationsCount: 256,
65+
DefaultImagePullBackOffTimeout: 0,
6366
},
6467
fileName: "config-defaults-with-pod-template",
6568
},
6669
{
6770
expectedError: true,
6871
fileName: "config-defaults-timeout-err",
72+
}, {
73+
expectedError: true,
74+
fileName: "config-defaults-imagepullbackoff-timeout-err",
6975
},
7076
// Previously the yaml package did not support UnmarshalStrict, though
7177
// it's supported now however it may introduce incompatibility, so we decide
@@ -79,6 +85,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) {
7985
DefaultManagedByLabelValue: config.DefaultManagedByLabelValue,
8086
DefaultPodTemplate: &pod.Template{},
8187
DefaultMaxMatrixCombinationsCount: 256,
88+
DefaultImagePullBackOffTimeout: 0,
8289
},
8390
},
8491
{
@@ -90,6 +97,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) {
9097
DefaultManagedByLabelValue: config.DefaultManagedByLabelValue,
9198
DefaultAAPodTemplate: &pod.AffinityAssistantTemplate{},
9299
DefaultMaxMatrixCombinationsCount: 256,
100+
DefaultImagePullBackOffTimeout: 0,
93101
},
94102
},
95103
{
@@ -104,6 +112,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) {
104112
DefaultTimeoutMinutes: 60,
105113
DefaultServiceAccount: "default",
106114
DefaultManagedByLabelValue: config.DefaultManagedByLabelValue,
115+
DefaultImagePullBackOffTimeout: 0,
107116
},
108117
},
109118
{
@@ -115,6 +124,7 @@ func TestNewDefaultsFromConfigMap(t *testing.T) {
115124
DefaultMaxMatrixCombinationsCount: 256,
116125
DefaultManagedByLabelValue: "tekton-pipelines",
117126
DefaultForbiddenEnv: []string{"TEKTON_POWER_MODE", "TEST_ENV", "TEST_TEKTON"},
127+
DefaultImagePullBackOffTimeout: time.Duration(15) * time.Second,
118128
},
119129
},
120130
}
@@ -137,6 +147,7 @@ func TestNewDefaultsFromEmptyConfigMap(t *testing.T) {
137147
DefaultManagedByLabelValue: "tekton-pipelines",
138148
DefaultServiceAccount: "default",
139149
DefaultMaxMatrixCombinationsCount: 256,
150+
DefaultImagePullBackOffTimeout: 0,
140151
}
141152
verifyConfigFileWithExpectedConfig(t, DefaultsConfigEmptyName, expectedConfig)
142153
}
@@ -285,6 +296,24 @@ func TestEquals(t *testing.T) {
285296
DefaultForbiddenEnv: []string{"TEST_ENV", "TEKTON_POWER_MODE"},
286297
},
287298
expected: true,
299+
}, {
300+
name: "different default ImagePullBackOff timeout",
301+
left: &config.Defaults{
302+
DefaultImagePullBackOffTimeout: 10,
303+
},
304+
right: &config.Defaults{
305+
DefaultImagePullBackOffTimeout: 20,
306+
},
307+
expected: false,
308+
}, {
309+
name: "same default ImagePullBackOff timeout",
310+
left: &config.Defaults{
311+
DefaultImagePullBackOffTimeout: 20,
312+
},
313+
right: &config.Defaults{
314+
DefaultImagePullBackOffTimeout: 20,
315+
},
316+
expected: true,
288317
},
289318
}
290319

pkg/apis/config/testdata/config-defaults-forbidden-env.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,4 @@ data:
2121
default-timeout-minutes: "50"
2222
default-service-account: "tekton"
2323
default-forbidden-env: "TEST_TEKTON, TEKTON_POWER_MODE,TEST_ENV,TEST_TEKTON"
24+
default-imagepullbackoff-timeout: "15s"
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Copyright 2019 The Tekton Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: v1
16+
kind: ConfigMap
17+
metadata:
18+
name: config-defaults
19+
namespace: tekton-pipelines
20+
data:
21+
default-imagepullbackoff-timeout: "not-a-timeout"

pkg/apis/config/testdata/config-defaults.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@ data:
2222
default-service-account: "tekton"
2323
default-managed-by-label-value: "something-else"
2424
default-resolver-type: "git"
25+
default-imagepullbackoff-timeout: "5s"

pkg/reconciler/taskrun/taskrun.go

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,13 +93,15 @@ type Reconciler struct {
9393
tracerProvider trace.TracerProvider
9494
}
9595

96+
const ImagePullBackOff = "ImagePullBackOff"
97+
9698
var (
9799
// Check that our Reconciler implements taskrunreconciler.Interface
98100
_ taskrunreconciler.Interface = (*Reconciler)(nil)
99101

100102
// Pod failure reasons that trigger failure of the TaskRun
101103
podFailureReasons = map[string]struct{}{
102-
"ImagePullBackOff": {},
104+
ImagePullBackOff: {},
103105
"InvalidImageName": {},
104106
}
105107
)
@@ -170,7 +172,7 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, tr *v1.TaskRun) pkgrecon
170172
}
171173

172174
// Check for Pod Failures
173-
if failed, reason, message := c.checkPodFailed(tr); failed {
175+
if failed, reason, message := c.checkPodFailed(ctx, tr); failed {
174176
err := c.failTaskRun(ctx, tr, reason, message)
175177
return c.finishReconcileUpdateEmitEvents(ctx, tr, before, err)
176178
}
@@ -216,10 +218,30 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, tr *v1.TaskRun) pkgrecon
216218
return nil
217219
}
218220

219-
func (c *Reconciler) checkPodFailed(tr *v1.TaskRun) (bool, v1.TaskRunReason, string) {
221+
func (c *Reconciler) checkPodFailed(ctx context.Context, tr *v1.TaskRun) (bool, v1.TaskRunReason, string) {
220222
for _, step := range tr.Status.Steps {
221223
if step.Waiting != nil {
222224
if _, found := podFailureReasons[step.Waiting.Reason]; found {
225+
if step.Waiting.Reason == ImagePullBackOff {
226+
imagePullBackOffTimeOut := config.FromContextOrDefaults(ctx).Defaults.DefaultImagePullBackOffTimeout
227+
// only attempt to recover from the imagePullBackOff if specified
228+
if imagePullBackOffTimeOut.Seconds() != 0 {
229+
p, err := c.KubeClientSet.CoreV1().Pods(tr.Namespace).Get(ctx, tr.Status.PodName, metav1.GetOptions{})
230+
if err != nil {
231+
message := fmt.Sprintf(`The step %q in TaskRun %q failed to pull the image %q and the pod with error: "%s."`, step.Name, tr.Name, step.ImageID, err)
232+
return true, v1.TaskRunReasonImagePullFailed, message
233+
}
234+
for _, condition := range p.Status.Conditions {
235+
// check the pod condition to get the time when the pod was scheduled
236+
// keep trying until the pod schedule time has exceeded the specified imagePullBackOff timeout duration
237+
if condition.Type == corev1.PodScheduled {
238+
if c.Clock.Since(condition.LastTransitionTime.Time) < imagePullBackOffTimeOut {
239+
return false, "", ""
240+
}
241+
}
242+
}
243+
}
244+
}
223245
image := step.ImageID
224246
message := fmt.Sprintf(`The step %q in TaskRun %q failed to pull the image %q. The pod errored with the message: "%s."`, step.Name, tr.Name, image, step.Waiting.Message)
225247
return true, v1.TaskRunReasonImagePullFailed, message
@@ -229,6 +251,26 @@ func (c *Reconciler) checkPodFailed(tr *v1.TaskRun) (bool, v1.TaskRunReason, str
229251
for _, sidecar := range tr.Status.Sidecars {
230252
if sidecar.Waiting != nil {
231253
if _, found := podFailureReasons[sidecar.Waiting.Reason]; found {
254+
if sidecar.Waiting.Reason == ImagePullBackOff {
255+
imagePullBackOffTimeOut := config.FromContextOrDefaults(ctx).Defaults.DefaultImagePullBackOffTimeout
256+
// only attempt to recover from the imagePullBackOff if specified
257+
if imagePullBackOffTimeOut.Seconds() != 0 {
258+
p, err := c.KubeClientSet.CoreV1().Pods(tr.Namespace).Get(ctx, tr.Status.PodName, metav1.GetOptions{})
259+
if err != nil {
260+
message := fmt.Sprintf(`The sidecar %q in TaskRun %q failed to pull the image %q and the pod with error: "%s."`, sidecar.Name, tr.Name, sidecar.ImageID, err)
261+
return true, v1.TaskRunReasonImagePullFailed, message
262+
}
263+
for _, condition := range p.Status.Conditions {
264+
// check the pod condition to get the time when the pod was scheduled
265+
// keep trying until the pod schedule time has exceeded the specified imagePullBackOff timeout duration
266+
if condition.Type == corev1.PodScheduled {
267+
if c.Clock.Since(condition.LastTransitionTime.Time) < imagePullBackOffTimeOut {
268+
return false, "", ""
269+
}
270+
}
271+
}
272+
}
273+
}
232274
image := sidecar.ImageID
233275
message := fmt.Sprintf(`The sidecar %q in TaskRun %q failed to pull the image %q. The pod errored with the message: "%s."`, sidecar.Name, tr.Name, image, sidecar.Waiting.Message)
234276
return true, v1.TaskRunReasonImagePullFailed, message

0 commit comments

Comments
 (0)