Skip to content

Commit 44fce0e

Browse files
Hazel Sudzilouskiagaudreault
andauthored
fix: add fatal timeout upgrade with SIGKILL to ARGO_EXEC_TIMEOUT (closes #20785, #18478) (#22713)
Signed-off-by: Hazel Sudzilouski <[email protected]> Signed-off-by: Alexandre Gaudreault <[email protected]> Co-authored-by: Alexandre Gaudreault <[email protected]>
1 parent 8f1f5c7 commit 44fce0e

File tree

4 files changed

+71
-7
lines changed

4 files changed

+71
-7
lines changed

docs/operator-manual/config-management-plugins.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,9 @@ If you don't need to set any environment variables, you can set an empty plugin
326326
if version was mentioned in the `ConfigManagementPlugin` spec or else just use `<metadata.name>`. You can also remove the name altogether
327327
and let the automatic discovery to identify the plugin.
328328
!!! note
329-
If a CMP renders blank manifests, and `prune` is set to `true`, Argo CD will automatically remove resources. CMP plugin authors should ensure errors are part of the exit code. Commonly something like `kustomize build . | cat` won't pass errors because of the pipe. Consider setting `set -o pipefail` so anything piped will pass errors on failure.
329+
If a CMP renders blank manfiests, and `prune` is set to `true`, Argo CD will automatically remove resources. CMP plugin authors should ensure errors are part of the exit code. Commonly something like `kustomize build . | cat` won't pass errors because of the pipe. Consider setting `set -o pipefail` so anything piped will pass errors on failure.
330+
!!! note
331+
If a CMP command fails to gracefully exit on `ARGOCD_EXEC_TIMEOUT`, it will be forcefully killed after an additional timeout of `ARGOCD_EXEC_FATAL_TIMEOUT`.
330332

331333
## Debugging a CMP
332334

docs/operator-manual/high_availability.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ and might fail. To avoid failed syncs use the `ARGOCD_GIT_ATTEMPTS_COUNT` enviro
3434

3535
* `argocd-repo-server` executes config management tools such as `helm` or `kustomize` and enforces a 90 second timeout. This timeout can be changed by using the `ARGOCD_EXEC_TIMEOUT` env variable. The value should be in the Go time duration string format, for example, `2m30s`.
3636

37+
* `argocd-repo-server` will issue a `SIGTERM` signal to a command that has elapsed the `ARGOCD_EXEC_TIMEOUT`. In most cases, well-behaved commands will exit immediately when receiving the signal. However, if this does not happen, `argocd-repo-server` will wait an additional timeout of `ARGOCD_EXEC_FATAL_TIMEOUT` and then forcefully exit the command with a `SIGKILL` to prevent stalling. Note that a failure to exit with `SIGTERM` is usually a bug in either the offending command or in the way `argocd-repo-server` calls it and should be reported to the issue tracker for further investigation.
38+
3739
**metrics:**
3840

3941
* `argocd_git_request_total` - Number of git requests. This metric provides two tags:

util/exec/exec.go

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,9 @@ import (
2020
)
2121

2222
var (
23-
timeout time.Duration
24-
Unredacted = Redact(nil)
23+
timeout time.Duration
24+
fatalTimeout time.Duration
25+
Unredacted = Redact(nil)
2526
)
2627

2728
type ExecRunOpts struct {
@@ -45,6 +46,10 @@ func initTimeout() {
4546
if err != nil {
4647
timeout = 90 * time.Second
4748
}
49+
fatalTimeout, err = time.ParseDuration(os.Getenv("ARGOCD_EXEC_FATAL_TIMEOUT"))
50+
if err != nil {
51+
fatalTimeout = 10 * time.Second
52+
}
4853
}
4954

5055
func Run(cmd *exec.Cmd) (string, error) {
@@ -57,7 +62,7 @@ func RunWithRedactor(cmd *exec.Cmd, redactor func(text string) string) (string,
5762
}
5863

5964
func RunWithExecRunOpts(cmd *exec.Cmd, opts ExecRunOpts) (string, error) {
60-
cmdOpts := CmdOpts{Timeout: timeout, Redactor: opts.Redactor, TimeoutBehavior: opts.TimeoutBehavior, SkipErrorLogging: opts.SkipErrorLogging}
65+
cmdOpts := CmdOpts{Timeout: timeout, FatalTimeout: fatalTimeout, Redactor: opts.Redactor, TimeoutBehavior: opts.TimeoutBehavior, SkipErrorLogging: opts.SkipErrorLogging}
6166
span := tracing.NewLoggingTracer(log.NewLogrusLogger(log.NewWithCurrentConfig())).StartSpan(fmt.Sprintf("exec %v", cmd.Args[0]))
6267
span.SetBaggageItem("dir", cmd.Dir)
6368
if cmdOpts.Redactor != nil {
@@ -130,6 +135,8 @@ type TimeoutBehavior struct {
130135
type CmdOpts struct {
131136
// Timeout determines how long to wait for the command to exit
132137
Timeout time.Duration
138+
// FatalTimeout is the amount of additional time to wait after Timeout before fatal SIGKILL
139+
FatalTimeout time.Duration
133140
// Redactor redacts tokens from the output
134141
Redactor func(text string) string
135142
// TimeoutBehavior configures what to do in case of timeout
@@ -142,6 +149,7 @@ type CmdOpts struct {
142149

143150
var DefaultCmdOpts = CmdOpts{
144151
Timeout: time.Duration(0),
152+
FatalTimeout: time.Duration(0),
145153
Redactor: Unredacted,
146154
TimeoutBehavior: TimeoutBehavior{syscall.SIGKILL, false},
147155
SkipErrorLogging: false,
@@ -189,30 +197,60 @@ func RunCommandExt(cmd *exec.Cmd, opts CmdOpts) (string, error) {
189197
done := make(chan error)
190198
go func() { done <- cmd.Wait() }()
191199

192-
// Start a timer
200+
// Start timers for timeout
193201
timeout := DefaultCmdOpts.Timeout
202+
fatalTimeout := DefaultCmdOpts.FatalTimeout
194203

195204
if opts.Timeout != time.Duration(0) {
196205
timeout = opts.Timeout
197206
}
198207

208+
if opts.FatalTimeout != time.Duration(0) {
209+
fatalTimeout = opts.FatalTimeout
210+
}
211+
199212
var timoutCh <-chan time.Time
200213
if timeout != 0 {
201214
timoutCh = time.NewTimer(timeout).C
202215
}
203216

217+
var fatalTimeoutCh <-chan time.Time
218+
if fatalTimeout != 0 {
219+
fatalTimeoutCh = time.NewTimer(timeout + fatalTimeout).C
220+
}
221+
204222
timeoutBehavior := DefaultCmdOpts.TimeoutBehavior
223+
fatalTimeoutBehaviour := syscall.SIGKILL
205224
if opts.TimeoutBehavior.Signal != syscall.Signal(0) {
206225
timeoutBehavior = opts.TimeoutBehavior
207226
}
208227

209228
select {
210229
// noinspection ALL
211230
case <-timoutCh:
231+
// send timeout signal
212232
_ = cmd.Process.Signal(timeoutBehavior.Signal)
233+
// wait on timeout signal and fallback to fatal timeout signal
213234
if timeoutBehavior.ShouldWait {
214-
<-done
235+
select {
236+
case <-done:
237+
case <-fatalTimeoutCh:
238+
// upgrades to SIGKILL if cmd does not respect SIGTERM
239+
_ = cmd.Process.Signal(fatalTimeoutBehaviour)
240+
// now original cmd should exit immediately after SIGKILL
241+
<-done
242+
// return error with a marker indicating that cmd exited only after fatal SIGKILL
243+
output := stdout.String()
244+
if opts.CaptureStderr {
245+
output += stderr.String()
246+
}
247+
logCtx.WithFields(logrus.Fields{"duration": time.Since(start)}).Debug(redactor(output))
248+
err = newCmdError(redactor(args), fmt.Errorf("fatal timeout after %v", timeout+fatalTimeout), "")
249+
logCtx.Error(err.Error())
250+
return strings.TrimSuffix(output, "\n"), err
251+
}
215252
}
253+
// either did not wait for timeout or cmd did respect SIGTERM
216254
output := stdout.String()
217255
if opts.CaptureStderr {
218256
output += stderr.String()
@@ -235,7 +273,6 @@ func RunCommandExt(cmd *exec.Cmd, opts CmdOpts) (string, error) {
235273
return strings.TrimSuffix(output, "\n"), err
236274
}
237275
}
238-
239276
output := stdout.String()
240277
if opts.CaptureStderr {
241278
output += stderr.String()

util/exec/exec_test.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,14 @@ func Test_timeout(t *testing.T) {
1717
t.Run("Default", func(t *testing.T) {
1818
initTimeout()
1919
assert.Equal(t, 90*time.Second, timeout)
20+
assert.Equal(t, 10*time.Second, fatalTimeout)
2021
})
2122
t.Run("Default", func(t *testing.T) {
2223
t.Setenv("ARGOCD_EXEC_TIMEOUT", "1s")
24+
t.Setenv("ARGOCD_EXEC_FATAL_TIMEOUT", "2s")
2325
initTimeout()
2426
assert.Equal(t, 1*time.Second, timeout)
27+
assert.Equal(t, 2*time.Second, fatalTimeout)
2528
})
2629
}
2730

@@ -42,6 +45,7 @@ func TestHideUsernamePassword(t *testing.T) {
4245
require.Error(t, err)
4346
}
4447

48+
// This tests a cmd that properly handles a SIGTERM signal
4549
func TestRunWithExecRunOpts(t *testing.T) {
4650
t.Setenv("ARGOCD_EXEC_TIMEOUT", "200ms")
4751
initTimeout()
@@ -56,6 +60,25 @@ func TestRunWithExecRunOpts(t *testing.T) {
5660
assert.ErrorContains(t, err, "failed timeout after 200ms")
5761
}
5862

63+
// This tests a mis-behaved cmd that stalls on SIGTERM and requires a SIGKILL
64+
func TestRunWithExecRunOptsFatal(t *testing.T) {
65+
t.Setenv("ARGOCD_EXEC_TIMEOUT", "200ms")
66+
t.Setenv("ARGOCD_EXEC_FATAL_TIMEOUT", "100ms")
67+
68+
initTimeout()
69+
70+
opts := ExecRunOpts{
71+
TimeoutBehavior: TimeoutBehavior{
72+
Signal: syscall.SIGTERM,
73+
ShouldWait: true,
74+
},
75+
}
76+
// The returned error string in this case should contain a "fatal" in this case
77+
_, err := RunWithExecRunOpts(exec.Command("sh", "-c", "trap 'trap - 15 && echo captured && sleep 10000' 15 && sleep 2"), opts)
78+
// The expected timeout is ARGOCD_EXEC_TIMEOUT + ARGOCD_EXEC_FATAL_TIMEOUT = 200ms + 100ms = 300ms
79+
assert.ErrorContains(t, err, "failed fatal timeout after 300ms")
80+
}
81+
5982
func Test_getCommandArgsToLog(t *testing.T) {
6083
t.Parallel()
6184

0 commit comments

Comments
 (0)