Skip to content

Commit 328ef67

Browse files
reggie-kcrenshaw-devHazel Sudzilouskiagaudreault
authored
feat: move pkg/exec in-tree and add fatal timeout upgrade with SIGKILL to ARGO_EXEC_TIMEOUT (#419)
* chore: move pkg/exec in-tree (argoproj#22175) (argoproj#22460) Signed-off-by: Michael Crenshaw <[email protected]> * fix: add fatal timeout upgrade with SIGKILL to ARGO_EXEC_TIMEOUT (closes argoproj#20785, argoproj#18478) (argoproj#22713) Signed-off-by: Hazel Sudzilouski <[email protected]> Signed-off-by: Alexandre Gaudreault <[email protected]> Co-authored-by: Alexandre Gaudreault <[email protected]> --------- Signed-off-by: Michael Crenshaw <[email protected]> Signed-off-by: Hazel Sudzilouski <[email protected]> Signed-off-by: Alexandre Gaudreault <[email protected]> Co-authored-by: Michael Crenshaw <[email protected]> Co-authored-by: Hazel Sudzilouski <[email protected]> Co-authored-by: Alexandre Gaudreault <[email protected]>
1 parent ac26d0e commit 328ef67

File tree

13 files changed

+407
-17
lines changed

13 files changed

+407
-17
lines changed

docs/operator-manual/config-management-plugins.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,9 @@ If you don't need to set any environment variables, you can set an empty plugin
326326
if version was mentioned in the `ConfigManagementPlugin` spec or else just use `<metadata.name>`. You can also remove the name altogether
327327
and let the automatic discovery to identify the plugin.
328328
!!! note
329-
If a CMP renders blank manfiests, and `prune` is set to `true`, Argo CD will automatically remove resources. CMP plugin authors should ensure errors are part of the exit code. Commonly something like `kustomize build . | cat` won't pass errors because of the pipe. Consider setting `set -o pipefail` so anything piped will pass errors on failure.
329+
If a CMP renders blank manifests, and `prune` is set to `true`, Argo CD will automatically remove resources. CMP plugin authors should ensure errors are part of the exit code. Commonly something like `kustomize build . | cat` won't pass errors because of the pipe. Consider setting `set -o pipefail` so anything piped will pass errors on failure.
330+
!!! note
331+
If a CMP command fails to gracefully exit on `ARGOCD_EXEC_TIMEOUT`, it will be forcefully killed after an additional timeout of `ARGOCD_EXEC_FATAL_TIMEOUT`.
330332

331333
## Debugging a CMP
332334

docs/operator-manual/high_availability.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ and might fail. To avoid failed syncs use the `ARGOCD_GIT_ATTEMPTS_COUNT` enviro
3232

3333
* `argocd-repo-server` executes config management tools such as `helm` or `kustomize` and enforces a 90 second timeout. This timeout can be changed by using the `ARGOCD_EXEC_TIMEOUT` env variable. The value should be in the Go time duration string format, for example, `2m30s`.
3434

35+
* `argocd-repo-server` will issue a `SIGTERM` signal to a command that has elapsed the `ARGOCD_EXEC_TIMEOUT`. In most cases, well-behaved commands will exit immediately when receiving the signal. However, if this does not happen, `argocd-repo-server` will wait an additional timeout of `ARGOCD_EXEC_FATAL_TIMEOUT` and then forcefully exit the command with a `SIGKILL` to prevent stalling. Note that a failure to exit with `SIGTERM` is usually a bug in either the offending command or in the way `argocd-repo-server` calls it and should be reported to the issue tracker for further investigation.
36+
3537
**metrics:**
3638

3739
* `argocd_git_request_total` - Number of git requests. This metric provides two tags: `repo` - Git repo URL; `request_type` - `ls-remote` or `fetch`.

test/e2e/fixture/applicationsets/utils/cmd.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import (
55
"os/exec"
66
"strings"
77

8-
argoexec "github.com/argoproj/pkg/exec"
8+
argoexec "github.com/argoproj/argo-cd/v3/util/exec"
99
)
1010

1111
func Run(workDir, name string, args ...string) (string, error) {

test/e2e/fixture/cmd.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import (
55
"os/exec"
66
"strings"
77

8-
argoexec "github.com/argoproj/pkg/exec"
8+
argoexec "github.com/argoproj/argo-cd/v3/util/exec"
99
)
1010

1111
func Run(workDir, name string, args ...string) (string, error) {

test/fixture/revision_metadata/author.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,8 @@ import (
44
"fmt"
55
"strings"
66

7-
argoexec "github.com/argoproj/pkg/exec"
8-
97
"github.com/argoproj/argo-cd/v3/util/errors"
8+
argoexec "github.com/argoproj/argo-cd/v3/util/exec"
109
)
1110

1211
var Author string

test/manifests_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@ import (
66
"path/filepath"
77
"testing"
88

9-
argoexec "github.com/argoproj/pkg/exec"
109
"github.com/stretchr/testify/assert"
1110
"github.com/stretchr/testify/require"
1211

1312
"github.com/argoproj/argo-cd/v3/test/fixture/test"
13+
argoexec "github.com/argoproj/argo-cd/v3/util/exec"
1414
)
1515

1616
func TestKustomizeVersion(t *testing.T) {

util/exec/exec.go

Lines changed: 202 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,35 @@
11
package exec
22

33
import (
4+
"bytes"
5+
"errors"
46
"fmt"
57
"os"
68
"os/exec"
79
"strconv"
810
"strings"
11+
"syscall"
912
"time"
1013
"unicode"
1114

1215
"github.com/argoproj/gitops-engine/pkg/utils/tracing"
13-
argoexec "github.com/argoproj/pkg/exec"
16+
"github.com/sirupsen/logrus"
1417

1518
"github.com/argoproj/argo-cd/v3/util/log"
19+
"github.com/argoproj/argo-cd/v3/util/rand"
1620
)
1721

18-
var timeout time.Duration
22+
var (
23+
timeout time.Duration
24+
fatalTimeout time.Duration
25+
Unredacted = Redact(nil)
26+
)
1927

2028
type ExecRunOpts struct {
2129
// Redactor redacts tokens from the output
2230
Redactor func(text string) string
2331
// TimeoutBehavior configures what to do in case of timeout
24-
TimeoutBehavior argoexec.TimeoutBehavior
32+
TimeoutBehavior TimeoutBehavior
2533
// SkipErrorLogging determines whether to skip logging of execution errors (rc > 0)
2634
SkipErrorLogging bool
2735
// CaptureStderr determines whether to capture stderr in addition to stdout
@@ -38,6 +46,10 @@ func initTimeout() {
3846
if err != nil {
3947
timeout = 90 * time.Second
4048
}
49+
fatalTimeout, err = time.ParseDuration(os.Getenv("ARGOCD_EXEC_FATAL_TIMEOUT"))
50+
if err != nil {
51+
fatalTimeout = 10 * time.Second
52+
}
4153
}
4254

4355
func Run(cmd *exec.Cmd) (string, error) {
@@ -50,7 +62,7 @@ func RunWithRedactor(cmd *exec.Cmd, redactor func(text string) string) (string,
5062
}
5163

5264
func RunWithExecRunOpts(cmd *exec.Cmd, opts ExecRunOpts) (string, error) {
53-
cmdOpts := argoexec.CmdOpts{Timeout: timeout, Redactor: opts.Redactor, TimeoutBehavior: opts.TimeoutBehavior, SkipErrorLogging: opts.SkipErrorLogging}
65+
cmdOpts := CmdOpts{Timeout: timeout, FatalTimeout: fatalTimeout, Redactor: opts.Redactor, TimeoutBehavior: opts.TimeoutBehavior, SkipErrorLogging: opts.SkipErrorLogging}
5466
span := tracing.NewLoggingTracer(log.NewLogrusLogger(log.NewWithCurrentConfig())).StartSpan(fmt.Sprintf("exec %v", cmd.Args[0]))
5567
span.SetBaggageItem("dir", cmd.Dir)
5668
if cmdOpts.Redactor != nil {
@@ -59,7 +71,7 @@ func RunWithExecRunOpts(cmd *exec.Cmd, opts ExecRunOpts) (string, error) {
5971
span.SetBaggageItem("args", fmt.Sprintf("%v", cmd.Args))
6072
}
6173
defer span.Finish()
62-
return argoexec.RunCommandExt(cmd, cmdOpts)
74+
return RunCommandExt(cmd, cmdOpts)
6375
}
6476

6577
// GetCommandArgsToLog represents the given command in a way that we can copy-and-paste into a terminal
@@ -88,3 +100,188 @@ func GetCommandArgsToLog(cmd *exec.Cmd) string {
88100
args := strings.Join(argsToLog, " ")
89101
return args
90102
}
103+
104+
type CmdError struct {
105+
Args string
106+
Stderr string
107+
Cause error
108+
}
109+
110+
func (ce *CmdError) Error() string {
111+
res := fmt.Sprintf("`%v` failed %v", ce.Args, ce.Cause)
112+
if ce.Stderr != "" {
113+
res = fmt.Sprintf("%s: %s", res, ce.Stderr)
114+
}
115+
return res
116+
}
117+
118+
func (ce *CmdError) String() string {
119+
return ce.Error()
120+
}
121+
122+
func newCmdError(args string, cause error, stderr string) *CmdError {
123+
return &CmdError{Args: args, Stderr: stderr, Cause: cause}
124+
}
125+
126+
// TimeoutBehavior defines behavior for when the command takes longer than the passed in timeout to exit
127+
// By default, SIGKILL is sent to the process and it is not waited upon
128+
type TimeoutBehavior struct {
129+
// Signal determines the signal to send to the process
130+
Signal syscall.Signal
131+
// ShouldWait determines whether to wait for the command to exit once timeout is reached
132+
ShouldWait bool
133+
}
134+
135+
type CmdOpts struct {
136+
// Timeout determines how long to wait for the command to exit
137+
Timeout time.Duration
138+
// FatalTimeout is the amount of additional time to wait after Timeout before fatal SIGKILL
139+
FatalTimeout time.Duration
140+
// Redactor redacts tokens from the output
141+
Redactor func(text string) string
142+
// TimeoutBehavior configures what to do in case of timeout
143+
TimeoutBehavior TimeoutBehavior
144+
// SkipErrorLogging defines whether to skip logging of execution errors (rc > 0)
145+
SkipErrorLogging bool
146+
// CaptureStderr defines whether to capture stderr in addition to stdout
147+
CaptureStderr bool
148+
}
149+
150+
var DefaultCmdOpts = CmdOpts{
151+
Timeout: time.Duration(0),
152+
FatalTimeout: time.Duration(0),
153+
Redactor: Unredacted,
154+
TimeoutBehavior: TimeoutBehavior{syscall.SIGKILL, false},
155+
SkipErrorLogging: false,
156+
CaptureStderr: false,
157+
}
158+
159+
func Redact(items []string) func(text string) string {
160+
return func(text string) string {
161+
for _, item := range items {
162+
text = strings.ReplaceAll(text, item, "******")
163+
}
164+
return text
165+
}
166+
}
167+
168+
// RunCommandExt is a convenience function to run/log a command and return/log stderr in an error upon
169+
// failure.
170+
func RunCommandExt(cmd *exec.Cmd, opts CmdOpts) (string, error) {
171+
execId, err := rand.RandHex(5)
172+
if err != nil {
173+
return "", err
174+
}
175+
logCtx := logrus.WithFields(logrus.Fields{"execID": execId})
176+
177+
redactor := DefaultCmdOpts.Redactor
178+
if opts.Redactor != nil {
179+
redactor = opts.Redactor
180+
}
181+
182+
// log in a way we can copy-and-paste into a terminal
183+
args := strings.Join(cmd.Args, " ")
184+
logCtx.WithFields(logrus.Fields{"dir": cmd.Dir}).Info(redactor(args))
185+
186+
var stdout bytes.Buffer
187+
var stderr bytes.Buffer
188+
cmd.Stdout = &stdout
189+
cmd.Stderr = &stderr
190+
191+
start := time.Now()
192+
err = cmd.Start()
193+
if err != nil {
194+
return "", err
195+
}
196+
197+
done := make(chan error)
198+
go func() { done <- cmd.Wait() }()
199+
200+
// Start timers for timeout
201+
timeout := DefaultCmdOpts.Timeout
202+
fatalTimeout := DefaultCmdOpts.FatalTimeout
203+
204+
if opts.Timeout != time.Duration(0) {
205+
timeout = opts.Timeout
206+
}
207+
208+
if opts.FatalTimeout != time.Duration(0) {
209+
fatalTimeout = opts.FatalTimeout
210+
}
211+
212+
var timoutCh <-chan time.Time
213+
if timeout != 0 {
214+
timoutCh = time.NewTimer(timeout).C
215+
}
216+
217+
var fatalTimeoutCh <-chan time.Time
218+
if fatalTimeout != 0 {
219+
fatalTimeoutCh = time.NewTimer(timeout + fatalTimeout).C
220+
}
221+
222+
timeoutBehavior := DefaultCmdOpts.TimeoutBehavior
223+
fatalTimeoutBehaviour := syscall.SIGKILL
224+
if opts.TimeoutBehavior.Signal != syscall.Signal(0) {
225+
timeoutBehavior = opts.TimeoutBehavior
226+
}
227+
228+
select {
229+
// noinspection ALL
230+
case <-timoutCh:
231+
// send timeout signal
232+
_ = cmd.Process.Signal(timeoutBehavior.Signal)
233+
// wait on timeout signal and fallback to fatal timeout signal
234+
if timeoutBehavior.ShouldWait {
235+
select {
236+
case <-done:
237+
case <-fatalTimeoutCh:
238+
// upgrades to SIGKILL if cmd does not respect SIGTERM
239+
_ = cmd.Process.Signal(fatalTimeoutBehaviour)
240+
// now original cmd should exit immediately after SIGKILL
241+
<-done
242+
// return error with a marker indicating that cmd exited only after fatal SIGKILL
243+
output := stdout.String()
244+
if opts.CaptureStderr {
245+
output += stderr.String()
246+
}
247+
logCtx.WithFields(logrus.Fields{"duration": time.Since(start)}).Debug(redactor(output))
248+
err = newCmdError(redactor(args), fmt.Errorf("fatal timeout after %v", timeout+fatalTimeout), "")
249+
logCtx.Error(err.Error())
250+
return strings.TrimSuffix(output, "\n"), err
251+
}
252+
}
253+
// either did not wait for timeout or cmd did respect SIGTERM
254+
output := stdout.String()
255+
if opts.CaptureStderr {
256+
output += stderr.String()
257+
}
258+
logCtx.WithFields(logrus.Fields{"duration": time.Since(start)}).Debug(redactor(output))
259+
err = newCmdError(redactor(args), fmt.Errorf("timeout after %v", timeout), "")
260+
logCtx.Error(err.Error())
261+
return strings.TrimSuffix(output, "\n"), err
262+
case err := <-done:
263+
if err != nil {
264+
output := stdout.String()
265+
if opts.CaptureStderr {
266+
output += stderr.String()
267+
}
268+
logCtx.WithFields(logrus.Fields{"duration": time.Since(start)}).Debug(redactor(output))
269+
err := newCmdError(redactor(args), errors.New(redactor(err.Error())), strings.TrimSpace(redactor(stderr.String())))
270+
if !opts.SkipErrorLogging {
271+
logCtx.Error(err.Error())
272+
}
273+
return strings.TrimSuffix(output, "\n"), err
274+
}
275+
}
276+
output := stdout.String()
277+
if opts.CaptureStderr {
278+
output += stderr.String()
279+
}
280+
logCtx.WithFields(logrus.Fields{"duration": time.Since(start)}).Debug(redactor(output))
281+
282+
return strings.TrimSuffix(output, "\n"), nil
283+
}
284+
285+
func RunCommand(name string, opts CmdOpts, arg ...string) (string, error) {
286+
return RunCommandExt(exec.Command(name, arg...), opts)
287+
}

util/exec/exec_norace_test.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
//go:build !race
2+
3+
package exec
4+
5+
import (
6+
"testing"
7+
"time"
8+
9+
log "github.com/sirupsen/logrus"
10+
"github.com/sirupsen/logrus/hooks/test"
11+
"github.com/stretchr/testify/assert"
12+
"github.com/stretchr/testify/require"
13+
)
14+
15+
// FIXME: there's a race in RunCommand that causes the test to fail when -race is enabled. The race is in the timeout
16+
// handling, which shares bytes buffers between the exec goroutine and the timeout handler code.
17+
func TestRunCommandTimeout(t *testing.T) {
18+
hook := test.NewGlobal()
19+
log.SetLevel(log.DebugLevel)
20+
defer log.SetLevel(log.InfoLevel)
21+
22+
output, err := RunCommand("sh", CmdOpts{Timeout: 500 * time.Millisecond}, "-c", "echo hello world && echo my-error >&2 && sleep 2")
23+
assert.Equal(t, "hello world", output)
24+
require.EqualError(t, err, "`sh -c echo hello world && echo my-error >&2 && sleep 2` failed timeout after 500ms")
25+
26+
assert.Len(t, hook.Entries, 3)
27+
28+
entry := hook.Entries[0]
29+
assert.Equal(t, log.InfoLevel, entry.Level)
30+
assert.Equal(t, "sh -c echo hello world && echo my-error >&2 && sleep 2", entry.Message)
31+
assert.Contains(t, entry.Data, "dir")
32+
assert.Contains(t, entry.Data, "execID")
33+
34+
entry = hook.Entries[1]
35+
assert.Equal(t, log.DebugLevel, entry.Level)
36+
assert.Equal(t, "hello world\n", entry.Message)
37+
assert.Contains(t, entry.Data, "duration")
38+
assert.Contains(t, entry.Data, "execID")
39+
40+
entry = hook.Entries[2]
41+
assert.Equal(t, log.ErrorLevel, entry.Level)
42+
assert.Equal(t, "`sh -c echo hello world && echo my-error >&2 && sleep 2` failed timeout after 500ms", entry.Message)
43+
assert.Contains(t, entry.Data, "execID")
44+
}

0 commit comments

Comments
 (0)