Skip to content

Commit 4acf0b2

Browse files
thampiotrwildum
authored andcommitted
Fix deadlock due to infinite retry (#2174)
* Fix deadlock due to infinite retry * changelog
1 parent 3d8c07d commit 4acf0b2

1 file changed

Lines changed: 23 additions & 9 deletions

File tree

  • internal/runtime/internal/controller

internal/runtime/internal/controller/loader.go

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,12 @@ import (
1010
"time"
1111

1212
"github.com/go-kit/log"
13+
"github.com/grafana/dskit/backoff"
14+
"github.com/hashicorp/go-multierror"
15+
"go.opentelemetry.io/otel/attribute"
16+
"go.opentelemetry.io/otel/codes"
17+
"go.opentelemetry.io/otel/trace"
18+
1319
"github.com/grafana/alloy/internal/featuregate"
1420
"github.com/grafana/alloy/internal/runtime/internal/dag"
1521
"github.com/grafana/alloy/internal/runtime/internal/worker"
@@ -18,11 +24,6 @@ import (
1824
"github.com/grafana/alloy/internal/service"
1925
"github.com/grafana/alloy/syntax/ast"
2026
"github.com/grafana/alloy/syntax/diag"
21-
"github.com/grafana/dskit/backoff"
22-
"github.com/hashicorp/go-multierror"
23-
"go.opentelemetry.io/otel/attribute"
24-
"go.opentelemetry.io/otel/codes"
25-
"go.opentelemetry.io/otel/trace"
2627
)
2728

2829
// The Loader builds and evaluates ComponentNodes from Alloy blocks.
@@ -92,10 +93,11 @@ func NewLoader(opts LoaderOptions) *Loader {
9293
componentNodeManager: NewComponentNodeManager(globals, reg),
9394

9495
// This is a reasonable default which should work for most cases. If a component is completely stuck, we would
95-
// retry and log an error every 10 seconds, at most.
96+
// retry and log an error every 10 seconds, at most. We give up after some time to prevent lasting deadlocks.
9697
backoffConfig: backoff.Config{
9798
MinBackoff: 1 * time.Millisecond,
9899
MaxBackoff: 10 * time.Second,
100+
MaxRetries: 20, // Give up after 20 attempts - it could be a deadlock instead of an overload.
99101
},
100102

101103
graph: &dag.Graph{},
@@ -744,19 +746,31 @@ func (l *Loader) EvaluateDependants(ctx context.Context, updatedNodes []*QueuedN
744746
l.concurrentEvalFn(nodeRef, dependantCtx, tracer, parentRef)
745747
})
746748
if err != nil {
747-
level.Error(l.log).Log(
748-
"msg", "failed to submit node for evaluation - Alloy is likely overloaded "+
749-
"and cannot keep up with evaluating components - will retry",
749+
level.Warn(l.log).Log(
750+
"msg", "failed to submit node for evaluation - will retry",
750751
"err", err,
751752
"node_id", n.NodeID(),
752753
"originator_id", parent.Node.NodeID(),
753754
"retries", retryBackoff.NumRetries(),
754755
)
756+
// When backing off, release the mut in case the evaluation requires to interact with the loader itself.
757+
l.mut.RUnlock()
755758
retryBackoff.Wait()
759+
l.mut.RLock()
756760
} else {
757761
break
758762
}
759763
}
764+
if err != nil && !retryBackoff.Ongoing() {
765+
level.Error(l.log).Log(
766+
"msg", "retry attempts exhausted when submitting node for evaluation to the worker pool - "+
767+
"this could be a deadlock, performance bottleneck or severe overload leading to goroutine starvation",
768+
"err", err,
769+
"node_id", n.NodeID(),
770+
"originator_id", parent.Node.NodeID(),
771+
"retries", retryBackoff.NumRetries(),
772+
)
773+
}
760774
span.SetAttributes(attribute.Int("retries", retryBackoff.NumRetries()))
761775
if err != nil {
762776
span.SetStatus(codes.Error, err.Error())

0 commit comments

Comments
 (0)