@@ -10,6 +10,12 @@ import (
1010 "time"
1111
1212 "github.com/go-kit/log"
13+ "github.com/grafana/dskit/backoff"
14+ "github.com/hashicorp/go-multierror"
15+ "go.opentelemetry.io/otel/attribute"
16+ "go.opentelemetry.io/otel/codes"
17+ "go.opentelemetry.io/otel/trace"
18+
1319 "github.com/grafana/alloy/internal/featuregate"
1420 "github.com/grafana/alloy/internal/runtime/internal/dag"
1521 "github.com/grafana/alloy/internal/runtime/internal/worker"
@@ -18,11 +24,6 @@ import (
1824 "github.com/grafana/alloy/internal/service"
1925 "github.com/grafana/alloy/syntax/ast"
2026 "github.com/grafana/alloy/syntax/diag"
21- "github.com/grafana/dskit/backoff"
22- "github.com/hashicorp/go-multierror"
23- "go.opentelemetry.io/otel/attribute"
24- "go.opentelemetry.io/otel/codes"
25- "go.opentelemetry.io/otel/trace"
2627)
2728
2829// The Loader builds and evaluates ComponentNodes from Alloy blocks.
@@ -92,10 +93,11 @@ func NewLoader(opts LoaderOptions) *Loader {
9293 componentNodeManager : NewComponentNodeManager (globals , reg ),
9394
9495 // This is a reasonable default which should work for most cases. If a component is completely stuck, we would
95- // retry and log an error every 10 seconds, at most.
96+ // retry and log an error every 10 seconds, at most. We give up after some time to prevent lasting deadlocks.
9697 backoffConfig : backoff.Config {
9798 MinBackoff : 1 * time .Millisecond ,
9899 MaxBackoff : 10 * time .Second ,
100+ MaxRetries : 20 , // Give up after 20 attempts - it could be a deadlock instead of an overload.
99101 },
100102
101103 graph : & dag.Graph {},
@@ -744,19 +746,31 @@ func (l *Loader) EvaluateDependants(ctx context.Context, updatedNodes []*QueuedN
744746 l .concurrentEvalFn (nodeRef , dependantCtx , tracer , parentRef )
745747 })
746748 if err != nil {
747- level .Error (l .log ).Log (
748- "msg" , "failed to submit node for evaluation - Alloy is likely overloaded " +
749- "and cannot keep up with evaluating components - will retry" ,
749+ level .Warn (l .log ).Log (
750+ "msg" , "failed to submit node for evaluation - will retry" ,
750751 "err" , err ,
751752 "node_id" , n .NodeID (),
752753 "originator_id" , parent .Node .NodeID (),
753754 "retries" , retryBackoff .NumRetries (),
754755 )
756+ // When backing off, release the mut in case the evaluation requires to interact with the loader itself.
757+ l .mut .RUnlock ()
755758 retryBackoff .Wait ()
759+ l .mut .RLock ()
756760 } else {
757761 break
758762 }
759763 }
764+ if err != nil && ! retryBackoff .Ongoing () {
765+ level .Error (l .log ).Log (
766+ "msg" , "retry attempts exhausted when submitting node for evaluation to the worker pool - " +
767+ "this could be a deadlock, performance bottleneck or severe overload leading to goroutine starvation" ,
768+ "err" , err ,
769+ "node_id" , n .NodeID (),
770+ "originator_id" , parent .Node .NodeID (),
771+ "retries" , retryBackoff .NumRetries (),
772+ )
773+ }
760774 span .SetAttributes (attribute .Int ("retries" , retryBackoff .NumRetries ()))
761775 if err != nil {
762776 span .SetStatus (codes .Error , err .Error ())
0 commit comments