@@ -172,7 +172,6 @@ func (b *lbBuilder) Build(cc balancer.ClientConn, opt balancer.BuildOptions) bal
172172 doneCh : make (chan struct {}),
173173
174174 manualResolver : r ,
175- csEvltr : & balancer.ConnectivityStateEvaluator {},
176175 subConns : make (map [resolver.Address ]balancer.SubConn ),
177176 scStates : make (map [balancer.SubConn ]connectivity.State ),
178177 picker : & errPicker {err : balancer .ErrNoSubConnAvailable },
@@ -238,15 +237,15 @@ type lbBalancer struct {
238237 // but with only READY SCs will be gerenated.
239238 backendAddrs []resolver.Address
240239 // Roundrobin functionalities.
241- csEvltr * balancer.ConnectivityStateEvaluator
242240 state connectivity.State
243241 subConns map [resolver.Address ]balancer.SubConn // Used to new/remove SubConn.
244242 scStates map [balancer.SubConn ]connectivity.State // Used to filter READY SubConns.
245243 picker balancer.Picker
246244 // Support fallback to resolved backend addresses if there's no response
247245 // from remote balancer within fallbackTimeout.
248- fallbackTimerExpired bool
249- serverListReceived bool
246+ remoteBalancerConnected bool
247+ serverListReceived bool
248+ inFallback bool
250249 // resolvedBackendAddrs is resolvedAddrs minus remote balancers. It's set
251250 // when resolved address updates are received, and read in the goroutine
252251 // handling fallback.
@@ -264,13 +263,16 @@ func (lb *lbBalancer) regeneratePicker(resetDrop bool) {
264263 return
265264 }
266265
266+ if lb .state == connectivity .Connecting {
267+ lb .picker = & errPicker {err : balancer .ErrNoSubConnAvailable }
268+ return
269+ }
270+
267271 var readySCs []balancer.SubConn
268272 if lb .usePickFirst {
269- if lb .state == connectivity .Ready || lb .state == connectivity .Idle {
270- for _ , sc := range lb .subConns {
271- readySCs = append (readySCs , sc )
272- break
273- }
273+ for _ , sc := range lb .subConns {
274+ readySCs = append (readySCs , sc )
275+ break
274276 }
275277 } else {
276278 for _ , a := range lb .backendAddrs {
@@ -286,10 +288,13 @@ func (lb *lbBalancer) regeneratePicker(resetDrop bool) {
286288 // If there's no ready SubConns, always re-pick. This is to avoid drops
287289 // unless at least one SubConn is ready. Otherwise we may drop more
288290 // often than want because of drops + re-picks(which become re-drops).
291+ //
292+ // This doesn't seem to be necessary after the connecting check above.
293+ // Kept for safety.
289294 lb .picker = & errPicker {err : balancer .ErrNoSubConnAvailable }
290295 return
291296 }
292- if len ( lb .fullServerList ) <= 0 {
297+ if lb .inFallback {
293298 lb .picker = newRRPicker (readySCs )
294299 return
295300 }
@@ -305,6 +310,34 @@ func (lb *lbBalancer) regeneratePicker(resetDrop bool) {
305310 prevLBPicker .updateReadySCs (readySCs )
306311}
307312
313+ // aggregateSubConnStats calculate the aggregated state of SubConns in
314+ // lb.SubConns. These SubConns are subconns in use (when switching between
315+ // fallback and grpclb). lb.scState contains states for all SubConns, including
316+ // those in cache (SubConns are cached for 10 seconds after remove).
317+ //
318+ // The aggregated state is:
319+ // - If at least one SubConn in Ready, the aggregated state is Ready;
320+ // - Else if at least one SubConn in Connecting, the aggregated state is Connecting;
321+ // - Else the aggregated state is TransientFailure.
322+ func (lb * lbBalancer ) aggregateSubConnStates () connectivity.State {
323+ var numConnecting uint64
324+
325+ for _ , sc := range lb .subConns {
326+ if state , ok := lb .scStates [sc ]; ok {
327+ switch state {
328+ case connectivity .Ready :
329+ return connectivity .Ready
330+ case connectivity .Connecting :
331+ numConnecting ++
332+ }
333+ }
334+ }
335+ if numConnecting > 0 {
336+ return connectivity .Connecting
337+ }
338+ return connectivity .TransientFailure
339+ }
340+
308341func (lb * lbBalancer ) HandleSubConnStateChange (sc balancer.SubConn , s connectivity.State ) {
309342 if grpclog .V (2 ) {
310343 grpclog .Infof ("lbBalancer: handle SubConn state change: %p, %v" , sc , s )
@@ -328,18 +361,33 @@ func (lb *lbBalancer) HandleSubConnStateChange(sc balancer.SubConn, s connectivi
328361 // kept the sc's state in scStates. Remove state for this sc here.
329362 delete (lb .scStates , sc )
330363 }
364+ // Force regenerate picker if
365+ // - this sc became ready from not-ready
366+ // - this sc became not-ready from ready
367+ lb .updateStateAndPicker ((oldS == connectivity .Ready ) != (s == connectivity .Ready ), false )
368+
369+ // Enter fallback when the aggregated state is not Ready and the connection
370+ // to remote balancer is lost.
371+ if lb .state != connectivity .Ready {
372+ if ! lb .inFallback && ! lb .remoteBalancerConnected {
373+ // Enter fallback.
374+ lb .refreshSubConns (lb .resolvedBackendAddrs , false )
375+ }
376+ }
377+ }
331378
379+ // updateStateAndPicker re-calculate the aggregated state, and regenerate picker
380+ // if overall state is changed.
381+ //
382+ // If forceRegeneratePicker is true, picker will be regenerated.
383+ func (lb * lbBalancer ) updateStateAndPicker (forceRegeneratePicker bool , resetDrop bool ) {
332384 oldAggrState := lb .state
333- lb .state = lb .csEvltr .RecordTransition (oldS , s )
334-
385+ lb .state = lb .aggregateSubConnStates ()
335386 // Regenerate picker when one of the following happens:
336- // - this sc became ready from not-ready
337- // - this sc became not-ready from ready
338- // - the aggregated state of balancer became TransientFailure from non-TransientFailure
339- // - the aggregated state of balancer became non-TransientFailure from TransientFailure
340- if (oldS == connectivity .Ready ) != (s == connectivity .Ready ) ||
341- (lb .state == connectivity .TransientFailure ) != (oldAggrState == connectivity .TransientFailure ) {
342- lb .regeneratePicker (false )
387+ // - caller wants to regenerate
388+ // - the aggregated state changed
389+ if forceRegeneratePicker || (lb .state != oldAggrState ) {
390+ lb .regeneratePicker (resetDrop )
343391 }
344392
345393 lb .cc .UpdateBalancerState (lb .state , lb .picker )
@@ -357,11 +405,11 @@ func (lb *lbBalancer) fallbackToBackendsAfter(fallbackTimeout time.Duration) {
357405 return
358406 }
359407 lb .mu .Lock ()
360- if lb .serverListReceived {
408+ if lb .inFallback || lb . serverListReceived {
361409 lb .mu .Unlock ()
362410 return
363411 }
364- lb . fallbackTimerExpired = true
412+ // Enter fallback.
365413 lb .refreshSubConns (lb .resolvedBackendAddrs , false )
366414 lb .mu .Unlock ()
367415}
@@ -405,10 +453,7 @@ func (lb *lbBalancer) HandleResolvedAddrs(addrs []resolver.Address, err error) {
405453
406454 lb .mu .Lock ()
407455 lb .resolvedBackendAddrs = backendAddrs
408- // If serverListReceived is true, connection to remote balancer was
409- // successful and there's no need to do fallback anymore.
410- // If fallbackTimerExpired is false, fallback hasn't happened yet.
411- if ! lb .serverListReceived && lb .fallbackTimerExpired {
456+ if lb .inFallback {
412457 // This means we received a new list of resolved backends, and we are
413458 // still in fallback mode. Need to update the list of backends we are
414459 // using to the new list of backends.
0 commit comments