2222import java .util .ArrayList ;
2323import java .util .Collections ;
2424import java .util .List ;
25- import java .util .Set ;
2625import java .util .stream .Collectors ;
2726import org .apache .hadoop .hbase .HRegionLocation ;
2827import org .apache .hadoop .hbase .TableName ;
@@ -159,10 +158,12 @@ protected Flow executeFromState(MasterProcedureEnv env, ReopenTableRegionsState
159158 setNextState (ReopenTableRegionsState .REOPEN_TABLE_REGIONS_REOPEN_REGIONS );
160159 return Flow .HAS_MORE_STATE ;
161160 case REOPEN_TABLE_REGIONS_REOPEN_REGIONS :
162- if (!regions .isEmpty ()) {
161+ // if we didn't finish reopening the last batch yet, let's keep trying until we do.
162+ // at that point, the batch will be empty and we can generate a new batch
163+ if (!regions .isEmpty () && currentRegionBatch .isEmpty ()) {
164+ currentRegionBatch = regions .stream ().limit (reopenBatchSize ).collect (Collectors .toList ());
163165 batchesProcessed ++;
164166 }
165- currentRegionBatch = regions .stream ().limit (reopenBatchSize ).collect (Collectors .toList ());
166167 for (HRegionLocation loc : currentRegionBatch ) {
167168 RegionStateNode regionNode =
168169 env .getAssignmentManager ().getRegionStates ().getRegionStateNode (loc .getRegion ());
@@ -187,59 +188,65 @@ protected Flow executeFromState(MasterProcedureEnv env, ReopenTableRegionsState
187188 setNextState (ReopenTableRegionsState .REOPEN_TABLE_REGIONS_CONFIRM_REOPENED );
188189 return Flow .HAS_MORE_STATE ;
189190 case REOPEN_TABLE_REGIONS_CONFIRM_REOPENED :
190- regions = regions .stream ().map (env .getAssignmentManager ().getRegionStates ()::checkReopened )
191- .filter (l -> l != null ).collect (Collectors .toList ());
192- // we need to create a set of region names because the HRegionLocation hashcode is only
193- // based
194- // on the server name
195- Set <byte []> currentRegionBatchNames = currentRegionBatch .stream ()
196- .map (r -> r .getRegion ().getRegionName ()).collect (Collectors .toSet ());
197- currentRegionBatch = regions .stream ()
198- .filter (r -> currentRegionBatchNames .contains (r .getRegion ().getRegionName ()))
199- .collect (Collectors .toList ());
200- if (currentRegionBatch .isEmpty ()) {
201- if (regions .isEmpty ()) {
202- return Flow .NO_MORE_STATE ;
203- } else {
204- setNextState (ReopenTableRegionsState .REOPEN_TABLE_REGIONS_REOPEN_REGIONS );
205- reopenBatchSize = Math .min (reopenBatchSizeMax , 2 * reopenBatchSize );
206- if (reopenBatchBackoffMillis > 0 ) {
207- setBackoffStateAndSuspend (reopenBatchBackoffMillis );
208- } else {
209- return Flow .HAS_MORE_STATE ;
210- }
211- }
191+ // update region lists based on what's been reopened
192+ regions = filterReopened (env , regions );
193+ currentRegionBatch = filterReopened (env , currentRegionBatch );
194+
195+ // existing batch didn't fully reopen, so try to resolve that first.
196+ // since this is a retry, don't do the batch backoff
197+ if (!currentRegionBatch .isEmpty ()) {
198+ return reopenIfSchedulable (env , currentRegionBatch , false );
212199 }
213- if (currentRegionBatch .stream ().anyMatch (loc -> canSchedule (env , loc ))) {
214- retryCounter = null ;
215- setNextState (ReopenTableRegionsState .REOPEN_TABLE_REGIONS_REOPEN_REGIONS );
216- if (reopenBatchBackoffMillis > 0 ) {
217- setBackoffStateAndSuspend (reopenBatchBackoffMillis );
218- } else {
219- return Flow .HAS_MORE_STATE ;
220- }
221- }
222- // We can not schedule TRSP for all the regions need to reopen, wait for a while and retry
223- // again.
224- if (retryCounter == null ) {
225- retryCounter = ProcedureUtil .createRetryCounter (env .getMasterConfiguration ());
200+
201+ if (regions .isEmpty ()) {
202+ return Flow .NO_MORE_STATE ;
226203 }
227- long backoffMillis = retryCounter .getBackoffTimeAndIncrementAttempts ();
228- LOG .info (
229- "There are still {} region(s) which need to be reopened for table {}. {} are in "
230- + "OPENING state, suspend {}secs and try again later" ,
231- regions .size (), tableName , currentRegionBatch .size (), backoffMillis / 1000 );
232- setBackoffStateAndSuspend (backoffMillis );
204+
205+ // current batch is finished, schedule more regions
206+ return reopenIfSchedulable (env , regions , true );
233207 default :
234208 throw new UnsupportedOperationException ("unhandled state=" + state );
235209 }
236210 }
237211
238- private void setBackoffStateAndSuspend (long millis ) throws ProcedureSuspendedException {
212+ private List <HRegionLocation > filterReopened (MasterProcedureEnv env ,
213+ List <HRegionLocation > regionsToCheck ) {
214+ return regionsToCheck .stream ().map (env .getAssignmentManager ().getRegionStates ()::checkReopened )
215+ .filter (l -> l != null ).collect (Collectors .toList ());
216+ }
217+
218+ private Flow reopenIfSchedulable (MasterProcedureEnv env , List <HRegionLocation > regionsToReopen ,
219+ boolean shouldBatchBackoff ) throws ProcedureSuspendedException {
220+ if (regionsToReopen .stream ().anyMatch (loc -> canSchedule (env , loc ))) {
221+ retryCounter = null ;
222+ setNextState (ReopenTableRegionsState .REOPEN_TABLE_REGIONS_REOPEN_REGIONS );
223+ reopenBatchSize = Math .min (reopenBatchSizeMax , 2 * reopenBatchSize );
224+ if (shouldBatchBackoff && reopenBatchBackoffMillis > 0 ) {
225+ setBackoffState (reopenBatchBackoffMillis );
226+ throw new ProcedureSuspendedException ();
227+ } else {
228+ return Flow .HAS_MORE_STATE ;
229+ }
230+ }
231+
232+ // We can not schedule TRSP for all the regions need to reopen, wait for a while and retry
233+ // again.
234+ if (retryCounter == null ) {
235+ retryCounter = ProcedureUtil .createRetryCounter (env .getMasterConfiguration ());
236+ }
237+ long backoffMillis = retryCounter .getBackoffTimeAndIncrementAttempts ();
238+ LOG .info (
239+ "There are still {} region(s) which need to be reopened for table {}. {} are in "
240+ + "OPENING state, suspend {}secs and try again later" ,
241+ regions .size (), tableName , currentRegionBatch .size (), backoffMillis / 1000 );
242+ setBackoffState (backoffMillis );
243+ throw new ProcedureSuspendedException ();
244+ }
245+
246+ private void setBackoffState (long millis ) {
239247 setTimeout (Math .toIntExact (millis ));
240248 setState (ProcedureProtos .ProcedureState .WAITING_TIMEOUT );
241249 skipPersistence ();
242- throw new ProcedureSuspendedException ();
243250 }
244251
245252 private List <HRegionLocation >
0 commit comments