2727import java .util .List ;
2828import java .util .Map ;
2929import java .util .Set ;
30+ import java .util .function .Function ;
31+
3032import org .apache .commons .lang3 .StringUtils ;
33+ import org .apache .hadoop .hbase .DoNotRetryIOException ;
3134import org .apache .hadoop .hbase .NamespaceDescriptor ;
3235import org .apache .hadoop .hbase .ServerName ;
3336import org .apache .hadoop .hbase .TableName ;
5457@ InterfaceAudience .Private
5558public class RSGroupAdminServer implements RSGroupAdmin {
5659 private static final Logger LOG = LoggerFactory .getLogger (RSGroupAdminServer .class );
57- public static final String KEEP_ONE_SERVER_IN_DEFAULT_ERROR_MESSAGE = "should keep at least " +
60+ static final String KEEP_ONE_SERVER_IN_DEFAULT_ERROR_MESSAGE = "should keep at least " +
5861 "one server in 'default' RSGroup." ;
5962
6063 private MasterServices master ;
6164 private final RSGroupInfoManager rsGroupInfoManager ;
6265
66+ /** Define the config key of retries threshold when movements failed */
67+ //made package private for testing
68+ static final String FAILED_MOVE_MAX_RETRY = "hbase.rsgroup.move.max.retry" ;
69+
70+ /** Define the default number of retries */
71+ //made package private for testing
72+ static final int DEFAULT_MAX_RETRY_VALUE = 50 ;
73+
74+ private int moveMaxRetry ;
75+
6376 public RSGroupAdminServer (MasterServices master , RSGroupInfoManager rsGroupInfoManager ) {
6477 this .master = master ;
6578 this .rsGroupInfoManager = rsGroupInfoManager ;
79+ this .moveMaxRetry = master .getConfiguration ().getInt (FAILED_MOVE_MAX_RETRY ,
80+ DEFAULT_MAX_RETRY_VALUE );
6681 }
6782
6883 @ Override
@@ -202,20 +217,77 @@ private void checkServersAndTables(Set<Address> servers, Set<TableName> tables,
202217 * @throws IOException if moving the server and tables fail
203218 */
204219 private void moveServerRegionsFromGroup (Set <Address > servers , String targetGroupName )
205- throws IOException {
220+ throws IOException {
221+ moveRegionsBetweenGroups (servers , targetGroupName ,
222+ rs -> getRegions (rs ),
223+ info -> {
224+ try {
225+ RSGroupInfo group = getRSGroupInfo (targetGroupName );
226+ return group .containsTable (info .getTable ());
227+ } catch (IOException e ) {
228+ e .printStackTrace ();
229+ return false ;
230+ }
231+ },
232+ rs -> rs .getHostname ());
233+ }
234+
235+ /**
236+ * Moves regions of tables which are not on target group servers.
237+ *
238+ * @param tables the tables that will move to new group
239+ * @param targetGroupName the target group name
240+ * @throws IOException if moving the region fails
241+ */
242+ private void moveTableRegionsToGroup (Set <TableName > tables , String targetGroupName )
243+ throws IOException {
244+ moveRegionsBetweenGroups (tables , targetGroupName ,
245+ table -> {
246+ if (master .getAssignmentManager ().isTableDisabled (table )) {
247+ return new ArrayList <>();
248+ }
249+ return master .getAssignmentManager ().getRegionStates ().getRegionsOfTable (table );
250+ },
251+ info -> {
252+ try {
253+ RSGroupInfo group = getRSGroupInfo (targetGroupName );
254+ ServerName sn =
255+ master .getAssignmentManager ().getRegionStates ().getRegionServerOfRegion (info );
256+ return group .containsServer (sn .getAddress ());
257+ } catch (IOException e ) {
258+ e .printStackTrace ();
259+ return false ;
260+ }
261+ },
262+ table -> table .getNameWithNamespaceInclAsString ());
263+ }
264+
265+ private <T > void moveRegionsBetweenGroups (Set <T > regionsOwners , String targetGroupName ,
266+ Function <T , List <RegionInfo >> getRegionsInfo , Function <RegionInfo , Boolean > validation ,
267+ Function <T , String > getOwnerName ) throws IOException {
206268 boolean hasRegionsToMove ;
207- RSGroupInfo targetGrp = getRSGroupInfo (targetGroupName );
208- Set <Address > allSevers = new HashSet <>(servers );
269+ int retry = 0 ;
270+ Set <T > allOwners = new HashSet <>(regionsOwners );
271+ Set <String > failedRegions = new HashSet <>();
272+ IOException toThrow = null ;
209273 do {
210274 hasRegionsToMove = false ;
211- for (Iterator <Address > iter = allSevers .iterator (); iter .hasNext ();) {
212- Address rs = iter .next ();
275+ for (Iterator <T > iter = allOwners .iterator (); iter .hasNext (); ) {
276+ T owner = iter .next ();
213277 // Get regions that are associated with this server and filter regions by group tables.
214- for (RegionInfo region : getRegions ( rs )) {
215- if (!targetGrp . containsTable (region . getTable () )) {
216- LOG .info ("Moving server region {}, which do not belong to RSGroup {}" ,
278+ for (RegionInfo region : getRegionsInfo . apply ( owner )) {
279+ if (!validation . apply (region )) {
280+ LOG .info ("Moving region {}, which do not belong to RSGroup {}" ,
217281 region .getShortNameToLog (), targetGroupName );
218- this .master .getAssignmentManager ().move (region );
282+ try {
283+ this .master .getAssignmentManager ().move (region );
284+ failedRegions .remove (region .getRegionNameAsString ());
285+ } catch (IOException ioe ) {
286+ LOG .debug ("Move region {} from group failed, will retry, current retry time is {}" ,
287+ region .getShortNameToLog (), retry , ioe );
288+ toThrow = ioe ;
289+ failedRegions .add (region .getRegionNameAsString ());
290+ }
219291 if (master .getAssignmentManager ().getRegionStates ().
220292 getRegionState (region ).isFailedOpen ()) {
221293 continue ;
@@ -225,44 +297,30 @@ private void moveServerRegionsFromGroup(Set<Address> servers, String targetGroup
225297 }
226298
227299 if (!hasRegionsToMove ) {
228- LOG .info ("Server {} has no more regions to move for RSGroup" , rs . getHostname ( ));
300+ LOG .info ("No more regions to move from {} to RSGroup" , getOwnerName . apply ( owner ));
229301 iter .remove ();
230302 }
231303 }
304+
305+ retry ++;
232306 try {
233307 rsGroupInfoManager .wait (1000 );
234308 } catch (InterruptedException e ) {
235309 LOG .warn ("Sleep interrupted" , e );
236310 Thread .currentThread ().interrupt ();
237311 }
238- } while (hasRegionsToMove );
239- }
240-
241- /**
242- * Moves regions of tables which are not on target group servers.
243- *
244- * @param tables the tables that will move to new group
245- * @param targetGroupName the target group name
246- * @throws IOException if moving the region fails
247- */
248- private void moveTableRegionsToGroup (Set <TableName > tables , String targetGroupName )
249- throws IOException {
250- RSGroupInfo targetGrp = getRSGroupInfo (targetGroupName );
251- for (TableName table : tables ) {
252- if (master .getAssignmentManager ().isTableDisabled (table )) {
253- LOG .debug ("Skipping move regions because the table {} is disabled" , table );
254- continue ;
255- }
256- LOG .info ("Moving region(s) for table {} to RSGroup {}" , table , targetGroupName );
257- for (RegionInfo region : master .getAssignmentManager ().getRegionStates ()
258- .getRegionsOfTable (table )) {
259- ServerName sn =
260- master .getAssignmentManager ().getRegionStates ().getRegionServerOfRegion (region );
261- if (!targetGrp .containsServer (sn .getAddress ())) {
262- LOG .info ("Moving region {} to RSGroup {}" , region .getShortNameToLog (), targetGroupName );
263- master .getAssignmentManager ().move (region );
264- }
265- }
312+ } while (hasRegionsToMove && retry <= moveMaxRetry );
313+
314+ //has up to max retry time or there are no more regions to move
315+ if (hasRegionsToMove ) {
316+ // print failed moved regions, for later process conveniently
317+ String msg = String
318+ .format ("move regions for group %s failed, failed regions: %s" , targetGroupName ,
319+ failedRegions );
320+ LOG .error (msg );
321+ throw new DoNotRetryIOException (
322+ msg + ", just record the last failed region's cause, more details in server log" ,
323+ toThrow );
266324 }
267325 }
268326
0 commit comments