Skip to content

Commit 74731c2

Browse files
sunhellywchevreuil
authored andcommitted
HBASE-22414 Interruption of moving regions in RSGroup will cause regions on wrong rs
Signed-off-by: Wellington Chevreuil <wchevreuil@apache.org>
1 parent 380f80f commit 74731c2

3 files changed

Lines changed: 315 additions & 42 deletions

File tree

hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupAdminServer.java

Lines changed: 97 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,10 @@
2727
import java.util.List;
2828
import java.util.Map;
2929
import java.util.Set;
30+
import java.util.function.Function;
31+
3032
import org.apache.commons.lang3.StringUtils;
33+
import org.apache.hadoop.hbase.DoNotRetryIOException;
3134
import org.apache.hadoop.hbase.NamespaceDescriptor;
3235
import org.apache.hadoop.hbase.ServerName;
3336
import org.apache.hadoop.hbase.TableName;
@@ -54,15 +57,27 @@
5457
@InterfaceAudience.Private
5558
public class RSGroupAdminServer implements RSGroupAdmin {
5659
private static final Logger LOG = LoggerFactory.getLogger(RSGroupAdminServer.class);
57-
public static final String KEEP_ONE_SERVER_IN_DEFAULT_ERROR_MESSAGE = "should keep at least " +
60+
static final String KEEP_ONE_SERVER_IN_DEFAULT_ERROR_MESSAGE = "should keep at least " +
5861
"one server in 'default' RSGroup.";
5962

6063
private MasterServices master;
6164
private final RSGroupInfoManager rsGroupInfoManager;
6265

66+
/** Define the config key of retries threshold when movements failed */
67+
//made package private for testing
68+
static final String FAILED_MOVE_MAX_RETRY = "hbase.rsgroup.move.max.retry";
69+
70+
/** Define the default number of retries */
71+
//made package private for testing
72+
static final int DEFAULT_MAX_RETRY_VALUE = 50;
73+
74+
private int moveMaxRetry;
75+
6376
public RSGroupAdminServer(MasterServices master, RSGroupInfoManager rsGroupInfoManager) {
6477
this.master = master;
6578
this.rsGroupInfoManager = rsGroupInfoManager;
79+
this.moveMaxRetry = master.getConfiguration().getInt(FAILED_MOVE_MAX_RETRY,
80+
DEFAULT_MAX_RETRY_VALUE);
6681
}
6782

6883
@Override
@@ -202,20 +217,77 @@ private void checkServersAndTables(Set<Address> servers, Set<TableName> tables,
202217
* @throws IOException if moving the server and tables fail
203218
*/
204219
private void moveServerRegionsFromGroup(Set<Address> servers, String targetGroupName)
205-
throws IOException {
220+
throws IOException {
221+
moveRegionsBetweenGroups(servers, targetGroupName,
222+
rs -> getRegions(rs),
223+
info -> {
224+
try {
225+
RSGroupInfo group = getRSGroupInfo(targetGroupName);
226+
return group.containsTable(info.getTable());
227+
} catch (IOException e) {
228+
e.printStackTrace();
229+
return false;
230+
}
231+
},
232+
rs -> rs.getHostname());
233+
}
234+
235+
/**
236+
* Moves regions of tables which are not on target group servers.
237+
*
238+
* @param tables the tables that will move to new group
239+
* @param targetGroupName the target group name
240+
* @throws IOException if moving the region fails
241+
*/
242+
private void moveTableRegionsToGroup(Set<TableName> tables, String targetGroupName)
243+
throws IOException {
244+
moveRegionsBetweenGroups(tables, targetGroupName,
245+
table -> {
246+
if (master.getAssignmentManager().isTableDisabled(table)) {
247+
return new ArrayList<>();
248+
}
249+
return master.getAssignmentManager().getRegionStates().getRegionsOfTable(table);
250+
},
251+
info -> {
252+
try {
253+
RSGroupInfo group = getRSGroupInfo(targetGroupName);
254+
ServerName sn =
255+
master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(info);
256+
return group.containsServer(sn.getAddress());
257+
} catch (IOException e) {
258+
e.printStackTrace();
259+
return false;
260+
}
261+
},
262+
table -> table.getNameWithNamespaceInclAsString());
263+
}
264+
265+
private <T> void moveRegionsBetweenGroups(Set<T> regionsOwners, String targetGroupName,
266+
Function<T, List<RegionInfo>> getRegionsInfo, Function<RegionInfo, Boolean> validation,
267+
Function<T, String> getOwnerName) throws IOException {
206268
boolean hasRegionsToMove;
207-
RSGroupInfo targetGrp = getRSGroupInfo(targetGroupName);
208-
Set<Address> allSevers = new HashSet<>(servers);
269+
int retry = 0;
270+
Set<T> allOwners = new HashSet<>(regionsOwners);
271+
Set<String> failedRegions = new HashSet<>();
272+
IOException toThrow = null;
209273
do {
210274
hasRegionsToMove = false;
211-
for (Iterator<Address> iter = allSevers.iterator(); iter.hasNext();) {
212-
Address rs = iter.next();
275+
for (Iterator<T> iter = allOwners.iterator(); iter.hasNext(); ) {
276+
T owner = iter.next();
213277
// Get regions that are associated with this server and filter regions by group tables.
214-
for (RegionInfo region : getRegions(rs)) {
215-
if (!targetGrp.containsTable(region.getTable())) {
216-
LOG.info("Moving server region {}, which do not belong to RSGroup {}",
278+
for (RegionInfo region : getRegionsInfo.apply(owner)) {
279+
if (!validation.apply(region)) {
280+
LOG.info("Moving region {}, which do not belong to RSGroup {}",
217281
region.getShortNameToLog(), targetGroupName);
218-
this.master.getAssignmentManager().move(region);
282+
try {
283+
this.master.getAssignmentManager().move(region);
284+
failedRegions.remove(region.getRegionNameAsString());
285+
} catch (IOException ioe) {
286+
LOG.debug("Move region {} from group failed, will retry, current retry time is {}",
287+
region.getShortNameToLog(), retry, ioe);
288+
toThrow = ioe;
289+
failedRegions.add(region.getRegionNameAsString());
290+
}
219291
if (master.getAssignmentManager().getRegionStates().
220292
getRegionState(region).isFailedOpen()) {
221293
continue;
@@ -225,44 +297,30 @@ private void moveServerRegionsFromGroup(Set<Address> servers, String targetGroup
225297
}
226298

227299
if (!hasRegionsToMove) {
228-
LOG.info("Server {} has no more regions to move for RSGroup", rs.getHostname());
300+
LOG.info("No more regions to move from {} to RSGroup", getOwnerName.apply(owner));
229301
iter.remove();
230302
}
231303
}
304+
305+
retry++;
232306
try {
233307
rsGroupInfoManager.wait(1000);
234308
} catch (InterruptedException e) {
235309
LOG.warn("Sleep interrupted", e);
236310
Thread.currentThread().interrupt();
237311
}
238-
} while (hasRegionsToMove);
239-
}
240-
241-
/**
242-
* Moves regions of tables which are not on target group servers.
243-
*
244-
* @param tables the tables that will move to new group
245-
* @param targetGroupName the target group name
246-
* @throws IOException if moving the region fails
247-
*/
248-
private void moveTableRegionsToGroup(Set<TableName> tables, String targetGroupName)
249-
throws IOException {
250-
RSGroupInfo targetGrp = getRSGroupInfo(targetGroupName);
251-
for (TableName table : tables) {
252-
if (master.getAssignmentManager().isTableDisabled(table)) {
253-
LOG.debug("Skipping move regions because the table {} is disabled", table);
254-
continue;
255-
}
256-
LOG.info("Moving region(s) for table {} to RSGroup {}", table, targetGroupName);
257-
for (RegionInfo region : master.getAssignmentManager().getRegionStates()
258-
.getRegionsOfTable(table)) {
259-
ServerName sn =
260-
master.getAssignmentManager().getRegionStates().getRegionServerOfRegion(region);
261-
if (!targetGrp.containsServer(sn.getAddress())) {
262-
LOG.info("Moving region {} to RSGroup {}", region.getShortNameToLog(), targetGroupName);
263-
master.getAssignmentManager().move(region);
264-
}
265-
}
312+
} while (hasRegionsToMove && retry <= moveMaxRetry);
313+
314+
//has up to max retry time or there are no more regions to move
315+
if (hasRegionsToMove) {
316+
// print failed moved regions, for later process conveniently
317+
String msg = String
318+
.format("move regions for group %s failed, failed regions: %s", targetGroupName,
319+
failedRegions);
320+
LOG.error(msg);
321+
throw new DoNotRetryIOException(
322+
msg + ", just record the last failed region's cause, more details in server log",
323+
toThrow);
266324
}
267325
}
268326

0 commit comments

Comments
 (0)