Skip to content

Commit 1eb159b

Browse files
authored
HBASE-24511 Ability to configure timeout between RPC retry to RS from master (#1861)
Signed-off-by: Viraj Jasani <vjasani@apache.org>
1 parent 7b396e9 commit 1eb159b

2 files changed

Lines changed: 50 additions & 5 deletions

File tree

hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -234,12 +234,19 @@ protected class ExecuteProceduresRemoteCall implements RemoteProcedureResolver,
234234
private int numberOfAttemptsSoFar = 0;
235235
private long maxWaitTime = -1;
236236

237+
private final long rsRpcRetryInterval;
238+
private static final String RS_RPC_RETRY_INTERVAL_CONF_KEY =
239+
"hbase.regionserver.rpc.retry.interval";
240+
private static final int DEFAULT_RS_RPC_RETRY_INTERVAL = 100;
241+
237242
private ExecuteProceduresRequest.Builder request = null;
238243

239244
public ExecuteProceduresRemoteCall(final ServerName serverName,
240245
final Set<RemoteProcedure> remoteProcedures) {
241246
this.serverName = serverName;
242247
this.remoteProcedures = remoteProcedures;
248+
this.rsRpcRetryInterval = master.getConfiguration().getLong(RS_RPC_RETRY_INTERVAL_CONF_KEY,
249+
DEFAULT_RS_RPC_RETRY_INTERVAL);
243250
}
244251

245252
private AsyncRegionServerAdmin getRsAdmin() throws IOException {
@@ -259,8 +266,8 @@ private boolean scheduleForRetry(IOException e) {
259266
LOG.warn("Waiting a little before retrying {}, try={}, can wait up to {}ms",
260267
serverName, numberOfAttemptsSoFar, remainingTime);
261268
numberOfAttemptsSoFar++;
262-
// Retry every 100ms up to maximum wait time.
263-
submitTask(this, 100, TimeUnit.MILLISECONDS);
269+
// Retry every rsRpcRetryInterval millis up to maximum wait time.
270+
submitTask(this, rsRpcRetryInterval, TimeUnit.MILLISECONDS);
264271
return true;
265272
}
266273
LOG.warn("{} is throwing ServerNotRunningYetException for {}ms; trying another server",
@@ -305,10 +312,12 @@ private boolean scheduleForRetry(IOException e) {
305312
numberOfAttemptsSoFar++;
306313
// Add some backoff here as the attempts rise otherwise if a stuck condition, will fill logs
307314
// with failed attempts. None of our backoff classes -- RetryCounter or ClientBackoffPolicy
308-
// -- fit here nicely so just do something simple; increment by 100ms * retry^2 on each try
315+
// -- fit here nicely so just do something simple; increment by rsRpcRetryInterval millis *
316+
// retry^2 on each try
309317
// up to max of 10 seconds (don't want to back off too much in case of situation change).
310318
submitTask(this,
311-
Math.min(100 * (this.numberOfAttemptsSoFar * this.numberOfAttemptsSoFar), 10 * 1000),
319+
Math.min(rsRpcRetryInterval * (this.numberOfAttemptsSoFar * this.numberOfAttemptsSoFar),
320+
10 * 1000),
312321
TimeUnit.MILLISECONDS);
313322
return true;
314323
}

hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@
2222

2323
import java.io.IOException;
2424
import java.io.StringWriter;
25-
25+
import java.util.concurrent.ScheduledThreadPoolExecutor;
26+
import java.util.concurrent.TimeUnit;
2627
import org.apache.commons.lang3.StringUtils;
2728
import org.apache.hadoop.conf.Configuration;
2829
import org.apache.hadoop.hbase.HBaseClassTestRule;
@@ -38,6 +39,7 @@
3839
import org.apache.hadoop.hbase.testclassification.LargeTests;
3940
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
4041
import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
42+
import org.apache.hadoop.hbase.util.Threads;
4143
import org.apache.log4j.Appender;
4244
import org.apache.log4j.Layout;
4345
import org.apache.log4j.PatternLayout;
@@ -222,6 +224,40 @@ public void testReportForDutyWithMasterChange() throws Exception {
222224
tablesOnMaster? 3: 2);
223225

224226
}
227+
228+
/**
229+
* Tests region sever reportForDuty with RS RPC retry
230+
*/
231+
@Test
232+
public void testReportForDutyWithRSRpcRetry() throws Exception {
233+
ScheduledThreadPoolExecutor scheduledThreadPoolExecutor =
234+
new ScheduledThreadPoolExecutor(1, Threads.newDaemonThreadFactory("RSDelayedStart"));
235+
236+
// Start a master and wait for it to become the active/primary master.
237+
// Use a random unique port
238+
cluster.getConfiguration().setInt(HConstants.MASTER_PORT, HBaseTestingUtility.randomFreePort());
239+
// Override the default RS RPC retry interval of 100ms to 300ms
240+
cluster.getConfiguration().setLong("hbase.regionserver.rpc.retry.interval", 300);
241+
// master has a rs. defaultMinToStart = 2
242+
boolean tablesOnMaster = LoadBalancer.isTablesOnMaster(testUtil.getConfiguration());
243+
cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART,
244+
tablesOnMaster ? 2 : 1);
245+
cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART,
246+
tablesOnMaster ? 2 : 1);
247+
master = cluster.addMaster();
248+
rs = cluster.addRegionServer();
249+
LOG.debug("Starting master: " + master.getMaster().getServerName());
250+
master.start();
251+
// Delay the RS start so that the meta assignment fails in first attempt and goes to retry block
252+
scheduledThreadPoolExecutor.schedule(new Runnable() {
253+
@Override
254+
public void run() {
255+
rs.start();
256+
}
257+
}, 1000, TimeUnit.MILLISECONDS);
258+
259+
waitForClusterOnline(master);
260+
}
225261

226262
private void waitForClusterOnline(MasterThread master) throws InterruptedException {
227263
while (true) {

0 commit comments

Comments
 (0)