Skip to content

Commit e379b25

Browse files
author
Daniel Roudnitsky
committed
HBASE-27781 Fix case of action counter assertion error in handling of batch operation timeout exceeded
1 parent c3b807c commit e379b25

2 files changed

Lines changed: 111 additions & 17 deletions

File tree

hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncRequestFutureImpl.java

Lines changed: 46 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -448,30 +448,22 @@ void groupAndSendMultiAction(List<Action> currentActions, int numAttempt) {
448448

449449
boolean isReplica = false;
450450
List<Action> unknownReplicaActions = null;
451+
List<Action> locateRegionFailedActions = null;
451452
for (Action action : currentActions) {
452453
if (isOperationTimeoutExceeded()) {
453-
String message = numAttempt == 1
454-
? "Operation timeout exceeded during resolution of region locations, "
455-
+ "prior to executing any actions."
456-
: "Operation timeout exceeded during re-resolution of region locations on retry "
457-
+ (numAttempt - 1) + ".";
458-
459-
message += " Meta may be slow or operation timeout too short for batch size or retries.";
460-
OperationTimeoutExceededException exception =
461-
new OperationTimeoutExceededException(message);
462-
463-
// Clear any actions we already resolved, because none will have been executed yet
464-
// We are going to fail all passed actions because there's no way we can execute any
465-
// if operation timeout is exceeded.
466454
actionsByServer.clear();
467-
for (Action actionToFail : currentActions) {
468-
manageLocationError(actionToFail, exception);
469-
}
455+
failIncompleteActionsWithOpTimeout(currentActions, locateRegionFailedActions, numAttempt);
470456
return;
471457
}
472458

473459
RegionLocations locs = findAllLocationsOrFail(action, true);
474-
if (locs == null) continue;
460+
if (locs == null) {
461+
if (locateRegionFailedActions == null) {
462+
locateRegionFailedActions = new ArrayList<>(1);
463+
}
464+
locateRegionFailedActions.add(action);
465+
continue;
466+
}
475467
boolean isReplicaAction = !RegionReplicaUtil.isDefaultReplica(action.getReplicaId());
476468
if (isReplica && !isReplicaAction) {
477469
// This is the property of the current implementation, not a requirement.
@@ -488,6 +480,10 @@ void groupAndSendMultiAction(List<Action> currentActions, int numAttempt) {
488480
} else {
489481
// TODO: relies on primary location always being fetched
490482
manageLocationError(action, null);
483+
if (locateRegionFailedActions == null) {
484+
locateRegionFailedActions = new ArrayList<>(1);
485+
}
486+
locateRegionFailedActions.add(action);
491487
}
492488
} else {
493489
byte[] regionName = loc.getRegionInfo().getRegionName();
@@ -561,6 +557,39 @@ private RegionLocations findAllLocationsOrFail(Action action, boolean useCache)
561557
return loc;
562558
}
563559

560+
/**
561+
* For failing all actions that were being grouped during a groupAndSendMultiAction when operation
562+
* timeout was exceeded and there is no time remaining to continue grouping/sending any of the
563+
* actions. We don't fail any actions which have already failed to completion during grouping due
564+
* to location error (they already have an error set and had action counter decremented for)
565+
* @param actions actions being processed by the groupAndSend when operation
566+
* timeout occurred
567+
* @param locateRegionFailedActions actions already failed to completion due to location error
568+
* @param numAttempt the number of attempts so far
569+
*/
570+
private void failIncompleteActionsWithOpTimeout(List<Action> actions,
571+
List<Action> locateRegionFailedActions, int numAttempt) {
572+
String message = numAttempt == 1
573+
? "Operation timeout exceeded during resolution of region locations, "
574+
+ "prior to executing any actions."
575+
: "Operation timeout exceeded during re-resolution of region locations on retry "
576+
+ (numAttempt - 1) + ".";
577+
message += " Meta may be slow or operation timeout too short for batch size or retries.";
578+
OperationTimeoutExceededException exception = new OperationTimeoutExceededException(message);
579+
580+
for (Action actionToFail : actions) {
581+
// Action equality is implemented as row equality so we check action index equality
582+
// since we don't want two different actions for the same row to be considered equal here
583+
boolean actionAlreadyFailed =
584+
locateRegionFailedActions != null && locateRegionFailedActions.stream().anyMatch(
585+
failedAction -> failedAction.getOriginalIndex() == actionToFail.getOriginalIndex()
586+
&& failedAction.getReplicaId() == actionToFail.getReplicaId());
587+
if (!actionAlreadyFailed) {
588+
manageLocationError(actionToFail, exception);
589+
}
590+
}
591+
}
592+
564593
/**
565594
* Send a multi action structure to the servers, after a delay depending on the attempt number.
566595
* Asynchronous.

hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestClientOperationTimeout.java

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import java.io.IOException;
2323
import java.net.SocketTimeoutException;
2424
import java.util.ArrayList;
25+
import java.util.Arrays;
2526
import java.util.List;
2627
import org.apache.hadoop.conf.Configuration;
2728
import org.apache.hadoop.hbase.HBaseClassTestRule;
@@ -321,6 +322,70 @@ public void testMultiGetRetryTimeout() {
321322
}
322323
}
323324

325+
/**
326+
* Test that for a batch operation where region location resolution fails for the first action in
327+
* the batch and consumes the entire operation timeout, that the location error is preserved for
328+
* the first action and that the rest of the batch is failed fast with
329+
* OperationTimeoutExceededException , this also (indirectly) tests that the action counter is
330+
* decremented properly for all actions, see last catch block
331+
*/
332+
@Test
333+
public void testMultiOperationTimoutWithLocationError() throws IOException, InterruptedException {
334+
// Need meta delay > meta scan timeout > operation timeout (with no retries) so that the
335+
// meta scan for resolving region location for the first action times out after the operation
336+
// timeout has been exceeded leaving no time to attempt region location resolution for any
337+
// other actions remaining in the batch
338+
int operationTimeout = 100;
339+
int metaScanTimeout = 150;
340+
DELAY_META_SCAN = 200;
341+
342+
Configuration conf = new Configuration(UTIL.getConfiguration());
343+
conf.setLong(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, operationTimeout);
344+
conf.setLong(ConnectionConfiguration.HBASE_CLIENT_META_SCANNER_TIMEOUT, metaScanTimeout);
345+
conf.setLong(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 0);
346+
347+
try (Connection specialConnection = ConnectionFactory.createConnection(conf);
348+
Table specialTable = specialConnection.getTable(TABLE_NAME)) {
349+
350+
// Region location resolution for first action should fail due to meta scan timeout and cause
351+
// the batch to exceed the operation timeout, second and third action should then be failed
352+
// fast with OperationTimeoutExceededException
353+
Get firstAction = new Get(Bytes.toBytes(0)).addColumn(FAMILY, QUALIFIER);
354+
Get secondAction = firstAction;
355+
Get thirdAction = new Get(Bytes.toBytes(1)).addColumn(FAMILY, QUALIFIER);
356+
List<Get> gets = Arrays.asList(firstAction, secondAction, thirdAction);
357+
try {
358+
specialTable.batch(gets, new Object[3]);
359+
Assert.fail("Should not reach here");
360+
} catch (RetriesExhaustedWithDetailsException exception) {
361+
byte[] firstExceptionRow = exception.getRow(0).getRow();
362+
Assert.assertEquals(firstAction.getRow(), firstExceptionRow);
363+
364+
// CallTimeout comes from the scan timeout to meta table in locateRegionInMeta
365+
Throwable firstActionCause = exception.getCause(0);
366+
Assert.assertTrue(firstActionCause instanceof RetriesExhaustedException);
367+
Assert.assertTrue(firstActionCause.getCause() instanceof CallTimeoutException);
368+
369+
byte[] secondExceptionRow = exception.getRow(1).getRow();
370+
Assert.assertEquals(secondAction.getRow(), secondExceptionRow);
371+
372+
Throwable secondActionCause = exception.getCause(1);
373+
Assert.assertTrue(secondActionCause instanceof OperationTimeoutExceededException);
374+
375+
byte[] thirdExceptionRow = exception.getRow(2).getRow();
376+
Assert.assertEquals(thirdAction.getRow(), thirdExceptionRow);
377+
378+
Throwable thirdActionCause = exception.getCause(2);
379+
Assert.assertTrue(thirdActionCause instanceof OperationTimeoutExceededException);
380+
}
381+
} catch (SocketTimeoutException ste) {
382+
if (ste.getMessage().contains("time out before the actionsInProgress changed to zero")) {
383+
Assert.fail("Not all actions had action counter decremented: " + ste);
384+
}
385+
throw ste;
386+
}
387+
}
388+
324389
/**
325390
* Tests that scan on a table throws {@link RetriesExhaustedException} when the operation takes
326391
* longer than 'hbase.client.scanner.timeout.period'.

0 commit comments

Comments
 (0)