Skip to content

Commit e1fc3c4

Browse files
authored
[HBASE-24956] ConnectionManager#locateRegionInMeta waits for user region lock indefinitely. (#2415)
Signed-off-by: Bharath Vissapragada <[email protected]> Signed-off-by: Viraj Jasani <[email protected]>
1 parent f0acafc commit e1fc3c4

File tree

2 files changed

+101
-8
lines changed

2 files changed

+101
-8
lines changed

hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionManager.java

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272
import org.apache.hadoop.hbase.client.backoff.ClientBackoffPolicyFactory;
7373
import org.apache.hadoop.hbase.client.coprocessor.Batch;
7474
import org.apache.hadoop.hbase.exceptions.ClientExceptionsUtil;
75+
import org.apache.hadoop.hbase.exceptions.LockTimeoutException;
7576
import org.apache.hadoop.hbase.exceptions.RegionMovedException;
7677
import org.apache.hadoop.hbase.ipc.RpcClient;
7778
import org.apache.hadoop.hbase.ipc.RpcClientFactory;
@@ -1316,13 +1317,15 @@ private RegionLocations locateRegionInMeta(TableName tableName, byte[] row,
13161317

13171318
// Query the meta region
13181319
long pauseBase = this.pause;
1319-
userRegionLock.lock();
1320+
takeUserRegionLock();
13201321
try {
1321-
if (useCache) {// re-check cache after get lock
1322-
RegionLocations locations = getCachedLocation(tableName, row);
1323-
if (locations != null && locations.getRegionLocation(replicaId) != null) {
1324-
return locations;
1325-
}
1322+
// We don't need to check if useCache is enabled or not. Even if useCache is false
1323+
// we already cleared the cache for this row before acquiring userRegion lock so if this
1324+
// row is present in cache that means some other thread has populated it while we were
1325+
// waiting to acquire user region lock.
1326+
RegionLocations locations = getCachedLocation(tableName, row);
1327+
if (locations != null && locations.getRegionLocation(replicaId) != null) {
1328+
return locations;
13261329
}
13271330
Result regionInfoRow = null;
13281331
s.resetMvccReadPoint();
@@ -1339,7 +1342,7 @@ rpcControllerFactory, getMetaLookupPool(),
13391342
}
13401343

13411344
// convert the row result into the HRegionLocation we need!
1342-
RegionLocations locations = MetaTableAccessor.getRegionLocations(regionInfoRow);
1345+
locations = MetaTableAccessor.getRegionLocations(regionInfoRow);
13431346
if (locations == null || locations.getRegionLocation(replicaId) == null) {
13441347
throw new IOException("HRegionInfo was null in " +
13451348
tableName + ", row=" + regionInfoRow);
@@ -1423,6 +1426,19 @@ rpcControllerFactory, getMetaLookupPool(),
14231426
}
14241427
}
14251428

1429+
void takeUserRegionLock() throws IOException {
1430+
try {
1431+
long waitTime = connectionConfig.getMetaOperationTimeout();
1432+
if (!userRegionLock.tryLock(waitTime, TimeUnit.MILLISECONDS)) {
1433+
throw new LockTimeoutException("Failed to get user region lock in"
1434+
+ waitTime + " ms. " + " for accessing meta region server.");
1435+
}
1436+
} catch (InterruptedException ie) {
1437+
LOG.error("Interrupted while waiting for a lock", ie);
1438+
throw ExceptionUtil.asInterrupt(ie);
1439+
}
1440+
}
1441+
14261442
/**
14271443
* Put a newly discovered HRegionLocation into the cache.
14281444
* @param tableName The table name.

hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaCache.java

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import org.apache.hadoop.hbase.*;
2424

2525
import org.apache.hadoop.hbase.exceptions.ClientExceptionsUtil;
26+
import org.apache.hadoop.hbase.exceptions.LockTimeoutException;
2627
import org.apache.hadoop.hbase.exceptions.RegionOpeningException;
2728
import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
2829
import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.GetResponse;
@@ -42,6 +43,8 @@
4243
import java.io.IOException;
4344
import java.util.ArrayList;
4445
import java.util.List;
46+
import org.slf4j.Logger;
47+
import org.slf4j.LoggerFactory;
4548

4649
import static junit.framework.Assert.assertEquals;
4750
import static org.junit.Assert.assertNotNull;
@@ -57,6 +60,7 @@ public class TestMetaCache {
5760
private static final byte[] QUALIFIER = Bytes.toBytes("qual");
5861

5962
private static HRegionServer badRS;
63+
private static final Logger LOG = LoggerFactory.getLogger(TestMetaCache.class);
6064

6165
/**
6266
* @throws java.lang.Exception
@@ -356,4 +360,77 @@ public void throwOnScan(FakeRSRpcServices rpcServices, ClientProtos.ScanRequest
356360
throws ServiceException {
357361
}
358362
}
359-
}
363+
364+
@Test
365+
public void testUserRegionLockThrowsException() throws IOException, InterruptedException {
366+
((FakeRSRpcServices)badRS.getRSRpcServices()).setExceptionInjector(new LockSleepInjector());
367+
Configuration conf = new Configuration(TEST_UTIL.getConfiguration());
368+
conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 1);
369+
conf.setLong(HConstants.HBASE_CLIENT_META_OPERATION_TIMEOUT, 2000);
370+
conf.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 2000);
371+
372+
try (ConnectionManager.HConnectionImplementation conn =
373+
(ConnectionManager.HConnectionImplementation) ConnectionFactory.createConnection(conf)) {
374+
ClientThread client1 = new ClientThread(conn);
375+
ClientThread client2 = new ClientThread(conn);
376+
client1.start();
377+
client2.start();
378+
client1.join();
379+
client2.join();
380+
// One thread will get the lock but will sleep in LockExceptionInjector#throwOnScan and
381+
// eventually fail since the sleep time is more than hbase client scanner timeout period.
382+
// Other thread will wait to acquire userRegionLock.
383+
// Have no idea which thread will be scheduled first. So need to check both threads.
384+
385+
// Both the threads will throw exception. One thread will throw exception since after
386+
// acquiring user region lock, it is sleeping for 5 seconds when the scanner time out period
387+
// is 2 seconds.
388+
// Other thread will throw exception since it was not able to get hold of user region lock
389+
// within meta operation timeout period.
390+
assertNotNull(client1.getException());
391+
assertNotNull(client2.getException());
392+
393+
assertTrue(client1.getException() instanceof LockTimeoutException
394+
^ client2.getException() instanceof LockTimeoutException);
395+
}
396+
}
397+
398+
private final class ClientThread extends Thread {
399+
private Exception exception;
400+
private ConnectionManager.HConnectionImplementation connection;
401+
402+
private ClientThread(ConnectionManager.HConnectionImplementation connection) {
403+
this.connection = connection;
404+
}
405+
@Override
406+
public void run() {
407+
byte[] currentKey = HConstants.EMPTY_START_ROW;
408+
try {
409+
connection.getRegionLocation(TABLE_NAME, currentKey, true);
410+
} catch (IOException e) {
411+
LOG.error("Thread id: " + this.getId() + " exception: ", e);
412+
this.exception = e;
413+
}
414+
}
415+
public Exception getException() {
416+
return exception;
417+
}
418+
}
419+
420+
public static class LockSleepInjector extends ExceptionInjector {
421+
@Override
422+
public void throwOnScan(FakeRSRpcServices rpcServices, ClientProtos.ScanRequest request) {
423+
try {
424+
Thread.sleep(5000);
425+
} catch (InterruptedException e) {
426+
LOG.info("Interrupted exception", e);
427+
}
428+
}
429+
430+
@Override
431+
public void throwOnGet(FakeRSRpcServices rpcServices, ClientProtos.GetRequest request) { }
432+
433+
@Override
434+
public void throwOnMutate(FakeRSRpcServices rpcServices, ClientProtos.MutateRequest request) { }
435+
}
436+
}

0 commit comments

Comments
 (0)