Skip to content

Commit 5eb4160

Browse files
HBASE-26190 High rate logging of BucketAllocatorException: Allocation too big
Reduce the frequency of allocation failed traces by printing them preiodically (once per minute). Record the allocation failures in the Bucket Cache Stats and let the stat thread dump cumulative allocation failures alongside other traces it dumps. Also, this change adds trace for the Table name, Column Family and HFileName for the most recent allocation failure in last 1 minute.
1 parent f000b77 commit 5eb4160

3 files changed

Lines changed: 50 additions & 4 deletions

File tree

hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2029,7 +2029,7 @@ static private byte[] getDummyHeaderForVersion(boolean usesHBaseChecksum) {
20292029
* @return This HFileBlocks fileContext which will a derivative of the
20302030
* fileContext for the file from which this block's data was originally read.
20312031
*/
2032-
HFileContext getHFileContext() {
2032+
public HFileContext getHFileContext() {
20332033
return this.fileContext;
20342034
}
20352035

hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/bucket/BucketCache.java

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,11 @@
6767
import org.apache.hadoop.hbase.io.hfile.Cacheable;
6868
import org.apache.hadoop.hbase.io.hfile.CachedBlock;
6969
import org.apache.hadoop.hbase.io.hfile.HFileBlock;
70+
import org.apache.hadoop.hbase.io.hfile.HFileContext;
7071
import org.apache.hadoop.hbase.nio.ByteBuff;
7172
import org.apache.hadoop.hbase.nio.RefCnt;
7273
import org.apache.hadoop.hbase.protobuf.ProtobufMagic;
74+
import org.apache.hadoop.hbase.util.Bytes;
7375
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
7476
import org.apache.hadoop.hbase.util.IdReadWriteLock;
7577
import org.apache.hadoop.hbase.util.IdReadWriteLockStrongRef;
@@ -249,6 +251,10 @@ public class BucketCache implements BlockCache, HeapSize {
249251
* */
250252
private String algorithm;
251253

254+
/* Tracing failed Bucket Cache allocations. */
255+
private long allocFailLogPrevTs; // time of previous log event for allocation failure.
256+
private static final int ALLOCATION_FAIL_LOG_TIME_PERIOD = 60000; // Default 1 minute.
257+
252258
public BucketCache(String ioEngineName, long capacity, int blockSize, int[] bucketSizes,
253259
int writerThreadNum, int writerQLen, String persistencePath) throws IOException {
254260
this(ioEngineName, capacity, blockSize, bucketSizes, writerThreadNum, writerQLen,
@@ -291,6 +297,8 @@ public BucketCache(String ioEngineName, long capacity, int blockSize, int[] buck
291297
this.blockSize = blockSize;
292298
this.ioErrorsTolerationDuration = ioErrorsTolerationDuration;
293299

300+
this.allocFailLogPrevTs = 0;
301+
294302
bucketAllocator = new BucketAllocator(capacity, bucketSizes);
295303
for (int i = 0; i < writerThreads.length; ++i) {
296304
writerQueues.add(new ArrayBlockingQueue<>(writerQLen));
@@ -728,7 +736,8 @@ public void logStats() {
728736
(StringUtils.formatPercent(cacheStats.getHitCachingRatio(), 2)+ ", ")) +
729737
"evictions=" + cacheStats.getEvictionCount() + ", " +
730738
"evicted=" + cacheStats.getEvictedCount() + ", " +
731-
"evictedPerRun=" + cacheStats.evictedPerEviction());
739+
"evictedPerRun=" + cacheStats.evictedPerEviction() + ", " +
740+
"allocationFailCount=" + cacheStats.getAllocationFailCount());
732741
cacheStats.reset();
733742
}
734743

@@ -1036,7 +1045,29 @@ void doDrain(final List<RAMQueueEntry> entries) throws InterruptedException {
10361045
}
10371046
index++;
10381047
} catch (BucketAllocatorException fle) {
1039-
LOG.warn("Failed allocation for " + (re == null ? "" : re.getKey()) + "; " + fle);
1048+
long currTs = System.currentTimeMillis(); // Current time since Epoch in milliseconds.
1049+
cacheStats.allocationFailed(); // Record the warning.
1050+
if (allocFailLogPrevTs == 0 || (currTs - allocFailLogPrevTs) > ALLOCATION_FAIL_LOG_TIME_PERIOD) {
1051+
String failHFileName = "";
1052+
String failColumnFamily = "";
1053+
String failTableName = "";
1054+
if (re != null) {
1055+
Cacheable failData = re.getData();
1056+
if (failData instanceof HFileBlock) {
1057+
HFileBlock failBlock = (HFileBlock) failData;
1058+
HFileContext failFileContext = failBlock.getHFileContext();
1059+
failHFileName = failFileContext.getHFileName();
1060+
failColumnFamily = Bytes.toString(failFileContext.getColumnFamily());
1061+
failTableName = Bytes.toString(failFileContext.getTableName());
1062+
}
1063+
}
1064+
LOG.warn("Most recent failed allocation in " + ALLOCATION_FAIL_LOG_TIME_PERIOD +
1065+
" milliseconds; Key: " + (re == null ? "" : re.getKey()) +
1066+
", TableName = " + failTableName + ", ColumnFamily = " + failColumnFamily +
1067+
", HFileName : " + failHFileName
1068+
, fle);
1069+
allocFailLogPrevTs = currTs;
1070+
}
10401071
// Presume can't add. Too big? Move index on. Entry will be cleared from ramCache below.
10411072
bucketEntries[index] = null;
10421073
index++;

hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/bucket/BucketCacheStats.java

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,20 @@ public class BucketCacheStats extends CacheStats {
3535
private static final long NANO_TIME = TimeUnit.MILLISECONDS.toNanos(1);
3636
private long lastLogTime = EnvironmentEdgeManager.currentTime();
3737

38+
/* Tracing failed Bucket Cache allocations. */
39+
private LongAdder allocationFailCount = new LongAdder();
40+
3841
BucketCacheStats() {
3942
super("BucketCache");
43+
44+
allocationFailCount.reset();
4045
}
4146

4247
@Override
4348
public String toString() {
4449
return super.toString() + ", ioHitsPerSecond=" + getIOHitsPerSecond() +
45-
", ioTimePerHit=" + getIOTimePerHit();
50+
", ioTimePerHit=" + getIOTimePerHit() + ", allocationFailCount=" +
51+
getAllocationFailCount();
4652
}
4753

4854
public void ioHit(long time) {
@@ -66,5 +72,14 @@ public double getIOTimePerHit() {
6672
public void reset() {
6773
ioHitCount.reset();
6874
ioHitTime.reset();
75+
allocationFailCount.reset();
76+
}
77+
78+
public long getAllocationFailCount() {
79+
return allocationFailCount.sum();
80+
}
81+
82+
public void allocationFailed () {
83+
allocationFailCount.increment();
6984
}
7085
}

0 commit comments

Comments
 (0)