Skip to content

Commit 022dd5f

Browse files
committed
HBASE-25720 Sync WAL stuck when prepare flush cache will prevent flush cache and cause OOM
1 parent 446f22f commit 022dd5f

3 files changed

Lines changed: 26 additions & 33 deletions

File tree

hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2842,7 +2842,13 @@ protected PrepareFlushResult internalPrepareFlushCache(WAL wal, long myseqid,
28422842
String s = "Finished memstore snapshotting " + this + ", syncing WAL and waiting on mvcc, " +
28432843
"flushsize=" + totalSizeOfFlushableStores;
28442844
status.setStatus(s);
2845-
doSyncOfUnflushedWALChanges(wal, getRegionInfo());
2845+
2846+
try {
2847+
doSyncOfUnflushedWALChanges(wal, getRegionInfo());
2848+
} catch (Throwable t) {
2849+
status.abort("Sync unflushed WAL changes failed: " + StringUtils.stringifyException(t));
2850+
fatalForFlushCache(t);
2851+
}
28462852
return new PrepareFlushResult(storeFlushCtxs, committedFiles, storeFlushableSize, startTime,
28472853
flushOpSeqId, flushedSeqId, totalSizeOfFlushableStores);
28482854
}
@@ -3032,22 +3038,8 @@ FlushResultImpl internalFlushCacheAndCommit(WAL wal, MonitoredTask status,
30323038
}
30333039
wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
30343040
}
3035-
DroppedSnapshotException dse = new DroppedSnapshotException("region: " +
3036-
Bytes.toStringBinary(getRegionInfo().getRegionName()), t);
30373041
status.abort("Flush failed: " + StringUtils.stringifyException(t));
3038-
3039-
// Callers for flushcache() should catch DroppedSnapshotException and abort the region server.
3040-
// However, since we may have the region read lock, we cannot call close(true) here since
3041-
// we cannot promote to a write lock. Instead we are setting closing so that all other region
3042-
// operations except for close will be rejected.
3043-
this.closing.set(true);
3044-
3045-
if (rsServices != null) {
3046-
// This is a safeguard against the case where the caller fails to explicitly handle aborting
3047-
rsServices.abort("Replay of WAL required. Forcing server shutdown", dse);
3048-
}
3049-
3050-
throw dse;
3042+
fatalForFlushCache(t);
30513043
}
30523044

30533045
// If we get to here, the HStores have been written.
@@ -3093,6 +3085,24 @@ FlushResultImpl internalFlushCacheAndCommit(WAL wal, MonitoredTask status,
30933085
FlushResult.Result.FLUSHED_NO_COMPACTION_NEEDED, flushOpSeqId);
30943086
}
30953087

3088+
private void fatalForFlushCache(Throwable t) throws DroppedSnapshotException {
3089+
DroppedSnapshotException dse = new DroppedSnapshotException("region: " +
3090+
Bytes.toStringBinary(getRegionInfo().getRegionName()), t);
3091+
3092+
// Callers for flushcache() should catch DroppedSnapshotException and abort the region server.
3093+
// However, since we may have the region read lock, we cannot call close(true) here since
3094+
// we cannot promote to a write lock. Instead we are setting closing so that all other region
3095+
// operations except for close will be rejected.
3096+
this.closing.set(true);
3097+
3098+
if (rsServices != null) {
3099+
// This is a safeguard against the case where the caller fails to explicitly handle aborting
3100+
rsServices.abort("Replay of WAL required. Forcing server shutdown", dse);
3101+
}
3102+
3103+
throw dse;
3104+
}
3105+
30963106
/**
30973107
* Method to safely get the next sequence number.
30983108
* @return Next sequence number unassociated with any actual edit.

hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegion.java

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1276,9 +1276,6 @@ public long getSyncedLength() {
12761276
try {
12771277
region.flush(true);
12781278
fail("This should have thrown exception");
1279-
} catch (DroppedSnapshotException unexpected) {
1280-
// this should not be a dropped snapshot exception. Meaning that RS will not abort
1281-
throw unexpected;
12821279
} catch (IOException expected) {
12831280
// expected
12841281
}

hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestLogRolling.java

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -535,20 +535,6 @@ public void postLogRoll(Path oldFile, Path newFile) {
535535
assertTrue(loggedRows.contains("row1004"));
536536
assertTrue(loggedRows.contains("row1005"));
537537

538-
// flush all regions
539-
for (HRegion r : server.getOnlineRegionsLocalContext()) {
540-
try {
541-
r.flush(true);
542-
} catch (Exception e) {
543-
// This try/catch was added by HBASE-14317. It is needed
544-
// because this issue tightened up the semantic such that
545-
// a failed append could not be followed by a successful
546-
// sync. What is coming out here is a failed sync, a sync
547-
// that used to 'pass'.
548-
LOG.info(e.toString(), e);
549-
}
550-
}
551-
552538
ResultScanner scanner = table.getScanner(new Scan());
553539
try {
554540
for (int i = 2; i <= 5; i++) {

0 commit comments

Comments
 (0)