Skip to content

Commit d77da02

Browse files
committed
HBASE-29797 Should not create wal directory when creating WAL instance
1 parent a548519 commit d77da02

22 files changed

Lines changed: 186 additions & 52 deletions

hbase-server/pom.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,11 @@
296296
<artifactId>junit-vintage-engine</artifactId>
297297
<scope>test</scope>
298298
</dependency>
299+
<dependency>
300+
<groupId>org.awaitility</groupId>
301+
<artifactId>awaitility</artifactId>
302+
<scope>test</scope>
303+
</dependency>
299304
<dependency>
300305
<groupId>org.mockito</groupId>
301306
<artifactId>mockito-core</artifactId>

hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1799,8 +1799,8 @@ private void setupWALAndReplication() throws IOException {
17991799
throw new RegionServerRunningException(
18001800
"Region server has already created directory at " + this.serverName.toString());
18011801
}
1802-
// Always create wal directory as now we need this when master restarts to find out the live
1803-
// region servers.
1802+
// Create wal directory here and we will never create it again in other places. This is
1803+
// important to make sure that our fencing way takes effect. See HBASE-29797 for more details.
18041804
if (!this.walFs.mkdirs(logDir)) {
18051805
throw new IOException("Can not create wal directory " + logDir);
18061806
}

hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -536,10 +536,8 @@ protected AbstractFSWAL(final FileSystem fs, final Abortable abortable, final Pa
536536
this.remoteFs = remoteFs;
537537
this.remoteWALDir = remoteWALDir;
538538

539-
if (!fs.exists(walDir) && !fs.mkdirs(walDir)) {
540-
throw new IOException("Unable to mkdir " + walDir);
541-
}
542-
539+
// Here we only crate archive dir, without wal dir. This is to make sure that our fencing way
540+
// takes effect. See HBASE-29797 for more details.
543541
if (!fs.exists(this.walArchiveDir)) {
544542
if (!fs.mkdirs(this.walArchiveDir)) {
545543
throw new IOException("Unable to mkdir " + this.walArchiveDir);

hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALFactory.java

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import org.apache.hadoop.hbase.regionserver.wal.ProtobufWALTailingReader;
3737
import org.apache.hadoop.hbase.replication.ReplicationStorageFactory;
3838
import org.apache.hadoop.hbase.util.CancelableProgressable;
39+
import org.apache.hadoop.hbase.util.CommonFSUtils;
3940
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
4041
import org.apache.hadoop.hbase.util.LeaseNotRecoveredException;
4142
import org.apache.hadoop.hbase.wal.WALProvider.Writer;
@@ -211,7 +212,7 @@ static WALProvider createProvider(Class<? extends WALProvider> clazz) throws IOE
211212
public WALFactory(Configuration conf, String factoryId) throws IOException {
212213
// default enableSyncReplicationWALProvider is true, only disable SyncReplicationWALProvider
213214
// for HMaster or HRegionServer which take system table only. See HBASE-19999
214-
this(conf, factoryId, null);
215+
this(conf, factoryId, null, true);
215216
}
216217

217218
/**
@@ -228,17 +229,30 @@ public WALFactory(Configuration conf, String factoryId) throws IOException {
228229
*/
229230
public WALFactory(Configuration conf, ServerName serverName, Abortable abortable)
230231
throws IOException {
231-
this(conf, serverName.toString(), abortable);
232+
this(conf, serverName.toString(), abortable, false);
233+
}
234+
235+
private static void createWALDirectory(Configuration conf, String factoryId) throws IOException {
236+
FileSystem walFs = CommonFSUtils.getWALFileSystem(conf);
237+
Path walRootDir = CommonFSUtils.getWALRootDir(conf);
238+
Path walDir = new Path(walRootDir, AbstractFSWALProvider.getWALDirectoryName(factoryId));
239+
if (!walFs.exists(walDir) && !walFs.mkdirs(walDir)) {
240+
throw new IOException("Can not create wal directory " + walDir);
241+
}
232242
}
233243

234244
/**
235-
* @param conf must not be null, will keep a reference to read params in later reader/writer
236-
* instances.
237-
* @param factoryId a unique identifier for this factory. used i.e. by filesystem implementations
238-
* to make a directory
239-
* @param abortable the server associated with this WAL file
245+
* @param conf must not be null, will keep a reference to read params in later
246+
* reader/writer instances.
247+
* @param factoryId a unique identifier for this factory. used i.e. by filesystem
248+
* implementations to make a directory
249+
* @param abortable the server associated with this WAL file
250+
* @param createWalDirectory pass {@code true} for testing purpose, to create the wal directory
251+
* automatically. In normal code path, we should create it in
252+
* HRegionServer setup.
240253
*/
241-
private WALFactory(Configuration conf, String factoryId, Abortable abortable) throws IOException {
254+
private WALFactory(Configuration conf, String factoryId, Abortable abortable,
255+
boolean createWalDirectory) throws IOException {
242256
// until we've moved reader/writer construction down into providers, this initialization must
243257
// happen prior to provider initialization, in case they need to instantiate a reader/writer.
244258
timeoutMillis = conf.getInt("hbase.hlog.open.timeout", 300000);
@@ -259,6 +273,10 @@ private WALFactory(Configuration conf, String factoryId, Abortable abortable) th
259273
REPLICATION_WAL_PROVIDER, this.abortable);
260274
// end required early initialization
261275
if (conf.getBoolean(WAL_ENABLED, true)) {
276+
if (createWalDirectory) {
277+
// for testing only
278+
createWALDirectory(conf, factoryId);
279+
}
262280
WALProvider provider = createProvider(getProviderClass(WAL_PROVIDER, DEFAULT_WAL_PROVIDER));
263281
provider.init(this, conf, null, this.abortable);
264282
provider.addWALActionsListener(new MetricsWAL());

hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtil.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2194,8 +2194,9 @@ public static WAL createWal(final Configuration conf, final Path rootDir, final
21942194
// The WAL subsystem will use the default rootDir rather than the passed in rootDir
21952195
// unless I pass along via the conf.
21962196
Configuration confForWAL = new Configuration(conf);
2197-
confForWAL.set(HConstants.HBASE_DIR, rootDir.toString());
2198-
return new WALFactory(confForWAL, "hregion-" + RandomStringUtils.randomNumeric(8)).getWAL(hri);
2197+
CommonFSUtils.setRootDir(confForWAL, rootDir);
2198+
return new WALFactory(confForWAL, "hregion-" + RandomStringUtils.insecure().nextNumeric(8))
2199+
.getWAL(hri);
21992200
}
22002201

22012202
/**
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase.master;
19+
20+
import static org.awaitility.Awaitility.await;
21+
22+
import java.io.IOException;
23+
import java.time.Duration;
24+
import java.util.Collections;
25+
import org.apache.hadoop.fs.FileStatus;
26+
import org.apache.hadoop.fs.Path;
27+
import org.apache.hadoop.hbase.HBaseTestingUtil;
28+
import org.apache.hadoop.hbase.TableName;
29+
import org.apache.hadoop.hbase.client.RegionInfo;
30+
import org.apache.hadoop.hbase.regionserver.HRegionServer;
31+
import org.apache.hadoop.hbase.regionserver.Region;
32+
import org.apache.hadoop.hbase.testclassification.MasterTests;
33+
import org.apache.hadoop.hbase.testclassification.MediumTests;
34+
import org.apache.hadoop.hbase.util.RecoverLeaseFSUtils;
35+
import org.junit.jupiter.api.AfterAll;
36+
import org.junit.jupiter.api.BeforeAll;
37+
import org.junit.jupiter.api.Tag;
38+
import org.junit.jupiter.api.Test;
39+
40+
/**
41+
* Testcase for HBASE-29797, where the lazy initialized WALProvider may recreate the WAL directory
42+
* and cause our fencing way loses efficacy.
43+
*/
44+
@Tag(MasterTests.TAG)
45+
@Tag(MediumTests.TAG)
46+
public class TestWALFencing {
47+
48+
private static final HBaseTestingUtil UTIL = new HBaseTestingUtil();
49+
50+
@BeforeAll
51+
public static void setUp() throws Exception {
52+
UTIL.startMiniCluster(3);
53+
UTIL.getAdmin().balancerSwitch(false, true);
54+
}
55+
56+
@AfterAll
57+
public static void tearDown() throws IOException {
58+
UTIL.shutdownMiniCluster();
59+
}
60+
61+
@Test
62+
public void testMoveMeta() throws Exception {
63+
HRegionServer metaRs = UTIL.getRSForFirstRegionInTable(TableName.META_TABLE_NAME);
64+
HRegionServer otherRs = UTIL.getOtherRegionServer(metaRs);
65+
// do fencing here, i.e, kill otherRs
66+
Path splittingDir = UTIL.getMiniHBaseCluster().getMaster().getMasterWalManager()
67+
.getLogDirs(Collections.singleton(otherRs.getServerName())).get(0);
68+
for (FileStatus walFile : otherRs.getWALFileSystem().listStatus(splittingDir)) {
69+
RecoverLeaseFSUtils.recoverFileLease(otherRs.getWALFileSystem(), walFile.getPath(),
70+
otherRs.getConfiguration());
71+
}
72+
// move meta region to otherRs, which should fail and crash otherRs, and then master will try to
73+
// assign meta region to another rs
74+
RegionInfo metaRegionInfo = metaRs.getRegions().stream().map(Region::getRegionInfo)
75+
.filter(RegionInfo::isMetaRegion).findAny().get();
76+
UTIL.getAdmin().move(metaRegionInfo.getRegionName(), otherRs.getServerName());
77+
// make sure that meta region is not on otherRs
78+
await().during(Duration.ofSeconds(5)).atMost(Duration.ofSeconds(6))
79+
.until(() -> otherRs.getRegions(TableName.META_TABLE_NAME).isEmpty());
80+
}
81+
}

hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestCompactionPolicy.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ protected void initialize() throws IOException {
104104
.setColumnFamily(familyDescriptor).build();
105105
RegionInfo info = RegionInfoBuilder.newBuilder(tableDescriptor.getTableName()).build();
106106

107+
fs.mkdirs(new Path(basedir, logName));
107108
hlog = new FSHLog(fs, basedir, logName, conf);
108109
hlog.init();
109110
ChunkCreator.initialize(MemStoreLAB.CHUNK_SIZE_DEFAULT, false, 0, 0, 0, null,

hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestFailedAppendAndSync.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,7 @@ public void testLockupAroundBadAssignSync() throws IOException {
197197
// the test.
198198
FileSystem fs = FileSystem.get(CONF);
199199
Path rootDir = new Path(dir + getName());
200+
fs.mkdirs(new Path(rootDir, getName()));
200201
DodgyFSLog dodgyWAL = new DodgyFSLog(fs, (Server) services, rootDir, getName(), CONF);
201202
dodgyWAL.init();
202203
LogRoller logRoller = new LogRoller(services);

hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegion.java

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -371,8 +371,9 @@ protected void doSync(long txid, boolean forceSync) throws IOException {
371371
}
372372

373373
FileSystem fs = FileSystem.get(CONF);
374-
Path rootDir = new Path(dir + "testMemstoreSnapshotSize");
375-
MyFaultyFSLog faultyLog = new MyFaultyFSLog(fs, rootDir, "testMemstoreSnapshotSize", CONF);
374+
Path rootDir = new Path(dir + method);
375+
fs.mkdirs(new Path(rootDir, method));
376+
MyFaultyFSLog faultyLog = new MyFaultyFSLog(fs, rootDir, method, CONF);
376377
faultyLog.init();
377378
region = initHRegion(tableName, null, null, CONF, false, Durability.SYNC_WAL, faultyLog,
378379
COLUMN_FAMILY_BYTES);
@@ -414,10 +415,10 @@ private static WAL createWALCompatibleWithFaultyFileSystem(String callingMethod,
414415

415416
@Test
416417
public void testMemstoreSizeAccountingWithFailedPostBatchMutate() throws IOException {
417-
String testName = "testMemstoreSizeAccountingWithFailedPostBatchMutate";
418418
FileSystem fs = FileSystem.get(CONF);
419-
Path rootDir = new Path(dir + testName);
420-
FSHLog hLog = new FSHLog(fs, rootDir, testName, CONF);
419+
Path rootDir = new Path(dir + method);
420+
fs.mkdirs(new Path(rootDir, method));
421+
FSHLog hLog = new FSHLog(fs, rootDir, method, CONF);
421422
hLog.init();
422423
region = initHRegion(tableName, null, null, CONF, false, Durability.SYNC_WAL, hLog,
423424
COLUMN_FAMILY_BYTES);
@@ -1253,8 +1254,10 @@ public long getSyncedLength() {
12531254
};
12541255
}
12551256
}
1256-
FailAppendFlushMarkerWAL wal = new FailAppendFlushMarkerWAL(FileSystem.get(walConf),
1257-
CommonFSUtils.getRootDir(walConf), method, walConf);
1257+
FileSystem fs = FileSystem.get(walConf);
1258+
Path rootDir = CommonFSUtils.getRootDir(walConf);
1259+
fs.mkdirs(new Path(rootDir, method));
1260+
FailAppendFlushMarkerWAL wal = new FailAppendFlushMarkerWAL(fs, rootDir, method, walConf);
12581261
wal.init();
12591262
this.region = initHRegion(tableName, HConstants.EMPTY_START_ROW, HConstants.EMPTY_END_ROW, CONF,
12601263
false, Durability.USE_DEFAULT, wal, family);
@@ -3355,8 +3358,9 @@ public void testScanner_DeleteOneFamilyNotAnother() throws IOException {
33553358
@Test
33563359
public void testDataInMemoryWithoutWAL() throws IOException {
33573360
FileSystem fs = FileSystem.get(CONF);
3358-
Path rootDir = new Path(dir + "testDataInMemoryWithoutWAL");
3359-
FSHLog hLog = new FSHLog(fs, rootDir, "testDataInMemoryWithoutWAL", CONF);
3361+
Path rootDir = new Path(dir + method);
3362+
fs.mkdirs(new Path(rootDir, method));
3363+
FSHLog hLog = new FSHLog(fs, rootDir, method, CONF);
33603364
hLog.init();
33613365
// This chunk creation is done throughout the code base. Do we want to move it into core?
33623366
// It is missing from this test. W/o it we NPE.

hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestLogRoller.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,13 @@ public class TestLogRoller {
6161
private static FileSystem FS;
6262

6363
@Before
64-
public void setup() throws Exception {
64+
public void setUp() throws Exception {
6565
CONF = TEST_UTIL.getConfiguration();
6666
CONF.setInt("hbase.regionserver.logroll.period", LOG_ROLL_PERIOD);
6767
CONF.setInt(HConstants.THREAD_WAKE_FREQUENCY, 300);
6868
ROOT_DIR = TEST_UTIL.getRandomDir();
6969
FS = FileSystem.get(CONF);
70+
FS.mkdirs(new Path(ROOT_DIR, LOG_DIR));
7071
RegionServerServices services = Mockito.mock(RegionServerServices.class);
7172
Mockito.when(services.getConfiguration()).thenReturn(CONF);
7273
ROLLER = new LogRoller(services);
@@ -77,7 +78,7 @@ public void setup() throws Exception {
7778
public void tearDown() throws Exception {
7879
ROLLER.close();
7980
FS.close();
80-
TEST_UTIL.shutdownMiniCluster();
81+
TEST_UTIL.cleanupTestDir();
8182
}
8283

8384
@Test

0 commit comments

Comments
 (0)