Skip to content

Commit 5e98129

Browse files
committed
HBASE-22964 Fix flaky TestClusterRestartFailover and TestClusterRestartFailoverSplitWithoutZk (apache#574)
Signed-off-by: Duo Zhang <[email protected]>
1 parent 4aaf883 commit 5e98129

File tree

2 files changed

+131
-46
lines changed

2 files changed

+131
-46
lines changed

hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1251,6 +1251,13 @@ public void restartHBaseCluster(int servers) throws IOException, InterruptedExce
12511251

12521252
public void restartHBaseCluster(int servers, List<Integer> ports)
12531253
throws IOException, InterruptedException {
1254+
StartMiniClusterOption option =
1255+
StartMiniClusterOption.builder().numRegionServers(servers).rsPorts(ports).build();
1256+
restartHBaseCluster(option);
1257+
}
1258+
1259+
public void restartHBaseCluster(StartMiniClusterOption option)
1260+
throws IOException, InterruptedException {
12541261
if (hbaseAdmin != null) {
12551262
hbaseAdmin.close();
12561263
hbaseAdmin = null;
@@ -1259,7 +1266,9 @@ public void restartHBaseCluster(int servers, List<Integer> ports)
12591266
this.connection.close();
12601267
this.connection = null;
12611268
}
1262-
this.hbaseCluster = new MiniHBaseCluster(this.conf, 1, servers, ports, null, null);
1269+
this.hbaseCluster =
1270+
new MiniHBaseCluster(this.conf, option.getNumMasters(), option.getNumRegionServers(),
1271+
option.getRsPorts(), option.getMasterClass(), option.getRsClass());
12631272
// Don't leave here till we've done a successful scan of the hbase:meta
12641273
Connection conn = ConnectionFactory.createConnection(this.conf);
12651274
Table t = conn.getTable(TableName.META_TABLE_NAME);

hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestClusterRestartFailover.java

Lines changed: 121 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -17,19 +17,33 @@
1717
*/
1818
package org.apache.hadoop.hbase.master;
1919

20+
import static org.junit.Assert.assertFalse;
21+
import static org.junit.Assert.assertNotNull;
22+
import static org.junit.Assert.assertNull;
23+
import static org.junit.Assert.assertTrue;
24+
25+
import java.io.IOException;
2026
import java.util.List;
27+
import java.util.Optional;
28+
import java.util.concurrent.CountDownLatch;
2129
import java.util.stream.Collectors;
30+
31+
import org.apache.hadoop.conf.Configuration;
2232
import org.apache.hadoop.hbase.HBaseClassTestRule;
2333
import org.apache.hadoop.hbase.ServerName;
34+
import org.apache.hadoop.hbase.StartMiniClusterOption;
2435
import org.apache.hadoop.hbase.TableName;
36+
import org.apache.hadoop.hbase.client.RegionInfo;
2537
import org.apache.hadoop.hbase.client.Table;
38+
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
2639
import org.apache.hadoop.hbase.master.assignment.ServerState;
2740
import org.apache.hadoop.hbase.master.assignment.ServerStateNode;
2841
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
2942
import org.apache.hadoop.hbase.procedure2.Procedure;
3043
import org.apache.hadoop.hbase.testclassification.LargeTests;
3144
import org.apache.hadoop.hbase.testclassification.MasterTests;
32-
import org.junit.Assert;
45+
import org.apache.hadoop.hbase.util.JVMClusterUtil;
46+
import org.apache.zookeeper.KeeperException;
3347
import org.junit.ClassRule;
3448
import org.junit.Test;
3549
import org.junit.experimental.categories.Category;
@@ -45,6 +59,9 @@ public class TestClusterRestartFailover extends AbstractTestRestartCluster {
4559

4660
private static final Logger LOG = LoggerFactory.getLogger(TestClusterRestartFailover.class);
4761

62+
private static CountDownLatch SCP_LATCH;
63+
private static ServerName SERVER_FOR_TEST;
64+
4865
@Override
4966
protected boolean splitWALCoordinatedByZk() {
5067
return true;
@@ -55,60 +72,119 @@ private ServerStateNode getServerStateNode(ServerName serverName) {
5572
.getServerNode(serverName);
5673
}
5774

75+
/**
76+
* Test for HBASE-22964
77+
*/
5878
@Test
5979
public void test() throws Exception {
60-
UTIL.startMiniCluster(3);
80+
setupCluster();
81+
setupTable();
82+
83+
// Find the server which not carry hbase:namespace
84+
for (JVMClusterUtil.RegionServerThread thread : UTIL.getHBaseCluster()
85+
.getRegionServerThreads()) {
86+
if (!thread.getRegionServer().getOnlineTables().contains(TableName.NAMESPACE_TABLE_NAME)) {
87+
SERVER_FOR_TEST = thread.getRegionServer().getServerName();
88+
break;
89+
}
90+
}
91+
UTIL.waitFor(60000, () -> getServerStateNode(SERVER_FOR_TEST) != null);
92+
ServerStateNode serverNode = getServerStateNode(SERVER_FOR_TEST);
93+
assertNotNull(serverNode);
94+
assertTrue("serverNode should be ONLINE when cluster runs normally",
95+
serverNode.isInState(ServerState.ONLINE));
96+
97+
SCP_LATCH = new CountDownLatch(1);
98+
99+
// Shutdown cluster and restart
100+
List<Integer> ports =
101+
UTIL.getHBaseCluster().getMaster().getServerManager().getOnlineServersList().stream()
102+
.map(serverName -> serverName.getPort()).collect(Collectors.toList());
103+
LOG.info("Shutting down cluster");
104+
UTIL.getHBaseCluster().killAll();
105+
UTIL.getHBaseCluster().waitUntilShutDown();
106+
LOG.info("Restarting cluster");
107+
UTIL.restartHBaseCluster(StartMiniClusterOption.builder().masterClass(HMasterForTest.class)
108+
.numMasters(1).numRegionServers(3).rsPorts(ports).build());
109+
UTIL.waitFor(60000, () -> UTIL.getHBaseCluster().getMaster().isInitialized());
110+
111+
UTIL.waitFor(60000, () -> getServerStateNode(SERVER_FOR_TEST) != null);
112+
serverNode = getServerStateNode(SERVER_FOR_TEST);
113+
assertFalse("serverNode should not be ONLINE during SCP processing",
114+
serverNode.isInState(ServerState.ONLINE));
115+
Optional<Procedure<?>> procedure = UTIL.getHBaseCluster().getMaster().getProcedures().stream()
116+
.filter(p -> (p instanceof ServerCrashProcedure) &&
117+
((ServerCrashProcedure) p).getServerName().equals(SERVER_FOR_TEST)).findAny();
118+
assertTrue("Should have one SCP for " + SERVER_FOR_TEST, procedure.isPresent());
119+
assertFalse("Submit the SCP for the same serverName " + SERVER_FOR_TEST + " which should fail",
120+
UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST));
121+
122+
// Wait the SCP to finish
123+
SCP_LATCH.countDown();
124+
UTIL.waitFor(60000, () -> procedure.get().isFinished());
125+
126+
assertFalse("Even when the SCP is finished, the duplicate SCP should not be scheduled for " +
127+
SERVER_FOR_TEST,
128+
UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST));
129+
serverNode = UTIL.getHBaseCluster().getMaster().getAssignmentManager().getRegionStates()
130+
.getServerNode(SERVER_FOR_TEST);
131+
assertNull("serverNode should be deleted after SCP finished", serverNode);
132+
}
133+
134+
private void setupCluster() throws Exception {
135+
UTIL.startMiniCluster(
136+
StartMiniClusterOption.builder().masterClass(HMasterForTest.class).numMasters(1)
137+
.numRegionServers(3).build());
61138
UTIL.waitFor(60000, () -> UTIL.getMiniHBaseCluster().getMaster().isInitialized());
62139
// wait for all SCPs finished
63140
UTIL.waitFor(60000, () -> UTIL.getHBaseCluster().getMaster().getProcedures().stream()
64-
.noneMatch(p -> p instanceof ServerCrashProcedure));
141+
.noneMatch(p -> p instanceof ServerCrashProcedure));
142+
UTIL.getHBaseCluster().getMaster().balanceSwitch(false);
143+
}
144+
145+
private void setupTable() throws Exception {
65146
TableName tableName = TABLES[0];
66-
ServerName testServer = UTIL.getHBaseCluster().getRegionServer(0).getServerName();
67-
UTIL.waitFor(30000, () -> getServerStateNode(testServer) != null);
68-
ServerStateNode serverNode = getServerStateNode(testServer);
69-
Assert.assertNotNull(serverNode);
70-
Assert.assertTrue("serverNode should be ONLINE when cluster runs normally",
71-
serverNode.isInState(ServerState.ONLINE));
72147
UTIL.createMultiRegionTable(tableName, FAMILY);
73-
UTIL.waitTableEnabled(tableName);
148+
UTIL.waitTableAvailable(tableName);
74149
Table table = UTIL.getConnection().getTable(tableName);
75150
for (int i = 0; i < 100; i++) {
76151
UTIL.loadTable(table, FAMILY);
77152
}
78-
List<Integer> ports =
79-
UTIL.getHBaseCluster().getMaster().getServerManager().getOnlineServersList().stream()
80-
.map(serverName -> serverName.getPort()).collect(Collectors.toList());
81-
LOG.info("Shutting down cluster");
82-
UTIL.getHBaseCluster().killAll();
83-
UTIL.getHBaseCluster().waitUntilShutDown();
84-
LOG.info("Starting cluster the second time");
85-
UTIL.restartHBaseCluster(3, ports);
86-
UTIL.waitFor(30000, () -> UTIL.getHBaseCluster().getMaster().isInitialized());
87-
serverNode = UTIL.getHBaseCluster().getMaster().getAssignmentManager().getRegionStates()
88-
.getServerNode(testServer);
89-
Assert.assertNotNull("serverNode should not be null when restart whole cluster", serverNode);
90-
Assert.assertFalse(serverNode.isInState(ServerState.ONLINE));
91-
LOG.info("start to find the procedure of SCP for the severName we choose");
92-
UTIL.waitFor(60000,
93-
() -> UTIL.getHBaseCluster().getMaster().getProcedures().stream()
94-
.anyMatch(procedure -> (procedure instanceof ServerCrashProcedure) &&
95-
((ServerCrashProcedure) procedure).getServerName().equals(testServer)));
96-
Assert.assertFalse("serverNode should not be ONLINE during SCP processing",
97-
serverNode.isInState(ServerState.ONLINE));
98-
LOG.info("start to submit the SCP for the same serverName {} which should fail", testServer);
99-
Assert
100-
.assertFalse(UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(testServer));
101-
Procedure<?> procedure = UTIL.getHBaseCluster().getMaster().getProcedures().stream()
102-
.filter(p -> (p instanceof ServerCrashProcedure) &&
103-
((ServerCrashProcedure) p).getServerName().equals(testServer))
104-
.findAny().get();
105-
UTIL.waitFor(60000, () -> procedure.isFinished());
106-
LOG.info("even when the SCP is finished, the duplicate SCP should not be scheduled for {}",
107-
testServer);
108-
Assert
109-
.assertFalse(UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(testServer));
110-
serverNode = UTIL.getHBaseCluster().getMaster().getAssignmentManager().getRegionStates()
111-
.getServerNode(testServer);
112-
Assert.assertNull("serverNode should be deleted after SCP finished", serverNode);
153+
}
154+
155+
public static final class HMasterForTest extends HMaster {
156+
157+
public HMasterForTest(Configuration conf) throws IOException, KeeperException {
158+
super(conf);
159+
}
160+
161+
@Override
162+
protected AssignmentManager createAssignmentManager(MasterServices master) {
163+
return new AssignmentManagerForTest(master);
164+
}
165+
}
166+
167+
private static final class AssignmentManagerForTest extends AssignmentManager {
168+
169+
public AssignmentManagerForTest(MasterServices master) {
170+
super(master);
171+
}
172+
173+
@Override
174+
public List<RegionInfo> getRegionsOnServer(ServerName serverName) {
175+
List<RegionInfo> regions = super.getRegionsOnServer(serverName);
176+
// ServerCrashProcedure will call this method, so wait the CountDownLatch here
177+
if (SCP_LATCH != null && SERVER_FOR_TEST != null && serverName.equals(SERVER_FOR_TEST)) {
178+
try {
179+
LOG.info("ServerCrashProcedure wait the CountDownLatch here");
180+
SCP_LATCH.await();
181+
LOG.info("Continue the ServerCrashProcedure");
182+
SCP_LATCH = null;
183+
} catch (InterruptedException e) {
184+
throw new RuntimeException(e);
185+
}
186+
}
187+
return regions;
188+
}
113189
}
114190
}

0 commit comments

Comments
 (0)