Skip to content

Commit 9f81c78

Browse files
committed
HBASE-27568 ChaosMonkey add support for JournalNodes (#4963)
Signed-off-by: Reid Chan <[email protected]>
1 parent c1e551a commit 9f81c78

File tree

8 files changed

+208
-9
lines changed

8 files changed

+208
-9
lines changed

hbase-it/src/test/java/org/apache/hadoop/hbase/ClusterManager.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,10 @@ interface ClusterManager extends Configurable {
3131
/**
3232
* Type of the service daemon
3333
*/
34-
public static enum ServiceType {
34+
enum ServiceType {
3535
HADOOP_NAMENODE("namenode"),
3636
HADOOP_DATANODE("datanode"),
37+
HADOOP_JOURNALNODE("journalnode"),
3738
HADOOP_JOBTRACKER("jobtracker"),
3839
HADOOP_TASKTRACKER("tasktracker"),
3940
ZOOKEEPER_SERVER("QuorumPeerMain"),

hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ public class DistributedHBaseCluster extends HBaseCluster {
5757
* restarted instances of the same server will have different ServerName and will not coincide
5858
* with past dead ones. So there's no need to cleanup this list.
5959
*/
60-
private Set<ServerName> killedRegionServers = new HashSet<>();
60+
private final Set<ServerName> killedRegionServers = new HashSet<>();
6161

6262
public DistributedHBaseCluster(Configuration conf, ClusterManager clusterManager)
6363
throws IOException {
@@ -248,6 +248,37 @@ public void waitForNameNodeToStop(ServerName serverName, long timeout) throws IO
248248
waitForServiceToStop(ServiceType.HADOOP_NAMENODE, serverName, timeout);
249249
}
250250

251+
@Override
252+
public void startJournalNode(ServerName serverName) throws IOException {
253+
LOG.info("Starting journal node on: {}", serverName.getServerName());
254+
clusterManager.start(ServiceType.HADOOP_JOURNALNODE, serverName.getHostname(),
255+
serverName.getPort());
256+
}
257+
258+
@Override
259+
public void killJournalNode(ServerName serverName) throws IOException {
260+
LOG.info("Aborting journal node on: {}", serverName.getServerName());
261+
clusterManager.kill(ServiceType.HADOOP_JOURNALNODE, serverName.getHostname(),
262+
serverName.getPort());
263+
}
264+
265+
@Override
266+
public void stopJournalNode(ServerName serverName) throws IOException {
267+
LOG.info("Stopping journal node on: {}", serverName.getServerName());
268+
clusterManager.stop(ServiceType.HADOOP_JOURNALNODE, serverName.getHostname(),
269+
serverName.getPort());
270+
}
271+
272+
@Override
273+
public void waitForJournalNodeToStart(ServerName serverName, long timeout) throws IOException {
274+
waitForServiceToStart(ServiceType.HADOOP_JOURNALNODE, serverName, timeout);
275+
}
276+
277+
@Override
278+
public void waitForJournalNodeToStop(ServerName serverName, long timeout) throws IOException {
279+
waitForServiceToStop(ServiceType.HADOOP_JOURNALNODE, serverName, timeout);
280+
}
281+
251282
private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
252283
throws IOException {
253284
LOG.info("Waiting for service: {} to stop: {}", service, serverName.getServerName());
@@ -264,7 +295,7 @@ private void waitForServiceToStop(ServiceType service, ServerName serverName, lo
264295

265296
private void waitForServiceToStart(ServiceType service, ServerName serverName, long timeout)
266297
throws IOException {
267-
LOG.info("Waiting for service: {} to start: ", service, serverName.getServerName());
298+
LOG.info("Waiting for service: {} to start: {}", service, serverName.getServerName());
268299
long start = EnvironmentEdgeManager.currentTime();
269300

270301
while ((EnvironmentEdgeManager.currentTime() - start) < timeout) {
@@ -359,8 +390,7 @@ public boolean restoreClusterMetrics(ClusterMetrics initial) throws IOException
359390
LOG.info("Restoring cluster - started");
360391

361392
// do a best effort restore
362-
boolean success = true;
363-
success = restoreMasters(initial, current) && success;
393+
boolean success = restoreMasters(initial, current);
364394
success = restoreRegionServers(initial, current) && success;
365395
success = restoreAdmin() && success;
366396

hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -260,18 +260,32 @@ protected void startDataNode(ServerName server) throws IOException {
260260
}
261261

262262
protected void killNameNode(ServerName server) throws IOException {
263-
getLogger().info("Killing namenode :-{}", server.getHostname());
263+
getLogger().info("Killing namenode {}", server.getHostname());
264264
cluster.killNameNode(server);
265265
cluster.waitForNameNodeToStop(server, killNameNodeTimeout);
266-
getLogger().info("Killed namenode:{}. Reported num of rs:{}", server,
266+
getLogger().info("Killed namenode {}. Reported num of rs:{}", server,
267267
cluster.getClusterMetrics().getLiveServerMetrics().size());
268268
}
269269

270270
protected void startNameNode(ServerName server) throws IOException {
271-
getLogger().info("Starting Namenode :-{}", server.getHostname());
271+
getLogger().info("Starting namenode {}", server.getHostname());
272272
cluster.startNameNode(server);
273273
cluster.waitForNameNodeToStart(server, startNameNodeTimeout);
274-
getLogger().info("Started namenode:{}", server);
274+
getLogger().info("Started namenode {}", server);
275+
}
276+
277+
protected void killJournalNode(ServerName server) throws IOException {
278+
getLogger().info("Killing journalnode {}", server.getHostname());
279+
cluster.killJournalNode(server);
280+
cluster.waitForJournalNodeToStop(server, killNameNodeTimeout);
281+
getLogger().info("Killed journalnode {}", server);
282+
}
283+
284+
protected void startJournalNode(ServerName server) throws IOException {
285+
getLogger().info("Starting journalnode {}", server.getHostname());
286+
cluster.startJournalNode(server);
287+
cluster.waitForJournalNodeToStart(server, startNameNodeTimeout);
288+
getLogger().info("Started journalnode {}", server);
275289
}
276290

277291
protected void unbalanceRegions(ClusterMetrics clusterStatus, List<ServerName> fromServers,

hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,4 +122,17 @@ void restartNameNode(ServerName server, long sleepTime) throws IOException {
122122
getLogger().info("Starting name node: {}", server);
123123
startNameNode(server);
124124
}
125+
126+
void restartJournalNode(ServerName server, long sleepTime) throws IOException {
127+
sleepTime = Math.max(sleepTime, 1000);
128+
// Don't try the kill if we're stopping
129+
if (context.isStopping()) {
130+
return;
131+
}
132+
getLogger().info("Killing journal node: {}", server);
133+
killJournalNode(server);
134+
sleep(sleepTime);
135+
getLogger().info("Starting journal node: {}", server);
136+
startJournalNode(server);
137+
}
125138
}
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase.chaos.actions;
19+
20+
import java.util.Arrays;
21+
import org.apache.commons.lang3.StringUtils;
22+
import org.apache.hadoop.conf.Configuration;
23+
import org.apache.hadoop.hbase.ServerName;
24+
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
25+
import org.apache.hadoop.hbase.net.Address;
26+
import org.apache.hadoop.hdfs.DFSUtil;
27+
import org.apache.hadoop.hdfs.DistributedFileSystem;
28+
import org.apache.hadoop.hdfs.HAUtil;
29+
import org.slf4j.Logger;
30+
import org.slf4j.LoggerFactory;
31+
32+
public class RestartRandomJournalNodeAction extends RestartActionBaseAction {
33+
private static final Logger LOG = LoggerFactory.getLogger(RestartRandomJournalNodeAction.class);
34+
35+
public RestartRandomJournalNodeAction(long sleepTime) {
36+
super(sleepTime);
37+
}
38+
39+
@Override
40+
protected Logger getLogger() {
41+
return LOG;
42+
}
43+
44+
@Override
45+
public void perform() throws Exception {
46+
getLogger().info("Performing action: Restart random JournalNode");
47+
48+
final String qjournal;
49+
try (final DistributedFileSystem dfs = HdfsActionUtils.createDfs(getConf())) {
50+
final Configuration conf = dfs.getConf();
51+
final String nameServiceID = DFSUtil.getNamenodeNameServiceId(conf);
52+
if (!HAUtil.isHAEnabled(conf, nameServiceID)) {
53+
getLogger().info("HA for HDFS is not enabled; skipping");
54+
return;
55+
}
56+
57+
qjournal = conf.get("dfs.namenode.shared.edits.dir");
58+
if (StringUtils.isEmpty(qjournal)) {
59+
getLogger().info("Empty qjournals!");
60+
return;
61+
}
62+
}
63+
64+
final ServerName journalNode =
65+
PolicyBasedChaosMonkey.selectRandomItem(getJournalNodes(qjournal));
66+
restartJournalNode(journalNode, sleepTime);
67+
}
68+
69+
private static ServerName[] getJournalNodes(final String qjournal) {
70+
// WARNING: HDFS internals. qjournal looks like this:
71+
// qjournal://journalnode-0.example.com:8485;...;journalnode-N.example.com:8485/hk8se
72+
// When done, we have an array of journalnodes+ports: e.g.journalnode-0.example.com:8485
73+
final String[] journalNodes =
74+
qjournal.toLowerCase().replaceAll("qjournal:\\/\\/", "").replaceAll("\\/.*$", "").split(";");
75+
return Arrays.stream(journalNodes).map(Address::fromString)
76+
.map(addr -> ServerName.valueOf(addr.getHostName(), addr.getPort()))
77+
.toArray(ServerName[]::new);
78+
}
79+
}

hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction;
2626
import org.apache.hadoop.hbase.chaos.actions.RestartActiveNameNodeAction;
2727
import org.apache.hadoop.hbase.chaos.actions.RestartRandomDataNodeAction;
28+
import org.apache.hadoop.hbase.chaos.actions.RestartRandomJournalNodeAction;
2829
import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsExceptMetaAction;
2930
import org.apache.hadoop.hbase.chaos.actions.RestartRandomZKNodeAction;
3031
import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsAction;
@@ -59,6 +60,7 @@ public ChaosMonkey build() {
5960
new ForceBalancerAction(),
6061
new RestartActiveNameNodeAction(60000),
6162
new RestartRandomDataNodeAction(60000),
63+
new RestartRandomJournalNodeAction(60000),
6264
new RestartRandomZKNodeAction(60000),
6365
new GracefulRollingRestartRsAction(gracefulRollingRestartTSSLeepTime),
6466
new RollingBatchSuspendResumeRsAction(rollingBatchSuspendRSSleepTime,

hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,41 @@ public abstract void waitForNameNodeToStart(ServerName serverName, long timeout)
284284
public abstract void waitForNameNodeToStop(ServerName serverName, long timeout)
285285
throws IOException;
286286

287+
/**
288+
* Starts a new journalnode on the given hostname or if this is a mini/local cluster, silently
289+
* logs warning message.
290+
* @throws IOException if something goes wrong
291+
*/
292+
public abstract void startJournalNode(ServerName serverName) throws IOException;
293+
294+
/**
295+
* Kills the journalnode process if this is a distributed cluster, otherwise, this causes master
296+
* to exit doing basic clean up only.
297+
* @throws IOException if something goes wrong
298+
*/
299+
public abstract void killJournalNode(ServerName serverName) throws IOException;
300+
301+
/**
302+
* Stops the journalnode if this is a distributed cluster, otherwise silently logs warning
303+
* message.
304+
* @throws IOException if something goes wrong
305+
*/
306+
public abstract void stopJournalNode(ServerName serverName) throws IOException;
307+
308+
/**
309+
* Wait for the specified journalnode to join the cluster
310+
* @throws IOException if something goes wrong or timeout occurs
311+
*/
312+
public abstract void waitForJournalNodeToStart(ServerName serverName, long timeout)
313+
throws IOException;
314+
315+
/**
316+
* Wait for the specified journalnode to stop
317+
* @throws IOException if something goes wrong or timeout occurs
318+
*/
319+
public abstract void waitForJournalNodeToStop(ServerName serverName, long timeout)
320+
throws IOException;
321+
287322
/**
288323
* Starts a new master on the given hostname or if this is a mini/local cluster, starts a master
289324
* locally.

hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,31 @@ public void waitForNameNodeToStop(ServerName serverName, long timeout) throws IO
377377
LOG.warn("Waiting for namenodes to stop on mini cluster is not supported");
378378
}
379379

380+
@Override
381+
public void startJournalNode(ServerName serverName) {
382+
LOG.warn("Starting journalnodes on mini cluster is not supported");
383+
}
384+
385+
@Override
386+
public void killJournalNode(ServerName serverName) {
387+
LOG.warn("Aborting journalnodes on mini cluster is not supported");
388+
}
389+
390+
@Override
391+
public void stopJournalNode(ServerName serverName) {
392+
LOG.warn("Stopping journalnodes on mini cluster is not supported");
393+
}
394+
395+
@Override
396+
public void waitForJournalNodeToStart(ServerName serverName, long timeout) {
397+
LOG.warn("Waiting for journalnodes to start on mini cluster is not supported");
398+
}
399+
400+
@Override
401+
public void waitForJournalNodeToStop(ServerName serverName, long timeout) {
402+
LOG.warn("Waiting for journalnodes to stop on mini cluster is not supported");
403+
}
404+
380405
@Override
381406
public void startMaster(String hostname, int port) throws IOException {
382407
this.startMaster();

0 commit comments

Comments
 (0)