Skip to content

Commit 8c989c9

Browse files
authored
HBASE-28342 Decommissioned hosts should be rejected by the HMaster (#5681)
Signed-off by: Nick Dimiduk <[email protected]>
1 parent 7be588e commit 8c989c9

6 files changed

Lines changed: 215 additions & 23 deletions

File tree

hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1622,6 +1622,20 @@ public enum OperationStatusCode {
16221622
*/
16231623
public final static boolean HBASE_SERVER_USEIP_ENABLED_DEFAULT = false;
16241624

1625+
/**
1626+
* Should the HMaster reject hosts of decommissioned RegionServers, bypass matching their port and
1627+
* startcode parts of their ServerName or not? When True, the HMaster will reject a RegionServer's
1628+
* request to `reportForDuty` if it's hostname exists in the list of decommissioned RegionServers
1629+
* it maintains internally. Added in HBASE-28342.
1630+
*/
1631+
public final static String REJECT_DECOMMISSIONED_HOSTS_KEY =
1632+
"hbase.master.reject.decommissioned.hosts";
1633+
1634+
/**
1635+
* Default value of {@link #REJECT_DECOMMISSIONED_HOSTS_KEY}
1636+
*/
1637+
public final static boolean REJECT_DECOMMISSIONED_HOSTS_DEFAULT = false;
1638+
16251639
private HConstants() {
16261640
// Can't be instantiated with this ctor.
16271641
}
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase.master;
19+
20+
import org.apache.hadoop.hbase.HBaseIOException;
21+
import org.apache.yetus.audience.InterfaceAudience;
22+
23+
@InterfaceAudience.Private
24+
public class DecommissionedHostRejectedException extends HBaseIOException {
25+
public DecommissionedHostRejectedException(String message) {
26+
super(message);
27+
}
28+
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -546,7 +546,6 @@ public HMaster(final Configuration conf) throws IOException {
546546
HConstants.DEFAULT_HBASE_MASTER_BALANCER_MAX_RIT_PERCENT);
547547

548548
// Do we publish the status?
549-
550549
boolean shouldPublish =
551550
conf.getBoolean(HConstants.STATUS_PUBLISHED, HConstants.STATUS_PUBLISHED_DEFAULT);
552551
Class<? extends ClusterStatusPublisher.Publisher> publisherClass =
@@ -997,7 +996,10 @@ private void finishActiveMasterInitialization() throws IOException, InterruptedE
997996
masterRegion = MasterRegionFactory.create(this);
998997
rsListStorage = new MasterRegionServerList(masterRegion, this);
999998

999+
// Initialize the ServerManager and register it as a configuration observer
10001000
this.serverManager = createServerManager(this, rsListStorage);
1001+
this.configurationManager.registerObserver(this.serverManager);
1002+
10011003
this.syncReplicationReplayWALManager = new SyncReplicationReplayWALManager(this);
10021004
if (
10031005
!conf.getBoolean(HBASE_SPLIT_WAL_COORDINATED_BY_ZK, DEFAULT_HBASE_SPLIT_COORDINATED_BY_ZK)

hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java

Lines changed: 89 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import java.util.List;
2929
import java.util.Map;
3030
import java.util.Map.Entry;
31+
import java.util.Objects;
3132
import java.util.Set;
3233
import java.util.concurrent.ConcurrentNavigableMap;
3334
import java.util.concurrent.ConcurrentSkipListMap;
@@ -51,6 +52,7 @@
5152
import org.apache.hadoop.hbase.client.AsyncClusterConnection;
5253
import org.apache.hadoop.hbase.client.AsyncRegionServerAdmin;
5354
import org.apache.hadoop.hbase.client.RegionInfo;
55+
import org.apache.hadoop.hbase.conf.ConfigurationObserver;
5456
import org.apache.hadoop.hbase.ipc.RemoteWithExtrasException;
5557
import org.apache.hadoop.hbase.master.assignment.RegionStates;
5658
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
@@ -100,7 +102,7 @@
100102
* only after the handler is fully enabled and has completed the handling.
101103
*/
102104
@InterfaceAudience.Private
103-
public class ServerManager {
105+
public class ServerManager implements ConfigurationObserver {
104106
public static final String WAIT_ON_REGIONSERVERS_MAXTOSTART =
105107
"hbase.master.wait.on.regionservers.maxtostart";
106108

@@ -172,6 +174,9 @@ public class ServerManager {
172174
/** Listeners that are called on server events. */
173175
private List<ServerListener> listeners = new CopyOnWriteArrayList<>();
174176

177+
/** Configured value of HConstants.REJECT_DECOMMISSIONED_HOSTS_KEY */
178+
private volatile boolean rejectDecommissionedHostsConfig;
179+
175180
/**
176181
* Constructor.
177182
*/
@@ -183,6 +188,35 @@ public ServerManager(final MasterServices master, RegionServerList storage) {
183188
warningSkew = c.getLong("hbase.master.warningclockskew", 10000);
184189
persistFlushedSequenceId =
185190
c.getBoolean(PERSIST_FLUSHEDSEQUENCEID, PERSIST_FLUSHEDSEQUENCEID_DEFAULT);
191+
rejectDecommissionedHostsConfig = getRejectDecommissionedHostsConfig(c);
192+
}
193+
194+
/**
195+
* Implementation of the ConfigurationObserver interface. We are interested in live-loading the
196+
* configuration value of HConstants.REJECT_DECOMMISSIONED_HOSTS_KEY
197+
* @param conf Server configuration instance
198+
*/
199+
@Override
200+
public void onConfigurationChange(Configuration conf) {
201+
final boolean newValue = getRejectDecommissionedHostsConfig(conf);
202+
if (rejectDecommissionedHostsConfig == newValue) {
203+
// no-op
204+
return;
205+
}
206+
207+
LOG.info("Config Reload for RejectDecommissionedHosts. previous value: {}, new value: {}",
208+
rejectDecommissionedHostsConfig, newValue);
209+
210+
rejectDecommissionedHostsConfig = newValue;
211+
}
212+
213+
/**
214+
* Reads the value of HConstants.REJECT_DECOMMISSIONED_HOSTS_KEY from the config and returns it
215+
* @param conf Configuration instance of the Master
216+
*/
217+
public boolean getRejectDecommissionedHostsConfig(Configuration conf) {
218+
return conf.getBoolean(HConstants.REJECT_DECOMMISSIONED_HOSTS_KEY,
219+
HConstants.REJECT_DECOMMISSIONED_HOSTS_DEFAULT);
186220
}
187221

188222
/**
@@ -227,11 +261,14 @@ ServerName regionServerStartup(RegionServerStartupRequest request, int versionNu
227261
final String hostname =
228262
request.hasUseThisHostnameInstead() ? request.getUseThisHostnameInstead() : isaHostName;
229263
ServerName sn = ServerName.valueOf(hostname, request.getPort(), request.getServerStartCode());
264+
265+
// Check if the host should be rejected based on it's decommissioned status
266+
checkRejectableDecommissionedStatus(sn);
267+
230268
checkClockSkew(sn, request.getServerCurrentTime());
231269
checkIsDead(sn, "STARTUP");
232270
if (!checkAndRecordNewServer(sn, ServerMetricsBuilder.of(sn, versionNumber, version))) {
233-
LOG.warn(
234-
"THIS SHOULD NOT HAPPEN, RegionServerStartup" + " could not record the server: " + sn);
271+
LOG.warn("THIS SHOULD NOT HAPPEN, RegionServerStartup could not record the server: {}", sn);
235272
}
236273
storage.started(sn);
237274
return sn;
@@ -293,6 +330,42 @@ public void regionServerReport(ServerName sn, ServerMetrics sl) throws YouAreDea
293330
updateLastFlushedSequenceIds(sn, sl);
294331
}
295332

333+
/**
334+
* Checks if the Master is configured to reject decommissioned hosts or not. When it's configured
335+
* to do so, any RegionServer trying to join the cluster will have it's host checked against the
336+
* list of hosts of currently decommissioned servers and potentially get prevented from reporting
337+
* for duty; otherwise, we do nothing and we let them pass to the next check. See HBASE-28342 for
338+
* details.
339+
* @param sn The ServerName to check for
340+
* @throws DecommissionedHostRejectedException if the Master is configured to reject
341+
* decommissioned hosts and this host exists in the
342+
* list of the decommissioned servers
343+
*/
344+
private void checkRejectableDecommissionedStatus(ServerName sn)
345+
throws DecommissionedHostRejectedException {
346+
LOG.info("Checking decommissioned status of RegionServer {}", sn.getServerName());
347+
348+
// If the Master is not configured to reject decommissioned hosts, return early.
349+
if (!rejectDecommissionedHostsConfig) {
350+
return;
351+
}
352+
353+
// Look for a match for the hostname in the list of decommissioned servers
354+
for (ServerName server : getDrainingServersList()) {
355+
if (Objects.equals(server.getHostname(), sn.getHostname())) {
356+
// Found a match and master is configured to reject decommissioned hosts, throw exception!
357+
LOG.warn(
358+
"Rejecting RegionServer {} from reporting for duty because Master is configured "
359+
+ "to reject decommissioned hosts and this host was marked as such in the past.",
360+
sn.getServerName());
361+
throw new DecommissionedHostRejectedException(String.format(
362+
"Host %s exists in the list of decommissioned servers and Master is configured to "
363+
+ "reject decommissioned hosts",
364+
sn.getHostname()));
365+
}
366+
}
367+
}
368+
296369
/**
297370
* Check is a server of same host and port already exists, if not, or the existed one got a
298371
* smaller start code, record it.
@@ -647,13 +720,8 @@ public synchronized void moveFromOnlineToDeadServers(final ServerName sn) {
647720
* Remove the server from the drain list.
648721
*/
649722
public synchronized boolean removeServerFromDrainList(final ServerName sn) {
650-
// Warn if the server (sn) is not online. ServerName is of the form:
651-
// <hostname> , <port> , <startcode>
723+
LOG.info("Removing server {} from the draining list.", sn);
652724

653-
if (!this.isServerOnline(sn)) {
654-
LOG.warn("Server " + sn + " is not currently online. "
655-
+ "Removing from draining list anyway, as requested.");
656-
}
657725
// Remove the server from the draining servers lists.
658726
return this.drainingServers.remove(sn);
659727
}
@@ -663,22 +731,23 @@ public synchronized boolean removeServerFromDrainList(final ServerName sn) {
663731
* @return True if the server is added or the server is already on the drain list.
664732
*/
665733
public synchronized boolean addServerToDrainList(final ServerName sn) {
666-
// Warn if the server (sn) is not online. ServerName is of the form:
667-
// <hostname> , <port> , <startcode>
668-
669-
if (!this.isServerOnline(sn)) {
670-
LOG.warn("Server " + sn + " is not currently online. "
671-
+ "Ignoring request to add it to draining list.");
734+
// If master is not rejecting decommissioned hosts, warn if the server (sn) is not online.
735+
// However, we want to add servers even if they're not online if the master is configured
736+
// to reject decommissioned hosts
737+
if (!rejectDecommissionedHostsConfig && !this.isServerOnline(sn)) {
738+
LOG.warn("Server {} is not currently online. Ignoring request to add it to draining list.",
739+
sn);
672740
return false;
673741
}
674-
// Add the server to the draining servers lists, if it's not already in
675-
// it.
742+
743+
// Add the server to the draining servers lists, if it's not already in it.
676744
if (this.drainingServers.contains(sn)) {
677-
LOG.warn("Server " + sn + " is already in the draining server list."
678-
+ "Ignoring request to add it again.");
745+
LOG.warn(
746+
"Server {} is already in the draining server list. Ignoring request to add it again.", sn);
679747
return true;
680748
}
681-
LOG.info("Server " + sn + " added to draining server list.");
749+
750+
LOG.info("Server {} added to draining server list.", sn);
682751
return this.drainingServers.add(sn);
683752
}
684753

hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@
120120
import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
121121
import org.apache.hadoop.hbase.ipc.ServerRpcController;
122122
import org.apache.hadoop.hbase.log.HBaseMarkers;
123+
import org.apache.hadoop.hbase.master.DecommissionedHostRejectedException;
123124
import org.apache.hadoop.hbase.mob.MobFileCache;
124125
import org.apache.hadoop.hbase.mob.RSMobFileCleanerChore;
125126
import org.apache.hadoop.hbase.monitoring.TaskMonitor;
@@ -2664,6 +2665,11 @@ private RegionServerStartupResponse reportForDuty() throws IOException {
26642665
LOG.error(HBaseMarkers.FATAL, "Master rejected startup because clock is out of sync", ioe);
26652666
// Re-throw IOE will cause RS to abort
26662667
throw ioe;
2668+
} else if (ioe instanceof DecommissionedHostRejectedException) {
2669+
LOG.error(HBaseMarkers.FATAL,
2670+
"Master rejected startup because the host is considered decommissioned", ioe);
2671+
// Re-throw IOE will cause RS to abort
2672+
throw ioe;
26672673
} else if (ioe instanceof ServerNotRunningYetException) {
26682674
LOG.debug("Master is not running yet");
26692675
} else {

hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,16 @@
1717
*/
1818
package org.apache.hadoop.hbase.regionserver;
1919

20-
import static org.junit.Assert.assertEquals;
21-
import static org.junit.Assert.assertTrue;
20+
import static org.hamcrest.CoreMatchers.containsString;
21+
import static org.hamcrest.Matchers.allOf;
22+
import static org.hamcrest.Matchers.hasItem;
23+
import static org.hamcrest.Matchers.is;
24+
import static org.junit.Assert.*;
2225

2326
import java.io.IOException;
2427
import java.io.StringWriter;
28+
import java.util.Arrays;
29+
import java.util.Collections;
2530
import java.util.concurrent.ScheduledThreadPoolExecutor;
2631
import java.util.concurrent.TimeUnit;
2732
import org.apache.commons.lang3.StringUtils;
@@ -30,9 +35,11 @@
3035
import org.apache.hadoop.hbase.HBaseTestingUtil;
3136
import org.apache.hadoop.hbase.HConstants;
3237
import org.apache.hadoop.hbase.LocalHBaseCluster;
38+
import org.apache.hadoop.hbase.MatcherPredicate;
3339
import org.apache.hadoop.hbase.ServerName;
3440
import org.apache.hadoop.hbase.SingleProcessHBaseCluster.MiniHBaseClusterRegionServer;
3541
import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
42+
import org.apache.hadoop.hbase.master.DecommissionedHostRejectedException;
3643
import org.apache.hadoop.hbase.master.HMaster;
3744
import org.apache.hadoop.hbase.master.ServerManager;
3845
import org.apache.hadoop.hbase.testclassification.LargeTests;
@@ -241,6 +248,72 @@ public void run() {
241248
waitForClusterOnline(master);
242249
}
243250

251+
/**
252+
* Tests that the RegionServer's reportForDuty gets rejected by the master when the master is
253+
* configured to reject decommissioned hosts and when there is a match for the joining
254+
* RegionServer in the list of decommissioned servers. Test case for HBASE-28342.
255+
*/
256+
@Test
257+
public void testReportForDutyGetsRejectedByMasterWhenConfiguredToRejectDecommissionedHosts()
258+
throws Exception {
259+
// Start a master and wait for it to become the active/primary master.
260+
// Use a random unique port
261+
cluster.getConfiguration().setInt(HConstants.MASTER_PORT, HBaseTestingUtil.randomFreePort());
262+
cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
263+
cluster.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 1);
264+
265+
// Set the cluster to reject decommissioned hosts
266+
cluster.getConfiguration().setBoolean(HConstants.REJECT_DECOMMISSIONED_HOSTS_KEY, true);
267+
268+
master = cluster.addMaster();
269+
rs = cluster.addRegionServer();
270+
master.start();
271+
rs.start();
272+
waitForClusterOnline(master);
273+
274+
// Add a second decommissioned region server to the cluster, wait for it to fail reportForDuty
275+
LogCapturer capturer =
276+
new LogCapturer((org.apache.logging.log4j.core.Logger) org.apache.logging.log4j.LogManager
277+
.getLogger(HRegionServer.class));
278+
279+
rs2 = cluster.addRegionServer();
280+
master.getMaster().decommissionRegionServers(
281+
Collections.singletonList(rs2.getRegionServer().getServerName()), false);
282+
rs2.start();
283+
284+
// Assert that the second regionserver has aborted
285+
testUtil.waitFor(TimeUnit.SECONDS.toMillis(90),
286+
new MatcherPredicate<>(() -> rs2.getRegionServer().isAborted(), is(true)));
287+
288+
// Assert that the log messages for DecommissionedHostRejectedException exist in the logs
289+
capturer.stopCapturing();
290+
291+
assertThat(capturer.getOutput(),
292+
containsString("Master rejected startup because the host is considered decommissioned"));
293+
294+
/**
295+
* Assert that the following log message occurred (one line):
296+
* "org.apache.hadoop.hbase.master.DecommissionedHostRejectedException:
297+
* org.apache.hadoop.hbase.master.DecommissionedHostRejectedException: Host localhost exists in
298+
* the list of decommissioned servers and Master is configured to reject decommissioned hosts"
299+
*/
300+
assertThat(Arrays.asList(capturer.getOutput().split("\n")),
301+
hasItem(allOf(containsString(DecommissionedHostRejectedException.class.getSimpleName()),
302+
containsString(DecommissionedHostRejectedException.class.getSimpleName()),
303+
containsString("Host " + rs2.getRegionServer().getServerName().getHostname()
304+
+ " exists in the list of decommissioned servers and Master is configured to reject"
305+
+ " decommissioned hosts"))));
306+
307+
assertThat(Arrays.asList(capturer.getOutput().split("\n")),
308+
hasItem(
309+
allOf(containsString("ABORTING region server " + rs2.getRegionServer().getServerName()),
310+
containsString("Unhandled"),
311+
containsString(DecommissionedHostRejectedException.class.getSimpleName()),
312+
containsString("Host " + rs2.getRegionServer().getServerName().getHostname()
313+
+ " exists in the list of decommissioned servers and Master is configured to reject"
314+
+ " decommissioned hosts"))));
315+
}
316+
244317
/**
245318
* Tests region sever reportForDuty with a non-default environment edge
246319
*/

0 commit comments

Comments
 (0)