Skip to content

Commit 6f23800

Browse files
taklwuStephen Wu
authored andcommitted
HBASE-24286: HMaster won't become healthy after after cloning or creating a new cluster pointing at the same file system
HBase currently does not handle `Unknown Servers` automatically and requires users to run hbck2 `scheduleRecoveries` when one see unknown servers on the HBase report UI. This became a blocker on HBase2 adoption especially when a table wasn't disabled before shutting down a HBase cluster on cloud or any dynamic environment that hostname may change frequently. Once the cluster restarts, hbase:meta will be keeping the old hostname/IPs for the previous cluster, and those region servers became `Unknown Servers` and will never be recycled. Our fix here is to trigger a repair immediately after the CatalogJanitor figured out any `Unknown Servers` with submitting a HBCKServerCrashProcedure such that regions on `Unknown Server ` can be reassigned to other online servers. - Also fix a logic change that always delete the meta table directory if InitMetaProcedure#writeFsLayout runs, especially if ZNode is fresh but meta table exists
1 parent ce4e692 commit 6f23800

File tree

4 files changed

+252
-7
lines changed

4 files changed

+252
-7
lines changed

hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1610,6 +1610,10 @@ public enum OperationStatusCode {
16101610
"hbase.regionserver.slowlog.systable.enabled";
16111611
public static final boolean DEFAULT_SLOW_LOG_SYS_TABLE_ENABLED_KEY = false;
16121612

1613+
public static final String CATALOGJANITOR_REPAIR_UNKNOWN_SERVERS =
1614+
"hbase.catalogjanitor.repair.unknown.servers";
1615+
public static final boolean DEFAULT_CATALOGJANITOR_REPAIR_UNKNOWN_SERVERS = true;
1616+
16131617
private HConstants() {
16141618
// Can't be instantiated with this ctor.
16151619
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/CatalogJanitor.java

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -74,17 +74,20 @@
7474
* mode, if we are NOT shutting down, AND if the assignmentmanager is loaded.
7575
* Playing it safe, we will garbage collect no-longer needed region references
7676
* only if there are no regions-in-transition (RIT).
77+
*
78+
* if any unknown servers are found, we schedule a HBCKSCP and reassign regions to online
79+
* servers.
7780
*/
7881
// TODO: Only works with single hbase:meta region currently. Fix.
7982
// TODO: Should it start over every time? Could it continue if runs into problem? Only if
8083
// problem does not mess up 'results'.
81-
// TODO: Do more by way of 'repair'; see note on unknownServers below.
8284
@InterfaceAudience.Private
8385
public class CatalogJanitor extends ScheduledChore {
8486
private static final Logger LOG = LoggerFactory.getLogger(CatalogJanitor.class.getName());
8587
private final AtomicBoolean alreadyRunning = new AtomicBoolean(false);
8688
private final AtomicBoolean enabled = new AtomicBoolean(true);
8789
private final MasterServices services;
90+
private final boolean repairUnknownServers;
8891

8992
/**
9093
* Saved report from last hbase:meta scan to completion. May be stale if having trouble
@@ -96,6 +99,9 @@ public class CatalogJanitor extends ScheduledChore {
9699
super("CatalogJanitor-" + services.getServerName().toShortString(), services,
97100
services.getConfiguration().getInt("hbase.catalogjanitor.interval", 300000));
98101
this.services = services;
102+
repairUnknownServers = services.getConfiguration()
103+
.getBoolean(HConstants.CATALOGJANITOR_REPAIR_UNKNOWN_SERVERS,
104+
HConstants.DEFAULT_CATALOGJANITOR_REPAIR_UNKNOWN_SERVERS);
99105
}
100106

101107
@Override
@@ -138,7 +144,7 @@ protected void chore() {
138144
isMetaLoaded(am)) {
139145
scan();
140146
} else {
141-
LOG.warn("CatalogJanitor is disabled! Enabled=" + getEnabled() +
147+
LOG.warn("CatalogJanitor is disabled! Enabled=" + getEnabled() +
142148
", maintenanceMode=" + this.services.isInMaintenanceMode() + ", am=" + am +
143149
", metaLoaded=" + isMetaLoaded(am) + ", hasRIT=" + isRIT(am) +
144150
" clusterShutDown=" + this.services.getServerManager().isClusterShutdown());
@@ -171,6 +177,8 @@ int scan() throws IOException {
171177
this.lastReport = scanForReport();
172178
if (!this.lastReport.isEmpty()) {
173179
LOG.warn(this.lastReport.toString());
180+
// expires unknown servers
181+
repairUnknownServers();
174182
}
175183

176184
if (isRIT(this.services.getAssignmentManager())) {
@@ -218,6 +226,20 @@ int scan() throws IOException {
218226
}
219227
}
220228

229+
void repairUnknownServers() {
230+
if(repairUnknownServers && !lastReport.unknownServers.isEmpty()) {
231+
// submit HBCKServerCrashProcedure to avoid any corner cases that in-memory region states
232+
// mismatches with hbase:meta. in fact if HBCKSCP finds any in-memory region states,
233+
// HBCKSCP is basically same as SCP.
234+
lastReport.unknownServers.stream().forEach(regionInfoServerNamePair -> {
235+
LOG.info("Submitting HBCKSCP for Unknown Region Server {}",
236+
regionInfoServerNamePair.getSecond());
237+
services.getAssignmentManager()
238+
.submitServerCrash(regionInfoServerNamePair.getSecond(), true, true);
239+
});
240+
}
241+
}
242+
221243
/**
222244
* Scan hbase:meta.
223245
* @return Return generated {@link Report}
@@ -447,9 +469,10 @@ public static class Report {
447469
private final List<Pair<RegionInfo, RegionInfo>> overlaps = new ArrayList<>();
448470

449471
/**
450-
* TODO: If CatalogJanitor finds an 'Unknown Server', it should 'fix' it by queuing
451-
* a {@link org.apache.hadoop.hbase.master.procedure.HBCKServerCrashProcedure} for
452-
* found server for it to clean up meta.
472+
* If CatalogJanitor finds an 'Unknown Server' during each scan/chore on hbase:meta,
473+
* it should be automatically fixed by queuing a
474+
* {@link org.apache.hadoop.hbase.master.procedure.HBCKServerCrashProcedure} for
475+
* found server for it to clean up meta and reassign regions to online servers.
453476
*/
454477
private final List<Pair<RegionInfo, ServerName>> unknownServers = new ArrayList<>();
455478

hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/InitMetaProcedure.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,8 @@ private static void writeFsLayout(Path rootDir, Configuration conf) throws IOExc
7171
LOG.info("BOOTSTRAP: creating hbase:meta region");
7272
FileSystem fs = rootDir.getFileSystem(conf);
7373
Path tableDir = CommonFSUtils.getTableDir(rootDir, TableName.META_TABLE_NAME);
74-
if (fs.exists(tableDir) && !fs.delete(tableDir, true)) {
75-
LOG.warn("Can not delete partial created meta table, continue...");
74+
if (fs.exists(tableDir)) {
75+
LOG.info("Meta table directory exists, continue...");
7676
}
7777
// Bootstrapping, make sure blockcache is off. Else, one will be
7878
// created here in bootstrap and it'll need to be cleaned up. Better to
Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase.master;
19+
20+
import static org.junit.Assert.assertEquals;
21+
import static org.junit.Assert.assertFalse;
22+
import static org.junit.Assert.assertTrue;
23+
24+
import java.io.IOException;
25+
import java.time.Duration;
26+
import java.util.List;
27+
28+
import org.apache.hadoop.fs.Path;
29+
import org.apache.hadoop.hbase.Cell;
30+
import org.apache.hadoop.hbase.HBaseClassTestRule;
31+
import org.apache.hadoop.hbase.HBaseTestingUtility;
32+
import org.apache.hadoop.hbase.HConstants;
33+
import org.apache.hadoop.hbase.MiniHBaseCluster;
34+
import org.apache.hadoop.hbase.ServerName;
35+
import org.apache.hadoop.hbase.TableName;
36+
import org.apache.hadoop.hbase.client.Get;
37+
import org.apache.hadoop.hbase.client.Put;
38+
import org.apache.hadoop.hbase.client.RegionInfo;
39+
import org.apache.hadoop.hbase.client.Result;
40+
import org.apache.hadoop.hbase.client.Table;
41+
import org.apache.hadoop.hbase.master.region.MasterRegionFactory;
42+
import org.apache.hadoop.hbase.procedure2.store.wal.WALProcedureStore;
43+
import org.apache.hadoop.hbase.regionserver.HRegionServer;
44+
import org.apache.hadoop.hbase.testclassification.LargeTests;
45+
import org.apache.hadoop.hbase.util.Bytes;
46+
import org.apache.hadoop.hbase.util.CommonFSUtils;
47+
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
48+
49+
import org.junit.ClassRule;
50+
import org.junit.Rule;
51+
import org.junit.Test;
52+
import org.junit.experimental.categories.Category;
53+
import org.junit.rules.TestName;
54+
55+
/**
56+
* Test reuse data directory when cluster failover with a set of new region servers with
57+
* different hostnames. For any hbase system table and user table can be assigned normally after
58+
* cluster restart
59+
*/
60+
@Category({ LargeTests.class })
61+
public class TestRecreateCluster {
62+
@ClassRule
63+
public static final HBaseClassTestRule CLASS_RULE =
64+
HBaseClassTestRule.forClass(TestRecreateCluster.class);
65+
66+
@Rule
67+
public TestName name = new TestName();
68+
69+
private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
70+
private static final int NUM_RS = 3;
71+
private static final long TIMEOUT_MS = Duration.ofMinutes(2).toMillis();
72+
73+
@Test
74+
public void testRecreateCluster_UserTableDisabled() throws Exception {
75+
TEST_UTIL.startMiniCluster(NUM_RS);
76+
try {
77+
TableName tableName = TableName.valueOf("t1");
78+
prepareDataBeforeRecreate(TEST_UTIL, tableName);
79+
TEST_UTIL.getAdmin().disableTable(tableName);
80+
TEST_UTIL.waitTableDisabled(tableName.getName());
81+
restartHBaseCluster(true);
82+
TEST_UTIL.getAdmin().enableTable(tableName);
83+
validateDataAfterRecreate(TEST_UTIL, tableName);
84+
} finally {
85+
TEST_UTIL.shutdownMiniCluster();
86+
}
87+
}
88+
89+
@Test
90+
public void testRecreateCluster_UserTableEnabled() throws Exception {
91+
validateRecreateClusterWithUserTableEnabled(true);
92+
}
93+
94+
@Test
95+
public void testRecreateCluster_UserTableEnabled_WithoutCleanupWALsAndZNodes() throws Exception {
96+
validateRecreateClusterWithUserTableEnabled(false);
97+
}
98+
99+
private void validateRecreateClusterWithUserTableEnabled(boolean cleanupWALsAndZNodes)
100+
throws Exception {
101+
TEST_UTIL.startMiniCluster(NUM_RS);
102+
try {
103+
TableName tableName = TableName.valueOf("t1");
104+
prepareDataBeforeRecreate(TEST_UTIL, tableName);
105+
restartHBaseCluster(cleanupWALsAndZNodes);
106+
validateDataAfterRecreate(TEST_UTIL, tableName);
107+
} finally {
108+
TEST_UTIL.shutdownMiniCluster();
109+
}
110+
}
111+
112+
private void restartHBaseCluster(boolean cleanUpWALsAndZNodes) throws Exception {
113+
// flush cache so that everything is on disk
114+
TEST_UTIL.getMiniHBaseCluster().flushcache();
115+
116+
List<ServerName> oldServers =
117+
TEST_UTIL.getHBaseCluster().getMaster().getServerManager().getOnlineServersList();
118+
119+
// make sure there is no procedures pending
120+
TEST_UTIL.waitFor(TIMEOUT_MS, () -> TEST_UTIL.getHBaseCluster().getMaster()
121+
.getProcedures().stream().filter(p -> p.isFinished()).findAny().isPresent());
122+
123+
// shutdown and delete data if needed
124+
Path walRootDirPath = TEST_UTIL.getMiniHBaseCluster().getMaster().getWALRootDir();
125+
Path rootDirPath = CommonFSUtils.getRootDir(TEST_UTIL.getConfiguration());
126+
TEST_UTIL.shutdownMiniHBaseCluster();
127+
128+
if (cleanUpWALsAndZNodes) {
129+
TEST_UTIL.getDFSCluster().getFileSystem()
130+
.delete(new Path(rootDirPath, MasterRegionFactory.MASTER_STORE_DIR), true);
131+
TEST_UTIL.getDFSCluster().getFileSystem()
132+
.delete(new Path(walRootDirPath, MasterRegionFactory.MASTER_STORE_DIR), true);
133+
TEST_UTIL.getDFSCluster().getFileSystem()
134+
.delete(new Path(walRootDirPath, WALProcedureStore.MASTER_PROCEDURE_LOGDIR), true);
135+
136+
TEST_UTIL.getDFSCluster().getFileSystem()
137+
.delete(new Path(walRootDirPath, HConstants.HREGION_LOGDIR_NAME), true);
138+
TEST_UTIL.getDFSCluster().getFileSystem()
139+
.delete(new Path(walRootDirPath, HConstants.HREGION_OLDLOGDIR_NAME), true);
140+
// delete all zk data
141+
// we cannot keep ZK data because it will hold the meta region states as open and
142+
// didn't submit a InitMetaProcedure
143+
ZKUtil.deleteChildrenRecursively(TEST_UTIL.getZooKeeperWatcher(),
144+
TEST_UTIL.getZooKeeperWatcher().getZNodePaths().baseZNode);
145+
TEST_UTIL.shutdownMiniZKCluster();
146+
TEST_UTIL.startMiniZKCluster();
147+
}
148+
149+
TEST_UTIL.restartHBaseCluster(NUM_RS);
150+
TEST_UTIL.waitFor(TIMEOUT_MS,
151+
() -> TEST_UTIL.getMiniHBaseCluster().getNumLiveRegionServers() == NUM_RS);
152+
153+
// make sure we have a new set of region servers with different hostnames and ports
154+
List<ServerName> newServers =
155+
TEST_UTIL.getHBaseCluster().getMaster().getServerManager().getOnlineServersList();
156+
assertFalse(newServers.stream().filter(newServer -> oldServers.contains(newServer)).findAny()
157+
.isPresent());
158+
}
159+
160+
private void prepareDataBeforeRecreate(
161+
HBaseTestingUtility testUtil, TableName tableName) throws Exception {
162+
Table table = testUtil.createTable(tableName, "f");
163+
Put put = new Put(Bytes.toBytes("r1"));
164+
put.addColumn(Bytes.toBytes("f"), Bytes.toBytes("c"), Bytes.toBytes("v"));
165+
table.put(put);
166+
167+
ensureTableNotColocatedWithSystemTable(tableName, TableName.NAMESPACE_TABLE_NAME);
168+
}
169+
170+
private void ensureTableNotColocatedWithSystemTable(TableName userTable, TableName systemTable)
171+
throws IOException, InterruptedException {
172+
MiniHBaseCluster hbaseCluster = TEST_UTIL.getHBaseCluster();
173+
assertTrue("Please start more than 1 regionserver",
174+
hbaseCluster.getRegionServerThreads().size() > 1);
175+
176+
int userTableServerNum = getServerNumForTableWithOnlyOneRegion(userTable);
177+
int systemTableServerNum = getServerNumForTableWithOnlyOneRegion(systemTable);
178+
179+
if (userTableServerNum != systemTableServerNum) {
180+
// no-ops if user table and system are already on a different host
181+
return;
182+
}
183+
184+
int destServerNum = (systemTableServerNum + 1) % NUM_RS;
185+
assertTrue(systemTableServerNum != destServerNum);
186+
187+
HRegionServer systemTableServer = hbaseCluster.getRegionServer(systemTableServerNum);
188+
HRegionServer destServer = hbaseCluster.getRegionServer(destServerNum);
189+
assertTrue(!systemTableServer.equals(destServer));
190+
// make sure the dest server is live before moving region
191+
hbaseCluster.waitForRegionServerToStart(destServer.getServerName().getHostname(),
192+
destServer.getServerName().getPort(), TIMEOUT_MS);
193+
// move region of userTable to a different regionserver not co-located with system table
194+
TEST_UTIL.moveRegionAndWait(TEST_UTIL.getAdmin().getRegions(userTable).get(0),
195+
destServer.getServerName());
196+
}
197+
198+
private int getServerNumForTableWithOnlyOneRegion(TableName tableName) throws IOException {
199+
List<RegionInfo> tableRegionInfos = TEST_UTIL.getAdmin().getRegions(tableName);
200+
assertEquals(1, tableRegionInfos.size());
201+
return TEST_UTIL.getHBaseCluster()
202+
.getServerWith(tableRegionInfos.get(0).getRegionName());
203+
}
204+
205+
private void validateDataAfterRecreate(
206+
HBaseTestingUtility testUtil, TableName tableName) throws Exception {
207+
Table t1 = testUtil.getConnection().getTable(tableName);
208+
Get get = new Get(Bytes.toBytes("r1"));
209+
get.addColumn(Bytes.toBytes("f"), Bytes.toBytes("c"));
210+
Result result = t1.get(get);
211+
assertTrue(result.advance());
212+
Cell cell = result.current();
213+
assertEquals("v", Bytes.toString(cell.getValueArray(),
214+
cell.getValueOffset(), cell.getValueLength()));
215+
assertFalse(result.advance());
216+
}
217+
218+
}

0 commit comments

Comments
 (0)