Skip to content

Commit 90ecb15

Browse files
committed
HBASE-22709 Add a chore thread in master to do hbck checking
1 parent a65e72d commit 90ecb15

6 files changed

Lines changed: 418 additions & 25 deletions

File tree

hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/MasterStatusTmpl.jamon

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,8 @@ AssignmentManager assignmentManager = master.getAssignmentManager();
149149
<li class="active"><a href="/master-status">Home</a></li>
150150
<li><a href="/tablesDetailed.jsp">Table Details</a></li>
151151
<%if master.isActiveMaster() %>
152-
<li><a href="/procedures.jsp">Procedures &amp; Locks</a></li>
152+
<li><a href="/procedures.jsp">Procedures &amp; Locks</a></li>
153+
<li><a href="/hbck.jsp">HBCK Report</a></li>
153154
</%if>
154155
<li><a href="/processMaster.jsp">Process Metrics</a></li>
155156
<li><a href="/logs/">Local Logs</a></li>

hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,7 @@ public void run() {
385385
private ClusterStatusPublisher clusterStatusPublisherChore = null;
386386
private SnapshotCleanerChore snapshotCleanerChore = null;
387387

388+
private HbckChecker hbckChecker;
388389
CatalogJanitor catalogJanitorChore;
389390
private LogCleaner logCleaner;
390391
private HFileCleaner hfileCleaner;
@@ -1108,6 +1109,8 @@ private void finishActiveMasterInitialization(MonitoredTask status) throws IOExc
11081109
getChoreService().scheduleChore(normalizerChore);
11091110
this.catalogJanitorChore = new CatalogJanitor(this);
11101111
getChoreService().scheduleChore(catalogJanitorChore);
1112+
this.hbckChecker = new HbckChecker(this);
1113+
getChoreService().scheduleChore(hbckChecker);
11111114
this.serverManager.startChore();
11121115

11131116
// Only for rolling upgrade, where we need to migrate the data in namespace table to meta table.
@@ -1587,6 +1590,7 @@ private void stopChores() {
15871590
choreService.cancelChore(this.hfileCleaner);
15881591
choreService.cancelChore(this.replicationBarrierCleaner);
15891592
choreService.cancelChore(this.snapshotCleanerChore);
1593+
choreService.cancelChore(this.hbckChecker);
15901594
}
15911595
}
15921596

@@ -3756,4 +3760,8 @@ public Map<String, ReplicationStatus> getWalGroupsReplicationStatus() {
37563760
}
37573761
return super.getWalGroupsReplicationStatus();
37583762
}
3763+
3764+
public HbckChecker getHbckChecker() {
3765+
return this.hbckChecker;
3766+
}
37593767
}
Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase.master;
19+
20+
import java.io.IOException;
21+
import java.util.HashMap;
22+
import java.util.LinkedList;
23+
import java.util.List;
24+
import java.util.Map;
25+
import java.util.Set;
26+
27+
import org.apache.hadoop.fs.FileStatus;
28+
import org.apache.hadoop.fs.FileSystem;
29+
import org.apache.hadoop.fs.Path;
30+
import org.apache.hadoop.hbase.ScheduledChore;
31+
import org.apache.hadoop.hbase.ServerName;
32+
import org.apache.hadoop.hbase.client.RegionInfo;
33+
import org.apache.hadoop.hbase.util.FSUtils;
34+
import org.apache.hadoop.hbase.util.HbckRegionInfo;
35+
import org.apache.hadoop.hbase.util.Pair;
36+
import org.apache.yetus.audience.InterfaceAudience;
37+
import org.apache.yetus.audience.InterfaceStability;
38+
import org.slf4j.Logger;
39+
import org.slf4j.LoggerFactory;
40+
41+
import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
42+
43+
/**
44+
* Used to do the hbck checking job at master side.
45+
*/
46+
@InterfaceAudience.Private
47+
@InterfaceStability.Evolving
48+
public class HbckChecker extends ScheduledChore {
49+
private static final Logger LOG = LoggerFactory.getLogger(HbckChecker.class.getName());
50+
51+
private static final String HBCK_CHECKER_INTERVAL = "hbase.master.hbck.checker.interval";
52+
private static final int DEFAULT_HBCK_CHECKER_INTERVAL = 60 * 60 * 1000;
53+
54+
private final MasterServices master;
55+
56+
/**
57+
* This map contains the state of all hbck items. It maps from encoded region
58+
* name to HbckRegionInfo structure. The information contained in HbckRegionInfo is used
59+
* to detect and correct consistency (hdfs/meta/deployment) problems.
60+
*/
61+
private final Map<String, HbckRegionInfo> regionInfoMap = new HashMap<>();
62+
63+
private final Map<String, ServerName> orphanRegionsOnRS = new HashMap<>();
64+
private final List<String> orphanRegionsOnFS = new LinkedList<>();
65+
private final Map<String, Pair<ServerName, List<ServerName>>> inconsistentRegions =
66+
new HashMap<>();
67+
68+
private final Map<String, ServerName> orphanRegionsOnRSSnapshot = new HashMap<>();
69+
private final List<String> orphanRegionsOnFSSnapshot = new LinkedList<>();
70+
private final Map<String, Pair<ServerName, List<ServerName>>> inconsistentRegionsSnapshot =
71+
new HashMap<>();
72+
73+
private volatile boolean running = false;
74+
75+
public HbckChecker(MasterServices master) {
76+
super("HbckChecker-", master,
77+
master.getConfiguration().getInt(HBCK_CHECKER_INTERVAL, DEFAULT_HBCK_CHECKER_INTERVAL));
78+
this.master = master;
79+
}
80+
81+
@Override
82+
protected void chore() {
83+
running = true;
84+
regionInfoMap.clear();
85+
orphanRegionsOnRS.clear();
86+
orphanRegionsOnFS.clear();
87+
inconsistentRegions.clear();
88+
loadRegionsFromInMemoryState();
89+
loadRegionsFromRSReport();
90+
try {
91+
loadRegionsFromFS();
92+
} catch (IOException e) {
93+
LOG.warn("Faile to load the regions from filesystem", e);
94+
}
95+
saveCheckResultToSnapshot();
96+
running = false;
97+
}
98+
99+
private void saveCheckResultToSnapshot() {
100+
synchronized (orphanRegionsOnRSSnapshot) {
101+
orphanRegionsOnRSSnapshot.clear();
102+
orphanRegionsOnRS.entrySet()
103+
.forEach(e -> orphanRegionsOnRSSnapshot.put(e.getKey(), e.getValue()));
104+
}
105+
synchronized (orphanRegionsOnFSSnapshot) {
106+
orphanRegionsOnFSSnapshot.clear();
107+
orphanRegionsOnFSSnapshot.addAll(orphanRegionsOnFS);
108+
}
109+
synchronized (inconsistentRegionsSnapshot) {
110+
inconsistentRegionsSnapshot.clear();
111+
inconsistentRegions.entrySet()
112+
.forEach(e -> inconsistentRegionsSnapshot.put(e.getKey(), e.getValue()));
113+
}
114+
}
115+
116+
private void loadRegionsFromInMemoryState() {
117+
List<RegionState> regionStates =
118+
master.getAssignmentManager().getRegionStates().getRegionStates();
119+
for (RegionState regionState : regionStates) {
120+
RegionInfo regionInfo = regionState.getRegion();
121+
HbckRegionInfo.MetaEntry metaEntry =
122+
new HbckRegionInfo.MetaEntry(regionInfo, regionState.getServerName(),
123+
regionState.getStamp());
124+
regionInfoMap.put(regionInfo.getEncodedName(), new HbckRegionInfo(metaEntry));
125+
}
126+
}
127+
128+
private void loadRegionsFromRSReport() {
129+
Map<ServerName, Set<byte[]>> rsReports = master.getAssignmentManager().getRSReports();
130+
for (Map.Entry<ServerName, Set<byte[]>> entry : rsReports.entrySet()) {
131+
ServerName serverName = entry.getKey();
132+
for (byte[] regionName : entry.getValue()) {
133+
String encodedRegionName = RegionInfo.encodeRegionName(regionName);
134+
HbckRegionInfo hri = regionInfoMap.get(encodedRegionName);
135+
if (hri == null) {
136+
orphanRegionsOnRS.put(encodedRegionName, serverName);
137+
continue;
138+
}
139+
hri.addServer(hri.getMetaEntry(), serverName);
140+
}
141+
}
142+
143+
for (Map.Entry<String, HbckRegionInfo> entry : regionInfoMap.entrySet()) {
144+
String encodedRegionName = entry.getKey();
145+
HbckRegionInfo hri = entry.getValue();
146+
ServerName locationInMeta = hri.getMetaEntry().getRegionServer();
147+
if (hri.getDeployedOn().size() == 0) {
148+
// Master thought this region opened, but no regionserver reported it.
149+
inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, new LinkedList<>()));
150+
} else if (hri.getDeployedOn().size() > 1) {
151+
// More than one regionserver reported opened this region
152+
inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, hri.getDeployedOn()));
153+
} else if (!hri.getDeployedOn().get(0).equals(locationInMeta)) {
154+
// Master thought this region opened on Server1, but regionserver reported Server2
155+
inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, hri.getDeployedOn()));
156+
}
157+
}
158+
}
159+
160+
private void loadRegionsFromFS() throws IOException {
161+
Path rootDir = master.getMasterFileSystem().getRootDir();
162+
FileSystem fs = master.getMasterFileSystem().getFileSystem();
163+
164+
// list all tables from HDFS
165+
List<FileStatus> tableDirs = Lists.newArrayList();
166+
List<Path> paths = FSUtils.getTableDirs(fs, rootDir);
167+
for (Path path : paths) {
168+
tableDirs.add(fs.getFileStatus(path));
169+
}
170+
171+
for (FileStatus tableDir : tableDirs) {
172+
FileStatus[] regionDirs = fs.listStatus(tableDir.getPath());
173+
for (FileStatus regionDir : regionDirs) {
174+
String encodedRegionName = regionDir.getPath().getName();
175+
HbckRegionInfo hri = regionInfoMap.get(encodedRegionName);
176+
if (hri == null) {
177+
orphanRegionsOnFS.add(encodedRegionName);
178+
continue;
179+
}
180+
HbckRegionInfo.HdfsEntry hdfsEntry =
181+
new HbckRegionInfo.HdfsEntry(regionDir.getPath(), regionDir.getModificationTime());
182+
hri.setHdfsEntry(hdfsEntry);
183+
}
184+
}
185+
}
186+
187+
/**
188+
* When running, the HBCK report may be changed later.
189+
*/
190+
public boolean isRunning() {
191+
return running;
192+
}
193+
194+
public Map<String, ServerName> getOrphanRegionsOnRS() {
195+
synchronized (orphanRegionsOnRSSnapshot) {
196+
return this.orphanRegionsOnRSSnapshot;
197+
}
198+
}
199+
200+
public List<String> getOrphanRegionsOnFS() {
201+
synchronized (orphanRegionsOnFSSnapshot) {
202+
return this.orphanRegionsOnFSSnapshot;
203+
}
204+
}
205+
206+
/**
207+
* Found the inconsistent regions. There are three case:
208+
* case 1. Master thought this region opened, but no regionserver reported it.
209+
* case 2. Master thought this region opened on Server1, but regionserver reported Server2
210+
* case 3. More than one regionservers reported opened this region
211+
*
212+
* @return the map of inconsistent regions. Key is the region name. Value is a pair of location in
213+
* meta and the regionservers which reported opened this region.
214+
*/
215+
public Map<String, Pair<ServerName, List<ServerName>>> getInconsistentRegions() {
216+
synchronized (inconsistentRegionsSnapshot) {
217+
return this.inconsistentRegionsSnapshot;
218+
}
219+
}
220+
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1467,6 +1467,12 @@ public long submitServerCrash(ServerName serverName, boolean shouldSplitWal) {
14671467
LOG.info("Skip to add SCP for {} since this server should be OFFLINE already", serverName);
14681468
return -1;
14691469
}
1470+
1471+
// Remove the in-memory rsReports result
1472+
synchronized (rsReports) {
1473+
rsReports.remove(serverName);
1474+
}
1475+
14701476
// we hold the write lock here for fencing on reportRegionStateTransition. Once we set the
14711477
// server state to CRASHED, we will no longer accept the reportRegionStateTransition call from
14721478
// this server. This is used to simplify the implementation for TRSP and SCP, where we can make
@@ -2084,4 +2090,15 @@ public Map<String, Pair<ServerName, Set<ServerName>>> getProblematicRegions() {
20842090

20852091
return problematicRegions;
20862092
}
2093+
2094+
/**
2095+
* @return a snapshot of rsReports
2096+
*/
2097+
public Map<ServerName, Set<byte[]>> getRSReports() {
2098+
Map<ServerName, Set<byte[]>> rsReportsSnapshot = new HashMap<>();
2099+
synchronized (rsReports) {
2100+
rsReports.entrySet().forEach(e -> rsReportsSnapshot.put(e.getKey(), e.getValue()));
2101+
}
2102+
return rsReportsSnapshot;
2103+
}
20872104
}

0 commit comments

Comments
 (0)