Skip to content

Commit f454803

Browse files
committed
HBASE-22709 Add a chore thread in master to do hbck checking
1 parent a65e72d commit f454803

7 files changed

Lines changed: 421 additions & 144 deletions

File tree

hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon

Lines changed: 0 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -42,84 +42,8 @@ int limit = 100;
4242
<%java>
4343
SortedSet<RegionState> rit = assignmentManager.getRegionStates()
4444
.getRegionsInTransitionOrderedByTimestamp();
45-
Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = assignmentManager
46-
.getProblematicRegions();
4745
</%java>
4846

49-
<%if !problematicRegions.isEmpty() %>
50-
<%java>
51-
int totalSize = problematicRegions.size();
52-
int sizePerPage = Math.min(10, totalSize);
53-
int numOfPages = (int) Math.ceil(totalSize * 1.0 / sizePerPage);
54-
</%java>
55-
<section>
56-
<h2><a name="problem-regions">Problematic Regions</a></h2>
57-
<p>
58-
<span>
59-
<% problematicRegions.size() %> problematic region(s). There are three case: 1. Master
60-
thought this region opened, but no regionserver reported it. 2. Master thought this
61-
region opened on Server1, but regionserver reported Server2. 3. More than one
62-
regionservers reported opened this region. Notice: the reported online regionservers
63-
may be not right when there are regions in transition. Please check them in
64-
regionserver's web UI.
65-
</span>
66-
</p>
67-
<div class="tabbable">
68-
<div class="tab-content">
69-
<%java int recordItr = 0; %>
70-
<%for Map.Entry<String, Pair<ServerName, Set<ServerName>>> entry : problematicRegions.entrySet() %>
71-
<%if (recordItr % sizePerPage) == 0 %>
72-
<%if recordItr == 0 %>
73-
<div class="tab-pane active" id="tab_prs<% (recordItr / sizePerPage) + 1 %>">
74-
<%else>
75-
<div class="tab-pane" id="tab_prs<% (recordItr / sizePerPage) + 1 %>">
76-
</%if>
77-
<table class="table table-striped" style="margin-bottom:0px;">
78-
<tr>
79-
<th>Region</th>
80-
<th>Location in META</th>
81-
<th>Reported Online Region Servers</th>
82-
</tr>
83-
</%if>
84-
85-
<tr>
86-
<td><% entry.getKey() %></td>
87-
<td><% entry.getValue().getFirst() %></td>
88-
<td><% entry.getValue().getSecond().stream().map(ServerName::getServerName)
89-
.collect(Collectors.joining(", ")) %></td>
90-
</tr>
91-
<%java recordItr++; %>
92-
<%if (recordItr % sizePerPage) == 0 %>
93-
</table>
94-
</div>
95-
</%if>
96-
</%for>
97-
98-
<%if (recordItr % sizePerPage) != 0 %>
99-
<%for ; (recordItr % sizePerPage) != 0 ; recordItr++ %>
100-
<tr><td colspan="3" style="height:61px"></td></tr>
101-
</%for>
102-
</table>
103-
</div>
104-
</%if>
105-
106-
</div>
107-
<nav>
108-
<ul class="nav nav-pills pagination">
109-
<%for int i = 1 ; i <= numOfPages; i++ %>
110-
<%if i == 1 %>
111-
<li class="active">
112-
<%else>
113-
<li>
114-
</%if>
115-
<a href="#tab_prs<% i %>"><% i %></a></li>
116-
</%for>
117-
</ul>
118-
</nav>
119-
</div>
120-
</section>
121-
</%if>
122-
12347
<%if !rit.isEmpty() %>
12448
<%java>
12549
long currentTime = System.currentTimeMillis();

hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/MasterStatusTmpl.jamon

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,8 @@ AssignmentManager assignmentManager = master.getAssignmentManager();
149149
<li class="active"><a href="/master-status">Home</a></li>
150150
<li><a href="/tablesDetailed.jsp">Table Details</a></li>
151151
<%if master.isActiveMaster() %>
152-
<li><a href="/procedures.jsp">Procedures &amp; Locks</a></li>
152+
<li><a href="/procedures.jsp">Procedures &amp; Locks</a></li>
153+
<li><a href="/hbck.jsp">HBCK Report</a></li>
153154
</%if>
154155
<li><a href="/processMaster.jsp">Process Metrics</a></li>
155156
<li><a href="/logs/">Local Logs</a></li>

hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,7 @@ public void run() {
385385
private ClusterStatusPublisher clusterStatusPublisherChore = null;
386386
private SnapshotCleanerChore snapshotCleanerChore = null;
387387

388+
private HbckChecker hbckChecker;
388389
CatalogJanitor catalogJanitorChore;
389390
private LogCleaner logCleaner;
390391
private HFileCleaner hfileCleaner;
@@ -1108,6 +1109,8 @@ private void finishActiveMasterInitialization(MonitoredTask status) throws IOExc
11081109
getChoreService().scheduleChore(normalizerChore);
11091110
this.catalogJanitorChore = new CatalogJanitor(this);
11101111
getChoreService().scheduleChore(catalogJanitorChore);
1112+
this.hbckChecker = new HbckChecker(this);
1113+
getChoreService().scheduleChore(hbckChecker);
11111114
this.serverManager.startChore();
11121115

11131116
// Only for rolling upgrade, where we need to migrate the data in namespace table to meta table.
@@ -1587,6 +1590,7 @@ private void stopChores() {
15871590
choreService.cancelChore(this.hfileCleaner);
15881591
choreService.cancelChore(this.replicationBarrierCleaner);
15891592
choreService.cancelChore(this.snapshotCleanerChore);
1593+
choreService.cancelChore(this.hbckChecker);
15901594
}
15911595
}
15921596

@@ -3756,4 +3760,8 @@ public Map<String, ReplicationStatus> getWalGroupsReplicationStatus() {
37563760
}
37573761
return super.getWalGroupsReplicationStatus();
37583762
}
3763+
3764+
public HbckChecker getHbckChecker() {
3765+
return this.hbckChecker;
3766+
}
37593767
}
Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase.master;
19+
20+
import java.io.IOException;
21+
import java.util.HashMap;
22+
import java.util.LinkedList;
23+
import java.util.List;
24+
import java.util.Map;
25+
import java.util.Set;
26+
27+
import org.apache.hadoop.fs.FileStatus;
28+
import org.apache.hadoop.fs.FileSystem;
29+
import org.apache.hadoop.fs.Path;
30+
import org.apache.hadoop.hbase.ScheduledChore;
31+
import org.apache.hadoop.hbase.ServerName;
32+
import org.apache.hadoop.hbase.client.RegionInfo;
33+
import org.apache.hadoop.hbase.util.FSUtils;
34+
import org.apache.hadoop.hbase.util.HbckRegionInfo;
35+
import org.apache.hadoop.hbase.util.Pair;
36+
import org.apache.yetus.audience.InterfaceAudience;
37+
import org.apache.yetus.audience.InterfaceStability;
38+
import org.slf4j.Logger;
39+
import org.slf4j.LoggerFactory;
40+
41+
import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
42+
43+
/**
44+
* Used to do the hbck checking job at master side.
45+
*/
46+
@InterfaceAudience.Private
47+
@InterfaceStability.Evolving
48+
public class HbckChecker extends ScheduledChore {
49+
private static final Logger LOG = LoggerFactory.getLogger(HbckChecker.class.getName());
50+
51+
private static final String HBCK_CHECKER_INTERVAL = "hbase.master.hbck.checker.interval";
52+
private static final int DEFAULT_HBCK_CHECKER_INTERVAL = 60 * 60 * 1000;
53+
54+
private final MasterServices master;
55+
56+
/**
57+
* This map contains the state of all hbck items. It maps from encoded region
58+
* name to HbckRegionInfo structure. The information contained in HbckRegionInfo is used
59+
* to detect and correct consistency (hdfs/meta/deployment) problems.
60+
*/
61+
private final Map<String, HbckRegionInfo> regionInfoMap = new HashMap<>();
62+
63+
private final Map<String, ServerName> orphanRegionsOnRS = new HashMap<>();
64+
private final List<String> orphanRegionsOnFS = new LinkedList<>();
65+
private final Map<String, Pair<ServerName, List<ServerName>>> inconsistentRegions =
66+
new HashMap<>();
67+
68+
private final Map<String, ServerName> orphanRegionsOnRSSnapshot = new HashMap<>();
69+
private final List<String> orphanRegionsOnFSSnapshot = new LinkedList<>();
70+
private final Map<String, Pair<ServerName, List<ServerName>>> inconsistentRegionsSnapshot =
71+
new HashMap<>();
72+
73+
private volatile boolean running = false;
74+
75+
public HbckChecker(MasterServices master) {
76+
super("HbckChecker-", master,
77+
master.getConfiguration().getInt(HBCK_CHECKER_INTERVAL, DEFAULT_HBCK_CHECKER_INTERVAL));
78+
this.master = master;
79+
}
80+
81+
@Override
82+
protected void chore() {
83+
running = true;
84+
regionInfoMap.clear();
85+
orphanRegionsOnRS.clear();
86+
orphanRegionsOnFS.clear();
87+
inconsistentRegions.clear();
88+
loadRegionsFromInMemoryState();
89+
loadRegionsFromRSReport();
90+
try {
91+
loadRegionsFromFS();
92+
} catch (IOException e) {
93+
LOG.warn("Faile to load the regions from filesystem", e);
94+
}
95+
saveCheckResultToSnapshot();
96+
running = false;
97+
}
98+
99+
private void saveCheckResultToSnapshot() {
100+
synchronized (orphanRegionsOnRSSnapshot) {
101+
orphanRegionsOnRSSnapshot.clear();
102+
orphanRegionsOnRS.entrySet()
103+
.forEach(e -> orphanRegionsOnRSSnapshot.put(e.getKey(), e.getValue()));
104+
}
105+
synchronized (orphanRegionsOnFSSnapshot) {
106+
orphanRegionsOnFSSnapshot.clear();
107+
orphanRegionsOnFSSnapshot.addAll(orphanRegionsOnFS);
108+
}
109+
synchronized (inconsistentRegionsSnapshot) {
110+
inconsistentRegionsSnapshot.clear();
111+
inconsistentRegions.entrySet()
112+
.forEach(e -> inconsistentRegionsSnapshot.put(e.getKey(), e.getValue()));
113+
}
114+
}
115+
116+
private void loadRegionsFromInMemoryState() {
117+
List<RegionState> regionStates =
118+
master.getAssignmentManager().getRegionStates().getRegionStates();
119+
for (RegionState regionState : regionStates) {
120+
RegionInfo regionInfo = regionState.getRegion();
121+
HbckRegionInfo.MetaEntry metaEntry =
122+
new HbckRegionInfo.MetaEntry(regionInfo, regionState.getServerName(),
123+
regionState.getStamp());
124+
regionInfoMap.put(regionInfo.getEncodedName(), new HbckRegionInfo(metaEntry));
125+
}
126+
}
127+
128+
private void loadRegionsFromRSReport() {
129+
Map<ServerName, Set<byte[]>> rsReports = master.getAssignmentManager().getRSReports();
130+
for (Map.Entry<ServerName, Set<byte[]>> entry : rsReports.entrySet()) {
131+
ServerName serverName = entry.getKey();
132+
for (byte[] regionName : entry.getValue()) {
133+
String encodedRegionName = RegionInfo.encodeRegionName(regionName);
134+
HbckRegionInfo hri = regionInfoMap.get(encodedRegionName);
135+
if (hri == null) {
136+
orphanRegionsOnRS.put(encodedRegionName, serverName);
137+
continue;
138+
}
139+
hri.addServer(hri.getMetaEntry(), serverName);
140+
}
141+
}
142+
143+
for (Map.Entry<String, HbckRegionInfo> entry : regionInfoMap.entrySet()) {
144+
String encodedRegionName = entry.getKey();
145+
HbckRegionInfo hri = entry.getValue();
146+
ServerName locationInMeta = hri.getMetaEntry().getRegionServer();
147+
if (hri.getDeployedOn().size() == 0) {
148+
// Master thought this region opened, but no regionserver reported it.
149+
inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, new LinkedList<>()));
150+
} else if (hri.getDeployedOn().size() > 1) {
151+
// More than one regionserver reported opened this region
152+
inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, hri.getDeployedOn()));
153+
} else if (!hri.getDeployedOn().get(0).equals(locationInMeta)) {
154+
// Master thought this region opened on Server1, but regionserver reported Server2
155+
inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, hri.getDeployedOn()));
156+
}
157+
}
158+
}
159+
160+
private void loadRegionsFromFS() throws IOException {
161+
Path rootDir = master.getMasterFileSystem().getRootDir();
162+
FileSystem fs = master.getMasterFileSystem().getFileSystem();
163+
164+
// list all tables from HDFS
165+
List<FileStatus> tableDirs = Lists.newArrayList();
166+
List<Path> paths = FSUtils.getTableDirs(fs, rootDir);
167+
for (Path path : paths) {
168+
tableDirs.add(fs.getFileStatus(path));
169+
}
170+
171+
for (FileStatus tableDir : tableDirs) {
172+
FileStatus[] regionDirs = fs.listStatus(tableDir.getPath());
173+
for (FileStatus regionDir : regionDirs) {
174+
String encodedRegionName = regionDir.getPath().getName();
175+
HbckRegionInfo hri = regionInfoMap.get(encodedRegionName);
176+
if (hri == null) {
177+
orphanRegionsOnFS.add(encodedRegionName);
178+
continue;
179+
}
180+
HbckRegionInfo.HdfsEntry hdfsEntry =
181+
new HbckRegionInfo.HdfsEntry(regionDir.getPath(), regionDir.getModificationTime());
182+
hri.setHdfsEntry(hdfsEntry);
183+
}
184+
}
185+
}
186+
187+
/**
188+
* When running, the HBCK report may be changed later.
189+
*/
190+
public boolean isRunning() {
191+
return running;
192+
}
193+
194+
public Map<String, ServerName> getOrphanRegionsOnRS() {
195+
synchronized (orphanRegionsOnRSSnapshot) {
196+
return this.orphanRegionsOnRSSnapshot;
197+
}
198+
}
199+
200+
public List<String> getOrphanRegionsOnFS() {
201+
synchronized (orphanRegionsOnFSSnapshot) {
202+
return this.orphanRegionsOnFSSnapshot;
203+
}
204+
}
205+
206+
/**
207+
* Found the inconsistent regions. There are three case:
208+
* case 1. Master thought this region opened, but no regionserver reported it.
209+
* case 2. Master thought this region opened on Server1, but regionserver reported Server2
210+
* case 3. More than one regionservers reported opened this region
211+
*
212+
* @return the map of inconsistent regions. Key is the region name. Value is a pair of location in
213+
* meta and the regionservers which reported opened this region.
214+
*/
215+
public Map<String, Pair<ServerName, List<ServerName>>> getInconsistentRegions() {
216+
synchronized (inconsistentRegionsSnapshot) {
217+
return this.inconsistentRegionsSnapshot;
218+
}
219+
}
220+
}

0 commit comments

Comments
 (0)