1+ /**
2+ * Licensed to the Apache Software Foundation (ASF) under one
3+ * or more contributor license agreements. See the NOTICE file
4+ * distributed with this work for additional information
5+ * regarding copyright ownership. The ASF licenses this file
6+ * to you under the Apache License, Version 2.0 (the
7+ * "License"); you may not use this file except in compliance
8+ * with the License. You may obtain a copy of the License at
9+ *
10+ * http://www.apache.org/licenses/LICENSE-2.0
11+ *
12+ * Unless required by applicable law or agreed to in writing, software
13+ * distributed under the License is distributed on an "AS IS" BASIS,
14+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+ * See the License for the specific language governing permissions and
16+ * limitations under the License.
17+ */
18+ package org .apache .hadoop .hbase .master ;
19+
20+ import java .io .IOException ;
21+ import java .util .HashMap ;
22+ import java .util .LinkedList ;
23+ import java .util .List ;
24+ import java .util .Map ;
25+ import java .util .Set ;
26+
27+ import org .apache .hadoop .fs .FileStatus ;
28+ import org .apache .hadoop .fs .FileSystem ;
29+ import org .apache .hadoop .fs .Path ;
30+ import org .apache .hadoop .hbase .ScheduledChore ;
31+ import org .apache .hadoop .hbase .ServerName ;
32+ import org .apache .hadoop .hbase .client .RegionInfo ;
33+ import org .apache .hadoop .hbase .util .FSUtils ;
34+ import org .apache .hadoop .hbase .util .HbckRegionInfo ;
35+ import org .apache .hadoop .hbase .util .Pair ;
36+ import org .apache .yetus .audience .InterfaceAudience ;
37+ import org .apache .yetus .audience .InterfaceStability ;
38+ import org .slf4j .Logger ;
39+ import org .slf4j .LoggerFactory ;
40+
41+ import org .apache .hbase .thirdparty .com .google .common .collect .Lists ;
42+
43+ /**
44+ * Used to do the hbck checking job at master side.
45+ */
46+ @ InterfaceAudience .Private
47+ @ InterfaceStability .Evolving
48+ public class HbckChecker extends ScheduledChore {
49+ private static final Logger LOG = LoggerFactory .getLogger (HbckChecker .class .getName ());
50+
51+ private static final String HBCK_CHECKER_INTERVAL = "hbase.master.hbck.checker.interval" ;
52+ private static final int DEFAULT_HBCK_CHECKER_INTERVAL = 60 * 60 * 1000 ;
53+
54+ private final MasterServices master ;
55+
56+ /**
57+ * This map contains the state of all hbck items. It maps from encoded region
58+ * name to HbckRegionInfo structure. The information contained in HbckRegionInfo is used
59+ * to detect and correct consistency (hdfs/meta/deployment) problems.
60+ */
61+ private final Map <String , HbckRegionInfo > regionInfoMap = new HashMap <>();
62+
63+ private final Map <String , ServerName > orphanRegionsOnRS = new HashMap <>();
64+ private final List <String > orphanRegionsOnFS = new LinkedList <>();
65+ private final Map <String , Pair <ServerName , List <ServerName >>> inconsistentRegions =
66+ new HashMap <>();
67+
68+ private final Map <String , ServerName > orphanRegionsOnRSSnapshot = new HashMap <>();
69+ private final List <String > orphanRegionsOnFSSnapshot = new LinkedList <>();
70+ private final Map <String , Pair <ServerName , List <ServerName >>> inconsistentRegionsSnapshot =
71+ new HashMap <>();
72+
73+ private volatile boolean running = false ;
74+
75+ public HbckChecker (MasterServices master ) {
76+ super ("HbckChecker-" , master ,
77+ master .getConfiguration ().getInt (HBCK_CHECKER_INTERVAL , DEFAULT_HBCK_CHECKER_INTERVAL ));
78+ this .master = master ;
79+ }
80+
81+ @ Override
82+ protected void chore () {
83+ running = true ;
84+ regionInfoMap .clear ();
85+ orphanRegionsOnRS .clear ();
86+ orphanRegionsOnFS .clear ();
87+ inconsistentRegions .clear ();
88+ loadRegionsFromInMemoryState ();
89+ loadRegionsFromRSReport ();
90+ try {
91+ loadRegionsFromFS ();
92+ } catch (IOException e ) {
93+ LOG .warn ("Faile to load the regions from filesystem" , e );
94+ }
95+ saveCheckResultToSnapshot ();
96+ running = false ;
97+ }
98+
99+ private void saveCheckResultToSnapshot () {
100+ synchronized (orphanRegionsOnRSSnapshot ) {
101+ orphanRegionsOnRSSnapshot .clear ();
102+ orphanRegionsOnRS .entrySet ()
103+ .forEach (e -> orphanRegionsOnRSSnapshot .put (e .getKey (), e .getValue ()));
104+ }
105+ synchronized (orphanRegionsOnFSSnapshot ) {
106+ orphanRegionsOnFSSnapshot .clear ();
107+ orphanRegionsOnFSSnapshot .addAll (orphanRegionsOnFS );
108+ }
109+ synchronized (inconsistentRegionsSnapshot ) {
110+ inconsistentRegionsSnapshot .clear ();
111+ inconsistentRegions .entrySet ()
112+ .forEach (e -> inconsistentRegionsSnapshot .put (e .getKey (), e .getValue ()));
113+ }
114+ }
115+
116+ private void loadRegionsFromInMemoryState () {
117+ List <RegionState > regionStates =
118+ master .getAssignmentManager ().getRegionStates ().getRegionStates ();
119+ for (RegionState regionState : regionStates ) {
120+ RegionInfo regionInfo = regionState .getRegion ();
121+ HbckRegionInfo .MetaEntry metaEntry =
122+ new HbckRegionInfo .MetaEntry (regionInfo , regionState .getServerName (),
123+ regionState .getStamp ());
124+ regionInfoMap .put (regionInfo .getEncodedName (), new HbckRegionInfo (metaEntry ));
125+ }
126+ }
127+
128+ private void loadRegionsFromRSReport () {
129+ Map <ServerName , Set <byte []>> rsReports = master .getAssignmentManager ().getRSReports ();
130+ for (Map .Entry <ServerName , Set <byte []>> entry : rsReports .entrySet ()) {
131+ ServerName serverName = entry .getKey ();
132+ for (byte [] regionName : entry .getValue ()) {
133+ String encodedRegionName = RegionInfo .encodeRegionName (regionName );
134+ HbckRegionInfo hri = regionInfoMap .get (encodedRegionName );
135+ if (hri == null ) {
136+ orphanRegionsOnRS .put (encodedRegionName , serverName );
137+ continue ;
138+ }
139+ hri .addServer (hri .getMetaEntry (), serverName );
140+ }
141+ }
142+
143+ for (Map .Entry <String , HbckRegionInfo > entry : regionInfoMap .entrySet ()) {
144+ String encodedRegionName = entry .getKey ();
145+ HbckRegionInfo hri = entry .getValue ();
146+ ServerName locationInMeta = hri .getMetaEntry ().getRegionServer ();
147+ if (hri .getDeployedOn ().size () == 0 ) {
148+ // Master thought this region opened, but no regionserver reported it.
149+ inconsistentRegions .put (encodedRegionName , new Pair <>(locationInMeta , new LinkedList <>()));
150+ } else if (hri .getDeployedOn ().size () > 1 ) {
151+ // More than one regionserver reported opened this region
152+ inconsistentRegions .put (encodedRegionName , new Pair <>(locationInMeta , hri .getDeployedOn ()));
153+ } else if (!hri .getDeployedOn ().get (0 ).equals (locationInMeta )) {
154+ // Master thought this region opened on Server1, but regionserver reported Server2
155+ inconsistentRegions .put (encodedRegionName , new Pair <>(locationInMeta , hri .getDeployedOn ()));
156+ }
157+ }
158+ }
159+
160+ private void loadRegionsFromFS () throws IOException {
161+ Path rootDir = master .getMasterFileSystem ().getRootDir ();
162+ FileSystem fs = master .getMasterFileSystem ().getFileSystem ();
163+
164+ // list all tables from HDFS
165+ List <FileStatus > tableDirs = Lists .newArrayList ();
166+ List <Path > paths = FSUtils .getTableDirs (fs , rootDir );
167+ for (Path path : paths ) {
168+ tableDirs .add (fs .getFileStatus (path ));
169+ }
170+
171+ for (FileStatus tableDir : tableDirs ) {
172+ FileStatus [] regionDirs = fs .listStatus (tableDir .getPath ());
173+ for (FileStatus regionDir : regionDirs ) {
174+ String encodedRegionName = regionDir .getPath ().getName ();
175+ HbckRegionInfo hri = regionInfoMap .get (encodedRegionName );
176+ if (hri == null ) {
177+ orphanRegionsOnFS .add (encodedRegionName );
178+ continue ;
179+ }
180+ HbckRegionInfo .HdfsEntry hdfsEntry =
181+ new HbckRegionInfo .HdfsEntry (regionDir .getPath (), regionDir .getModificationTime ());
182+ hri .setHdfsEntry (hdfsEntry );
183+ }
184+ }
185+ }
186+
187+ /**
188+ * When running, the HBCK report may be changed later.
189+ */
190+ public boolean isRunning () {
191+ return running ;
192+ }
193+
194+ public Map <String , ServerName > getOrphanRegionsOnRS () {
195+ synchronized (orphanRegionsOnRSSnapshot ) {
196+ return this .orphanRegionsOnRSSnapshot ;
197+ }
198+ }
199+
200+ public List <String > getOrphanRegionsOnFS () {
201+ synchronized (orphanRegionsOnFSSnapshot ) {
202+ return this .orphanRegionsOnFSSnapshot ;
203+ }
204+ }
205+
206+ /**
207+ * Found the inconsistent regions. There are three case:
208+ * case 1. Master thought this region opened, but no regionserver reported it.
209+ * case 2. Master thought this region opened on Server1, but regionserver reported Server2
210+ * case 3. More than one regionservers reported opened this region
211+ *
212+ * @return the map of inconsistent regions. Key is the region name. Value is a pair of location in
213+ * meta and the regionservers which reported opened this region.
214+ */
215+ public Map <String , Pair <ServerName , List <ServerName >>> getInconsistentRegions () {
216+ synchronized (inconsistentRegionsSnapshot ) {
217+ return this .inconsistentRegionsSnapshot ;
218+ }
219+ }
220+ }
0 commit comments