1717 */
1818package org .apache .hadoop .hbase .master ;
1919
20+ import static org .junit .Assert .assertFalse ;
21+ import static org .junit .Assert .assertNotNull ;
22+ import static org .junit .Assert .assertNull ;
23+ import static org .junit .Assert .assertTrue ;
24+
25+ import java .io .IOException ;
2026import java .util .List ;
27+ import java .util .Optional ;
28+ import java .util .concurrent .CountDownLatch ;
2129import java .util .stream .Collectors ;
30+
31+ import org .apache .hadoop .conf .Configuration ;
2232import org .apache .hadoop .hbase .HBaseClassTestRule ;
2333import org .apache .hadoop .hbase .ServerName ;
34+ import org .apache .hadoop .hbase .StartMiniClusterOption ;
2435import org .apache .hadoop .hbase .TableName ;
36+ import org .apache .hadoop .hbase .client .RegionInfo ;
2537import org .apache .hadoop .hbase .client .Table ;
38+ import org .apache .hadoop .hbase .master .assignment .AssignmentManager ;
2639import org .apache .hadoop .hbase .master .assignment .ServerState ;
2740import org .apache .hadoop .hbase .master .assignment .ServerStateNode ;
2841import org .apache .hadoop .hbase .master .procedure .ServerCrashProcedure ;
2942import org .apache .hadoop .hbase .procedure2 .Procedure ;
3043import org .apache .hadoop .hbase .testclassification .LargeTests ;
3144import org .apache .hadoop .hbase .testclassification .MasterTests ;
32- import org .junit .Assert ;
45+ import org .apache .hadoop .hbase .util .JVMClusterUtil ;
46+ import org .apache .zookeeper .KeeperException ;
3347import org .junit .ClassRule ;
3448import org .junit .Test ;
3549import org .junit .experimental .categories .Category ;
@@ -45,6 +59,9 @@ public class TestClusterRestartFailover extends AbstractTestRestartCluster {
4559
4660 private static final Logger LOG = LoggerFactory .getLogger (TestClusterRestartFailover .class );
4761
62+ private static CountDownLatch SCP_LATCH ;
63+ private static ServerName SERVER_FOR_TEST ;
64+
4865 @ Override
4966 protected boolean splitWALCoordinatedByZk () {
5067 return true ;
@@ -55,60 +72,119 @@ private ServerStateNode getServerStateNode(ServerName serverName) {
5572 .getServerNode (serverName );
5673 }
5774
75+ /**
76+ * Test for HBASE-22964
77+ */
5878 @ Test
5979 public void test () throws Exception {
60- UTIL .startMiniCluster (3 );
80+ setupCluster ();
81+ setupTable ();
82+
83+ // Find the server which not carry hbase:namespace
84+ for (JVMClusterUtil .RegionServerThread thread : UTIL .getHBaseCluster ()
85+ .getRegionServerThreads ()) {
86+ if (!thread .getRegionServer ().getOnlineTables ().contains (TableName .NAMESPACE_TABLE_NAME )) {
87+ SERVER_FOR_TEST = thread .getRegionServer ().getServerName ();
88+ break ;
89+ }
90+ }
91+ UTIL .waitFor (60000 , () -> getServerStateNode (SERVER_FOR_TEST ) != null );
92+ ServerStateNode serverNode = getServerStateNode (SERVER_FOR_TEST );
93+ assertNotNull (serverNode );
94+ assertTrue ("serverNode should be ONLINE when cluster runs normally" ,
95+ serverNode .isInState (ServerState .ONLINE ));
96+
97+ SCP_LATCH = new CountDownLatch (1 );
98+
99+ // Shutdown cluster and restart
100+ List <Integer > ports =
101+ UTIL .getHBaseCluster ().getMaster ().getServerManager ().getOnlineServersList ().stream ()
102+ .map (serverName -> serverName .getPort ()).collect (Collectors .toList ());
103+ LOG .info ("Shutting down cluster" );
104+ UTIL .getHBaseCluster ().killAll ();
105+ UTIL .getHBaseCluster ().waitUntilShutDown ();
106+ LOG .info ("Restarting cluster" );
107+ UTIL .restartHBaseCluster (StartMiniClusterOption .builder ().masterClass (HMasterForTest .class )
108+ .numMasters (1 ).numRegionServers (3 ).rsPorts (ports ).build ());
109+ UTIL .waitFor (60000 , () -> UTIL .getHBaseCluster ().getMaster ().isInitialized ());
110+
111+ UTIL .waitFor (60000 , () -> getServerStateNode (SERVER_FOR_TEST ) != null );
112+ serverNode = getServerStateNode (SERVER_FOR_TEST );
113+ assertFalse ("serverNode should not be ONLINE during SCP processing" ,
114+ serverNode .isInState (ServerState .ONLINE ));
115+ Optional <Procedure <?>> procedure = UTIL .getHBaseCluster ().getMaster ().getProcedures ().stream ()
116+ .filter (p -> (p instanceof ServerCrashProcedure ) &&
117+ ((ServerCrashProcedure ) p ).getServerName ().equals (SERVER_FOR_TEST )).findAny ();
118+ assertTrue ("Should have one SCP for " + SERVER_FOR_TEST , procedure .isPresent ());
119+ assertFalse ("Submit the SCP for the same serverName " + SERVER_FOR_TEST + " which should fail" ,
120+ UTIL .getHBaseCluster ().getMaster ().getServerManager ().expireServer (SERVER_FOR_TEST ));
121+
122+ // Wait the SCP to finish
123+ SCP_LATCH .countDown ();
124+ UTIL .waitFor (60000 , () -> procedure .get ().isFinished ());
125+
126+ assertFalse ("Even when the SCP is finished, the duplicate SCP should not be scheduled for " +
127+ SERVER_FOR_TEST ,
128+ UTIL .getHBaseCluster ().getMaster ().getServerManager ().expireServer (SERVER_FOR_TEST ));
129+ serverNode = UTIL .getHBaseCluster ().getMaster ().getAssignmentManager ().getRegionStates ()
130+ .getServerNode (SERVER_FOR_TEST );
131+ assertNull ("serverNode should be deleted after SCP finished" , serverNode );
132+ }
133+
134+ private void setupCluster () throws Exception {
135+ UTIL .startMiniCluster (
136+ StartMiniClusterOption .builder ().masterClass (HMasterForTest .class ).numMasters (1 )
137+ .numRegionServers (3 ).build ());
61138 UTIL .waitFor (60000 , () -> UTIL .getMiniHBaseCluster ().getMaster ().isInitialized ());
62139 // wait for all SCPs finished
63140 UTIL .waitFor (60000 , () -> UTIL .getHBaseCluster ().getMaster ().getProcedures ().stream ()
64- .noneMatch (p -> p instanceof ServerCrashProcedure ));
141+ .noneMatch (p -> p instanceof ServerCrashProcedure ));
142+ UTIL .getHBaseCluster ().getMaster ().balanceSwitch (false );
143+ }
144+
145+ private void setupTable () throws Exception {
65146 TableName tableName = TABLES [0 ];
66- ServerName testServer = UTIL .getHBaseCluster ().getRegionServer (0 ).getServerName ();
67- UTIL .waitFor (30000 , () -> getServerStateNode (testServer ) != null );
68- ServerStateNode serverNode = getServerStateNode (testServer );
69- Assert .assertNotNull (serverNode );
70- Assert .assertTrue ("serverNode should be ONLINE when cluster runs normally" ,
71- serverNode .isInState (ServerState .ONLINE ));
72147 UTIL .createMultiRegionTable (tableName , FAMILY );
73- UTIL .waitTableEnabled (tableName );
148+ UTIL .waitTableAvailable (tableName );
74149 Table table = UTIL .getConnection ().getTable (tableName );
75150 for (int i = 0 ; i < 100 ; i ++) {
76151 UTIL .loadTable (table , FAMILY );
77152 }
78- List <Integer > ports =
79- UTIL .getHBaseCluster ().getMaster ().getServerManager ().getOnlineServersList ().stream ()
80- .map (serverName -> serverName .getPort ()).collect (Collectors .toList ());
81- LOG .info ("Shutting down cluster" );
82- UTIL .getHBaseCluster ().killAll ();
83- UTIL .getHBaseCluster ().waitUntilShutDown ();
84- LOG .info ("Starting cluster the second time" );
85- UTIL .restartHBaseCluster (3 , ports );
86- UTIL .waitFor (30000 , () -> UTIL .getHBaseCluster ().getMaster ().isInitialized ());
87- serverNode = UTIL .getHBaseCluster ().getMaster ().getAssignmentManager ().getRegionStates ()
88- .getServerNode (testServer );
89- Assert .assertNotNull ("serverNode should not be null when restart whole cluster" , serverNode );
90- Assert .assertFalse (serverNode .isInState (ServerState .ONLINE ));
91- LOG .info ("start to find the procedure of SCP for the severName we choose" );
92- UTIL .waitFor (60000 ,
93- () -> UTIL .getHBaseCluster ().getMaster ().getProcedures ().stream ()
94- .anyMatch (procedure -> (procedure instanceof ServerCrashProcedure ) &&
95- ((ServerCrashProcedure ) procedure ).getServerName ().equals (testServer )));
96- Assert .assertFalse ("serverNode should not be ONLINE during SCP processing" ,
97- serverNode .isInState (ServerState .ONLINE ));
98- LOG .info ("start to submit the SCP for the same serverName {} which should fail" , testServer );
99- Assert
100- .assertFalse (UTIL .getHBaseCluster ().getMaster ().getServerManager ().expireServer (testServer ));
101- Procedure <?> procedure = UTIL .getHBaseCluster ().getMaster ().getProcedures ().stream ()
102- .filter (p -> (p instanceof ServerCrashProcedure ) &&
103- ((ServerCrashProcedure ) p ).getServerName ().equals (testServer ))
104- .findAny ().get ();
105- UTIL .waitFor (60000 , () -> procedure .isFinished ());
106- LOG .info ("even when the SCP is finished, the duplicate SCP should not be scheduled for {}" ,
107- testServer );
108- Assert
109- .assertFalse (UTIL .getHBaseCluster ().getMaster ().getServerManager ().expireServer (testServer ));
110- serverNode = UTIL .getHBaseCluster ().getMaster ().getAssignmentManager ().getRegionStates ()
111- .getServerNode (testServer );
112- Assert .assertNull ("serverNode should be deleted after SCP finished" , serverNode );
153+ }
154+
155+ public static final class HMasterForTest extends HMaster {
156+
157+ public HMasterForTest (Configuration conf ) throws IOException , KeeperException {
158+ super (conf );
159+ }
160+
161+ @ Override
162+ protected AssignmentManager createAssignmentManager (MasterServices master ) {
163+ return new AssignmentManagerForTest (master );
164+ }
165+ }
166+
167+ private static final class AssignmentManagerForTest extends AssignmentManager {
168+
169+ public AssignmentManagerForTest (MasterServices master ) {
170+ super (master );
171+ }
172+
173+ @ Override
174+ public List <RegionInfo > getRegionsOnServer (ServerName serverName ) {
175+ List <RegionInfo > regions = super .getRegionsOnServer (serverName );
176+ // ServerCrashProcedure will call this method, so wait the CountDownLatch here
177+ if (SCP_LATCH != null && SERVER_FOR_TEST != null && serverName .equals (SERVER_FOR_TEST )) {
178+ try {
179+ LOG .info ("ServerCrashProcedure wait the CountDownLatch here" );
180+ SCP_LATCH .await ();
181+ LOG .info ("Continue the ServerCrashProcedure" );
182+ SCP_LATCH = null ;
183+ } catch (InterruptedException e ) {
184+ throw new RuntimeException (e );
185+ }
186+ }
187+ return regions ;
188+ }
113189 }
114190}
0 commit comments