From da1200662540b9cdd7a2a5552b338b0e976bf940 Mon Sep 17 00:00:00 2001 From: Viraj Jasani Date: Tue, 15 Apr 2025 22:42:44 -0700 Subject: [PATCH 1/7] HBASE-29251 Procedure gets stuck if the procedure state cannot be persisted --- .../hbase/master/region/MasterRegion.java | 21 ++- .../hbase/util/TestMasterRegionMutation.java | 158 ++++++++++++++++++ 2 files changed, 176 insertions(+), 3 deletions(-) create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestMasterRegionMutation.java diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java index d465b110948e..dfacfb72da6b 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java @@ -118,6 +118,11 @@ public final class MasterRegion { private MasterRegionWALRoller walRoller; + /** + * This is only for test purpose. + */ + private boolean updateFailForTest = false; + private MasterRegion(Server server, HRegion region, WALFactory walFactory, MasterRegionFlusherAndCompactor flusherAndCompactor, MasterRegionWALRoller walRoller) { this.server = server; @@ -143,13 +148,23 @@ private void shutdownWAL() { } } + @RestrictedApi(explanation = "Should only be called in tests", link = "", + allowedOnPath = ".*/src/test/.*") + public void setTestFailure() { + this.updateFailForTest = true; + } + public void update(UpdateMasterRegion action) throws IOException { try { + if (updateFailForTest) { + // test for HBASE-29251 + throw new IOException("Update failed"); + } action.update(region); flusherAndCompactor.onUpdate(); - } catch (WALSyncTimeoutIOException e) { - LOG.error(HBaseMarkers.FATAL, "WAL sync timeout. Aborting server."); - server.abort("WAL sync timeout", e); + } catch (IOException e) { + LOG.error(HBaseMarkers.FATAL, "MasterRegion mutation is not successful. Aborting server."); + server.abort("MasterRegion mutation is not successful", e); throw e; } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestMasterRegionMutation.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestMasterRegionMutation.java new file mode 100644 index 000000000000..7fce7674b40b --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestMasterRegionMutation.java @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.util; + +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtil; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.SingleProcessHBaseCluster; +import org.apache.hadoop.hbase.StartTestingClusterOption; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder; +import org.apache.hadoop.hbase.client.TableDescriptor; +import org.apache.hadoop.hbase.client.TableDescriptorBuilder; +import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.master.hbck.HbckChore; +import org.apache.hadoop.hbase.master.hbck.HbckReport; +import org.apache.hadoop.hbase.master.region.MasterRegion; +import org.apache.hadoop.hbase.regionserver.HRegionServer; +import org.apache.hadoop.hbase.testclassification.LargeTests; +import org.apache.hadoop.hbase.testclassification.MiscTests; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.rules.TestName; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos; + +@Category({ MiscTests.class, LargeTests.class }) +public class TestMasterRegionMutation { + + private static final Logger LOG = LoggerFactory.getLogger(TestMasterRegionMutation.class); + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestMasterRegionMutation.class); + + @Rule + public TestName name = new TestName(); + + private static final HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil(); + private static ServerName rs0; + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + StartTestingClusterOption.Builder builder = StartTestingClusterOption.builder(); + builder.numMasters(2).numRegionServers(3); + TEST_UTIL.startMiniCluster(builder.build()); + SingleProcessHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); + rs0 = cluster.getRegionServer(0).getServerName(); + TEST_UTIL.getAdmin().balancerSwitch(false, true); + } + + @AfterClass + public static void tearDownAfterClass() throws Exception { + TEST_UTIL.shutdownMiniCluster(); + } + + @Before + public void setUp() throws Exception { + final TableName tableName = TableName.valueOf(name.getMethodName()); + TableDescriptor tableDesc = TableDescriptorBuilder.newBuilder(tableName) + .setColumnFamily(ColumnFamilyDescriptorBuilder.of("fam1")).build(); + int startKey = 0; + int endKey = 80000; + TEST_UTIL.getAdmin().createTable(tableDesc, Bytes.toBytes(startKey), Bytes.toBytes(endKey), 9); + } + + @Test + public void testMasterRegionMutations() throws Exception { + HbckChore hbckChore = new HbckChore(TEST_UTIL.getHBaseCluster().getMaster()); + SingleProcessHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); + + HRegionServer hRegionServer0 = cluster.getRegionServer(0); + HRegionServer hRegionServer1 = cluster.getRegionServer(1); + HRegionServer hRegionServer2 = cluster.getRegionServer(2); + int numRegions0 = hRegionServer0.getNumberOfOnlineRegions(); + int numRegions1 = hRegionServer1.getNumberOfOnlineRegions(); + int numRegions2 = hRegionServer2.getNumberOfOnlineRegions(); + + hbckChore.choreForTesting(); + HbckReport hbckReport = hbckChore.getLastReport(); + Assert.assertEquals(0, hbckReport.getInconsistentRegions().size()); + Assert.assertEquals(0, hbckReport.getOrphanRegionsOnFS().size()); + Assert.assertEquals(0, hbckReport.getOrphanRegionsOnRS().size()); + + MasterRegion masterRegion = cluster.getMaster().getMasterRegion(); + masterRegion.setTestFailure(); + + // move one region from server 1 to server 0 + TEST_UTIL.getAdmin() + .move(hRegionServer1.getRegions().get(0).getRegionInfo().getEncodedNameAsBytes(), rs0); + + // move one region from server 2 to server 0 + TEST_UTIL.getAdmin() + .move(hRegionServer2.getRegions().get(0).getRegionInfo().getEncodedNameAsBytes(), rs0); + + HMaster master = TEST_UTIL.getHBaseCluster().getMaster(); + + // Ensure: + // 1. num of regions before and after master abort remain same + // 2. all procedures are successfully completed + TEST_UTIL.waitFor(5000, 1000, () -> { + LOG.info("numRegions0: {} , numRegions1: {} , numRegions2: {}", numRegions0, numRegions1, + numRegions2); + LOG.info("Online regions - server0 : {} , server1: {} , server2: {}", + cluster.getRegionServer(0).getNumberOfOnlineRegions(), + cluster.getRegionServer(1).getNumberOfOnlineRegions(), + cluster.getRegionServer(2).getNumberOfOnlineRegions()); + LOG.info("Num of successfully completed procedures: {} , num of all procedures: {}", + master.getMasterProcedureExecutor().getProcedures().stream() + .filter(masterProcedureEnvProcedure -> masterProcedureEnvProcedure.getState() + == ProcedureProtos.ProcedureState.SUCCESS) + .count(), + master.getMasterProcedureExecutor().getProcedures().size()); + return (numRegions0 + numRegions1 + numRegions2) + == (cluster.getRegionServer(0).getNumberOfOnlineRegions() + + cluster.getRegionServer(1).getNumberOfOnlineRegions() + + cluster.getRegionServer(2).getNumberOfOnlineRegions()) + && master.getMasterProcedureExecutor().getProcedures().stream() + .filter(masterProcedureEnvProcedure -> masterProcedureEnvProcedure.getState() + == ProcedureProtos.ProcedureState.SUCCESS) + .count() == master.getMasterProcedureExecutor().getProcedures().size(); + }); + + // Ensure we have no inconsistent regions + TEST_UTIL.waitFor(5000, 1000, () -> { + HbckChore hbck = new HbckChore(TEST_UTIL.getHBaseCluster().getMaster()); + hbck.choreForTesting(); + HbckReport report = hbck.getLastReport(); + return report.getInconsistentRegions().isEmpty() && report.getOrphanRegionsOnFS().isEmpty() + && report.getOrphanRegionsOnRS().isEmpty(); + }); + + } + +} From e77927a3323c03361ad23467b66a6a2ff93db857 Mon Sep 17 00:00:00 2001 From: Viraj Jasani Date: Thu, 17 Apr 2025 11:58:36 -0700 Subject: [PATCH 2/7] use custom region for test --- .../hbase/master/region/MasterRegion.java | 19 +----- .../hbase/util/TestMasterRegionMutation.java | 63 +++++++++++++++++-- 2 files changed, 60 insertions(+), 22 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java index dfacfb72da6b..0d621710f262 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java @@ -118,11 +118,6 @@ public final class MasterRegion { private MasterRegionWALRoller walRoller; - /** - * This is only for test purpose. - */ - private boolean updateFailForTest = false; - private MasterRegion(Server server, HRegion region, WALFactory walFactory, MasterRegionFlusherAndCompactor flusherAndCompactor, MasterRegionWALRoller walRoller) { this.server = server; @@ -148,22 +143,14 @@ private void shutdownWAL() { } } - @RestrictedApi(explanation = "Should only be called in tests", link = "", - allowedOnPath = ".*/src/test/.*") - public void setTestFailure() { - this.updateFailForTest = true; - } - public void update(UpdateMasterRegion action) throws IOException { try { - if (updateFailForTest) { - // test for HBASE-29251 - throw new IOException("Update failed"); - } action.update(region); flusherAndCompactor.onUpdate(); } catch (IOException e) { - LOG.error(HBaseMarkers.FATAL, "MasterRegion mutation is not successful. Aborting server."); + LOG.error(HBaseMarkers.FATAL, + "MasterRegion mutation is not successful. Aborting server to let new active master " + + "resume failed proc store update."); server.abort("MasterRegion mutation is not successful", e); throw e; } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestMasterRegionMutation.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestMasterRegionMutation.java index 7fce7674b40b..976c9af86565 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestMasterRegionMutation.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestMasterRegionMutation.java @@ -17,22 +17,35 @@ */ package org.apache.hadoop.hbase.util; +import java.io.IOException; +import java.util.concurrent.atomic.AtomicBoolean; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtil; +import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.SingleProcessHBaseCluster; import org.apache.hadoop.hbase.StartTestingClusterOption; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder; +import org.apache.hadoop.hbase.client.Mutation; +import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.TableDescriptor; import org.apache.hadoop.hbase.client.TableDescriptorBuilder; import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.master.hbck.HbckChore; import org.apache.hadoop.hbase.master.hbck.HbckReport; -import org.apache.hadoop.hbase.master.region.MasterRegion; +import org.apache.hadoop.hbase.master.region.MasterRegionFactory; +import org.apache.hadoop.hbase.regionserver.HRegion; +import org.apache.hadoop.hbase.regionserver.HRegionFileSystem; import org.apache.hadoop.hbase.regionserver.HRegionServer; +import org.apache.hadoop.hbase.regionserver.OperationStatus; +import org.apache.hadoop.hbase.regionserver.RegionServerServices; import org.apache.hadoop.hbase.testclassification.LargeTests; -import org.apache.hadoop.hbase.testclassification.MiscTests; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.wal.WAL; import org.junit.AfterClass; import org.junit.Assert; import org.junit.Before; @@ -47,7 +60,11 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos; -@Category({ MiscTests.class, LargeTests.class }) +/** + * MasterRegion related test that ensures the operations continue even when Procedure state update + * encounters IO errors. + */ +@Category({ MasterTests.class, LargeTests.class }) public class TestMasterRegionMutation { private static final Logger LOG = LoggerFactory.getLogger(TestMasterRegionMutation.class); @@ -62,10 +79,13 @@ public class TestMasterRegionMutation { private static final HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil(); private static ServerName rs0; + private static final AtomicBoolean ERROR_OUT = new AtomicBoolean(false); + @BeforeClass public static void setUpBeforeClass() throws Exception { + TEST_UTIL.getConfiguration().setClass(HConstants.REGION_IMPL, TestRegion.class, HRegion.class); StartTestingClusterOption.Builder builder = StartTestingClusterOption.builder(); - builder.numMasters(2).numRegionServers(3); + builder.numMasters(4).numRegionServers(3); TEST_UTIL.startMiniCluster(builder.build()); SingleProcessHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); rs0 = cluster.getRegionServer(0).getServerName(); @@ -105,13 +125,18 @@ public void testMasterRegionMutations() throws Exception { Assert.assertEquals(0, hbckReport.getOrphanRegionsOnFS().size()); Assert.assertEquals(0, hbckReport.getOrphanRegionsOnRS().size()); - MasterRegion masterRegion = cluster.getMaster().getMasterRegion(); - masterRegion.setTestFailure(); + // procedure state store update failure needs to trigger active master abort and hence master + // failover + ERROR_OUT.set(true); // move one region from server 1 to server 0 TEST_UTIL.getAdmin() .move(hRegionServer1.getRegions().get(0).getRegionInfo().getEncodedNameAsBytes(), rs0); + // procedure state store update failure needs to trigger active master abort and hence master + // failover + ERROR_OUT.set(true); + // move one region from server 2 to server 0 TEST_UTIL.getAdmin() .move(hRegionServer2.getRegions().get(0).getRegionInfo().getEncodedNameAsBytes(), rs0); @@ -155,4 +180,30 @@ public void testMasterRegionMutations() throws Exception { } + public static class TestRegion extends HRegion { + + public TestRegion(Path tableDir, WAL wal, FileSystem fs, Configuration confParam, + RegionInfo regionInfo, TableDescriptor htd, RegionServerServices rsServices) { + super(tableDir, wal, fs, confParam, regionInfo, htd, rsServices); + } + + public TestRegion(HRegionFileSystem fs, WAL wal, Configuration confParam, TableDescriptor htd, + RegionServerServices rsServices) { + super(fs, wal, confParam, htd, rsServices); + } + + @Override + public OperationStatus[] batchMutate(Mutation[] mutations, boolean atomic, long nonceGroup, + long nonce) throws IOException { + if ( + MasterRegionFactory.TABLE_NAME.equals(getTableDescriptor().getTableName()) + && ERROR_OUT.get() + ) { + ERROR_OUT.set(false); + throw new IOException("test error"); + } + return super.batchMutate(mutations, atomic, nonceGroup, nonce); + } + } + } From c510142548a7bd8d165167ba5a88e6bffbb6d80a Mon Sep 17 00:00:00 2001 From: Viraj Jasani Date: Thu, 17 Apr 2025 14:12:54 -0700 Subject: [PATCH 3/7] minor change --- .../org/apache/hadoop/hbase/master/region/MasterRegion.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java index 0d621710f262..09bb3014925b 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java @@ -149,9 +149,9 @@ public void update(UpdateMasterRegion action) throws IOException { flusherAndCompactor.onUpdate(); } catch (IOException e) { LOG.error(HBaseMarkers.FATAL, - "MasterRegion mutation is not successful. Aborting server to let new active master " + "MasterRegion update is not successful. Aborting server to let new active master " + "resume failed proc store update."); - server.abort("MasterRegion mutation is not successful", e); + server.abort("MasterRegion update is not successful", e); throw e; } } From c9dd767c05d1cae12d413bee1127119575d6e016 Mon Sep 17 00:00:00 2001 From: Viraj Jasani Date: Thu, 17 Apr 2025 21:31:38 -0700 Subject: [PATCH 4/7] add comments --- .../org/apache/hadoop/hbase/master/region/MasterRegion.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java index 09bb3014925b..31eeaecc5322 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java @@ -148,6 +148,11 @@ public void update(UpdateMasterRegion action) throws IOException { action.update(region); flusherAndCompactor.onUpdate(); } catch (IOException e) { + // We catch IOException here to ensure that if the mutation is not successful + // even after the internal retries done within AbstractFSWAL, we better abort + // the active master so that the new active master can take care of resuming + // the procedure state which could not be persisted successfully by previously + // aborted master. Refer to Jira: HBASE-29251. LOG.error(HBaseMarkers.FATAL, "MasterRegion update is not successful. Aborting server to let new active master " + "resume failed proc store update."); From f4920793e66d5d58909ed4cb3ff9438d65502fd7 Mon Sep 17 00:00:00 2001 From: Viraj Jasani Date: Fri, 18 Apr 2025 15:18:48 -0700 Subject: [PATCH 5/7] add retriable error handling --- .../hbase/master/region/MasterRegion.java | 62 +++++++--- .../TestMasterRegionMutation1.java} | 53 ++++++--- .../master/TestMasterRegionMutation2.java | 110 ++++++++++++++++++ 3 files changed, 192 insertions(+), 33 deletions(-) rename hbase-server/src/test/java/org/apache/hadoop/hbase/{util/TestMasterRegionMutation.java => master/TestMasterRegionMutation1.java} (80%) create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation2.java diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java index 31eeaecc5322..997a4b0a023f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java @@ -27,9 +27,11 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseIOException; +import org.apache.hadoop.hbase.RegionTooBusyException; import org.apache.hadoop.hbase.Server; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor; +import org.apache.hadoop.hbase.client.ConnectionUtils; import org.apache.hadoop.hbase.client.Get; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RegionInfoBuilder; @@ -54,6 +56,7 @@ import org.apache.hadoop.hbase.util.FSUtils; import org.apache.hadoop.hbase.util.HFileArchiveUtil; import org.apache.hadoop.hbase.util.RecoverLeaseFSUtils; +import org.apache.hadoop.hbase.util.Threads; import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; import org.apache.hadoop.hbase.wal.WAL; import org.apache.hadoop.hbase.wal.WALFactory; @@ -118,6 +121,10 @@ public final class MasterRegion { private MasterRegionWALRoller walRoller; + private final int maxRetriesForRegionUpdates; + + private final long regionUpdateRetryPauseTime; + private MasterRegion(Server server, HRegion region, WALFactory walFactory, MasterRegionFlusherAndCompactor flusherAndCompactor, MasterRegionWALRoller walRoller) { this.server = server; @@ -125,6 +132,10 @@ private MasterRegion(Server server, HRegion region, WALFactory walFactory, this.walFactory = walFactory; this.flusherAndCompactor = flusherAndCompactor; this.walRoller = walRoller; + this.maxRetriesForRegionUpdates = + server.getConfiguration().getInt("hbase.master.region.update.max.retries", 9); + this.regionUpdateRetryPauseTime = + server.getConfiguration().getLong("hbase.master.region.update.retry.pause", 100); } private void closeRegion(boolean abort) { @@ -143,24 +154,47 @@ private void shutdownWAL() { } } + /** + * Performs the mutation to the master region using UpdateMasterRegion update action. + * @param action Update region action. + * @throws IOException IO errors that causes active master to abort. + */ public void update(UpdateMasterRegion action) throws IOException { - try { - action.update(region); - flusherAndCompactor.onUpdate(); - } catch (IOException e) { - // We catch IOException here to ensure that if the mutation is not successful - // even after the internal retries done within AbstractFSWAL, we better abort - // the active master so that the new active master can take care of resuming - // the procedure state which could not be persisted successfully by previously - // aborted master. Refer to Jira: HBASE-29251. - LOG.error(HBaseMarkers.FATAL, - "MasterRegion update is not successful. Aborting server to let new active master " - + "resume failed proc store update."); - server.abort("MasterRegion update is not successful", e); - throw e; + for (int tries = 0; tries < maxRetriesForRegionUpdates; tries++) { + try { + // If the update is successful, return immediately. + action.update(region); + flusherAndCompactor.onUpdate(); + return; + } catch (RegionTooBusyException e) { + // RegionTooBusyException is the type of IOException for which we can retry + // for few times before aborting the active master. The master region might + // have genuine case for delayed flushes and/or some procedure bug causing + // heavy pressure on the memstore. + if (tries == (maxRetriesForRegionUpdates - 1)) { + abortServer(e); + } + LOG.info("Master region is too busy... retry attempt: {}", tries); + Threads.sleep(ConnectionUtils.getPauseTime(regionUpdateRetryPauseTime, tries)); + } catch (IOException e) { + // We catch IOException here to ensure that if the mutation is not successful + // even after the internal retries done within AbstractFSWAL, we better abort + // the active master so that the new active master can take care of resuming + // the procedure state which could not be persisted successfully by previously + // aborted master. Refer to Jira: HBASE-29251. + abortServer(e); + } } } + private void abortServer(IOException e) throws IOException { + LOG.error(HBaseMarkers.FATAL, + "MasterRegion update is not successful. Aborting server to let new active master " + + "resume failed proc store update."); + server.abort("MasterRegion update is not successful", e); + throw e; + } + public Result get(Get get) throws IOException { return region.get(get); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestMasterRegionMutation.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation1.java similarity index 80% rename from hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestMasterRegionMutation.java rename to hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation1.java index 976c9af86565..5013450a807b 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestMasterRegionMutation.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation1.java @@ -15,16 +15,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hbase.util; +package org.apache.hadoop.hbase.master; import java.io.IOException; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtil; import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.RegionTooBusyException; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.SingleProcessHBaseCluster; import org.apache.hadoop.hbase.StartTestingClusterOption; @@ -34,7 +36,6 @@ import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.TableDescriptor; import org.apache.hadoop.hbase.client.TableDescriptorBuilder; -import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.master.hbck.HbckChore; import org.apache.hadoop.hbase.master.hbck.HbckReport; import org.apache.hadoop.hbase.master.region.MasterRegionFactory; @@ -45,6 +46,7 @@ import org.apache.hadoop.hbase.regionserver.RegionServerServices; import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.wal.WAL; import org.junit.AfterClass; import org.junit.Assert; @@ -65,27 +67,30 @@ * encounters IO errors. */ @Category({ MasterTests.class, LargeTests.class }) -public class TestMasterRegionMutation { +public class TestMasterRegionMutation1 { - private static final Logger LOG = LoggerFactory.getLogger(TestMasterRegionMutation.class); + private static final Logger LOG = LoggerFactory.getLogger(TestMasterRegionMutation1.class); @ClassRule public static final HBaseClassTestRule CLASS_RULE = - HBaseClassTestRule.forClass(TestMasterRegionMutation.class); + HBaseClassTestRule.forClass(TestMasterRegionMutation1.class); @Rule public TestName name = new TestName(); - private static final HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil(); - private static ServerName rs0; + protected static final HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil(); + protected static ServerName rs0; - private static final AtomicBoolean ERROR_OUT = new AtomicBoolean(false); + protected static final AtomicBoolean ERROR_OUT = new AtomicBoolean(false); + private static final AtomicInteger ERROR_COUNTER = new AtomicInteger(0); + private static final AtomicBoolean FIRST_TIME_ERROR = new AtomicBoolean(true); @BeforeClass public static void setUpBeforeClass() throws Exception { - TEST_UTIL.getConfiguration().setClass(HConstants.REGION_IMPL, TestRegion.class, HRegion.class); + TEST_UTIL.getConfiguration().setClass(HConstants.REGION_IMPL, TestRegion2.class, HRegion.class); StartTestingClusterOption.Builder builder = StartTestingClusterOption.builder(); - builder.numMasters(4).numRegionServers(3); + // 1 master is expected to be aborted with this test + builder.numMasters(2).numRegionServers(3); TEST_UTIL.startMiniCluster(builder.build()); SingleProcessHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); rs0 = cluster.getRegionServer(0).getServerName(); @@ -125,16 +130,15 @@ public void testMasterRegionMutations() throws Exception { Assert.assertEquals(0, hbckReport.getOrphanRegionsOnFS().size()); Assert.assertEquals(0, hbckReport.getOrphanRegionsOnRS().size()); - // procedure state store update failure needs to trigger active master abort and hence master - // failover + // procedure state store update encounters retriable error, master abort is not required ERROR_OUT.set(true); // move one region from server 1 to server 0 TEST_UTIL.getAdmin() .move(hRegionServer1.getRegions().get(0).getRegionInfo().getEncodedNameAsBytes(), rs0); - // procedure state store update failure needs to trigger active master abort and hence master - // failover + // procedure state store update encounters retriable error, however all retries are exhausted. + // This leads to the trigger of active master abort and hence master failover. ERROR_OUT.set(true); // move one region from server 2 to server 0 @@ -180,14 +184,14 @@ public void testMasterRegionMutations() throws Exception { } - public static class TestRegion extends HRegion { + public static class TestRegion2 extends HRegion { - public TestRegion(Path tableDir, WAL wal, FileSystem fs, Configuration confParam, + public TestRegion2(Path tableDir, WAL wal, FileSystem fs, Configuration confParam, RegionInfo regionInfo, TableDescriptor htd, RegionServerServices rsServices) { super(tableDir, wal, fs, confParam, regionInfo, htd, rsServices); } - public TestRegion(HRegionFileSystem fs, WAL wal, Configuration confParam, TableDescriptor htd, + public TestRegion2(HRegionFileSystem fs, WAL wal, Configuration confParam, TableDescriptor htd, RegionServerServices rsServices) { super(fs, wal, confParam, htd, rsServices); } @@ -199,8 +203,19 @@ public OperationStatus[] batchMutate(Mutation[] mutations, boolean atomic, long MasterRegionFactory.TABLE_NAME.equals(getTableDescriptor().getTableName()) && ERROR_OUT.get() ) { - ERROR_OUT.set(false); - throw new IOException("test error"); + // First time errors are recovered with enough retries + if (FIRST_TIME_ERROR.get() && ERROR_COUNTER.getAndIncrement() == 5) { + ERROR_OUT.set(false); + ERROR_COUNTER.set(0); + FIRST_TIME_ERROR.set(false); + return super.batchMutate(mutations, atomic, nonceGroup, nonce); + } + // Second time errors are not recovered with enough retries, leading to master abort + if (!FIRST_TIME_ERROR.get() && ERROR_COUNTER.getAndIncrement() == 8) { + ERROR_OUT.set(false); + ERROR_COUNTER.set(0); + } + throw new RegionTooBusyException("test error..."); } return super.batchMutate(mutations, atomic, nonceGroup, nonce); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation2.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation2.java new file mode 100644 index 000000000000..4e89129b4b9d --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation2.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master; + +import java.io.IOException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.SingleProcessHBaseCluster; +import org.apache.hadoop.hbase.StartTestingClusterOption; +import org.apache.hadoop.hbase.client.Mutation; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.client.TableDescriptor; +import org.apache.hadoop.hbase.master.region.MasterRegionFactory; +import org.apache.hadoop.hbase.regionserver.HRegion; +import org.apache.hadoop.hbase.regionserver.HRegionFileSystem; +import org.apache.hadoop.hbase.regionserver.OperationStatus; +import org.apache.hadoop.hbase.regionserver.RegionServerServices; +import org.apache.hadoop.hbase.testclassification.LargeTests; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.wal.WAL; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +/** + * MasterRegion related test that ensures the operations continue even when Procedure state update + * encounters non-retriable IO errors. + */ +@Category({ MasterTests.class, LargeTests.class }) +public class TestMasterRegionMutation2 extends TestMasterRegionMutation1 { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestMasterRegionMutation2.class); + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + TEST_UTIL.getConfiguration().setClass(HConstants.REGION_IMPL, TestRegion1.class, HRegion.class); + StartTestingClusterOption.Builder builder = StartTestingClusterOption.builder(); + // 2 masters are expected to be aborted with this test + builder.numMasters(3).numRegionServers(3); + TEST_UTIL.startMiniCluster(builder.build()); + SingleProcessHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); + rs0 = cluster.getRegionServer(0).getServerName(); + TEST_UTIL.getAdmin().balancerSwitch(false, true); + } + + @AfterClass + public static void tearDownAfterClass() throws Exception { + TEST_UTIL.shutdownMiniCluster(); + } + + @Before + public void setUp() throws Exception { + super.setUp(); + } + + @Test + public void testMasterRegionMutations() throws Exception { + super.testMasterRegionMutations(); + } + + public static class TestRegion1 extends HRegion { + + public TestRegion1(Path tableDir, WAL wal, FileSystem fs, Configuration confParam, + RegionInfo regionInfo, TableDescriptor htd, RegionServerServices rsServices) { + super(tableDir, wal, fs, confParam, regionInfo, htd, rsServices); + } + + public TestRegion1(HRegionFileSystem fs, WAL wal, Configuration confParam, TableDescriptor htd, + RegionServerServices rsServices) { + super(fs, wal, confParam, htd, rsServices); + } + + @Override + public OperationStatus[] batchMutate(Mutation[] mutations, boolean atomic, long nonceGroup, + long nonce) throws IOException { + if ( + MasterRegionFactory.TABLE_NAME.equals(getTableDescriptor().getTableName()) + && ERROR_OUT.get() + ) { + ERROR_OUT.set(false); + throw new IOException("test error"); + } + return super.batchMutate(mutations, atomic, nonceGroup, nonce); + } + } + +} From 7e66d31603aced161e0a4643c127924a75ebf837 Mon Sep 17 00:00:00 2001 From: Viraj Jasani Date: Sun, 20 Apr 2025 11:08:13 -0700 Subject: [PATCH 6/7] minor nits --- .../hadoop/hbase/master/region/MasterRegion.java | 11 +++++++++-- .../hbase/master/TestMasterRegionMutation1.java | 10 +++++----- .../hbase/master/TestMasterRegionMutation2.java | 8 ++++---- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java index 997a4b0a023f..fac66feb2eb8 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java @@ -157,7 +157,7 @@ private void shutdownWAL() { /** * Performs the mutation to the master region using UpdateMasterRegion update action. * @param action Update region action. - * @throws IOException IO errors that causes active master to abort. + * @throws IOException IO error that causes active master to abort. */ public void update(UpdateMasterRegion action) throws IOException { for (int tries = 0; tries < maxRetriesForRegionUpdates; tries++) { @@ -174,7 +174,7 @@ public void update(UpdateMasterRegion action) throws IOException { if (tries == (maxRetriesForRegionUpdates - 1)) { abortServer(e); } - LOG.info("Master region is too busy... retry attempt: {}", tries); + LOG.info("Master region {} is too busy... retry attempt: {}", region, tries); Threads.sleep(ConnectionUtils.getPauseTime(regionUpdateRetryPauseTime, tries)); } catch (IOException e) { // We catch IOException here to ensure that if the mutation is not successful @@ -187,6 +187,13 @@ public void update(UpdateMasterRegion action) throws IOException { } } + /** + * Log the error and abort the master daemon immediately. Use this utility only when procedure + * state store update fails and the only way to recover is by terminating the active master so + * that new failed-over active master can resume the procedure execution. + * @param e IO error that causes active master to abort. + * @throws IOException IO error that causes active master to abort. + */ private void abortServer(IOException e) throws IOException { LOG.error(HBaseMarkers.FATAL, "MasterRegion update is not successful. Aborting server to let new active master " diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation1.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation1.java index 5013450a807b..337cd3deeee0 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation1.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation1.java @@ -64,7 +64,7 @@ /** * MasterRegion related test that ensures the operations continue even when Procedure state update - * encounters IO errors. + * encounters retriable IO errors. */ @Category({ MasterTests.class, LargeTests.class }) public class TestMasterRegionMutation1 { @@ -87,7 +87,7 @@ public class TestMasterRegionMutation1 { @BeforeClass public static void setUpBeforeClass() throws Exception { - TEST_UTIL.getConfiguration().setClass(HConstants.REGION_IMPL, TestRegion2.class, HRegion.class); + TEST_UTIL.getConfiguration().setClass(HConstants.REGION_IMPL, TestRegion.class, HRegion.class); StartTestingClusterOption.Builder builder = StartTestingClusterOption.builder(); // 1 master is expected to be aborted with this test builder.numMasters(2).numRegionServers(3); @@ -184,14 +184,14 @@ public void testMasterRegionMutations() throws Exception { } - public static class TestRegion2 extends HRegion { + public static class TestRegion extends HRegion { - public TestRegion2(Path tableDir, WAL wal, FileSystem fs, Configuration confParam, + public TestRegion(Path tableDir, WAL wal, FileSystem fs, Configuration confParam, RegionInfo regionInfo, TableDescriptor htd, RegionServerServices rsServices) { super(tableDir, wal, fs, confParam, regionInfo, htd, rsServices); } - public TestRegion2(HRegionFileSystem fs, WAL wal, Configuration confParam, TableDescriptor htd, + public TestRegion(HRegionFileSystem fs, WAL wal, Configuration confParam, TableDescriptor htd, RegionServerServices rsServices) { super(fs, wal, confParam, htd, rsServices); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation2.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation2.java index 4e89129b4b9d..dc918889503c 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation2.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterRegionMutation2.java @@ -56,7 +56,7 @@ public class TestMasterRegionMutation2 extends TestMasterRegionMutation1 { @BeforeClass public static void setUpBeforeClass() throws Exception { - TEST_UTIL.getConfiguration().setClass(HConstants.REGION_IMPL, TestRegion1.class, HRegion.class); + TEST_UTIL.getConfiguration().setClass(HConstants.REGION_IMPL, TestRegion.class, HRegion.class); StartTestingClusterOption.Builder builder = StartTestingClusterOption.builder(); // 2 masters are expected to be aborted with this test builder.numMasters(3).numRegionServers(3); @@ -81,14 +81,14 @@ public void testMasterRegionMutations() throws Exception { super.testMasterRegionMutations(); } - public static class TestRegion1 extends HRegion { + public static class TestRegion extends HRegion { - public TestRegion1(Path tableDir, WAL wal, FileSystem fs, Configuration confParam, + public TestRegion(Path tableDir, WAL wal, FileSystem fs, Configuration confParam, RegionInfo regionInfo, TableDescriptor htd, RegionServerServices rsServices) { super(tableDir, wal, fs, confParam, regionInfo, htd, rsServices); } - public TestRegion1(HRegionFileSystem fs, WAL wal, Configuration confParam, TableDescriptor htd, + public TestRegion(HRegionFileSystem fs, WAL wal, Configuration confParam, TableDescriptor htd, RegionServerServices rsServices) { super(fs, wal, confParam, htd, rsServices); } From 47edd9ace2b40679b030cf884fb086c8fa44ca65 Mon Sep 17 00:00:00 2001 From: Viraj Jasani Date: Mon, 21 Apr 2025 13:25:02 -0700 Subject: [PATCH 7/7] use flusherAndCompactor.onUpdate() and comment for backoff --- .../org/apache/hadoop/hbase/master/region/MasterRegion.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java index fac66feb2eb8..f8d7e7a18237 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java @@ -171,10 +171,14 @@ public void update(UpdateMasterRegion action) throws IOException { // for few times before aborting the active master. The master region might // have genuine case for delayed flushes and/or some procedure bug causing // heavy pressure on the memstore. + flusherAndCompactor.onUpdate(); if (tries == (maxRetriesForRegionUpdates - 1)) { abortServer(e); } LOG.info("Master region {} is too busy... retry attempt: {}", region, tries); + // Exponential backoff is performed by ConnectionUtils.getPauseTime(). + // It uses HConstants.RETRY_BACKOFF array for the backoff multiplier, the + // same array is used as backoff multiplier with RPC retries. Threads.sleep(ConnectionUtils.getPauseTime(regionUpdateRetryPauseTime, tries)); } catch (IOException e) { // We catch IOException here to ensure that if the mutation is not successful