Skip to content

Commit 522a471

Browse files
Umeshkumar9414ukumawat
andauthored
HBASE-28690 added masterStartCode as fencing token for remote procedures (#1)
* HBASE-28690 added masterStartCode as fencing token for remote procedures * HBASE-28690 comments updated * HBASE-28690 add masterStartCode for RemoteProcedureRequest * HBASE-28690 used master active time for fencing and review comments * HBASE-28690 minor comment addition * HBASE-28690 spotless apply * HBASE-28690 reduce log line length for checkstyle --------- Co-authored-by: ukumawat <ukumawat@salesforce.com>
1 parent edbb145 commit 522a471

15 files changed

Lines changed: 137 additions & 44 deletions

File tree

hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/ProtobufUtil.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3097,10 +3097,12 @@ public static CloseRegionRequest buildCloseRegionRequest(ServerName server, byte
30973097
}
30983098

30993099
public static CloseRegionRequest buildCloseRegionRequest(ServerName server, byte[] regionName,
3100-
ServerName destinationServer, long closeProcId, boolean evictCache) {
3100+
ServerName destinationServer, long closeProcId, boolean evictCache,
3101+
long initiatingMasterActiveTime) {
31013102
CloseRegionRequest.Builder builder =
31023103
getBuilder(server, regionName, destinationServer, closeProcId);
31033104
builder.setEvictCache(evictCache);
3105+
builder.setInitiatingMasterActiveTime(initiatingMasterActiveTime);
31043106
return builder.build();
31053107
}
31063108

hbase-protocol-shaded/src/main/protobuf/server/master/RegionServerStatus.proto

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,9 @@ message RegionStateTransition {
9797
optional uint64 open_seq_num = 3;
9898

9999
repeated int64 proc_id = 4;
100+
101+
// Master active time as fencing token
102+
optional int64 initiating_master_active_time = 5;
100103
enum TransitionCode {
101104
OPENED = 0;
102105
FAILED_OPEN = 1;
@@ -155,6 +158,8 @@ message RemoteProcedureResult {
155158
}
156159
required Status status = 2;
157160
optional ForeignExceptionMessage error = 3;
161+
// Master active time as fencing token
162+
optional int64 initiating_master_active_time = 4;
158163
}
159164
message ReportProcedureDoneRequest {
160165
repeated RemoteProcedureResult result = 1;

hbase-protocol-shaded/src/main/protobuf/server/region/Admin.proto

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ message OpenRegionRequest {
8080
repeated RegionOpenInfo open_info = 1;
8181
// the intended server for this RPC.
8282
optional uint64 serverStartCode = 2;
83+
// Master active time as fencing token
84+
optional int64 initiating_master_active_time = 3;
8385
// wall clock time from master
8486
optional uint64 master_system_time = 5;
8587

@@ -123,6 +125,8 @@ message CloseRegionRequest {
123125
optional uint64 serverStartCode = 5;
124126
optional int64 close_proc_id = 6 [default = -1];
125127
optional bool evict_cache = 7 [default = false];
128+
// Master active time as fencing token
129+
optional int64 initiating_master_active_time = 8;
126130
}
127131

128132
message CloseRegionResponse {
@@ -272,6 +276,8 @@ message RemoteProcedureRequest {
272276
required uint64 proc_id = 1;
273277
required string proc_class = 2;
274278
optional bytes proc_data = 3;
279+
// Master active time as fencing token
280+
optional int64 initiating_master_active_time = 4;
275281
}
276282

277283
message ExecuteProceduresRequest {

hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
import org.apache.hadoop.hbase.DoNotRetryIOException;
4040
import org.apache.hadoop.hbase.HBaseRpcServicesBase;
4141
import org.apache.hadoop.hbase.HConstants;
42+
import org.apache.hadoop.hbase.MasterNotRunningException;
4243
import org.apache.hadoop.hbase.MetaTableAccessor;
4344
import org.apache.hadoop.hbase.NamespaceDescriptor;
4445
import org.apache.hadoop.hbase.ServerMetrics;
@@ -64,7 +65,6 @@
6465
import org.apache.hadoop.hbase.ipc.QosPriority;
6566
import org.apache.hadoop.hbase.ipc.RpcServer;
6667
import org.apache.hadoop.hbase.ipc.RpcServer.BlockingServiceAndInterface;
67-
import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
6868
import org.apache.hadoop.hbase.ipc.ServerRpcController;
6969
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
7070
import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
@@ -396,6 +396,7 @@
396396
import org.apache.hadoop.hbase.shaded.protobuf.generated.RSGroupAdminProtos.UpdateRSGroupConfigRequest;
397397
import org.apache.hadoop.hbase.shaded.protobuf.generated.RSGroupAdminProtos.UpdateRSGroupConfigResponse;
398398
import org.apache.hadoop.hbase.shaded.protobuf.generated.RecentLogs;
399+
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos;
399400
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.FileArchiveNotificationRequest;
400401
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.FileArchiveNotificationResponse;
401402
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.GetLastFlushedSequenceIdRequest;
@@ -1854,6 +1855,15 @@ public ReportRegionStateTransitionResponse reportRegionStateTransition(RpcContro
18541855
ReportRegionStateTransitionRequest req) throws ServiceException {
18551856
try {
18561857
server.checkServiceStarted();
1858+
for (RegionServerStatusProtos.RegionStateTransition transition : req.getTransitionList()) {
1859+
long procId =
1860+
transition.getProcIdCount() > 0 ? transition.getProcId(0) : Procedure.NO_PROC_ID;
1861+
// -1 is less than any possible MasterActiveCode
1862+
long initiatingMasterActiveTime = transition.hasInitiatingMasterActiveTime()
1863+
? transition.getInitiatingMasterActiveTime()
1864+
: -1;
1865+
throwOnOldMasterStartCode(procId, initiatingMasterActiveTime);
1866+
}
18571867
return server.getAssignmentManager().reportRegionStateTransition(req);
18581868
} catch (IOException ioe) {
18591869
throw new ServiceException(ioe);
@@ -2553,8 +2563,14 @@ public ReportProcedureDoneResponse reportProcedureDone(RpcController controller,
25532563
// Check Masters is up and ready for duty before progressing. Remote side will keep trying.
25542564
try {
25552565
this.server.checkServiceStarted();
2556-
} catch (ServerNotRunningYetException snrye) {
2557-
throw new ServiceException(snrye);
2566+
for (RemoteProcedureResult result : request.getResultList()) {
2567+
// -1 is less than any possible MasterActiveCode
2568+
long initiatingMasterActiveTime =
2569+
result.hasInitiatingMasterActiveTime() ? result.getInitiatingMasterActiveTime() : -1;
2570+
throwOnOldMasterStartCode(result.getProcId(), initiatingMasterActiveTime);
2571+
}
2572+
} catch (IOException ioe) {
2573+
throw new ServiceException(ioe);
25582574
}
25592575
request.getResultList().forEach(result -> {
25602576
if (result.getStatus() == RemoteProcedureResult.Status.SUCCESS) {
@@ -2567,6 +2583,18 @@ public ReportProcedureDoneResponse reportProcedureDone(RpcController controller,
25672583
return ReportProcedureDoneResponse.getDefaultInstance();
25682584
}
25692585

2586+
private void throwOnOldMasterStartCode(long procId, long initiatingMasterActiveTime)
2587+
throws MasterNotRunningException {
2588+
if (initiatingMasterActiveTime > server.getMasterActiveTime()) {
2589+
// procedure is initiated by new active master but report received on master with older active
2590+
// time
2591+
LOG.warn(
2592+
"Report for procId: {} and initiatingMasterAT {} received on master with activeTime {}",
2593+
procId, initiatingMasterActiveTime, server.getMasterActiveTime());
2594+
throw new MasterNotRunningException("Another master is active");
2595+
}
2596+
}
2597+
25702598
@Override
25712599
public FileArchiveNotificationResponse reportFileArchival(RpcController controller,
25722600
FileArchiveNotificationRequest request) throws ServiceException {

hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import org.apache.hadoop.hbase.client.RegionInfo;
3131
import org.apache.hadoop.hbase.ipc.RpcConnectionConstants;
3232
import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
33+
import org.apache.hadoop.hbase.master.HMaster;
3334
import org.apache.hadoop.hbase.master.MasterServices;
3435
import org.apache.hadoop.hbase.master.ServerListener;
3536
import org.apache.hadoop.hbase.master.ServerManager;
@@ -416,13 +417,16 @@ public void dispatchOpenRequests(final MasterProcedureEnv env,
416417
public void dispatchCloseRequests(final MasterProcedureEnv env,
417418
final List<RegionCloseOperation> operations) {
418419
for (RegionCloseOperation op : operations) {
419-
request.addCloseRegion(op.buildCloseRegionRequest(getServerName()));
420+
request.addCloseRegion(op.buildCloseRegionRequest(getServerName(),
421+
((HMaster) env.getMasterServices()).getMasterActiveTime()));
420422
}
421423
}
422424

423425
@Override
424426
public void dispatchServerOperations(MasterProcedureEnv env, List<ServerOperation> operations) {
425-
operations.stream().map(o -> o.buildRequest()).forEachOrdered(request::addProc);
427+
operations.stream()
428+
.map(o -> o.buildRequest(((HMaster) env.getMasterServices()).getMasterActiveTime()))
429+
.forEachOrdered(request::addProc);
426430
}
427431

428432
// will be overridden in test.
@@ -441,7 +445,9 @@ protected final void remoteCallFailed(final MasterProcedureEnv env, final IOExce
441445
private static OpenRegionRequest buildOpenRegionRequest(final MasterProcedureEnv env,
442446
final ServerName serverName, final List<RegionOpenOperation> operations) {
443447
final OpenRegionRequest.Builder builder = OpenRegionRequest.newBuilder();
444-
builder.setServerStartCode(serverName.getStartcode());
448+
builder.setServerStartCode(serverName.getStartCode());
449+
builder
450+
.setInitiatingMasterActiveTime(((HMaster) env.getMasterServices()).getMasterActiveTime());
445451
builder.setMasterSystemTime(EnvironmentEdgeManager.currentTime());
446452
for (RegionOpenOperation op : operations) {
447453
builder.addOpenInfo(op.buildRegionOpenInfoRequest(env));
@@ -471,9 +477,10 @@ public ServerOperation(RemoteProcedure remoteProcedure, long procId, Class<?> rs
471477
this.rsProcData = rsProcData;
472478
}
473479

474-
public RemoteProcedureRequest buildRequest() {
480+
public RemoteProcedureRequest buildRequest(long initiatingMasterActiveTime) {
475481
return RemoteProcedureRequest.newBuilder().setProcId(procId)
476-
.setProcClass(rsProcClass.getName()).setProcData(ByteString.copyFrom(rsProcData)).build();
482+
.setProcClass(rsProcClass.getName()).setProcData(ByteString.copyFrom(rsProcData))
483+
.setInitiatingMasterActiveTime(initiatingMasterActiveTime).build();
477484
}
478485
}
479486

@@ -517,9 +524,10 @@ public ServerName getDestinationServer() {
517524
return destinationServer;
518525
}
519526

520-
public CloseRegionRequest buildCloseRegionRequest(final ServerName serverName) {
527+
public CloseRegionRequest buildCloseRegionRequest(final ServerName serverName,
528+
long initiatingMasterActiveTime) {
521529
return ProtobufUtil.buildCloseRegionRequest(serverName, regionInfo.getRegionName(),
522-
getDestinationServer(), procId, evictCache);
530+
getDestinationServer(), procId, evictCache, initiatingMasterActiveTime);
523531

524532
}
525533
}

hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2231,6 +2231,7 @@ public void postOpenDeployTasks(final PostOpenDeployContext context) throws IOEx
22312231
HRegion r = context.getRegion();
22322232
long openProcId = context.getOpenProcId();
22332233
long masterSystemTime = context.getMasterSystemTime();
2234+
long initiatingMasterActiveTime = context.getInitiatingMasterActiveTime();
22342235
rpcServices.checkOpen();
22352236
LOG.info("Post open deploy tasks for {}, pid={}, masterSystemTime={}",
22362237
r.getRegionInfo().getRegionNameAsString(), openProcId, masterSystemTime);
@@ -2254,7 +2255,7 @@ public void postOpenDeployTasks(final PostOpenDeployContext context) throws IOEx
22542255
// Notify master
22552256
if (
22562257
!reportRegionStateTransition(new RegionStateTransitionContext(TransitionCode.OPENED,
2257-
openSeqNum, openProcId, masterSystemTime, r.getRegionInfo()))
2258+
openSeqNum, openProcId, masterSystemTime, r.getRegionInfo(), initiatingMasterActiveTime))
22582259
) {
22592260
throw new IOException(
22602261
"Failed to report opened region to master: " + r.getRegionInfo().getRegionNameAsString());
@@ -2315,6 +2316,7 @@ private boolean skipReportingTransition(final RegionStateTransitionContext conte
23152316
for (long procId : procIds) {
23162317
transition.addProcId(procId);
23172318
}
2319+
transition.setInitiatingMasterActiveTime(context.getInitiatingMasterActiveTime());
23182320

23192321
return builder.build();
23202322
}
@@ -3533,12 +3535,15 @@ public boolean reportFileArchivalForQuotas(TableName tableName,
35333535
return true;
35343536
}
35353537

3536-
void executeProcedure(long procId, RSProcedureCallable callable) {
3537-
executorService.submit(new RSProcedureHandler(this, procId, callable));
3538+
void executeProcedure(long procId, long initiatingMasterActiveTime,
3539+
RSProcedureCallable callable) {
3540+
executorService
3541+
.submit(new RSProcedureHandler(this, procId, initiatingMasterActiveTime, callable));
35383542
}
35393543

3540-
public void remoteProcedureComplete(long procId, Throwable error) {
3541-
procedureResultReporter.complete(procId, error);
3544+
public void remoteProcedureComplete(long procId, long initiatingMasterActiveTime,
3545+
Throwable error) {
3546+
procedureResultReporter.complete(procId, initiatingMasterActiveTime, error);
35423547
}
35433548

35443549
void reportProcedureDone(ReportProcedureDoneRequest request) throws IOException {

hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3859,6 +3859,8 @@ public ClearRegionBlockCacheResponse clearRegionBlockCache(RpcController control
38593859
private void executeOpenRegionProcedures(OpenRegionRequest request,
38603860
Map<TableName, TableDescriptor> tdCache) {
38613861
long masterSystemTime = request.hasMasterSystemTime() ? request.getMasterSystemTime() : -1;
3862+
long initiatingMasterActiveTime =
3863+
request.hasInitiatingMasterActiveTime() ? request.getInitiatingMasterActiveTime() : -1;
38623864
for (RegionOpenInfo regionOpenInfo : request.getOpenInfoList()) {
38633865
RegionInfo regionInfo = ProtobufUtil.toRegionInfo(regionOpenInfo.getRegion());
38643866
TableName tableName = regionInfo.getTable();
@@ -3884,14 +3886,16 @@ private void executeOpenRegionProcedures(OpenRegionRequest request,
38843886
}
38853887
long procId = regionOpenInfo.getOpenProcId();
38863888
if (server.submitRegionProcedure(procId)) {
3887-
server.getExecutorService().submit(
3888-
AssignRegionHandler.create(server, regionInfo, procId, tableDesc, masterSystemTime));
3889+
server.getExecutorService().submit(AssignRegionHandler.create(server, regionInfo, procId,
3890+
tableDesc, masterSystemTime, initiatingMasterActiveTime));
38893891
}
38903892
}
38913893
}
38923894

38933895
private void executeCloseRegionProcedures(CloseRegionRequest request) {
38943896
String encodedName;
3897+
long initiatingMasterActiveTime =
3898+
request.hasInitiatingMasterActiveTime() ? request.getInitiatingMasterActiveTime() : -1;
38953899
try {
38963900
encodedName = ProtobufUtil.getRegionEncodedName(request.getRegion());
38973901
} catch (DoNotRetryIOException e) {
@@ -3903,8 +3907,8 @@ private void executeCloseRegionProcedures(CloseRegionRequest request) {
39033907
long procId = request.getCloseProcId();
39043908
boolean evictCache = request.getEvictCache();
39053909
if (server.submitRegionProcedure(procId)) {
3906-
server.getExecutorService().submit(
3907-
UnassignRegionHandler.create(server, encodedName, procId, false, destination, evictCache));
3910+
server.getExecutorService().submit(UnassignRegionHandler.create(server, encodedName, procId,
3911+
false, destination, evictCache, initiatingMasterActiveTime));
39083912
}
39093913
}
39103914

@@ -3916,12 +3920,13 @@ private void executeProcedures(RemoteProcedureRequest request) {
39163920
} catch (Exception e) {
39173921
LOG.warn("Failed to instantiating remote procedure {}, pid={}", request.getProcClass(),
39183922
request.getProcId(), e);
3919-
server.remoteProcedureComplete(request.getProcId(), e);
3923+
server.remoteProcedureComplete(request.getProcId(), request.getInitiatingMasterActiveTime(),
3924+
e);
39203925
return;
39213926
}
39223927
callable.init(request.getProcData().toByteArray(), server);
39233928
LOG.debug("Executing remote procedure {}, pid={}", callable.getClass(), request.getProcId());
3924-
server.executeProcedure(request.getProcId(), callable);
3929+
server.executeProcedure(request.getProcId(), request.getInitiatingMasterActiveTime(), callable);
39253930
}
39263931

39273932
@Override

hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RegionServerServices.java

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,14 @@ class PostOpenDeployContext {
9393
private final HRegion region;
9494
private final long openProcId;
9595
private final long masterSystemTime;
96+
private final long initiatingMasterActiveTime;
9697

97-
public PostOpenDeployContext(HRegion region, long openProcId, long masterSystemTime) {
98+
public PostOpenDeployContext(HRegion region, long openProcId, long masterSystemTime,
99+
long initiatingMasterActiveTime) {
98100
this.region = region;
99101
this.openProcId = openProcId;
100102
this.masterSystemTime = masterSystemTime;
103+
this.initiatingMasterActiveTime = initiatingMasterActiveTime;
101104
}
102105

103106
public HRegion getRegion() {
@@ -111,6 +114,10 @@ public long getOpenProcId() {
111114
public long getMasterSystemTime() {
112115
return masterSystemTime;
113116
}
117+
118+
public long getInitiatingMasterActiveTime() {
119+
return initiatingMasterActiveTime;
120+
}
114121
}
115122

116123
/**
@@ -123,23 +130,26 @@ class RegionStateTransitionContext {
123130
private final TransitionCode code;
124131
private final long openSeqNum;
125132
private final long masterSystemTime;
133+
private final long initiatingMasterActiveTime;
126134
private final long[] procIds;
127135
private final RegionInfo[] hris;
128136

129137
public RegionStateTransitionContext(TransitionCode code, long openSeqNum, long masterSystemTime,
130-
RegionInfo... hris) {
138+
long initiatingMasterActiveTime, RegionInfo... hris) {
131139
this.code = code;
132140
this.openSeqNum = openSeqNum;
133141
this.masterSystemTime = masterSystemTime;
142+
this.initiatingMasterActiveTime = initiatingMasterActiveTime;
134143
this.hris = hris;
135144
this.procIds = new long[hris.length];
136145
}
137146

138147
public RegionStateTransitionContext(TransitionCode code, long openSeqNum, long procId,
139-
long masterSystemTime, RegionInfo hri) {
148+
long masterSystemTime, RegionInfo hri, long initiatingMasterActiveTime) {
140149
this.code = code;
141150
this.openSeqNum = openSeqNum;
142151
this.masterSystemTime = masterSystemTime;
152+
this.initiatingMasterActiveTime = initiatingMasterActiveTime;
143153
this.hris = new RegionInfo[] { hri };
144154
this.procIds = new long[] { procId };
145155
}
@@ -163,6 +173,10 @@ public RegionInfo[] getHris() {
163173
public long[] getProcIds() {
164174
return procIds;
165175
}
176+
177+
public long getInitiatingMasterActiveTime() {
178+
return initiatingMasterActiveTime;
179+
}
166180
}
167181

168182
/**

hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RemoteProcedureResultReporter.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,9 @@ public RemoteProcedureResultReporter(HRegionServer server) {
5151
this.server = server;
5252
}
5353

54-
public void complete(long procId, Throwable error) {
55-
RemoteProcedureResult.Builder builder = RemoteProcedureResult.newBuilder().setProcId(procId);
54+
public void complete(long procId, long initiatingMasterActiveTime, Throwable error) {
55+
RemoteProcedureResult.Builder builder = RemoteProcedureResult.newBuilder().setProcId(procId)
56+
.setInitiatingMasterActiveTime(initiatingMasterActiveTime);
5657
if (error != null) {
5758
LOG.debug("Failed to complete execution of pid={}", procId, error);
5859
builder.setStatus(RemoteProcedureResult.Status.ERROR).setError(

hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitRequest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ private void requestRegionSplit() {
8181
// are created just to pass the information to the reportRegionStateTransition().
8282
if (
8383
!server.reportRegionStateTransition(new RegionStateTransitionContext(
84-
TransitionCode.READY_TO_SPLIT, HConstants.NO_SEQNUM, -1, parent, hri_a, hri_b))
84+
TransitionCode.READY_TO_SPLIT, HConstants.NO_SEQNUM, -1, -1, parent, hri_a, hri_b))
8585
) {
8686
LOG.error("Unable to ask master to split " + parent.getRegionNameAsString());
8787
}

0 commit comments

Comments
 (0)