Skip to content

Commit 827ac96

Browse files
ddupginfraio
authored andcommitted
HBASE-24684 Fetch ReplicationSink servers list from HMaster instead o… (#2077)
Signed-off-by: Wellington Chevreuil <[email protected]>
1 parent 17621c8 commit 827ac96

15 files changed

Lines changed: 337 additions & 19 deletions

File tree

hbase-protocol-shaded/src/main/protobuf/server/master/Master.proto

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -717,6 +717,13 @@ message BalancerDecisionsResponse {
717717
repeated BalancerDecision balancer_decision = 1;
718718
}
719719

720+
message ListReplicationSinkServersRequest {
721+
}
722+
723+
message ListReplicationSinkServersResponse {
724+
repeated ServerName server_name = 1;
725+
}
726+
720727
service MasterService {
721728
/** Used by the client to get the number of regions that have received the updated schema */
722729
rpc GetSchemaAlterStatus(GetSchemaAlterStatusRequest)
@@ -1146,10 +1153,13 @@ service MasterService {
11461153
returns (RenameRSGroupResponse);
11471154

11481155
rpc UpdateRSGroupConfig(UpdateRSGroupConfigRequest)
1149-
returns (UpdateRSGroupConfigResponse);
1156+
returns (UpdateRSGroupConfigResponse);
11501157

11511158
rpc GetLogEntries(LogRequest)
11521159
returns(LogEntry);
1160+
1161+
rpc ListReplicationSinkServers(ListReplicationSinkServersRequest)
1162+
returns (ListReplicationSinkServersResponse);
11531163
}
11541164

11551165
// HBCK Service definitions.

hbase-server/src/main/java/org/apache/hadoop/hbase/coprocessor/MasterObserver.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1782,4 +1782,20 @@ default void preHasUserPermissions(ObserverContext<MasterCoprocessorEnvironment>
17821782
default void postHasUserPermissions(ObserverContext<MasterCoprocessorEnvironment> ctx,
17831783
String userName, List<Permission> permissions) throws IOException {
17841784
}
1785+
1786+
/**
1787+
* Called before getting servers for replication sink.
1788+
* @param ctx the coprocessor instance's environment
1789+
*/
1790+
default void preListReplicationSinkServers(ObserverContext<MasterCoprocessorEnvironment> ctx)
1791+
throws IOException {
1792+
}
1793+
1794+
/**
1795+
* Called after getting servers for replication sink.
1796+
* @param ctx the coprocessor instance's environment
1797+
*/
1798+
default void postListReplicationSinkServers(ObserverContext<MasterCoprocessorEnvironment> ctx)
1799+
throws IOException {
1800+
}
17851801
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3900,4 +3900,9 @@ public MetaRegionLocationCache getMetaRegionLocationCache() {
39003900
public RSGroupInfoManager getRSGroupInfoManager() {
39013901
return rsGroupInfoManager;
39023902
}
3903+
3904+
@Override
3905+
public List<ServerName> listReplicationSinkServers() throws IOException {
3906+
return this.serverManager.getOnlineServersList();
3907+
}
39033908
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterCoprocessorHost.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2038,4 +2038,22 @@ public void call(MasterObserver observer) throws IOException {
20382038
}
20392039
});
20402040
}
2041+
2042+
public void preListReplicationSinkServers() throws IOException {
2043+
execOperation(coprocEnvironments.isEmpty() ? null : new MasterObserverOperation() {
2044+
@Override
2045+
public void call(MasterObserver observer) throws IOException {
2046+
observer.preListReplicationSinkServers(this);
2047+
}
2048+
});
2049+
}
2050+
2051+
public void postListReplicationSinkServers() throws IOException {
2052+
execOperation(coprocEnvironments.isEmpty() ? null : new MasterObserverOperation() {
2053+
@Override
2054+
public void call(MasterObserver observer) throws IOException {
2055+
observer.postListReplicationSinkServers(this);
2056+
}
2057+
});
2058+
}
20412059
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,8 @@
263263
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ListNamespaceDescriptorsResponse;
264264
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ListNamespacesRequest;
265265
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ListNamespacesResponse;
266+
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ListReplicationSinkServersRequest;
267+
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ListReplicationSinkServersResponse;
266268
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ListTableDescriptorsByNamespaceRequest;
267269
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ListTableDescriptorsByNamespaceResponse;
268270
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ListTableNamesByNamespaceRequest;
@@ -3375,4 +3377,23 @@ private MasterProtos.BalancerDecisionsResponse getBalancerDecisions(
33753377
.addAllBalancerDecision(balancerDecisions).build();
33763378
}
33773379

3380+
public ListReplicationSinkServersResponse listReplicationSinkServers(
3381+
RpcController controller, ListReplicationSinkServersRequest request)
3382+
throws ServiceException {
3383+
ListReplicationSinkServersResponse.Builder builder =
3384+
ListReplicationSinkServersResponse.newBuilder();
3385+
try {
3386+
if (master.getMasterCoprocessorHost() != null) {
3387+
master.getMasterCoprocessorHost().preListReplicationSinkServers();
3388+
}
3389+
builder.addAllServerName(master.listReplicationSinkServers().stream()
3390+
.map(ProtobufUtil::toServerName).collect(Collectors.toList()));
3391+
if (master.getMasterCoprocessorHost() != null) {
3392+
master.getMasterCoprocessorHost().postListReplicationSinkServers();
3393+
}
3394+
} catch (IOException e) {
3395+
throw new ServiceException(e);
3396+
}
3397+
return builder.build();
3398+
}
33783399
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -553,4 +553,10 @@ default SplitWALManager getSplitWALManager(){
553553
* @return The state of the load balancer, or false if the load balancer isn't defined.
554554
*/
555555
boolean isBalancerOn();
556+
557+
/**
558+
* Get a list of servers' addresses for replication sink.
559+
* @return a list of servers' address
560+
*/
561+
List<ServerName> listReplicationSinkServers() throws IOException;
556562
}

hbase-server/src/main/java/org/apache/hadoop/hbase/replication/HBaseReplicationEndpoint.java

Lines changed: 133 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@
1818

1919
package org.apache.hadoop.hbase.replication;
2020

21+
import static org.apache.hadoop.hbase.HConstants.DEFAULT_HBASE_RPC_SHORTOPERATION_TIMEOUT;
22+
import static org.apache.hadoop.hbase.HConstants.HBASE_RPC_SHORTOPERATION_TIMEOUT_KEY;
23+
2124
import java.io.IOException;
2225
import java.util.ArrayList;
2326
import java.util.Collections;
@@ -28,21 +31,26 @@
2831

2932
import org.apache.hadoop.conf.Configuration;
3033
import org.apache.hadoop.fs.Path;
34+
import org.apache.hadoop.hbase.Abortable;
3135
import org.apache.hadoop.hbase.HBaseConfiguration;
36+
import org.apache.hadoop.hbase.ChoreService;
3237
import org.apache.hadoop.hbase.client.AsyncClusterConnection;
3338
import org.apache.hadoop.hbase.client.AsyncRegionServerAdmin;
3439
import org.apache.hadoop.hbase.client.AsyncReplicationServerAdmin;
3540
import org.apache.hadoop.hbase.client.ClusterConnectionFactory;
3641
import org.apache.hadoop.hbase.protobuf.ReplicationProtobufUtil;
42+
import org.apache.hadoop.hbase.ScheduledChore;
43+
import org.apache.hadoop.hbase.Server;
44+
import org.apache.hadoop.hbase.ServerName;
3745
import org.apache.hadoop.hbase.security.User;
46+
import org.apache.hadoop.hbase.security.UserProvider;
47+
import org.apache.hadoop.hbase.util.FutureUtils;
3848
import org.apache.hadoop.hbase.wal.WAL;
39-
import org.apache.hadoop.hbase.zookeeper.ZKListener;
40-
import org.apache.yetus.audience.InterfaceAudience;
41-
import org.apache.hadoop.hbase.Abortable;
42-
import org.apache.hadoop.hbase.ServerName;
4349
import org.apache.hadoop.hbase.zookeeper.ZKClusterId;
50+
import org.apache.hadoop.hbase.zookeeper.ZKListener;
4451
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
4552
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
53+
import org.apache.yetus.audience.InterfaceAudience;
4654
import org.apache.zookeeper.KeeperException;
4755
import org.apache.zookeeper.KeeperException.AuthFailedException;
4856
import org.apache.zookeeper.KeeperException.ConnectionLossException;
@@ -52,6 +60,12 @@
5260

5361
import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
5462
import org.apache.hbase.thirdparty.com.google.common.collect.Maps;
63+
import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
64+
65+
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
66+
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ListReplicationSinkServersRequest;
67+
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.ListReplicationSinkServersResponse;
68+
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.MasterService;
5569

5670
/**
5771
* A {@link BaseReplicationEndpoint} for replication endpoints whose
@@ -63,6 +77,13 @@ public abstract class HBaseReplicationEndpoint extends BaseReplicationEndpoint
6377

6478
private static final Logger LOG = LoggerFactory.getLogger(HBaseReplicationEndpoint.class);
6579

80+
public static final String FETCH_SERVERS_USE_ZK_CONF_KEY =
81+
"hbase.replication.fetch.servers.usezk";
82+
83+
public static final String FETCH_SERVERS_INTERVAL_CONF_KEY =
84+
"hbase.replication.fetch.servers.interval";
85+
public static final int DEFAULT_FETCH_SERVERS_INTERVAL = 10 * 60 * 1000; // 10 mins
86+
6687
private ZKWatcher zkw = null;
6788
private final Object zkwLock = new Object();
6889

@@ -94,6 +115,11 @@ public abstract class HBaseReplicationEndpoint extends BaseReplicationEndpoint
94115

95116
private List<ServerName> sinkServers = new ArrayList<>(0);
96117

118+
private AsyncClusterConnection peerConnection;
119+
private boolean fetchServersUseZk = false;
120+
private FetchServersChore fetchServersChore;
121+
private int shortOperationTimeout;
122+
97123
/*
98124
* Some implementations of HBaseInterClusterReplicationEndpoint may require instantiate different
99125
* Connection implementations, or initialize it in a different way, so defining createConnection
@@ -129,6 +155,19 @@ protected void disconnect() {
129155
LOG.warn("{} Failed to close the connection", ctx.getPeerId());
130156
}
131157
}
158+
if (fetchServersChore != null) {
159+
ChoreService choreService = ctx.getServer().getChoreService();
160+
if (null != choreService) {
161+
choreService.cancelChore(fetchServersChore);
162+
}
163+
}
164+
if (peerConnection != null) {
165+
try {
166+
peerConnection.close();
167+
} catch (IOException e) {
168+
LOG.warn("Attempt to close peerConnection failed.", e);
169+
}
170+
}
132171
}
133172

134173
/**
@@ -159,8 +198,27 @@ public void stop() {
159198
}
160199

161200
@Override
162-
protected void doStart() {
201+
protected synchronized void doStart() {
202+
this.shortOperationTimeout = ctx.getLocalConfiguration().getInt(
203+
HBASE_RPC_SHORTOPERATION_TIMEOUT_KEY, DEFAULT_HBASE_RPC_SHORTOPERATION_TIMEOUT);
163204
try {
205+
if (ctx.getLocalConfiguration().getBoolean(FETCH_SERVERS_USE_ZK_CONF_KEY, false)) {
206+
fetchServersUseZk = true;
207+
} else {
208+
try {
209+
if (ReplicationUtils.isPeerClusterSupportReplicationOffload(getPeerConnection())) {
210+
fetchServersChore = new FetchServersChore(ctx.getServer(), this);
211+
ctx.getServer().getChoreService().scheduleChore(fetchServersChore);
212+
fetchServersUseZk = false;
213+
} else {
214+
fetchServersUseZk = true;
215+
}
216+
} catch (Throwable t) {
217+
fetchServersUseZk = true;
218+
LOG.warn("Peer {} try to fetch servers by admin failed. Using zk impl.",
219+
ctx.getPeerId(), t);
220+
}
221+
}
164222
reloadZkWatcher();
165223
connectPeerCluster();
166224
notifyStarted();
@@ -203,7 +261,9 @@ private void reloadZkWatcher() throws IOException {
203261
}
204262
zkw = new ZKWatcher(ctx.getConfiguration(),
205263
"connection to cluster: " + ctx.getPeerId(), this);
206-
zkw.registerListener(new PeerRegionServerListener(this));
264+
if (fetchServersUseZk) {
265+
zkw.registerListener(new PeerRegionServerListener(this));
266+
}
207267
}
208268
}
209269

@@ -228,12 +288,47 @@ public boolean isAborted() {
228288
return false;
229289
}
230290

291+
/**
292+
* Get the connection to peer cluster
293+
* @return connection to peer cluster
294+
* @throws IOException If anything goes wrong connecting
295+
*/
296+
private synchronized AsyncClusterConnection getPeerConnection() throws IOException {
297+
if (peerConnection == null) {
298+
Configuration conf = ctx.getConfiguration();
299+
peerConnection = ClusterConnectionFactory.createAsyncClusterConnection(conf, null,
300+
UserProvider.instantiate(conf).getCurrent());
301+
}
302+
return peerConnection;
303+
}
304+
305+
/**
306+
* Get the list of all the servers that are responsible for replication sink
307+
* from the specified peer master
308+
* @return list of server addresses or an empty list if the slave is unavailable
309+
*/
310+
protected List<ServerName> fetchSlavesAddresses() {
311+
try {
312+
AsyncClusterConnection peerConn = getPeerConnection();
313+
ServerName master = FutureUtils.get(peerConn.getAdmin().getMaster());
314+
MasterService.BlockingInterface masterStub = MasterService.newBlockingStub(
315+
peerConn.getRpcClient()
316+
.createBlockingRpcChannel(master, User.getCurrent(), shortOperationTimeout));
317+
ListReplicationSinkServersResponse resp = masterStub
318+
.listReplicationSinkServers(null, ListReplicationSinkServersRequest.newBuilder().build());
319+
return ProtobufUtil.toServerNameList(resp.getServerNameList());
320+
} catch (ServiceException | IOException e) {
321+
LOG.error("Peer {} fetches servers failed", ctx.getPeerId(), e);
322+
}
323+
return Collections.emptyList();
324+
}
325+
231326
/**
232327
* Get the list of all the region servers from the specified peer
233328
*
234329
* @return list of region server addresses or an empty list if the slave is unavailable
235330
*/
236-
protected List<ServerName> fetchSlavesAddresses() {
331+
protected List<ServerName> fetchSlavesAddressesByZK() {
237332
List<String> children = null;
238333
try {
239334
synchronized (zkwLock) {
@@ -256,7 +351,12 @@ protected List<ServerName> fetchSlavesAddresses() {
256351
}
257352

258353
protected synchronized void chooseSinks() {
259-
List<ServerName> slaveAddresses = fetchSlavesAddresses();
354+
List<ServerName> slaveAddresses = Collections.emptyList();
355+
if (fetchServersUseZk) {
356+
slaveAddresses = fetchSlavesAddressesByZK();
357+
} else {
358+
slaveAddresses = fetchSlavesAddresses();
359+
}
260360
if (slaveAddresses.isEmpty()) {
261361
LOG.warn("No sinks available at peer. Will not be able to replicate");
262362
}
@@ -287,6 +387,14 @@ protected synchronized SinkPeer getReplicationSink() throws IOException {
287387
return createSinkPeer(serverName);
288388
}
289389

390+
private SinkPeer createSinkPeer(ServerName serverName) throws IOException {
391+
if (ReplicationUtils.isPeerClusterSupportReplicationOffload(conn)) {
392+
return new ReplicationServerSinkPeer(serverName, conn.getReplicationServerAdmin(serverName));
393+
} else {
394+
return new RegionServerSinkPeer(serverName, conn.getRegionServerAdmin(serverName));
395+
}
396+
}
397+
290398
/**
291399
* Report a {@code SinkPeer} as being bad (i.e. an attempt to replicate to it
292400
* failed). If a single SinkPeer is reported as bad more than
@@ -396,11 +504,23 @@ public void replicateWALEntry(WAL.Entry[] entries, String replicationClusterId,
396504
}
397505
}
398506

399-
private SinkPeer createSinkPeer(ServerName serverName) throws IOException {
400-
if (ReplicationUtils.isPeerClusterSupportReplicationOffload(conn)) {
401-
return new ReplicationServerSinkPeer(serverName, conn.getReplicationServerAdmin(serverName));
402-
} else {
403-
return new RegionServerSinkPeer(serverName, conn.getRegionServerAdmin(serverName));
507+
/**
508+
* Chore that will fetch the list of servers from peer master.
509+
*/
510+
public static class FetchServersChore extends ScheduledChore {
511+
512+
private HBaseReplicationEndpoint endpoint;
513+
514+
public FetchServersChore(Server server, HBaseReplicationEndpoint endpoint) {
515+
super("Peer-" + endpoint.ctx.getPeerId() + "-FetchServersChore", server,
516+
server.getConfiguration()
517+
.getInt(FETCH_SERVERS_INTERVAL_CONF_KEY, DEFAULT_FETCH_SERVERS_INTERVAL));
518+
this.endpoint = endpoint;
519+
}
520+
521+
@Override
522+
protected void chore() {
523+
endpoint.chooseSinks();
404524
}
405525
}
406526
}

hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -343,9 +343,9 @@ private void tryStartNewShipper(String walGroupId, PriorityBlockingQueue<Path> q
343343
Threads.setDaemonThreadRunning(
344344
walReader, Thread.currentThread().getName()
345345
+ ".replicationSource.wal-reader." + walGroupId + "," + queueId,
346-
(t,e) -> this.uncaughtException(t, e, this.manager, this.getPeerId()));
346+
(t,e) -> this.uncaughtException(t, e, null, this.getPeerId()));
347347
worker.setWALReader(walReader);
348-
worker.startup((t,e) -> this.uncaughtException(t, e, this.manager, this.getPeerId()));
348+
worker.startup((t,e) -> this.uncaughtException(t, e, null, this.getPeerId()));
349349
return worker;
350350
}
351351
});

hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceShipper.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,6 @@ void clearWALEntryBatch() {
369369

370370
LOG.trace("Decrementing totalBufferUsed by {}B while stopping Replication WAL Readers.",
371371
totalToDecrement.longValue());
372-
source.getSourceManager().getTotalBufferUsed().addAndGet(-totalToDecrement.longValue());
372+
source.controller.getTotalBufferUsed().addAndGet(-totalToDecrement.longValue());
373373
}
374374
}

0 commit comments

Comments
 (0)