Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
syntax = "proto2";

package hbase.pb;

option java_package = "org.apache.hadoop.hbase.shaded.protobuf.generated";
option java_outer_classname = "ReplicationServerStatusProtos";
option java_generic_services = true;
option java_generate_equals_and_hash = true;
option optimize_for = SPEED;

import "server/master/RegionServerStatus.proto";

service ReplicationServerStatusService {

rpc ReplicationServerReport(RegionServerReportRequest)
returns(RegionServerReportResponse);
}
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,8 @@ public void run() {
// manager of assignment nodes in zookeeper
private AssignmentManager assignmentManager;

// server manager to deal with replication server info
private ReplicationServerManager replicationServerManager;

/**
* Cache for the meta region replica's locations. Also tracks their changes to avoid stale
Expand Down Expand Up @@ -963,6 +965,8 @@ private void finishActiveMasterInitialization(MonitoredTask status) throws IOExc
.collect(Collectors.toList());
this.assignmentManager.setupRIT(ritList);

this.replicationServerManager = new ReplicationServerManager(this);

// Start RegionServerTracker with listing of servers found with exiting SCPs -- these should
// be registered in the deadServers set -- and with the list of servernames out on the
// filesystem that COULD BE 'alive' (we'll schedule SCPs for each and let SCP figure it out).
Expand Down Expand Up @@ -1131,6 +1135,7 @@ private void finishActiveMasterInitialization(MonitoredTask status) throws IOExc
this.hbckChore = new HbckChore(this);
getChoreService().scheduleChore(hbckChore);
this.serverManager.startChore();
this.replicationServerManager.startChore();

// Only for rolling upgrade, where we need to migrate the data in namespace table to meta table.
if (!waitForNamespaceOnline()) {
Expand Down Expand Up @@ -1389,6 +1394,11 @@ public ServerManager getServerManager() {
return this.serverManager;
}

@Override
public ReplicationServerManager getReplicationServerManager() {
return this.replicationServerManager;
}

@Override
public MasterFileSystem getMasterFileSystem() {
return this.fileSystemManager;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,7 @@
import org.apache.hadoop.hbase.shaded.protobuf.generated.ReplicationProtos.TransitReplicationPeerSyncReplicationStateResponse;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ReplicationProtos.UpdateReplicationPeerConfigRequest;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ReplicationProtos.UpdateReplicationPeerConfigResponse;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ReplicationServerStatusProtos.ReplicationServerStatusService;
import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotDescription;
import org.apache.hadoop.hbase.shaded.protobuf.generated.VisibilityLabelsProtos.VisibilityLabelsService;

Expand All @@ -412,7 +413,7 @@
public class MasterRpcServices extends RSRpcServices implements
MasterService.BlockingInterface, RegionServerStatusService.BlockingInterface,
LockService.BlockingInterface, HbckService.BlockingInterface,
ClientMetaService.BlockingInterface {
ClientMetaService.BlockingInterface, ReplicationServerStatusService.BlockingInterface {

private static final Logger LOG = LoggerFactory.getLogger(MasterRpcServices.class.getName());
private static final Logger AUDITLOG =
Expand Down Expand Up @@ -546,7 +547,7 @@ boolean synchronousBalanceSwitch(final boolean b) throws IOException {
*/
@Override
protected List<BlockingServiceAndInterface> getServices() {
List<BlockingServiceAndInterface> bssi = new ArrayList<>(5);
List<BlockingServiceAndInterface> bssi = new ArrayList<>(6);
bssi.add(new BlockingServiceAndInterface(
MasterService.newReflectiveBlockingService(this),
MasterService.BlockingInterface.class));
Expand All @@ -559,6 +560,9 @@ protected List<BlockingServiceAndInterface> getServices() {
HbckService.BlockingInterface.class));
bssi.add(new BlockingServiceAndInterface(ClientMetaService.newReflectiveBlockingService(this),
ClientMetaService.BlockingInterface.class));
bssi.add(new BlockingServiceAndInterface(
ReplicationServerStatusService.newReflectiveBlockingService(this),
ReplicationServerStatusService.BlockingInterface.class));
bssi.addAll(super.getServices());
return bssi;
}
Expand Down Expand Up @@ -3402,4 +3406,33 @@ public ListReplicationSinkServersResponse listReplicationSinkServers(
}
return builder.build();
}

@Override
public RegionServerReportResponse replicationServerReport(RpcController controller,
RegionServerReportRequest request) throws ServiceException {
try {
master.checkServiceStarted();
int versionNumber = 0;
String version = "0.0.0";
VersionInfo versionInfo = VersionInfoUtil.getCurrentClientVersionInfo();
if (versionInfo != null) {
version = versionInfo.getVersion();
versionNumber = VersionInfoUtil.getVersionNumber(versionInfo);
}
ClusterStatusProtos.ServerLoad sl = request.getLoad();
ServerName serverName = ProtobufUtil.toServerName(request.getServer());
ServerMetrics oldMetrics = master.getReplicationServerManager().getServerMetrics(serverName);
ServerMetrics newMetrics =
ServerMetricsBuilder.toServerMetrics(serverName, versionNumber, version, sl);
master.getReplicationServerManager().serverReport(serverName, newMetrics);
if (sl != null && master.metricsMaster != null) {
// Up our metrics.
master.metricsMaster.incrementRequests(sl.getTotalNumberOfRequests()
- (oldMetrics != null ? oldMetrics.getRequestCount() : 0));
}
} catch (IOException ioe) {
throw new ServiceException(ioe);
}
return RegionServerReportResponse.newBuilder().build();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,11 @@ public interface MasterServices extends Server {
*/
ServerManager getServerManager();

/**
* @return Master's {@link ReplicationServerManager} instance.
*/
ReplicationServerManager getReplicationServerManager();

/**
* @return Master's instance of {@link ExecutorService}
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.master;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentNavigableMap;
import java.util.concurrent.ConcurrentSkipListMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ScheduledChore;
import org.apache.hadoop.hbase.ServerMetrics;
import org.apache.hadoop.hbase.ServerName;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* The ReplicationServerManager class manages info about replication servers.
* <p>
* Maintains lists of online and dead servers.
* <p>
* Servers are distinguished in two different ways. A given server has a
* location, specified by hostname and port, and of which there can only be one
* online at any given time. A server instance is specified by the location
* (hostname and port) as well as the startcode (timestamp from when the server
* was started). This is used to differentiate a restarted instance of a given
* server from the original instance.
*/
@InterfaceAudience.Private
public class ReplicationServerManager {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it make sense to extend ServerManager, or maybe both this one and ServerManager could implement a common interface? It seems there is a common workflow for server managers.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. But we want do this refator after we finished this, because ReplicationServerManager may have new featrues but ServerManager not need. It is not easy to decide that extend ServerManager or implement a common interface. Thanks.


private static final Logger LOG = LoggerFactory.getLogger(ReplicationServerManager.class);

public static final String ONLINE_SERVER_REFRESH_INTERVAL =
"hbase.master.replication.server.refresh.interval";
public static final int ONLINE_SERVER_REFRESH_INTERVAL_DEFAULT = 60 * 1000; // 1 mins

private final MasterServices master;

/** Map of registered servers to their current load */
private final ConcurrentNavigableMap<ServerName, ServerMetrics> onlineServers =
new ConcurrentSkipListMap<>();

private OnlineServerRefresher onlineServerRefresher;
private int refreshPeriod;

/**
* Constructor.
*/
public ReplicationServerManager(final MasterServices master) {
this.master = master;
}

/**
* start chore in ServerManager
*/
public void startChore() {
Configuration conf = master.getConfiguration();
refreshPeriod = conf.getInt(ONLINE_SERVER_REFRESH_INTERVAL,
ONLINE_SERVER_REFRESH_INTERVAL_DEFAULT);
onlineServerRefresher = new OnlineServerRefresher("ReplicationServerRefresher", refreshPeriod);
master.getChoreService().scheduleChore(onlineServerRefresher);
}

/**
* Stop the ServerManager.
*/
public void stop() {
if (onlineServerRefresher != null) {
onlineServerRefresher.cancel();
}
}

public void serverReport(ServerName sn, ServerMetrics sl) {
if (null == this.onlineServers.replace(sn, sl)) {
if (!checkAndRecordNewServer(sn, sl)) {
LOG.info("ReplicationServerReport ignored, could not record the server: {}", sn);
}
}
}

/**
* Check is a server of same host and port already exists,
* if not, or the existed one got a smaller start code, record it.
*
* @param serverName the server to check and record
* @param sl the server load on the server
* @return true if the server is recorded, otherwise, false
*/
private boolean checkAndRecordNewServer(final ServerName serverName, final ServerMetrics sl) {
ServerName existingServer = null;
synchronized (this.onlineServers) {
existingServer = findServerWithSameHostnamePort(serverName);
if (existingServer != null && (existingServer.getStartcode() > serverName.getStartcode())) {
LOG.info("ReplicationServer serverName={} rejected; we already have {} registered with "
+ "same hostname and port", serverName, existingServer);
return false;
}
recordNewServer(serverName, sl);
// Note that we assume that same ts means same server, and don't expire in that case.
if (existingServer != null && (existingServer.getStartcode() < serverName.getStartcode())) {
LOG.info("Triggering server recovery; existingServer {} looks stale, new server: {}",
existingServer, serverName);
expireServer(existingServer);
}
}
return true;
}

/**
* Assumes onlineServers is locked.
* @return ServerName with matching hostname and port.
*/
private ServerName findServerWithSameHostnamePort(final ServerName serverName) {
ServerName end = ServerName.valueOf(serverName.getHostname(), serverName.getPort(),
Long.MAX_VALUE);

ServerName r = onlineServers.lowerKey(end);
if (r != null && ServerName.isSameAddress(r, serverName)) {
return r;
}
return null;
}

/**
* Assumes onlineServers is locked.
*/
private void recordNewServer(final ServerName serverName, final ServerMetrics sl) {
LOG.info("Registering ReplicationServer={}", serverName);
this.onlineServers.put(serverName, sl);
}

/**
* Assumes onlineServers is locked.
* Expire the passed server. Remove it from list of online servers
*/
public void expireServer(final ServerName serverName) {
LOG.info("Expiring ReplicationServer={}", serverName);
onlineServers.remove(serverName);
}

/**
* @return Read-only map of servers to serverinfo
*/
public Map<ServerName, ServerMetrics> getOnlineServers() {
// Presumption is that iterating the returned Map is OK.
synchronized (this.onlineServers) {
return Collections.unmodifiableMap(this.onlineServers);
}
}

/**
* @return A copy of the internal list of online servers.
*/
public List<ServerName> getOnlineServersList() {
return new ArrayList<>(this.onlineServers.keySet());
}

/**
* @param serverName server name
* @return ServerMetrics if serverName is known else null
*/
public ServerMetrics getServerMetrics(final ServerName serverName) {
return this.onlineServers.get(serverName);
}

private class OnlineServerRefresher extends ScheduledChore {

public OnlineServerRefresher(String name, int p) {
super(name, master, p, 60 * 1000); // delay one minute before first execute
}

@Override
protected void chore() {
synchronized (onlineServers) {
List<ServerName> servers = getOnlineServersList();
servers.forEach(s -> {
ServerMetrics metrics = onlineServers.get(s);
if (metrics.getReportTimestamp() + refreshPeriod < System.currentTimeMillis()) {
expireServer(s);
}
});
}
}
}
}
Loading