Skip to content

Commit a0f0872

Browse files
authored
HDDS-11444. Make Datanode Command metrics consistent across all commands (#7191)
1 parent d3b63c6 commit a0f0872

16 files changed

Lines changed: 419 additions & 86 deletions

hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/CommandHandlerMetrics.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import static org.apache.hadoop.ozone.container.common.helpers.CommandHandlerMetrics.CommandMetricsMetricsInfo.TotalRunTimeMs;
3535
import static org.apache.hadoop.ozone.container.common.helpers.CommandHandlerMetrics.CommandMetricsMetricsInfo.QueueWaitingTaskCount;
3636
import static org.apache.hadoop.ozone.container.common.helpers.CommandHandlerMetrics.CommandMetricsMetricsInfo.InvocationCount;
37+
import static org.apache.hadoop.ozone.container.common.helpers.CommandHandlerMetrics.CommandMetricsMetricsInfo.AvgRunTimeMs;
3738
import static org.apache.hadoop.ozone.container.common.helpers.CommandHandlerMetrics.CommandMetricsMetricsInfo.ThreadPoolActivePoolSize;
3839
import static org.apache.hadoop.ozone.container.common.helpers.CommandHandlerMetrics.CommandMetricsMetricsInfo.ThreadPoolMaxPoolSize;
3940
import static org.apache.hadoop.ozone.container.common.helpers.CommandHandlerMetrics.CommandMetricsMetricsInfo.CommandReceivedCount;
@@ -46,6 +47,7 @@ public final class CommandHandlerMetrics implements MetricsSource {
4647
enum CommandMetricsMetricsInfo implements MetricsInfo {
4748
Command("The type of the SCM command"),
4849
TotalRunTimeMs("The total runtime of the command handler in milliseconds"),
50+
AvgRunTimeMs("Average run time of the command handler in milliseconds"),
4951
QueueWaitingTaskCount("The number of queued tasks waiting for execution"),
5052
InvocationCount("The number of times the command handler has been invoked"),
5153
ThreadPoolActivePoolSize("The number of active threads in the thread pool"),
@@ -108,6 +110,7 @@ public void getMetrics(MetricsCollector collector, boolean all) {
108110
commandHandler.getCommandType().name());
109111

110112
builder.addGauge(TotalRunTimeMs, commandHandler.getTotalRunTime());
113+
builder.addGauge(AvgRunTimeMs, commandHandler.getAverageRunTime());
111114
builder.addGauge(QueueWaitingTaskCount, commandHandler.getQueuedCount());
112115
builder.addGauge(InvocationCount, commandHandler.getInvocationCount());
113116
int activePoolSize = commandHandler.getThreadPoolActivePoolSize();

hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/CloseContainerCommandHandler.java

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
import org.apache.hadoop.hdds.protocol.proto
3232
.StorageContainerDatanodeProtocolProtos.CloseContainerCommandProto;
3333
import org.apache.hadoop.hdds.tracing.TracingUtil;
34+
import org.apache.hadoop.metrics2.lib.MetricsRegistry;
35+
import org.apache.hadoop.metrics2.lib.MutableRate;
3436
import org.apache.hadoop.ozone.container.common.interfaces.Container;
3537
import org.apache.hadoop.ozone.container.common.statemachine
3638
.SCMConnectionManager;
@@ -58,7 +60,7 @@ public class CloseContainerCommandHandler implements CommandHandler {
5860
private final AtomicLong invocationCount = new AtomicLong(0);
5961
private final AtomicInteger queuedCount = new AtomicInteger(0);
6062
private final ThreadPoolExecutor executor;
61-
private long totalTime;
63+
private final MutableRate opsLatencyMs;
6264

6365
/**
6466
* Constructs a close container command handler.
@@ -72,6 +74,9 @@ public CloseContainerCommandHandler(
7274
new ThreadFactoryBuilder()
7375
.setNameFormat(threadNamePrefix + "CloseContainerThread-%d")
7476
.build());
77+
MetricsRegistry registry = new MetricsRegistry(
78+
CloseContainerCommandHandler.class.getSimpleName());
79+
this.opsLatencyMs = registry.newRate(SCMCommandProto.Type.closeContainerCommand + "Ms");
7580
}
7681

7782
/**
@@ -155,7 +160,7 @@ public void handle(SCMCommand command, OzoneContainer ozoneContainer,
155160
LOG.error("Can't close container #{}", containerId, e);
156161
} finally {
157162
long endTime = Time.monotonicNow();
158-
totalTime += endTime - startTime;
163+
this.opsLatencyMs.add(endTime - startTime);
159164
}
160165
}, executor).whenComplete((v, e) -> queuedCount.decrementAndGet());
161166
}
@@ -204,15 +209,12 @@ public int getInvocationCount() {
204209
*/
205210
@Override
206211
public long getAverageRunTime() {
207-
if (invocationCount.get() > 0) {
208-
return totalTime / invocationCount.get();
209-
}
210-
return 0;
212+
return (long) this.opsLatencyMs.lastStat().mean();
211213
}
212214

213215
@Override
214216
public long getTotalRunTime() {
215-
return totalTime;
217+
return (long) this.opsLatencyMs.lastStat().total();
216218
}
217219

218220
@Override

hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/ClosePipelineCommandHandler.java

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
import org.apache.hadoop.hdds.ratis.RatisHelper;
2525
import org.apache.hadoop.hdds.scm.client.HddsClientUtils;
2626
import org.apache.hadoop.hdds.scm.pipeline.PipelineID;
27+
import org.apache.hadoop.metrics2.lib.MetricsRegistry;
28+
import org.apache.hadoop.metrics2.lib.MutableRate;
2729
import org.apache.hadoop.ozone.container.common.statemachine
2830
.SCMConnectionManager;
2931
import org.apache.hadoop.ozone.container.common.statemachine.StateContext;
@@ -60,9 +62,9 @@ public class ClosePipelineCommandHandler implements CommandHandler {
6062

6163
private final AtomicLong invocationCount = new AtomicLong(0);
6264
private final AtomicInteger queuedCount = new AtomicInteger(0);
63-
private long totalTime;
6465
private final Executor executor;
6566
private final BiFunction<RaftPeer, GrpcTlsConfig, RaftClient> newRaftClient;
67+
private final MutableRate opsLatencyMs;
6668

6769
/**
6870
* Constructs a closePipelineCommand handler.
@@ -80,6 +82,9 @@ public ClosePipelineCommandHandler(
8082
Executor executor) {
8183
this.newRaftClient = newRaftClient;
8284
this.executor = executor;
85+
MetricsRegistry registry = new MetricsRegistry(
86+
ClosePipelineCommandHandler.class.getSimpleName());
87+
this.opsLatencyMs = registry.newRate(SCMCommandProto.Type.closePipelineCommand + "Ms");
8388
}
8489

8590
/**
@@ -155,7 +160,7 @@ public void handle(SCMCommand command, OzoneContainer ozoneContainer,
155160
}
156161
} finally {
157162
long endTime = Time.monotonicNow();
158-
totalTime += endTime - startTime;
163+
this.opsLatencyMs.add(endTime - startTime);
159164
}
160165
}, executor).whenComplete((v, e) -> queuedCount.decrementAndGet());
161166
}
@@ -187,15 +192,12 @@ public int getInvocationCount() {
187192
*/
188193
@Override
189194
public long getAverageRunTime() {
190-
if (invocationCount.get() > 0) {
191-
return totalTime / invocationCount.get();
192-
}
193-
return 0;
195+
return (long) this.opsLatencyMs.lastStat().mean();
194196
}
195197

196198
@Override
197199
public long getTotalRunTime() {
198-
return totalTime;
200+
return (long) this.opsLatencyMs.lastStat().total();
199201
}
200202

201203
@Override

hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/CreatePipelineCommandHandler.java

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.SCMCommandProto;
3131
import org.apache.hadoop.hdds.ratis.RatisHelper;
3232
import org.apache.hadoop.hdds.scm.pipeline.PipelineID;
33+
import org.apache.hadoop.metrics2.lib.MetricsRegistry;
34+
import org.apache.hadoop.metrics2.lib.MutableRate;
3335
import org.apache.hadoop.ozone.container.common.statemachine.SCMConnectionManager;
3436
import org.apache.hadoop.ozone.container.common.statemachine.StateContext;
3537
import org.apache.hadoop.ozone.container.common.transport.server.XceiverServerSpi;
@@ -59,8 +61,8 @@ public class CreatePipelineCommandHandler implements CommandHandler {
5961
private final AtomicInteger queuedCount = new AtomicInteger(0);
6062
private final BiFunction<RaftPeer, GrpcTlsConfig, RaftClient> newRaftClient;
6163

62-
private long totalTime;
6364
private final Executor executor;
65+
private final MutableRate opsLatencyMs;
6466

6567
/**
6668
* Constructs a createPipelineCommand handler.
@@ -75,6 +77,9 @@ public CreatePipelineCommandHandler(ConfigurationSource conf,
7577
Executor executor) {
7678
this.newRaftClient = newRaftClient;
7779
this.executor = executor;
80+
MetricsRegistry registry = new MetricsRegistry(
81+
CreatePipelineCommandHandler.class.getSimpleName());
82+
this.opsLatencyMs = registry.newRate(SCMCommandProto.Type.createPipelineCommand + "Ms");
7883
}
7984

8085
/**
@@ -135,7 +140,7 @@ public void handle(SCMCommand command, OzoneContainer ozoneContainer,
135140
}
136141
} finally {
137142
long endTime = Time.monotonicNow();
138-
totalTime += endTime - startTime;
143+
this.opsLatencyMs.add(endTime - startTime);
139144
}
140145
}, executor).whenComplete((v, e) -> queuedCount.decrementAndGet());
141146
}
@@ -167,15 +172,12 @@ public int getInvocationCount() {
167172
*/
168173
@Override
169174
public long getAverageRunTime() {
170-
if (invocationCount.get() > 0) {
171-
return totalTime / invocationCount.get();
172-
}
173-
return 0;
175+
return (long) this.opsLatencyMs.lastStat().mean();
174176
}
175177

176178
@Override
177179
public long getTotalRunTime() {
178-
return totalTime;
180+
return (long) this.opsLatencyMs.lastStat().total();
179181
}
180182

181183
@Override

hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/DeleteBlocksCommandHandler.java

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
import org.apache.hadoop.hdds.protocol.proto
3333
.StorageContainerDatanodeProtocolProtos.DeletedBlocksTransaction;
3434
import org.apache.hadoop.hdds.utils.db.Table;
35+
import org.apache.hadoop.metrics2.lib.MetricsRegistry;
36+
import org.apache.hadoop.metrics2.lib.MutableRate;
3537
import org.apache.hadoop.ozone.container.common.helpers.BlockData;
3638
import org.apache.hadoop.ozone.container.common.helpers.BlockDeletingServiceMetrics;
3739
import org.apache.hadoop.ozone.container.common.helpers.ChunkInfoList;
@@ -91,14 +93,14 @@ public class DeleteBlocksCommandHandler implements CommandHandler {
9193
private final ContainerSet containerSet;
9294
private final ConfigurationSource conf;
9395
private int invocationCount;
94-
private long totalTime;
9596
private final ThreadPoolExecutor executor;
9697
private final LinkedBlockingQueue<DeleteCmdInfo> deleteCommandQueues;
9798
private final Daemon handlerThread;
9899
private final OzoneContainer ozoneContainer;
99100
private final BlockDeletingServiceMetrics blockDeleteMetrics;
100101
private final long tryLockTimeoutMs;
101102
private final Map<String, SchemaHandler> schemaHandlers;
103+
private final MutableRate opsLatencyMs;
102104

103105
public DeleteBlocksCommandHandler(OzoneContainer container,
104106
ConfigurationSource conf, DatanodeConfiguration dnConf,
@@ -121,6 +123,9 @@ public DeleteBlocksCommandHandler(OzoneContainer container,
121123
dnConf.getBlockDeleteThreads(), threadFactory);
122124
this.deleteCommandQueues =
123125
new LinkedBlockingQueue<>(dnConf.getBlockDeleteQueueLimit());
126+
MetricsRegistry registry = new MetricsRegistry(
127+
DeleteBlocksCommandHandler.class.getSimpleName());
128+
this.opsLatencyMs = registry.newRate(SCMCommandProto.Type.deleteBlocksCommand + "Ms");
124129
long interval = dnConf.getBlockDeleteCommandWorkerInterval().toMillis();
125130
handlerThread = new Daemon(new DeleteCmdWorker(interval));
126131
handlerThread.start();
@@ -403,7 +408,7 @@ private void processCmd(DeleteCmdInfo cmd) {
403408
};
404409
updateCommandStatus(cmd.getContext(), cmd.getCmd(), statusUpdater, LOG);
405410
long endTime = Time.monotonicNow();
406-
totalTime += endTime - startTime;
411+
this.opsLatencyMs.add(endTime - startTime);
407412
invocationCount++;
408413
}
409414
}
@@ -666,15 +671,12 @@ public int getInvocationCount() {
666671

667672
@Override
668673
public long getAverageRunTime() {
669-
if (invocationCount > 0) {
670-
return totalTime / invocationCount;
671-
}
672-
return 0;
674+
return (long) this.opsLatencyMs.lastStat().mean();
673675
}
674676

675677
@Override
676678
public long getTotalRunTime() {
677-
return totalTime;
679+
return (long) this.opsLatencyMs.lastStat().total();
678680
}
679681

680682
@Override

hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/DeleteContainerCommandHandler.java

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
import java.util.concurrent.RejectedExecutionException;
2323
import org.apache.hadoop.hdds.protocol.proto
2424
.StorageContainerDatanodeProtocolProtos.SCMCommandProto;
25+
import org.apache.hadoop.metrics2.lib.MetricsRegistry;
26+
import org.apache.hadoop.metrics2.lib.MutableRate;
2527
import org.apache.hadoop.ozone.container.common.statemachine
2628
.SCMConnectionManager;
2729
import org.apache.hadoop.ozone.container.common.statemachine.StateContext;
@@ -39,7 +41,6 @@
3941
import java.util.concurrent.ThreadPoolExecutor;
4042
import java.util.concurrent.TimeUnit;
4143
import java.util.concurrent.atomic.AtomicInteger;
42-
import java.util.concurrent.atomic.AtomicLong;
4344

4445
/**
4546
* Handler to process the DeleteContainerCommand from SCM.
@@ -51,10 +52,10 @@ public class DeleteContainerCommandHandler implements CommandHandler {
5152

5253
private final AtomicInteger invocationCount = new AtomicInteger(0);
5354
private final AtomicInteger timeoutCount = new AtomicInteger(0);
54-
private final AtomicLong totalTime = new AtomicLong(0);
5555
private final ThreadPoolExecutor executor;
5656
private final Clock clock;
5757
private int maxQueueSize;
58+
private final MutableRate opsLatencyMs;
5859

5960
public DeleteContainerCommandHandler(
6061
int threadPoolSize, Clock clock, int queueSize, String threadNamePrefix) {
@@ -73,6 +74,9 @@ protected DeleteContainerCommandHandler(Clock clock,
7374
this.executor = executor;
7475
this.clock = clock;
7576
maxQueueSize = queueSize;
77+
MetricsRegistry registry = new MetricsRegistry(
78+
DeleteContainerCommandHandler.class.getSimpleName());
79+
this.opsLatencyMs = registry.newRate(SCMCommandProto.Type.deleteContainerCommand + "Ms");
7680
}
7781
@Override
7882
public void handle(final SCMCommand command,
@@ -124,7 +128,7 @@ private void handleInternal(SCMCommand command, StateContext context,
124128
} catch (IOException e) {
125129
LOG.error("Exception occurred while deleting the container.", e);
126130
} finally {
127-
totalTime.getAndAdd(Time.monotonicNow() - startTime);
131+
this.opsLatencyMs.add(Time.monotonicNow() - startTime);
128132
}
129133
}
130134

@@ -149,14 +153,12 @@ public int getTimeoutCount() {
149153

150154
@Override
151155
public long getAverageRunTime() {
152-
final int invocations = invocationCount.get();
153-
return invocations == 0 ?
154-
0 : totalTime.get() / invocations;
156+
return (long) this.opsLatencyMs.lastStat().mean();
155157
}
156158

157159
@Override
158160
public long getTotalRunTime() {
159-
return totalTime.get();
161+
return (long) this.opsLatencyMs.lastStat().total();
160162
}
161163

162164
@Override

hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/FinalizeNewLayoutVersionCommandHandler.java

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.FinalizeNewLayoutVersionCommandProto;
2121
import org.apache.hadoop.hdds.protocol.proto
2222
.StorageContainerDatanodeProtocolProtos.SCMCommandProto;
23+
import org.apache.hadoop.metrics2.lib.MetricsRegistry;
24+
import org.apache.hadoop.metrics2.lib.MutableRate;
2325
import org.apache.hadoop.ozone.container.common.statemachine.DatanodeStateMachine;
2426
import org.apache.hadoop.ozone.container.common.statemachine
2527
.SCMConnectionManager;
@@ -42,12 +44,15 @@ public class FinalizeNewLayoutVersionCommandHandler implements CommandHandler {
4244
LoggerFactory.getLogger(FinalizeNewLayoutVersionCommandHandler.class);
4345

4446
private AtomicLong invocationCount = new AtomicLong(0);
45-
private long totalTime;
47+
private final MutableRate opsLatencyMs;
4648

4749
/**
4850
* Constructs a FinalizeNewLayoutVersionCommandHandler.
4951
*/
5052
public FinalizeNewLayoutVersionCommandHandler() {
53+
MetricsRegistry registry = new MetricsRegistry(
54+
FinalizeNewLayoutVersionCommandHandler.class.getSimpleName());
55+
this.opsLatencyMs = registry.newRate(SCMCommandProto.Type.finalizeNewLayoutVersionCommand + "Ms");
5156
}
5257

5358
/**
@@ -82,7 +87,7 @@ public void handle(SCMCommand command, OzoneContainer ozoneContainer,
8287
LOG.error("Exception during finalization.", e);
8388
} finally {
8489
long endTime = Time.monotonicNow();
85-
totalTime += endTime - startTime;
90+
this.opsLatencyMs.add(endTime - startTime);
8691
}
8792
}
8893

@@ -113,15 +118,12 @@ public int getInvocationCount() {
113118
*/
114119
@Override
115120
public long getAverageRunTime() {
116-
if (invocationCount.get() > 0) {
117-
return totalTime / invocationCount.get();
118-
}
119-
return 0;
121+
return (long) this.opsLatencyMs.lastStat().mean();
120122
}
121123

122124
@Override
123125
public long getTotalRunTime() {
124-
return totalTime;
126+
return (long) this.opsLatencyMs.lastStat().total();
125127
}
126128

127129
@Override

0 commit comments

Comments
 (0)