-
Notifications
You must be signed in to change notification settings - Fork 219
Open
Description
Description
Hi, recently I benchmarked the inference on GPU with AWS EC2 P3.2xlarge instance on ResNet50 pretrained model. CPU benchmark are pretty close to python, however there is a regression on GPU:
0.2.0 TF Java
p50 4.76ms
P90 6.47ms
Python (TF 2.3.1)
P50 3.24ms
P90 4.59ms
I am note sure why CPU is very close but GPU is kind of far (20% diff)
System information
- OS Platform and Distribution (e.g., Linux Ubuntu 16.04): AWS DL AMI (Ubuntu 18.04 based)
- CUDA/cuDNN version: CUDA 10.1
- GPU model and memory: Tesla V100 16GB
Step to reproduce
You can get the Keras pretrained resnet50 model and save it to savedModel format.
Java
public class Example {
public static void main(String[] args) {
int ITERATION = 1000;
String dir = "model_path";
SavedModelBundle.Loader loader =
SavedModelBundle.loader(dir).withTags("serve");
SavedModelBundle bundle = loader.load();
Session session = bundle.session();
List<Long> timeCollector = new ArrayList<>();
for (int i = 0; i < ITERATION; i++) {
long start = System.nanoTime();
forward(session);
timeCollector.add(System.nanoTime() - start);
}
Collections.sort(timeCollector);
System.out.println("P50: " + percentile(timeCollector, 50) + "ms");
System.out.println("P90: " + percentile(timeCollector, 90) + "ms");
System.out.println("P99: " + percentile(timeCollector, 99) + "ms");
}
public static double percentile(List<Long> times, int percentile) {
int index = times.size() * percentile / 100;
return times.get(index) / 1_000_000f;
}
public static void forward(Session session) {
Session.Runner runner = session.runner();
try(Tensor<?> tensor = Tensor.of(TFloat32.DTYPE, Shape.of(1, 224, 224, 3))) {
runner.feed("serving_default_input_1:0", tensor);
runner.fetch("StatefulPartitionedCall:0");
List<Tensor<?>> result = runner.run();
}
}
}python
if __name__ == "__main__":
if len(sys.argv) != 4:
print("usage: python3 benchmark.py <model_name> <model_dir> <num_iterations>")
exit(1)
model_name = sys.argv[1]
model_path = sys.argv[2]
iterations = int(sys.argv[3])
print("#############################################")
print("start testing Model: " + model_name)
begin = time.time()
# load model
model = tf.saved_model.load(model_path)
latencies = []
for _ in range(iterations):
inputs = tf.zeros((1, 224, 224, 3))
start = time.time()
result = model(inputs)
# convert the second to mini-second
latencies.append((time.time() - start) * 1000)
result.numpy()
elapsed = (time.time() - begin) * 1000
throughput = iterations / elapsed * 1000
p50 = np.percentile(latencies, 50)
p90 = np.percentile(latencies, 90)
p99 = np.percentile(latencies, 99)
print("Model: {}".format(model_name))
print("Iterations: {:d}".format(iterations))
print("Throughput: {:.2f}".format(throughput))
print("Elapsed: {:.3f} ms.".format(elapsed))
print("P50: {:.3f} ms".format(p50))
print("P90: {:.3f} ms".format(p90))
print("P99: {:.3f} ms".format(p99))Metadata
Metadata
Assignees
Labels
No labels