ROCm
diff --git a/‎benchmarks/benchmark_latency.py‎
Lines changed: 32 additions & 21 deletions b/‎benchmarks/benchmark_latency.py‎
Lines changed: 32 additions & 21 deletions
diff --git a/‎benchmarks/benchmark_throughput.py‎
Lines changed: 13 additions & 2 deletions b/‎benchmarks/benchmark_throughput.py‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎csrc/ops.h‎
Lines changed: 3 additions & 7 deletions b/‎csrc/ops.h‎
Lines changed: 3 additions & 7 deletions
diff --git a/‎csrc/pybind.cpp‎
Lines changed: 1 addition & 2 deletions b/‎csrc/pybind.cpp‎
Lines changed: 1 addition & 2 deletions
@@ -19,27 +19,30 @@ def main(args: argparse.Namespace):
 
     # NOTE(woosuk): If the request cannot be processed in a single batch,
     # the engine will automatically process the request in multiple batches.
-    llm = LLM(model=args.model,
-              speculative_model=args.speculative_model,
-              num_speculative_tokens=args.num_speculative_tokens,
-              tokenizer=args.tokenizer,
-              quantization=args.quantization,
-              quantized_weights_path=args.quantized_weights_path,
-              tensor_parallel_size=args.tensor_parallel_size,
-              trust_remote_code=args.trust_remote_code,
-              dtype=args.dtype,
-              enforce_eager=args.enforce_eager,
-              kv_cache_dtype=args.kv_cache_dtype,
-              quantization_param_path=args.quantization_param_path,
-              device=args.device,
-              ray_workers_use_nsight=args.ray_workers_use_nsight,
-              worker_use_ray=args.worker_use_ray,
-              use_v2_block_manager=args.use_v2_block_manager,
-              enable_chunked_prefill=args.enable_chunked_prefill,
-              download_dir=args.download_dir,
-              block_size=args.block_size,
-              disable_custom_all_reduce=args.disable_custom_all_reduce,
-              gpu_memory_utilization=args.gpu_memory_utilization)
+    llm = LLM(
+        model=args.model,
+        speculative_model=args.speculative_model,
+        num_speculative_tokens=args.num_speculative_tokens,
+        tokenizer=args.tokenizer,
+        quantization=args.quantization,
+        quantized_weights_path=args.quantized_weights_path,
+        tensor_parallel_size=args.tensor_parallel_size,
+        trust_remote_code=args.trust_remote_code,
+        dtype=args.dtype,
+        enforce_eager=args.enforce_eager,
+        kv_cache_dtype=args.kv_cache_dtype,
+        quantization_param_path=args.quantization_param_path,
+        device=args.device,
+        ray_workers_use_nsight=args.ray_workers_use_nsight,
+        worker_use_ray=args.worker_use_ray,
+        use_v2_block_manager=args.use_v2_block_manager,
+        enable_chunked_prefill=args.enable_chunked_prefill,
+        download_dir=args.download_dir,
+        block_size=args.block_size,
+        disable_custom_all_reduce=args.disable_custom_all_reduce,
+        gpu_memory_utilization=args.gpu_memory_utilization,
+        distributed_executor_backend=args.distributed_executor_backend,
+    )
 
     sampling_params = SamplingParams(
         n=args.n,
@@ -237,5 +240,13 @@ def run_to_completion(profile_dir: Optional[str] = None):
                         help='the fraction of GPU memory to be used for '
                         'the model executor, which can range from 0 to 1.'
                         'If unspecified, will use the default value of 0.9.')
+    parser.add_argument(
+        '--distributed-executor-backend',
+        choices=['ray', 'mp', 'torchrun'],
+        default=None,
+        help='Backend to use for distributed serving. When more than 1 GPU '
+        'is used, on CUDA this will be automatically set to "ray" if '
+        'installed or "mp" (multiprocessing) otherwise. On ROCm, this is '
+        'instead set to torchrun by default.')
     args = parser.parse_args()
     main(args)
@@ -79,6 +79,7 @@ def run_vllm(
     enable_prefix_caching: bool,
     enable_chunked_prefill: bool,
     max_num_batched_tokens: int,
+    distributed_executor_backend: Optional[str],
     gpu_memory_utilization: float = 0.9,
     worker_use_ray: bool = False,
     download_dir: Optional[str] = None,
@@ -104,6 +105,7 @@ def run_vllm(
         download_dir=download_dir,
         enable_chunked_prefill=enable_chunked_prefill,
         max_num_batched_tokens=max_num_batched_tokens,
+        distributed_executor_backend=distributed_executor_backend,
     )
 
     # Add the requests to the engine.
@@ -229,8 +231,9 @@ def main(args: argparse.Namespace):
             args.max_model_len, args.enforce_eager, args.kv_cache_dtype,
             args.quantization_param_path, args.device,
             args.enable_prefix_caching, args.enable_chunked_prefill,
-            args.max_num_batched_tokens, args.gpu_memory_utilization,
-            args.worker_use_ray, args.download_dir)
+            args.max_num_batched_tokens, args.distributed_executor_backend,
+            args.gpu_memory_utilization, args.worker_use_ray,
+            args.download_dir)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -384,6 +387,14 @@ def main(args: argparse.Namespace):
         type=str,
         default=None,
         help='Path to save the throughput results in JSON format.')
+    parser.add_argument(
+        '--distributed-executor-backend',
+        choices=['ray', 'mp', 'torchrun'],
+        default=None,
+        help='Backend to use for distributed serving. When more than 1 GPU '
+        'is used, on CUDA this will be automatically set to "ray" if '
+        'installed or "mp" (multiprocessing) otherwise. On ROCm, this is '
+        'instead set to torchrun by default.')
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model
 
@@ -117,13 +117,9 @@ void convert_fp8(torch::Tensor& dst_data, torch::Tensor& src_data,
                  torch::Tensor& scale);
 
 #ifdef USE_ROCM
-torch::Tensor fp8_gemm(torch::Tensor& a, torch::Tensor& b,
-                       torch::Tensor& scaleA, torch::Tensor& scaleB,
-                       torch::Tensor& scaleD, int algo_idx);
-
-torch::Tensor fp8_gemm_16(torch::Tensor& a, torch::Tensor& b,
-                          torch::Tensor& scaleA, torch::Tensor& scaleB,
-                          int algo_idx);
+void fp8_mm(torch::Tensor& a, torch::Tensor& b, torch::Tensor& result,
+            torch::Tensor& scale_a, torch::Tensor& scale_b,
+            const c10::optional<torch::Tensor>& scale_result, int64_t algo_idx);
 
 void create_workspace();
 #endif
 
@@ -70,8 +70,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
           "Convert the key and value cache to fp8 data type");
 
 #ifdef USE_ROCM
-  ops.def("fp8_gemm", &fp8_gemm, "fp8 GEMM with fp8 output");
-  ops.def("fp8_gemm_16", &fp8_gemm_16, "fp8 GEMM with fp16 output");
+  ops.def("fp8_mm", &fp8_mm, "fp8 GEMM with fp8 fp16 bf16 output type");
   ops.def("create_workspace", &create_workspace,
           "Create workspace for fp8 GEMM");
 #endif