Skip to content

Commit 16ff6cc

Browse files
author
Aleksandr Malyshev
committed
Merge branch 'cl/fix-navi-build' of https://github.com/ROCm/vllm into
cl/fix-navi-build pulling latest and greatest from main
2 parents 415ebac + 67dab67 commit 16ff6cc

File tree

15 files changed

+464
-554
lines changed

15 files changed

+464
-554
lines changed

benchmarks/benchmark_latency.py

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -19,27 +19,30 @@ def main(args: argparse.Namespace):
1919

2020
# NOTE(woosuk): If the request cannot be processed in a single batch,
2121
# the engine will automatically process the request in multiple batches.
22-
llm = LLM(model=args.model,
23-
speculative_model=args.speculative_model,
24-
num_speculative_tokens=args.num_speculative_tokens,
25-
tokenizer=args.tokenizer,
26-
quantization=args.quantization,
27-
quantized_weights_path=args.quantized_weights_path,
28-
tensor_parallel_size=args.tensor_parallel_size,
29-
trust_remote_code=args.trust_remote_code,
30-
dtype=args.dtype,
31-
enforce_eager=args.enforce_eager,
32-
kv_cache_dtype=args.kv_cache_dtype,
33-
quantization_param_path=args.quantization_param_path,
34-
device=args.device,
35-
ray_workers_use_nsight=args.ray_workers_use_nsight,
36-
worker_use_ray=args.worker_use_ray,
37-
use_v2_block_manager=args.use_v2_block_manager,
38-
enable_chunked_prefill=args.enable_chunked_prefill,
39-
download_dir=args.download_dir,
40-
block_size=args.block_size,
41-
disable_custom_all_reduce=args.disable_custom_all_reduce,
42-
gpu_memory_utilization=args.gpu_memory_utilization)
22+
llm = LLM(
23+
model=args.model,
24+
speculative_model=args.speculative_model,
25+
num_speculative_tokens=args.num_speculative_tokens,
26+
tokenizer=args.tokenizer,
27+
quantization=args.quantization,
28+
quantized_weights_path=args.quantized_weights_path,
29+
tensor_parallel_size=args.tensor_parallel_size,
30+
trust_remote_code=args.trust_remote_code,
31+
dtype=args.dtype,
32+
enforce_eager=args.enforce_eager,
33+
kv_cache_dtype=args.kv_cache_dtype,
34+
quantization_param_path=args.quantization_param_path,
35+
device=args.device,
36+
ray_workers_use_nsight=args.ray_workers_use_nsight,
37+
worker_use_ray=args.worker_use_ray,
38+
use_v2_block_manager=args.use_v2_block_manager,
39+
enable_chunked_prefill=args.enable_chunked_prefill,
40+
download_dir=args.download_dir,
41+
block_size=args.block_size,
42+
disable_custom_all_reduce=args.disable_custom_all_reduce,
43+
gpu_memory_utilization=args.gpu_memory_utilization,
44+
distributed_executor_backend=args.distributed_executor_backend,
45+
)
4346

4447
sampling_params = SamplingParams(
4548
n=args.n,
@@ -237,5 +240,13 @@ def run_to_completion(profile_dir: Optional[str] = None):
237240
help='the fraction of GPU memory to be used for '
238241
'the model executor, which can range from 0 to 1.'
239242
'If unspecified, will use the default value of 0.9.')
243+
parser.add_argument(
244+
'--distributed-executor-backend',
245+
choices=['ray', 'mp', 'torchrun'],
246+
default=None,
247+
help='Backend to use for distributed serving. When more than 1 GPU '
248+
'is used, on CUDA this will be automatically set to "ray" if '
249+
'installed or "mp" (multiprocessing) otherwise. On ROCm, this is '
250+
'instead set to torchrun by default.')
240251
args = parser.parse_args()
241252
main(args)

benchmarks/benchmark_throughput.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ def run_vllm(
7979
enable_prefix_caching: bool,
8080
enable_chunked_prefill: bool,
8181
max_num_batched_tokens: int,
82+
distributed_executor_backend: Optional[str],
8283
gpu_memory_utilization: float = 0.9,
8384
worker_use_ray: bool = False,
8485
download_dir: Optional[str] = None,
@@ -104,6 +105,7 @@ def run_vllm(
104105
download_dir=download_dir,
105106
enable_chunked_prefill=enable_chunked_prefill,
106107
max_num_batched_tokens=max_num_batched_tokens,
108+
distributed_executor_backend=distributed_executor_backend,
107109
)
108110

109111
# Add the requests to the engine.
@@ -229,8 +231,9 @@ def main(args: argparse.Namespace):
229231
args.max_model_len, args.enforce_eager, args.kv_cache_dtype,
230232
args.quantization_param_path, args.device,
231233
args.enable_prefix_caching, args.enable_chunked_prefill,
232-
args.max_num_batched_tokens, args.gpu_memory_utilization,
233-
args.worker_use_ray, args.download_dir)
234+
args.max_num_batched_tokens, args.distributed_executor_backend,
235+
args.gpu_memory_utilization, args.worker_use_ray,
236+
args.download_dir)
234237
elif args.backend == "hf":
235238
assert args.tensor_parallel_size == 1
236239
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -384,6 +387,14 @@ def main(args: argparse.Namespace):
384387
type=str,
385388
default=None,
386389
help='Path to save the throughput results in JSON format.')
390+
parser.add_argument(
391+
'--distributed-executor-backend',
392+
choices=['ray', 'mp', 'torchrun'],
393+
default=None,
394+
help='Backend to use for distributed serving. When more than 1 GPU '
395+
'is used, on CUDA this will be automatically set to "ray" if '
396+
'installed or "mp" (multiprocessing) otherwise. On ROCm, this is '
397+
'instead set to torchrun by default.')
387398
args = parser.parse_args()
388399
if args.tokenizer is None:
389400
args.tokenizer = args.model

csrc/ops.h

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -117,13 +117,9 @@ void convert_fp8(torch::Tensor& dst_data, torch::Tensor& src_data,
117117
torch::Tensor& scale);
118118

119119
#ifdef USE_ROCM
120-
torch::Tensor fp8_gemm(torch::Tensor& a, torch::Tensor& b,
121-
torch::Tensor& scaleA, torch::Tensor& scaleB,
122-
torch::Tensor& scaleD, int algo_idx);
123-
124-
torch::Tensor fp8_gemm_16(torch::Tensor& a, torch::Tensor& b,
125-
torch::Tensor& scaleA, torch::Tensor& scaleB,
126-
int algo_idx);
120+
void fp8_mm(torch::Tensor& a, torch::Tensor& b, torch::Tensor& result,
121+
torch::Tensor& scale_a, torch::Tensor& scale_b,
122+
const c10::optional<torch::Tensor>& scale_result, int64_t algo_idx);
127123

128124
void create_workspace();
129125
#endif

csrc/pybind.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
7070
"Convert the key and value cache to fp8 data type");
7171

7272
#ifdef USE_ROCM
73-
ops.def("fp8_gemm", &fp8_gemm, "fp8 GEMM with fp8 output");
74-
ops.def("fp8_gemm_16", &fp8_gemm_16, "fp8 GEMM with fp16 output");
73+
ops.def("fp8_mm", &fp8_mm, "fp8 GEMM with fp8 fp16 bf16 output type");
7574
ops.def("create_workspace", &create_workspace,
7675
"Create workspace for fp8 GEMM");
7776
#endif

0 commit comments

Comments
 (0)