Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 20 additions & 7 deletions python/sglang/bench_one_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
from sglang.srt.server_args import PortArgs, ServerArgs
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
from sglang.srt.utils import (
DeepEPMode,
configure_logger,
get_bool_env_var,
kill_process_tree,
Expand Down Expand Up @@ -275,6 +276,9 @@ def _maybe_prepare_dp_attn_batch(batch: ScheduleBatch, model_runner):
disable_cuda_graph=model_runner.server_args.disable_cuda_graph,
spec_algorithm=SpeculativeAlgorithm.NONE,
speculative_num_draft_tokens=None,
enable_two_batch_overlap=model_runner.server_args.enable_two_batch_overlap,
enable_deepep_moe=model_runner.server_args.enable_deepep_moe,
deepep_mode=DeepEPMode[model_runner.server_args.deepep_mode],
Comment on lines +279 to +281
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

These parameters are added to prepare_dp_attn_batch_raw to enable more flexible control over the distributed execution. It's good to see these being included.

)


Expand Down Expand Up @@ -339,6 +343,7 @@ def latency_test_run_once(
log_decode_step,
profile,
profile_filename_prefix,
dp_size,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

It's good to see dp_size being passed to latency_test_run_once. This ensures the correct batch size is used in the calculations.

):
max_batch_size = model_runner.max_total_num_tokens // (input_len + output_len)
if batch_size > max_batch_size:
Expand All @@ -353,7 +358,7 @@ def latency_test_run_once(

measurement_results = {
"run_name": run_name,
"batch_size": batch_size,
"batch_size": batch_size * dp_size,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Multiplying batch_size by dp_size here correctly reflects the actual number of samples being processed in the distributed setting.

"input_len": input_len,
"output_len": output_len,
}
Expand All @@ -378,7 +383,7 @@ def latency_test_run_once(
synchronize(device)
prefill_latency = time.perf_counter() - tic
tot_latency += prefill_latency
throughput = input_len * batch_size / prefill_latency
throughput = input_len * batch_size * dp_size / prefill_latency
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The throughput calculation now correctly accounts for the dp_size, providing a more accurate measure of performance in the distributed attention setting.

rank_print(
f"Prefill. latency: {prefill_latency:6.5f} s, throughput: {throughput:9.2f} token/s"
)
Expand All @@ -394,16 +399,16 @@ def latency_test_run_once(
synchronize(device)
latency = time.perf_counter() - tic
tot_latency += latency
throughput = batch_size / latency
throughput = batch_size * dp_size / latency
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The throughput calculation here correctly accounts for the dp_size during the decode step.

decode_latencies.append(latency)
if i < 5 or (log_decode_step > 0 and i % log_decode_step == 0):
rank_print(
f"Decode {i}. Batch size: {batch_size}, latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
f"Decode {i}. Batch size: {batch_size * dp_size}, latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
)

if profile:
profiler.stop()
profile_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}.trace.json.gz"
profile_filename = f"{profile_filename_prefix}_batch{batch_size * dp_size}_input{input_len}_output{output_len}.trace.json.gz"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The profiling filename now includes the dp_size in the batch size, which is helpful for distinguishing profiling results in DP attention scenarios.

parent_dir = os.path.dirname(os.path.abspath(profile_filename))
os.makedirs(parent_dir, exist_ok=True)
profiler.export_chrome_trace(profile_filename)
Expand All @@ -412,14 +417,14 @@ def latency_test_run_once(
# Record decode timing from 2nd output
if output_len > 1:
med_decode_latency = np.median(decode_latencies)
med_decode_throughput = batch_size / med_decode_latency
med_decode_throughput = batch_size * dp_size / med_decode_latency
rank_print(
f"Decode. median latency: {med_decode_latency:6.5f} s, median throughput: {med_decode_throughput:9.2f} token/s"
)
measurement_results["median_decode_latency"] = med_decode_latency
measurement_results["median_decode_throughput"] = med_decode_throughput

throughput = (input_len + output_len) * batch_size / tot_latency
throughput = (input_len + output_len) * batch_size * dp_size / tot_latency
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The throughput calculation now correctly accounts for the dp_size in the total throughput calculation.

rank_print(
f"Total. latency: {tot_latency:6.3f} s, throughput: {throughput:9.2f} token/s"
)
Expand Down Expand Up @@ -464,6 +469,7 @@ def latency_test(
log_decode_step=0,
profile=False,
profile_filename_prefix="", # not used
dp_size=1 if not server_args.enable_dp_attention else server_args.dp_size,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Passing dp_size to latency_test_run_once ensures that the warmup phase also uses the correct batch size.

)

rank_print("Benchmark ...")
Expand All @@ -486,6 +492,7 @@ def latency_test(
bench_args.log_decode_step,
bench_args.profile if tp_rank == 0 else None,
bench_args.profile_filename_prefix,
1 if not server_args.enable_dp_attention else server_args.dp_size,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Passing dp_size to latency_test_run_once ensures that the benchmark phase also uses the correct batch size.

)
if ret is not None:
result_list.append(ret)
Expand All @@ -502,6 +509,12 @@ def latency_test(

def main(server_args, bench_args):
server_args.cuda_graph_max_bs = max(bench_args.batch_size)
if server_args.enable_dp_attention:
sub_batch_size = []
for i in range(len(bench_args.batch_size)):
assert bench_args.batch_size[i] % server_args.dp_size == 0
sub_batch_size.append(bench_args.batch_size[i] // server_args.dp_size)
bench_args.batch_size = tuple(sub_batch_size)
Comment on lines 511 to +517
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This logic correctly adjusts the batch size when DP attention is enabled, ensuring that the comparison between TP and TP+DP is fair.


_set_envs_and_config(server_args)

Expand Down