Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
4b53bc9
upd
BBuf Mar 30, 2025
953a000
upd
BBuf Mar 30, 2025
500e3e2
Merge branch 'main' into support_r1_shared_expers_fusion
zhyncs Mar 30, 2025
5dac1c2
upd
BBuf Mar 31, 2025
5cff889
Merge branch 'support_r1_shared_expers_fusion' of github.com:sgl-proj…
BBuf Mar 31, 2025
e771349
fix acc bug
BBuf Mar 31, 2025
c69675a
upd
BBuf Mar 31, 2025
4180d63
fix circular import
BBuf Mar 31, 2025
128480e
upd
BBuf Mar 31, 2025
3d223ba
upd
BBuf Apr 1, 2025
3fd2706
upd
BBuf Apr 1, 2025
6c33e52
upd
BBuf Apr 1, 2025
5d892f8
Merge branch 'main' into support_r1_shared_expers_fusion
zhyncs Apr 1, 2025
4bf262a
upd
BBuf Apr 1, 2025
42678e1
upd
BBuf Apr 1, 2025
3bfa90a
upd
BBuf Apr 1, 2025
e3c5c3d
fix amd
BBuf Apr 1, 2025
2a5af12
fix ci
BBuf Apr 1, 2025
2f185f9
refine
BBuf Apr 1, 2025
3bb4fc1
Merge branch 'main' into support_r1_shared_expers_fusion
BBuf Apr 2, 2025
99abc77
refine
BBuf Apr 2, 2025
f8c8c70
refine
BBuf Apr 2, 2025
2a4bc93
ud
BBuf Apr 2, 2025
9c797af
ud
BBuf Apr 2, 2025
0261301
ud
BBuf Apr 2, 2025
cd3782d
upd
BBuf Apr 2, 2025
3d8a840
refine
BBuf Apr 2, 2025
2419300
upd
BBuf Apr 2, 2025
90ee3f8
Merge branch 'main' into support_r1_shared_expers_fusion
BBuf Apr 2, 2025
1df1bee
Merge branch 'main' into support_r1_shared_expers_fusion
BBuf Apr 3, 2025
40dc2c6
fix ci
BBuf Apr 3, 2025
fb5b17a
lint
BBuf Apr 3, 2025
943e986
add warmup for bench_serving tools
BBuf Apr 3, 2025
dab29e2
refine
BBuf Apr 3, 2025
1bba429
Merge branch 'main' into support_r1_shared_expers_fusion
BBuf Apr 3, 2025
556cba3
Merge branch 'main' into support_r1_shared_expers_fusion
BBuf Apr 3, 2025
64f261e
Merge branch 'main' into support_r1_shared_expers_fusion
BBuf Apr 3, 2025
9a6832a
fix bug
BBuf Apr 3, 2025
dbcae93
Merge branch 'main' into support_r1_shared_expers_fusion
BBuf Apr 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,12 @@ def main(args: argparse.Namespace):
intermediate_size = config.moe_intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]:
E = config.n_routed_experts
n_share_fusion_experts = args.n_share_experts_fusion
E = (
config.n_routed_experts + n_share_fusion_experts
if config.architectures[0] in ["DeepseekV3ForCausalLM"]
else config.n_routed_experts
)
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
Expand Down Expand Up @@ -559,6 +564,12 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]:
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--batch-size", type=int, required=False)
parser.add_argument("--tune", action="store_true")
parser.add_argument(
"--n-share-experts-fusion",
type=int,
default=0,
help="The number of shared_experts need to be replica to fuse with normal experts in deepseek v3/r1",
)
args = parser.parse_args()

main(args)
37 changes: 31 additions & 6 deletions python/sglang/bench_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -993,13 +993,16 @@ async def limited_request_func(request_func_input, pbar):
return await request_func(request_func_input=request_func_input, pbar=pbar)

# Warmup
print("Starting initial single prompt test run...")
print(f"Starting warmup with {args.warmup_requests} sequences...")

# Use the first request for all warmup iterations
test_prompt, test_prompt_len, test_output_len = input_requests[0]
if lora_names != None and len(lora_names) != 0:
lora_name = lora_names[0]
else:
lora_name = None

# Create the test input once
test_input = RequestFuncInput(
model=model_id,
prompt=test_prompt,
Expand All @@ -1009,14 +1012,26 @@ async def limited_request_func(request_func_input, pbar):
lora_name=lora_name,
extra_request_body=extra_request_body,
)
test_output = await request_func(request_func_input=test_input)
if not test_output.success:

# Run warmup requests
warmup_tasks = []
for _ in range(args.warmup_requests):
warmup_tasks.append(
asyncio.create_task(request_func(request_func_input=test_input))
)

warmup_outputs = await asyncio.gather(*warmup_tasks)

# Check if at least one warmup request succeeded
if not any(output.success for output in warmup_outputs):
raise ValueError(
"Initial test run failed - Please make sure benchmark arguments "
f"are correctly specified. Error: {test_output.error}"
"Warmup failed - Please make sure benchmark arguments "
f"are correctly specified. Error: {warmup_outputs[0].error}"
)
else:
print("Initial test run completed. Starting main benchmark run...")
print(
f"Warmup completed with {args.warmup_requests} sequences. Starting main benchmark run..."
)

# Flush cache
if ("sglang" in backend and _get_bool_env_var("SGLANG_IS_IN_CI")) or flush_cache:
Expand Down Expand Up @@ -1253,6 +1268,10 @@ def run_benchmark(args_: argparse.Namespace):
if not hasattr(args, "max_concurrency"):
args.max_concurrency = None

# Set default value for warmup_requests if not present
if not hasattr(args, "warmup_requests"):
args.warmup_requests = 1

print(f"benchmark_args={args}")

# Set global environments
Expand Down Expand Up @@ -1560,6 +1579,12 @@ def __call__(self, parser, namespace, values, option_string=None):
action="store_true",
help="Flush the cache before running the benchmark",
)
parser.add_argument(
"--warmup-requests",
type=int,
default=1,
help="Number of warmup requests to run before the benchmark",
)

group = parser.add_argument_group("generated-shared-prefix dataset arguments")
group.add_argument(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
{
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 4
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 4
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 4
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 4
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3
},
"24": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 4
},
"32": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 5
},
"48": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 4
},
"64": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 4
},
"96": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
},
"128": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"256": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
},
"512": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 4
},
"1024": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 4
},
"1536": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 4
},
"2048": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 4
},
"3072": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 4
},
"4096": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 4
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
{
"1": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 4
},
"2": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3
},
"4": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 4
},
"8": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"16": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
},
"24": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
},
"32": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"48": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
},
"96": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
},
"128": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
},
"256": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
},
"512": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
},
"1024": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"1536": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"2048": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
},
"3072": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3
},
"4096": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
}
}
Loading
Loading