sgl-project
diff --git a/‎benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py‎
Lines changed: 10 additions & 1 deletion b/‎benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py‎
Lines changed: 9 additions & 0 deletions b/‎benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py‎
Lines changed: 9 additions & 0 deletions b/‎benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎docs/backend/native_api.ipynb‎
Lines changed: 2 additions & 3 deletions b/‎docs/backend/native_api.ipynb‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎docs/backend/server_arguments.md‎
Lines changed: 0 additions & 1 deletion b/‎docs/backend/server_arguments.md‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎python/pyproject.toml‎
Lines changed: 4 additions & 1 deletion b/‎python/pyproject.toml‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎python/sglang/README.md‎
Lines changed: 3 additions & 1 deletion b/‎python/sglang/README.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎python/sglang/bench_offline_throughput.py‎
Lines changed: 19 additions & 0 deletions b/‎python/sglang/bench_offline_throughput.py‎
Lines changed: 19 additions & 0 deletions
@@ -30,11 +30,20 @@ def get_model_config(model_name: str, tp_size: int):
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
         shard_intermediate_size = 2 * intermediate_size // tp_size
+    elif config.architectures[0] in [
+        "Grok1ForCausalLM",
+        "Grok1ImgGen",
+        "Grok1AForCausalLM",
+    ]:
+        E = config.num_local_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // tp_size
     elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]:
         E = config.n_routed_experts
         topk = config.num_experts_per_tok
         intermediate_size = config.intermediate_size
-        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+        shard_intermediate_size = 2 * intermediate_size // tp_size
     else:
         # Default: Mixtral
         E = config.num_local_experts
 
@@ -35,6 +35,15 @@ def get_model_config(model_name: str, tp_size: int):
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
         shard_intermediate_size = 2 * intermediate_size // tp_size
+    elif config.architectures[0] in [
+        "Grok1ForCausalLM",
+        "Grok1ImgGen",
+        "Grok1AForCausalLM",
+    ]:
+        E = config.num_local_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // tp_size
     else:
         # Default: Mixtral
         E = config.num_local_experts
 
@@ -397,6 +397,15 @@ def main(args: argparse.Namespace):
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    elif config.architectures[0] in [
+        "Grok1ForCausalLM",
+        "Grok1ImgGen",
+        "Grok1AForCausalLM",
+    ]:
+        E = config.num_local_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
     else:
         # Default: Mixtral
         E = config.num_local_experts
 
@@ -210,8 +210,7 @@
     "response = requests.post(url, json=data)\n",
     "print_highlight(response.text)\n",
     "assert response.json()[\"success\"] is True\n",
-    "assert response.json()[\"message\"] == \"Succeeded to update model weights.\"\n",
-    "assert response.json().keys() == {\"success\", \"message\"}"
+    "assert response.json()[\"message\"] == \"Succeeded to update model weights.\""
    ]
   },
   {
@@ -411,7 +410,7 @@
     "    },\n",
     ")\n",
     "output = response.json()\n",
-    "output_tokens = output[\"token_ids\"]\n",
+    "output_tokens = output[\"output_ids\"]\n",
     "\n",
     "output_text = tokenizer.decode(output_tokens, skip_special_tokens=False)\n",
     "print_highlight(f\"Tokenized Output: {output_tokens}\")\n",
 
@@ -96,7 +96,6 @@ Please consult the documentation below to learn more about the parameters you ma
 * `schedule_policy`: The scheduling policy to control the processing order of waiting prefill requests in a single engine.
 * `schedule_conservativeness`: Can be used to decrease/increase the conservativeness of the server when taking new requests. Highly conservative behavior leads to starvation, but low conservativeness leads to slowed-down performance.
 * `cpu_offload_gb`: Reserve this amount of RAM in GB for offloading of model parameters to the CPU.
-* `prefill_only_one_req`: When this flag is turned on, the engine prefills only one request at a time.
 
 ## Other runtime options
 
 
@@ -96,7 +96,10 @@ dev_cpu = ["sglang[all_cpu]", "sglang[test]"]
 "Bug Tracker" = "https://github.com/sgl-project/sglang/issues"
 
 [tool.setuptools.package-data]
-"sglang" = ["srt/layers/moe/fused_moe_triton/configs/*.json", "srt/layers/quantization/configs/*.json"]
+"sglang" = [
+    "srt/layers/moe/fused_moe_triton/configs/*.json",
+    "srt/layers/quantization/configs/*.json",
+]
 
 [tool.setuptools.packages.find]
 exclude = [
 
@@ -8,8 +8,10 @@
 - `bench_one_batch.py`: Benchmark the latency of running a single static batch without a server.
 - `bench_one_batch_server.py`: Benchmark the latency of running a single batch with a server.
 - `bench_serving.py`: Benchmark online serving with dynamic requests.
-- `check_env.py`: Check the environment variables.
+- `check_env.py`: Check the environment variables and dependencies.
 - `global_config.py`: The global configs and constants.
 - `launch_server.py`: The entry point for launching the local server.
 - `llama3_eval.py`: Evaluation of Llama 3 using the Meta Llama dataset.
+- `profiler.py`: Profile a running server.
 - `utils.py`: Common utilities.
+- `version.py`: Version info.
@@ -56,6 +56,7 @@ class BenchArgs:
     profile: bool = False
     skip_warmup: bool = False
     do_not_exit: bool = False
+    prompt_suffix: str = ""
 
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
@@ -177,6 +178,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
             action="store_true",
             help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
         )
+        parser.add_argument(
+            "--prompt-suffix",
+            type=str,
+            default="",
+            help="Suffix applied to the end of all user prompts, followed by assistant prompt suffix.",
+        )
 
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
@@ -216,6 +223,10 @@ def throughput_test_once(
     ]
 
     if profile:
+        assert (
+            "SGLANG_TORCH_PROFILER_DIR" in os.environ
+        ), "Please set SGLANG_TORCH_PROFILER_DIR."
+        os.makedirs(os.environ["SGLANG_TORCH_PROFILER_DIR"], exist_ok=True)
         backend.start_profile()
 
     st = time.perf_counter()
@@ -229,6 +240,8 @@ def throughput_test_once(
     if backend_name == "runtime":
         gen_out = json.loads(gen_out)
 
+    server_info = backend.get_server_info()
+
     measurement_results["total_latency"] = latency
     measurement_results["total_output_tokens"] = sum(
         o["meta_info"]["completion_tokens"] for o in gen_out
@@ -246,6 +259,7 @@ def throughput_test_once(
         measurement_results["total_input_tokens"]
         + measurement_results["total_output_tokens"]
     ) / latency
+    measurement_results["last_gen_throughput"] = server_info["last_gen_throughput"]
 
     return measurement_results
 
@@ -361,6 +375,11 @@ def throughput_test(
     print(
         "{:<40} {:<10}".format("Total generated tokens:", result["total_output_tokens"])
     )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Last generation throughput (tok/s):", result["last_gen_throughput"]
+        )
+    )
     print(
         "{:<40} {:<10.2f}".format(
             "Request throughput (req/s):", result["request_throughput"]