sgl-project · JustinTong0323 · Jul 26, 2025 · Jun 9, 2025 · Jun 9, 2025 · Jun 9, 2025
diff --git a/benchmark/mmmu/bench_sglang.py b/benchmark/mmmu/bench_sglang.py
@@ -118,7 +118,6 @@ async def eval_mmmu(args) -> None:
     client = openai.AsyncOpenAI(
         api_key="sk", base_url=f"http://127.0.0.1:{args.port}/v1"
     )
-    semaphore = asyncio.Semaphore(args.concurrency)
     start = time.perf_counter()
     base_url = f"http://127.0.0.1:{args.port}"
 
@@ -132,14 +131,22 @@ async def eval_mmmu(args) -> None:
 
         samples = samples[: args.profile_number]
 
-    tasks = [
-        process_sample_with_semaphore(semaphore, client, sample, sampling_params)
-        for sample in samples
-    ]
-
-    for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
-        sample, response = await coro
-        process_result(response, sample, answer_dict, out_samples)
+    if args.concurrency == 1:
+        # For concurrency == 1, run in sequential mode to ensure consistent order
+        # this is mainly for profiling
+        for sample in tqdm(samples):
+            _, response = await process_sample(client, sample, sampling_params)
+            process_result(response, sample, answer_dict, out_samples)
+    else:
+        semaphore = asyncio.Semaphore(args.concurrency)
+        tasks = [
+            process_sample_with_semaphore(semaphore, client, sample, sampling_params)
+            for sample in samples
+        ]
+
+        for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
+            sample, response = await coro
+            process_result(response, sample, answer_dict, out_samples)
 
     if args.profile:
         print("Stopping profiler...")

diff --git a/benchmark/mmmu/eval_utils.py b/benchmark/mmmu/eval_utils.py
@@ -28,7 +28,7 @@ class EvalArgs:
     seed: int = 42
     split: str = "validation"
     # Default setting to make the benchmark available on A100 for most 7B models
-    image_pixels_limit: int = 4300000
+    image_pixels_limit: int = -1
     result_filename: str = ""
     prompt_format_file: str = "prompt_format.yaml"
     dataset_path: str = "MMMU/MMMU"
@@ -193,6 +193,8 @@ def process_sample(i, sample):
             elif sample:
                 samples.append(sample)
 
+    samples.sort(key=lambda x: x["final_input_prompt"])
+
     print(
         f"Skipping {skip_count} samples with large images, {round((float(skip_count) / len(dataset)) * 100, 2)}% of dataset"
     )