vllm-project · hsliuustc0106 · Feb 9, 2026 · Feb 9, 2026 · Feb 9, 2026 · Feb 9, 2026
@@ -123,7 +123,7 @@ steps:
     timeout_in_minutes: 20
     depends_on: image-build
     commands:
-      - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py
+      - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py -m core_model
     agents:
       queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
     plugins:

@@ -56,7 +56,7 @@ steps:
     - export GPU_ARCHS=gfx942
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py
+    - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py -m core_model
 
 - label: "Diffusion Tensor Parallelism Test"
   timeout_in_minutes: 20

diff --git a/tests/e2e/offline_inference/test_sequence_parallel.py b/tests/e2e/offline_inference/test_sequence_parallel.py
@@ -145,7 +145,15 @@ def _run_inference(
 # SP configurations: (ulysses_degree, ring_degree, height, width, warmup, is_perf_test)
 # - warmup: whether to run warmup for this SP config
 # - is_perf_test: whether this is a performance test (show speedup metrics)
-SP_CONFIGS = [
+SP_CONFIGS_L2 = [
+    # Ulysses-2 - performance test
+    (2, 1, DEFAULT_HEIGHT, DEFAULT_WIDTH, True, True),
+    (1, 2, DEFAULT_HEIGHT, DEFAULT_WIDTH, True, True),  # Ring-2 - performance test
+    # Hybrid - correctness only
+    (2, 2, DEFAULT_HEIGHT, DEFAULT_WIDTH, False, False),
+]
+
+SP_CONFIGS_L3 = [
     # Ulysses-2 - performance test
     (2, 1, DEFAULT_HEIGHT, DEFAULT_WIDTH, True, True),
     (1, 2, DEFAULT_HEIGHT, DEFAULT_WIDTH, True, True),  # Ring-2 - performance test
@@ -168,7 +176,7 @@ def _get_sp_mode(ulysses_degree: int, ring_degree: int) -> str:
 @pytest.mark.core_model
 @pytest.mark.diffusion
 @pytest.mark.parallel
-@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2})
+@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 2, "rocm": 2})
 @pytest.mark.parametrize("model_name", MODELS)
 def test_sp_correctness(model_name: str):
     """Test that SP inference produces correct outputs and measure performance.
@@ -191,7 +199,130 @@ def test_sp_correctness(model_name: str):
     print(f"Available GPUs: {device_count}")
     print("=" * 70)
 
-    for ulysses_degree, ring_degree, height, width, sp_warmup, is_perf_test in SP_CONFIGS:
+    for ulysses_degree, ring_degree, height, width, sp_warmup, is_perf_test in SP_CONFIGS_L2:
+        sp_size = ulysses_degree * ring_degree
+        sp_mode = _get_sp_mode(ulysses_degree, ring_degree)
+
+        if device_count < sp_size:
+            print(f"\n[{sp_mode}] SKIPPED (requires {sp_size} GPUs)")
+            continue
+
+        # Determine baseline warmup: only for default size (performance tests)
+        cache_key = (height, width)
+        baseline_warmup = height == DEFAULT_HEIGHT and width == DEFAULT_WIDTH
+
+        # Get or compute baseline for this (height, width)
+        if cache_key not in baseline_cache:
+            print(f"\n--- Running baseline {height}x{width} (warmup={baseline_warmup}) ---")
+            baseline = _run_inference(
+                model_name,
+                torch.bfloat16,
+                "sdpa",
+                height=height,
+                width=width,
+                warmup=baseline_warmup,
+            )
+            assert len(baseline.images) == 1
+            baseline_cache[cache_key] = baseline
+            print(f"[baseline] {height}x{width}: {baseline.elapsed_ms:.0f}ms")
+        else:
+            baseline = baseline_cache[cache_key]
+
+        # Run SP
+        print(f"\n--- Running {sp_mode} (warmup={sp_warmup}) ---")
+        sp_result = _run_inference(
+            model_name,
+            torch.bfloat16,
+            "sdpa",
+            ulysses_degree=ulysses_degree,
+            ring_degree=ring_degree,
+            height=height,
+            width=width,
+            warmup=sp_warmup,
+        )
+        assert len(sp_result.images) == 1
+
+        # Compare outputs (correctness)
+        mean_diff, max_diff = _diff_metrics(baseline.images[0], sp_result.images[0])
+
+        # Build result entry
+        result = {
+            "mode": sp_mode,
+            "sp_size": sp_size,
+            "height": height,
+            "width": width,
+            "baseline_ms": baseline.elapsed_ms,
+            "sp_ms": sp_result.elapsed_ms,
+            "mean_diff": mean_diff,
+            "max_diff": max_diff,
+            "is_perf_test": is_perf_test,
+        }
+        results.append(result)
+
+        # Output based on test type
+        if is_perf_test:
+            speedup = baseline.elapsed_ms / sp_result.elapsed_ms if sp_result.elapsed_ms > 0 else 0
+            result["speedup"] = speedup
+            print(
+                f"[{sp_mode}] {sp_size} GPUs | "
+                f"baseline: {baseline.elapsed_ms:.0f}ms, sp: {sp_result.elapsed_ms:.0f}ms, "
+                f"speedup: {speedup:.2f}x"
+            )
+        else:
+            print(f"[{sp_mode}] {sp_size} GPUs | sp: {sp_result.elapsed_ms:.0f}ms (correctness only)")
+
+        print(f"[{sp_mode}] diff: mean={mean_diff:.6e}, max={max_diff:.6e}")
+
+        # Assert correctness
+        assert mean_diff <= DIFF_MEAN_THRESHOLD and max_diff <= DIFF_MAX_THRESHOLD, (
+            f"[{sp_mode}] SP output differs from baseline: mean={mean_diff:.6e}, max={max_diff:.6e}"
+        )
+
+    # Summary
+    print("\n" + "=" * 70)
+    print("SUMMARY")
+    print("=" * 70)
+    print(f"{'Mode':<15} {'GPUs':<6} {'Size':<10} {'Baseline':<12} {'SP':<12} {'Speedup':<10} {'Status'}")
+    print("-" * 70)
+    for r in results:
+        speedup_str = f"{r['speedup']:.2f}x" if r.get("speedup") else "N/A"
+        baseline_str = f"{r['baseline_ms']:.0f}ms" if r["is_perf_test"] else "N/A"
+        status = "PASS" if r["mean_diff"] <= DIFF_MEAN_THRESHOLD else "FAIL"
+        print(
+            f"{r['mode']:<15} {r['sp_size']:<6} {r['height']}x{r['width']:<5} "
+            f"{baseline_str:<12} {r['sp_ms']:.0f}ms{'':<7} {speedup_str:<10} {status}"
+        )
+    print("=" * 70)
+
+
+# TODO: After PR#1272 is merged, add markers
+# @pytest.mark.advanced_model
+@pytest.mark.diffusion
+@pytest.mark.parallel
+@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2})
+@pytest.mark.parametrize("model_name", MODELS)
+def test_sp_correctness_advanced(model_name: str):
+    """Test that SP inference produces correct outputs and measure performance.
+
+    Runs baseline once per unique (height, width), then tests all SP configs.
+
+    Note: Run with `pytest -v -s` to see detailed output.
+    """
+    device_count = current_omni_platform.get_device_count()
+
+    # Cache baseline results by (height, width)
+    # Key: (height, width), Value: (result, warmup_used)
+    baseline_cache: dict[tuple[int, int], InferenceResult] = {}
+
+    # Collect results for summary
+    results: list[dict] = []
+
+    print("\n" + "=" * 70)
+    print(f"Sequence Parallel Test - Model: {model_name}")
+    print(f"Available GPUs: {device_count}")
+    print("=" * 70)
+
+    for ulysses_degree, ring_degree, height, width, sp_warmup, is_perf_test in SP_CONFIGS_L3:
         sp_size = ulysses_degree * ring_degree
         sp_mode = _get_sp_mode(ulysses_degree, ring_degree)