|
62 | 62 | { |
63 | 63 | "speculative_model": "JackFram/llama-68m", |
64 | 64 | "num_speculative_tokens": 5, |
| 65 | + "enable_chunked_prefill": False, |
| 66 | + }, |
| 67 | + { |
| 68 | + # Chunked prefill enabled with small value |
| 69 | + # to make sure we get mixed batches. |
| 70 | + "speculative_model": "JackFram/llama-68m", |
| 71 | + "num_speculative_tokens": 5, |
| 72 | + "enable_chunked_prefill": True, |
| 73 | + "max_num_batched_tokens": 4, |
| 74 | + "max_num_seqs": 4 |
65 | 75 | }, |
66 | 76 | { |
67 | 77 | # Verify the detokenizer assertions in the test work when spec |
@@ -141,6 +151,14 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator, |
141 | 151 | { |
142 | 152 | "speculative_model": "JackFram/llama-68m", |
143 | 153 | "num_speculative_tokens": 5, |
| 154 | + "enable_chunked_prefill": False, |
| 155 | + }, |
| 156 | + { |
| 157 | + "speculative_model": "JackFram/llama-68m", |
| 158 | + "num_speculative_tokens": 5, |
| 159 | + "enable_chunked_prefill": True, |
| 160 | + "max_num_batched_tokens": 4, |
| 161 | + "max_num_seqs": 4, |
144 | 162 | }, |
145 | 163 | ]) |
146 | 164 | @pytest.mark.parametrize( |
@@ -204,6 +222,14 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( |
204 | 222 | { |
205 | 223 | "speculative_model": "JackFram/llama-68m", |
206 | 224 | "num_speculative_tokens": 5, |
| 225 | + "enable_chunked_prefill": False, |
| 226 | + }, |
| 227 | + { |
| 228 | + "speculative_model": "JackFram/llama-68m", |
| 229 | + "num_speculative_tokens": 5, |
| 230 | + "enable_chunked_prefill": True, |
| 231 | + "max_num_batched_tokens": 4, |
| 232 | + "max_num_seqs": 4 |
207 | 233 | }, |
208 | 234 | ]) |
209 | 235 | @pytest.mark.parametrize( |
@@ -255,6 +281,14 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs( |
255 | 281 | { |
256 | 282 | "speculative_model": "JackFram/llama-68m", |
257 | 283 | "num_speculative_tokens": 5, |
| 284 | + "enable_chunked_prefill": False, |
| 285 | + }, |
| 286 | + { |
| 287 | + "speculative_model": "JackFram/llama-68m", |
| 288 | + "num_speculative_tokens": 5, |
| 289 | + "enable_chunked_prefill": True, |
| 290 | + "max_num_batched_tokens": 4, |
| 291 | + "max_num_seqs": 4 |
258 | 292 | }, |
259 | 293 | ]) |
260 | 294 | @pytest.mark.parametrize("max_output_len", [ |
@@ -300,6 +334,14 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len( |
300 | 334 | { |
301 | 335 | "speculative_model": "JackFram/llama-68m", |
302 | 336 | "num_speculative_tokens": 5, |
| 337 | + "enable_chunked_prefill": False, |
| 338 | + }, |
| 339 | + { |
| 340 | + "speculative_model": "JackFram/llama-68m", |
| 341 | + "num_speculative_tokens": 5, |
| 342 | + "enable_chunked_prefill": True, |
| 343 | + "max_num_batched_tokens": 4, |
| 344 | + "max_num_seqs": 4 |
303 | 345 | }, |
304 | 346 | ]) |
305 | 347 | @pytest.mark.parametrize("batch_size", [1]) |
@@ -347,6 +389,14 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1( |
347 | 389 | { |
348 | 390 | "speculative_model": "JackFram/llama-68m", |
349 | 391 | "num_speculative_tokens": 5, |
| 392 | + "enable_chunked_prefill": False, |
| 393 | + }, |
| 394 | + { |
| 395 | + "speculative_model": "JackFram/llama-68m", |
| 396 | + "num_speculative_tokens": 5, |
| 397 | + "enable_chunked_prefill": True, |
| 398 | + "max_num_batched_tokens": 4, |
| 399 | + "max_num_seqs": 4 |
350 | 400 | }, |
351 | 401 | ]) |
352 | 402 | @pytest.mark.parametrize("batch_size", [32]) |
@@ -397,6 +447,14 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( |
397 | 447 | { |
398 | 448 | "speculative_model": "JackFram/llama-68m", |
399 | 449 | "num_speculative_tokens": 5, |
| 450 | + "enable_chunked_prefill": False, |
| 451 | + }, |
| 452 | + { |
| 453 | + "speculative_model": "JackFram/llama-68m", |
| 454 | + "num_speculative_tokens": 5, |
| 455 | + "enable_chunked_prefill": True, |
| 456 | + "max_num_batched_tokens": 4, |
| 457 | + "max_num_seqs": 4 |
400 | 458 | }, |
401 | 459 | ]) |
402 | 460 | @pytest.mark.parametrize( |
@@ -454,6 +512,14 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption( |
454 | 512 | { |
455 | 513 | "speculative_model": "JackFram/llama-68m", |
456 | 514 | "num_speculative_tokens": 5, |
| 515 | + "enable_chunked_prefill": False, |
| 516 | + }, |
| 517 | + { |
| 518 | + "speculative_model": "JackFram/llama-68m", |
| 519 | + "num_speculative_tokens": 5, |
| 520 | + "enable_chunked_prefill": True, |
| 521 | + "max_num_batched_tokens": 4, |
| 522 | + "max_num_seqs": 4 |
457 | 523 | }, |
458 | 524 | ]) |
459 | 525 | @pytest.mark.parametrize("batch_size", [2]) |
@@ -503,6 +569,15 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs, |
503 | 569 | # Artificially limit the draft model max model len; this forces vLLM |
504 | 570 | # to skip speculation once the sequences grow beyond 32-k tokens. |
505 | 571 | "speculative_max_model_len": 32, |
| 572 | + "enable_chunked_prefill": False, |
| 573 | + }, |
| 574 | + { |
| 575 | + "speculative_model": "JackFram/llama-68m", |
| 576 | + "num_speculative_tokens": 5, |
| 577 | + "enable_chunked_prefill": True, |
| 578 | + "max_num_batched_tokens": 4, |
| 579 | + "max_num_seqs": 4, |
| 580 | + "speculative_max_model_len": 32, |
506 | 581 | }, |
507 | 582 | ]) |
508 | 583 | @pytest.mark.parametrize("batch_size", [8]) |
@@ -551,6 +626,15 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs, |
551 | 626 | "speculative_model": "JackFram/llama-68m", |
552 | 627 | "num_speculative_tokens": 5, |
553 | 628 | "speculative_disable_by_batch_size": 2, |
| 629 | + "enable_chunked_prefill": False, |
| 630 | + }, |
| 631 | + { |
| 632 | + "speculative_model": "JackFram/llama-68m", |
| 633 | + "num_speculative_tokens": 5, |
| 634 | + "speculative_disable_by_batch_size": 2, |
| 635 | + "enable_chunked_prefill": True, |
| 636 | + "max_num_batched_tokens": 4, |
| 637 | + "max_num_seqs": 4, |
554 | 638 | }, |
555 | 639 | ]) |
556 | 640 | @pytest.mark.parametrize("batch_size", [8]) |
@@ -590,10 +674,17 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs, |
590 | 674 | { |
591 | 675 | "speculative_model": "JackFram/llama-68m", |
592 | 676 | "num_speculative_tokens": k, |
| 677 | + "enable_chunked_prefill": False, |
593 | 678 | } |
594 | 679 | # Try a range of common k, as well as large speculation. |
595 | 680 | for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63] |
596 | | - ]) |
| 681 | + ] + [{ |
| 682 | + "speculative_model": "JackFram/llama-68m", |
| 683 | + "num_speculative_tokens": k, |
| 684 | + "enable_chunked_prefill": True, |
| 685 | + "max_num_batched_tokens": 4, |
| 686 | + "max_num_seqs": 4, |
| 687 | + } for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63]]) |
597 | 688 | @pytest.mark.parametrize("batch_size", [2]) |
598 | 689 | @pytest.mark.parametrize( |
599 | 690 | "output_len", |
@@ -636,11 +727,19 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, |
636 | 727 | { |
637 | 728 | "speculative_model": "JackFram/llama-68m", |
638 | 729 | "num_speculative_tokens": k, |
639 | | - "spec_decoding_acceptance_method": "typical_acceptance_sampler" |
| 730 | + "spec_decoding_acceptance_method": "typical_acceptance_sampler", |
| 731 | + "enable_chunked_prefill": False |
640 | 732 | } |
641 | 733 | # Try a range of common k. |
642 | 734 | for k in [1, 2, 3] |
643 | | - ]) |
| 735 | + ] + [{ |
| 736 | + "speculative_model": "JackFram/llama-68m", |
| 737 | + "num_speculative_tokens": k, |
| 738 | + "spec_decoding_acceptance_method": "typical_acceptance_sampler", |
| 739 | + "enable_chunked_prefill": True, |
| 740 | + "max_num_batched_tokens": 4, |
| 741 | + "max_num_seqs": 4 |
| 742 | + } for k in [1, 2, 3]]) |
644 | 743 | @pytest.mark.parametrize("batch_size", [1, 32]) |
645 | 744 | @pytest.mark.parametrize( |
646 | 745 | "output_len", |
|
0 commit comments