File tree Expand file tree Collapse file tree 2 files changed +13
-5
lines changed
Expand file tree Collapse file tree 2 files changed +13
-5
lines changed Original file line number Diff line number Diff line change @@ -116,6 +116,9 @@ def test_models_with_fp8_kv_cache(
116116 pytest .skip (
117117 "#7378: CUDA illegal memory access (undiagnosed) facebook/opt-125m"
118118 )
119+ if ((model , kv_cache_dtype , chunked_prefill_token_size ) == (
120+ "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V" , "fp8_e4m3" , 4 )):
121+ pytest .skip ("flakey test, see: #7874 #8051" )
119122
120123 max_num_seqs = chunked_prefill_token_size
121124 max_num_batched_tokens = chunked_prefill_token_size
Original file line number Diff line number Diff line change @@ -1027,16 +1027,21 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
10271027
10281028 # Update waiting requests.
10291029 self .waiting .extendleft (running_scheduled .preempted )
1030+
10301031 # Update new running requests.
1031- self .running .extend ([s .seq_group for s in prefills .seq_groups ])
1032- self .running .extend (
1033- [s .seq_group for s in running_scheduled .decode_seq_groups ])
1034- self .running .extend (
1035- [s .seq_group for s in running_scheduled .prefill_seq_groups ])
1032+ # By default, vLLM scheduler prioritizes prefills.
1033+ # Once chunked prefill is enabled,
1034+ # the policy is changed to prioritize decode requests.
10361035 self .running .extend (
10371036 [s .seq_group for s in swapped_in .decode_seq_groups ])
10381037 self .running .extend (
10391038 [s .seq_group for s in swapped_in .prefill_seq_groups ])
1039+ self .running .extend (
1040+ [s .seq_group for s in running_scheduled .decode_seq_groups ])
1041+ self .running .extend (
1042+ [s .seq_group for s in running_scheduled .prefill_seq_groups ])
1043+ self .running .extend ([s .seq_group for s in prefills .seq_groups ])
1044+
10401045 # Update swapped requests.
10411046 self .swapped .extend (running_scheduled .swapped_out )
10421047 return SchedulerOutputs (
You can’t perform that action at this time.
0 commit comments