@@ -562,3 +562,42 @@ def test_chunked_prefill_max_seqs():
562562 assert len (get_sequence_groups (out )) == max_seqs
563563 assert not running [0 ].is_prefill ()
564564 assert not running [1 ].is_prefill ()
565+
566+
567+ def test_perfix_caching ():
568+ """Verify allocating full blocks when prefix caching is enabled."""
569+ block_size = 4
570+ max_seqs = 10
571+ max_model_len = 80
572+ max_num_batched_tokens = 64
573+ scheduler_config = SchedulerConfig (max_num_batched_tokens ,
574+ max_seqs ,
575+ max_model_len ,
576+ enable_chunked_prefill = True )
577+ cache_config = CacheConfig (block_size ,
578+ 1.0 ,
579+ 1 ,
580+ "auto" ,
581+ enable_prefix_caching = True )
582+ cache_config .num_cpu_blocks = 0
583+ cache_config .num_gpu_blocks = 32
584+ scheduler = Scheduler (scheduler_config , cache_config , None )
585+ running : List [SequenceGroup ] = []
586+
587+ # Add seq groups to scheduler.
588+ for i in range (2 ):
589+ _ , seq_group = create_dummy_prompt (str (i ),
590+ block_size = block_size ,
591+ prompt_length = 50 )
592+ scheduler .add_seq_group (seq_group )
593+ running .append (seq_group )
594+
595+ seq_group_meta , out = schedule_and_update_computed_tokens (scheduler )
596+ assert set (get_sequence_groups (out )) == set (running )
597+ assert seq_group_meta [0 ].token_chunk_size == 50
598+ # Verify it is chunked. Note that although the budget is 64-50=14,
599+ # we only allocate full blocks for prefix caching, so only 4*(14//4)=12
600+ # tokens are allocated.
601+ assert seq_group_meta [1 ].token_chunk_size == 12
602+ assert out .num_prefill_groups == 2
603+ assert out .num_batched_tokens == 62
0 commit comments