Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions python/sglang/srt/managers/cache_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ def write(
if host_indices is None:
return None
self.mem_pool_host.protect_write(host_indices)
torch.cuda.current_stream().synchronize()
self.write_queue.put(
CacheOperation(host_indices, device_indices, node_id, priority)
)
Expand Down
5 changes: 4 additions & 1 deletion python/sglang/srt/model_executor/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,7 @@ def model_specific_adjustment(self):
is_hopper_with_cuda_12_3()
and is_no_spec_infer_or_topk_one(server_args)
and is_fa3_default_architecture(self.model_config.hf_config)
and (not server_args.enable_hierarchical_cache)
):
server_args.attention_backend = "fa3"
elif _is_hip:
Expand All @@ -389,7 +390,9 @@ def model_specific_adjustment(self):
)
else:
# MLA architecture
if is_hopper_with_cuda_12_3():
if is_hopper_with_cuda_12_3() and (
not server_args.enable_hierarchical_cache
):
server_args.attention_backend = "fa3"
elif is_sm100_supported():
server_args.attention_backend = "flashinfer"
Expand Down
Loading