From 815d53715ab9cf1d6524231467ff176652217ab1 Mon Sep 17 00:00:00 2001
From: Zhiqiang Xie <xiezhq@stanford.edu>
Date: Wed, 23 Jul 2025 00:21:43 -0700
Subject: [PATCH 1/2] set default attention backend for hicache to be
 flashinfer

---
 python/sglang/srt/model_executor/model_runner.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 4f0b1d64ce8a..ff8713c0f443 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -377,6 +377,7 @@ def model_specific_adjustment(self):
                     is_hopper_with_cuda_12_3()
                     and is_no_spec_infer_or_topk_one(server_args)
                     and is_fa3_default_architecture(self.model_config.hf_config)
+                    and (not server_args.enable_hierarchical_cache)
                 ):
                     server_args.attention_backend = "fa3"
                 elif _is_hip:
@@ -389,7 +390,9 @@ def model_specific_adjustment(self):
                     )
             else:
                 # MLA architecture
-                if is_hopper_with_cuda_12_3():
+                if is_hopper_with_cuda_12_3() and (
+                    not server_args.enable_hierarchical_cache
+                ):
                     server_args.attention_backend = "fa3"
                 elif is_sm100_supported():
                     server_args.attention_backend = "flashinfer"

From 85d6ee733897ad37212b7bedd1c72bdf134ec882 Mon Sep 17 00:00:00 2001
From: Zhiqiang Xie <xiezhq@stanford.edu>
Date: Wed, 23 Jul 2025 00:24:44 -0700
Subject: [PATCH 2/2] fix data race of multi-stream Co-authored-by: pansicheng
 <sicheng.pan.chn@gmail.com>

---
 python/sglang/srt/managers/cache_controller.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/sglang/srt/managers/cache_controller.py b/python/sglang/srt/managers/cache_controller.py
index 5f43a5e9a033..a94fdec78c32 100644
--- a/python/sglang/srt/managers/cache_controller.py
+++ b/python/sglang/srt/managers/cache_controller.py
@@ -358,6 +358,7 @@ def write(
         if host_indices is None:
             return None
         self.mem_pool_host.protect_write(host_indices)
+        torch.cuda.current_stream().synchronize()
         self.write_queue.put(
             CacheOperation(host_indices, device_indices, node_id, priority)
         )