Merge pull request vllm-project#2 from vllm-project/fix-2-tilelang

youkaichao · web-flow · commit 6c4e0e835058 · 2025-09-20T11:14:08.000+08:00
Fixes for support_materials/2-tilelang/
diff --git a/support_materials/2-tilelang/1.index_attn_dynamic_qpack_varlen_fp8.py b/support_materials/2-tilelang/1.index_attn_dynamic_qpack_varlen_fp8.py
@@ -220,12 +220,10 @@ def index_attn_return_logits_interface(q,
                                        weights,
                                        cu_seqlen_ks,
                                        cu_seqlen_ke,
-                                       clean_logits=True):
+                                       should_clean_logits=True):
     seq_len, heads, index_dim = q.shape
     seq_len_kv = kv.shape[0]
 
-    clean_logits_kernel = clean_logits()
-
     index_attn_return_logits_kernel = index_attn_return_logits(
         heads=heads, index_dim=index_dim)
     logits = torch.empty([seq_len, seq_len_kv],
@@ -239,7 +237,8 @@ def index_attn_return_logits_interface(q,
         cu_seqlen_ks,
         cu_seqlen_ke,
     )
-    if clean_logits:
+    if should_clean_logits:
+        clean_logits_kernel = clean_logits()
         clean_logits_kernel(logits, cu_seqlen_ks, cu_seqlen_ke)
     return logits
 
@@ -262,7 +261,7 @@ def ref_fp8_mqa_logits(
                < cu_seqlen_ke[:, None])
     mask = mask_lo & mask_hi
 
-    score = torch.einsum("mhd,and->hmn", q, k)
+    score = torch.einsum("mhd,nd->hmn", q, k)
     logits = (score.relu() * weights.unsqueeze(-1).transpose(0, 1)).sum(dim=0)
     logits = logits.masked_fill(~mask, float("-inf"))
 
diff --git a/support_materials/2-tilelang/2.sparse_attn_mla.py b/support_materials/2-tilelang/2.sparse_attn_mla.py
@@ -314,7 +314,7 @@ def test_sparse_attn_mla_fwd():
     def fn():
         return sparse_attention_fwd_interface(q, kv, indices)
 
-    from tilelang.testing import do_bench
+    from tilelang.profiler import do_bench
 
     ms = do_bench(
         fn,