Aiter round mode control (#590)

ksikiric · Kristian Sikiric · web-flow · commit ccba9d529be8 · 2025-11-19T10:49:23.000+08:00
* Adding AITER round mode control to USP attention calls.

* Added guarding so that AITER call does not break for earlier AITER commits that does not support changing the round mode

* Added a more robust way of checking if round mode is available. Also removed code duplication when checking if round mode is available when calling aiter flash attention

* changed aiter.ops.mha.flash_attn_func back to aiter.flash_attn_func as this was a misstake and should not have been changed in the first place.

* Indentation fix

---------

Co-authored-by: Kristian Sikiric &lt;ksikiric@chi-mi300x-017.ord.vultr.cpe.ice.amd.com&gt;
diff --git a/xfuser/model_executor/layers/usp.py b/xfuser/model_executor/layers/usp.py
@@ -30,6 +30,14 @@
 HAS_AITER = env_info["has_aiter"]
 if HAS_AITER:
     import aiter
+    import inspect
+    try:
+        HAS_ROUND_MODE = inspect.signature(aiter.flash_attn_func).parameters.get("how_v3_bf16_cvt") is not None
+    except (AttributeError, TypeError):
+        HAS_ROUND_MODE = False
+    if HAS_ROUND_MODE:
+        import os
+        HOW_V3_BF16_CVT = int(os.environ.get("HOW_V3_BF16_CVT", "2"))
 
 aten = torch.ops.aten
 
@@ -175,14 +183,19 @@ def _aiter_attn_call(query, key, value, dropout_p, is_causal):
     query = torch.permute(query, [0, 2, 1, 3]).contiguous()
     key = torch.permute(key, [0, 2, 1, 3]).contiguous()
     value = torch.permute(value, [0, 2, 1, 3]).contiguous()
+    attn_kwargs = {
+        "dropout_p": dropout_p,
+        "causal": is_causal,
+        "return_attn_probs": False,
+        "return_lse": True,
+    }
+    if HAS_ROUND_MODE:
+        attn_kwargs["how_v3_bf16_cvt"] = HOW_V3_BF16_CVT
     output, softmax_lse = aiter.flash_attn_func(
         query,
         key,
         value,
-        dropout_p=dropout_p,
-        causal=is_causal,
-        return_attn_probs=False,
-        return_lse=True
+        **attn_kwargs
     )
     output = torch.permute(output, [0, 2, 1, 3])
     return output, softmax_lse