flashinfer-ai · yzh119 · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026
@@ -100,8 +100,11 @@
 from .norm import gemma_rmsnorm as gemma_rmsnorm
 from .norm import rmsnorm as rmsnorm
 
-from .norm import rmsnorm_fp4quant as rmsnorm_fp4quant
-from .norm import add_rmsnorm_fp4quant as add_rmsnorm_fp4quant
+try:
+    from .norm import rmsnorm_fp4quant as rmsnorm_fp4quant
+    from .norm import add_rmsnorm_fp4quant as add_rmsnorm_fp4quant
+except (ImportError, AttributeError):
+    pass  # nvidia-cutlass-dsl not installed
 from .page import append_paged_kv_cache as append_paged_kv_cache
 from .page import append_paged_mla_kv_cache as append_paged_mla_kv_cache
 from .page import get_batch_indices_positions as get_batch_indices_positions

@@ -18,10 +18,19 @@
 This module provides high-performance GPU kernels implemented using NVIDIA CuTe-DSL.
 """
 
-from .utils import is_cute_dsl_available, make_ptr, get_cutlass_dtype, get_num_sm
+import importlib.util
 
-# Conditionally import CuTe-DSL kernels
+
+def is_cute_dsl_available() -> bool:
+    return (
+        importlib.util.find_spec("cutlass") is not None
+        and importlib.util.find_spec("cutlass.cute") is not None
+    )
-import importlib.util
-
-# Conditionally import CuTe-DSL kernels
-
-def is_cute_dsl_available() -> bool:
-    return (
-        importlib.util.find_spec("cutlass") is not None
-        and importlib.util.find_spec("cutlass.cute") is not None
-    )
+import functools
+import importlib.util
+
+from ..api_logging import flashinfer_api
+
+
+`@functools.cache`
+`@flashinfer_api`
+def is_cute_dsl_available() -> bool:
+    return (
+        importlib.util.find_spec("cutlass") is not None
+        and importlib.util.find_spec("cutlass.cute") is not None
+    )
-import importlib.util
-
-# Conditionally import CuTe-DSL kernels
-
-def is_cute_dsl_available() -> bool:
-    return (
-        importlib.util.find_spec("cutlass") is not None
-        and importlib.util.find_spec("cutlass.cute") is not None
-    )
+import functools
+import importlib.util
+
+from ..api_logging import flashinfer_api
+
+
+`@functools.cache`
+`@flashinfer_api`
+def is_cute_dsl_available() -> bool:
+    return (
+        importlib.util.find_spec("cutlass") is not None
+        and importlib.util.find_spec("cutlass.cute") is not None
+    )
+
+
+# Conditionally import CuTe-DSL kernels (including utils which requires cutlass)
 if is_cute_dsl_available():
+    from .utils import make_ptr, get_cutlass_dtype, get_num_sm
     from .blockscaled_gemm import (
         grouped_gemm_nt_masked,
         Sm100BlockScaledPersistentDenseGemmKernel,
@@ -36,16 +45,35 @@
         AddRMSNormFP4QuantKernel,
     )
 
+    # Backwards-compatible re-exports from flashinfer.norm.kernels submodule
+    from ..norm.kernels import (
+        # Kernel classes
+        RMSNormKernel,
+        QKRMSNormKernel,
+        RMSNormQuantKernel,
+        FusedAddRMSNormKernel,
+        FusedAddRMSNormQuantKernel,
+        LayerNormKernel,
+        # Python API functions
+        rmsnorm_cute,
+        qk_rmsnorm_cute,
+        rmsnorm_quant_cute,
+        fused_add_rmsnorm_cute,
+        fused_add_rmsnorm_quant_cute,
+        layernorm_cute,
+    )
+
 __all__ = [
-    # Utils (always available)
+    # Always available
     "is_cute_dsl_available",
-    "make_ptr",
-    "get_cutlass_dtype",
-    "get_num_sm",
 ]
 
 if is_cute_dsl_available():
     __all__ += [
+        # Utils (require cutlass)
+        "make_ptr",
+        "get_cutlass_dtype",
+        "get_num_sm",
         # Blockscaled GEMM
         "grouped_gemm_nt_masked",
         "Sm100BlockScaledPersistentDenseGemmKernel",
@@ -56,4 +84,17 @@
         # Add + RMSNorm + FP4 Quantization
         "add_rmsnorm_fp4quant",
         "AddRMSNormFP4QuantKernel",
+        # Norm kernels (CuTe DSL) - backwards-compatible re-exports
+        "RMSNormKernel",
+        "QKRMSNormKernel",
+        "RMSNormQuantKernel",
+        "FusedAddRMSNormKernel",
+        "FusedAddRMSNormQuantKernel",
+        "LayerNormKernel",
+        "rmsnorm_cute",
+        "qk_rmsnorm_cute",
+        "rmsnorm_quant_cute",
+        "fused_add_rmsnorm_cute",
+        "fused_add_rmsnorm_quant_cute",
+        "layernorm_cute",
     ]
@@ -12,21 +12,53 @@
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
+
+FlashInfer Normalization Kernels
+================================
+
+This package provides high-performance normalization kernels:
+
+- RMSNorm: Root Mean Square Normalization
+- LayerNorm: Layer Normalization
+- Fused Add + RMSNorm: Combined residual add and RMSNorm
+- Quantized variants with FP8/FP4 output
 """
 
 import functools
+import os
 from typing import Optional
 
 import torch
 
-from .api_logging import flashinfer_api
-from .jit.norm import gen_norm_module
-from .utils import device_support_pdl, register_custom_op, register_fake_op
+from ..api_logging import flashinfer_api
+from ..utils import device_support_pdl, register_custom_op, register_fake_op
+
+# Always import gen_norm_module for JIT warmup and CUDA fallback
+from ..jit.norm import gen_norm_module
 
+# Use CUDA JIT implementation instead of CuTe DSL (for debugging/fallback)
+# Also fallback to CUDA JIT if nvidia-cutlass-dsl is not installed
+_USE_CUDA_NORM = os.environ.get("FLASHINFER_USE_CUDA_NORM", "0") == "1"
 
-@functools.cache
-def get_norm_module():
-    return gen_norm_module().build_and_load()
+if not _USE_CUDA_NORM:
+    try:
+        from .kernels import (
+            rmsnorm_cute,
+            qk_rmsnorm_cute,
+            rmsnorm_quant_cute,
+            fused_add_rmsnorm_cute,
+            fused_add_rmsnorm_quant_cute,
+            layernorm_cute,
+        )
+    except (ImportError, AttributeError):
+        # nvidia-cutlass-dsl not installed or incompatible version
+        _USE_CUDA_NORM = True
+
+if _USE_CUDA_NORM:
+
+    @functools.cache
+    def get_norm_module():
+        return gen_norm_module().build_and_load()
 
 
 @flashinfer_api
@@ -60,16 +92,14 @@ def rmsnorm(
     output: torch.Tensor
         Normalized tensor, 2D shape (batch_size, hidden_size) or 3D shape (batch_size, num_heads, hidden_size).
     """
-    if enable_pdl is None:
-        enable_pdl = device_support_pdl(input.device)
     if out is None:
         out = torch.empty_like(input)
-    _rmsnorm(out, input, weight, eps, enable_pdl)
+    _rmsnorm_impl(out, input, weight, eps, enable_pdl)
     return out
 
 
 @register_custom_op("flashinfer::rmsnorm", mutates_args=("out",))
-def _rmsnorm(
+def _rmsnorm_impl(
     out: torch.Tensor,
     input: torch.Tensor,
     weight: torch.Tensor,
@@ -78,11 +108,21 @@ def _rmsnorm(
 ) -> None:
     if enable_pdl is None:
         enable_pdl = device_support_pdl(input.device)
-    get_norm_module().rmsnorm(out, input, weight, eps, enable_pdl)
+    if _USE_CUDA_NORM:
+        get_norm_module().rmsnorm(out, input, weight, eps, enable_pdl)
+    else:
+        if input.dim() == 3:
+            qk_rmsnorm_cute(
+                input, weight, out, eps, weight_bias=0.0, enable_pdl=enable_pdl
+            )
+        else:
+            rmsnorm_cute(
+                input, weight, out, eps, weight_bias=0.0, enable_pdl=enable_pdl
+            )
 
 
 @register_fake_op("flashinfer::rmsnorm")
-def _rmsnorm_fake(
+def _rmsnorm_impl_fake(
     out: torch.Tensor,
     input: torch.Tensor,
     weight: torch.Tensor,
@@ -129,7 +169,12 @@ def rmsnorm_quant(
     """
     if enable_pdl is None:
         enable_pdl = device_support_pdl(input.device)
-    get_norm_module().rmsnorm_quant(out, input, weight, scale, eps, enable_pdl)
+    if _USE_CUDA_NORM:
+        get_norm_module().rmsnorm_quant(out, input, weight, scale, eps, enable_pdl)
+    else:
+        rmsnorm_quant_cute(
+            out, input, weight, scale, eps, weight_bias=0.0, enable_pdl=enable_pdl
+        )
 
 
 @register_fake_op("flashinfer::rmsnorm_quant")
@@ -177,7 +222,12 @@ def fused_add_rmsnorm(
     """
     if enable_pdl is None:
         enable_pdl = device_support_pdl(input.device)
-    get_norm_module().fused_add_rmsnorm(input, residual, weight, eps, enable_pdl)
+    if _USE_CUDA_NORM:
+        get_norm_module().fused_add_rmsnorm(input, residual, weight, eps, enable_pdl)
+    else:
+        fused_add_rmsnorm_cute(
+            input, residual, weight, eps, weight_bias=0.0, enable_pdl=enable_pdl
+        )
 
 
 @register_fake_op("flashinfer::fused_add_rmsnorm")
@@ -232,9 +282,21 @@ def fused_add_rmsnorm_quant(
     """
     if enable_pdl is None:
         enable_pdl = device_support_pdl(input.device)
-    get_norm_module().fused_add_rmsnorm_quant(
-        out, input, residual, weight, scale, eps, enable_pdl
-    )
+    if _USE_CUDA_NORM:
+        get_norm_module().fused_add_rmsnorm_quant(
+            out, input, residual, weight, scale, eps, enable_pdl
+        )
+    else:
+        fused_add_rmsnorm_quant_cute(
+            out,
+            input,
+            residual,
+            weight,
+            scale,
+            eps,
+            weight_bias=0.0,
+            enable_pdl=enable_pdl,
+        )
 
 
 @register_fake_op("flashinfer::fused_add_rmsnorm_quant")
@@ -281,16 +343,14 @@ def gemma_rmsnorm(
     output: torch.Tensor
         Gemma Normalized tensor, shape (batch_size, hidden_size).
     """
-    if enable_pdl is None:
-        enable_pdl = device_support_pdl(input.device)
     if out is None:
         out = torch.empty_like(input)
-    _gemma_rmsnorm(out, input, weight, eps, enable_pdl)
+    _gemma_rmsnorm_impl(out, input, weight, eps, enable_pdl)
     return out
 
 
 @register_custom_op("flashinfer::gemma_rmsnorm", mutates_args=("out",))
-def _gemma_rmsnorm(
+def _gemma_rmsnorm_impl(
     out: torch.Tensor,
     input: torch.Tensor,
     weight: torch.Tensor,
@@ -299,11 +359,21 @@ def _gemma_rmsnorm(
 ) -> None:
     if enable_pdl is None:
         enable_pdl = device_support_pdl(input.device)
-    get_norm_module().gemma_rmsnorm(out, input, weight, eps, enable_pdl)
+    if _USE_CUDA_NORM:
+        get_norm_module().gemma_rmsnorm(out, input, weight, eps, enable_pdl)
+    else:
+        if input.dim() == 3:
+            qk_rmsnorm_cute(
+                input, weight, out, eps, weight_bias=1.0, enable_pdl=enable_pdl
+            )
+        else:
+            rmsnorm_cute(
+                input, weight, out, eps, weight_bias=1.0, enable_pdl=enable_pdl
+            )
 
 
 @register_fake_op("flashinfer::gemma_rmsnorm")
-def _gemma_rmsnorm_fake(
+def _gemma_rmsnorm_impl_fake(
     out: torch.Tensor,
     input: torch.Tensor,
     weight: torch.Tensor,
@@ -348,7 +418,14 @@ def gemma_fused_add_rmsnorm(
     """
     if enable_pdl is None:
         enable_pdl = device_support_pdl(input.device)
-    get_norm_module().gemma_fused_add_rmsnorm(input, residual, weight, eps, enable_pdl)
+    if _USE_CUDA_NORM:
+        get_norm_module().gemma_fused_add_rmsnorm(
+            input, residual, weight, eps, enable_pdl
+        )
+    else:
+        fused_add_rmsnorm_cute(
+            input, residual, weight, eps, weight_bias=1.0, enable_pdl=enable_pdl
+        )
 
 
 @register_fake_op("flashinfer::gemma_fused_add_rmsnorm")
@@ -388,7 +465,10 @@ def layernorm(
         Layer Normalized tensor, shape (batch_size, hidden_size). Same dtype as input.
     """
     out = torch.empty_like(input)
-    get_norm_module().layernorm(out, input, gemma, beta, eps)
+    if _USE_CUDA_NORM:
+        get_norm_module().layernorm(out, input, gemma, beta, eps)
+    else:
+        layernorm_cute(out, input, gemma, beta, eps)
     return out
 
 
@@ -404,10 +484,25 @@ def _layernorm_fake(
 
 
 # CuTe-DSL fused RMSNorm + FP4 Quantization kernels
-# These require CuTe-DSL to be available and SM100+ (Blackwell) GPUs
+# These require SM100+ (Blackwell) GPUs and nvidia-cutlass-dsl
 try:
-    from .cute_dsl import rmsnorm_fp4quant, add_rmsnorm_fp4quant
+    from ..cute_dsl import rmsnorm_fp4quant as rmsnorm_fp4quant
+    from ..cute_dsl import add_rmsnorm_fp4quant as add_rmsnorm_fp4quant
 except ImportError:
-    # CuTe-DSL not available
-    rmsnorm_fp4quant = None  # type: ignore[misc,assignment]
-    add_rmsnorm_fp4quant = None  # type: ignore[misc,assignment]
+    # nvidia-cutlass-dsl not installed, these functions will not be available
+    pass
+
+
+# Public API exports
+__all__ = [
+    # JIT module generator (always available)
+    "gen_norm_module",
+    # Public APIs
+    "rmsnorm",
+    "rmsnorm_quant",
+    "fused_add_rmsnorm",
+    "fused_add_rmsnorm_quant",
+    "gemma_rmsnorm",
+    "gemma_fused_add_rmsnorm",
+    "layernorm",
+]