triton-lang
diff --git a/‎python/triton_kernels/bench/distributed.py‎
Lines changed: 2 additions & 1 deletion b/‎python/triton_kernels/bench/distributed.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎python/triton_kernels/tests/test_matmul.py‎
Lines changed: 4 additions & 4 deletions b/‎python/triton_kernels/tests/test_matmul.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎python/triton_kernels/tests/test_reduce.py‎
Lines changed: 9 additions & 6 deletions b/‎python/triton_kernels/tests/test_reduce.py‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎python/triton_kernels/triton_kernels/matmul_ogs.py‎
Lines changed: 97 additions & 185 deletions b/‎python/triton_kernels/triton_kernels/matmul_ogs.py‎
Lines changed: 97 additions & 185 deletions
diff --git a/‎python/triton_kernels/triton_kernels/matmul_ogs_details/_common.py‎
Lines changed: 1 addition & 7 deletions b/‎python/triton_kernels/triton_kernels/matmul_ogs_details/_common.py‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py‎
Lines changed: 5 additions & 4 deletions b/‎python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py‎
Lines changed: 5 additions & 5 deletions b/‎python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎python/triton_kernels/triton_kernels/matmul_ogs_details/_reduce_grouped.py‎
Lines changed: 0 additions & 102 deletions b/‎python/triton_kernels/triton_kernels/matmul_ogs_details/_reduce_grouped.py‎
Lines changed: 0 additions & 102 deletions
diff --git a/‎python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py‎
Lines changed: 0 additions & 4 deletions b/‎python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py‎
Lines changed: 0 additions & 4 deletions
@@ -277,7 +277,8 @@ def distributed_run(rank, world_size, batch, dim1, dim2, n_expts_tot, n_expts_ac
 
     # precision configs
     pcg = PrecisionConfig(flex_ctx=FlexCtx(rhs_data=wg_flex), weight_scale=wg_scale)
-    act = FusedActivation(FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit")), (1.0, 1.0), 2)
+    act = FusedActivation(FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit"), reduction_n=2),
+                          (1.0, 1.0))
     pc1 = PrecisionConfig(flex_ctx=FlexCtx(rhs_data=w1_flex), weight_scale=w1_scale)
     pc2 = PrecisionConfig(flex_ctx=FlexCtx(rhs_data=w2_flex), weight_scale=w2_scale)
     if rank == 0:
 
@@ -130,8 +130,8 @@ def init_precision(out_dtype, act_use_flexpoint, weight_dtype, weight_mxfp, mode
         ) if weight_use_flexpoint else InFlexData(),
         out_data=OutFlexData(
             dtype=out_dtype,
-            expected_scale=make(4.00, 5.00, mode == "batched" or expt_is_inner),
-            actual_scale=make(0, 0, mode == "batched" or expt_is_inner),
+            expected_scale=make_scalar(4.00),
+            actual_scale=make_scalar(0),
             checksum_scale=None,
         ) if act_use_flexpoint else OutFlexData(),
     )
@@ -776,8 +776,8 @@ def test_fused_act(m, n, k, mode, split_k, do_gather, do_scatter, fused_scatter,
                    precision_config=SwiGLUPrecisionConfig(swiglu_limit))
         b = matmul_ogs(
             x, w, bias, rdata, gindx, sindx, precision_opt,
-            fused_activation=FusedActivation(FnSpecs("swiglu", swiglu_fn, ("alpha", "limit")),
-                                             (swiglu_alpha, swiglu_limit), 2))
+            fused_activation=FusedActivation(FnSpecs("swiglu", swiglu_fn, ("alpha", "limit"), reduction_n=2),
+                                             (swiglu_alpha, swiglu_limit)))
     except opt_flags.InapplicableConstraint:
         pytest.skip("inapplicable constraint")
 
 
@@ -5,6 +5,7 @@
 from triton_kernels.numerics_details.mxfp import upcast_from_mxfp_torch, downcast_to_mxfp_torch
 from triton_kernels.numerics import InFlexData, OutFlexData
 import triton
+import triton.language as tl
 
 
 def init_mask(mask_mode, B, M, N, device):
@@ -30,8 +31,9 @@ def dtype_str_to_torch(dtype_str: str) -> torch.dtype:
 
 
 @triton.jit
-def plus_a(x, a):
-    return x + a
+def plus_a_reduce(x, a):
+    y = x + a
+    return tl.sum(y.reshape([x.shape[0], x.shape[1] // 2, 2]), axis=2)
 
 
 @pytest.mark.parametrize("B, M, N, postprocess_fn", [
@@ -84,14 +86,15 @@ def test_op(B, M, N, dtype_str, dim, mask_mode, postprocess_fn):
             reduce(x, dim=dim, mask=mask, x_mxscale=x_mscale)
         return
     if postprocess_fn == "plus_ten":
-        postprocess_fn_tri = PostprocessFn(specs=FnSpecs("plus_a", plus_a, ("a", )), fn_args=(10, ))
-        postprocess_fn_ref = lambda x: x + 10
+        postprocess_fn_tri = PostprocessFn(specs=FnSpecs("plus_a", plus_a_reduce, ("a", ), reduction_n=2),
+                                           fn_args=(10, ))
+        postprocess_fn_ref = lambda x: (x + 10).reshape([x.shape[0], x.shape[1] // 2, 2]).sum(dim=2)
     else:
         postprocess_fn_tri = postprocess_fn_ref = None
     y_tri, y_tri_mxscale = reduce(x, dim=dim, mask=mask, x_mxscale=x_mscale, x_flex=x_flex, y_flex=y_flex_tri,
-                                  postprocess_fn=postprocess_fn_tri)
+                                  postprocess_fn1=postprocess_fn_tri)
     y_ref, y_ref_mxscale = reduce_torch(x, dim=dim, mask=mask, x_mxscale=x_mscale, x_flex=x_flex, y_flex=y_flex_ref,
-                                        postprocess_fn=postprocess_fn_ref)
+                                        postprocess_fn1=postprocess_fn_ref)
     if is_mx:
         y_ref = upcast_from_mxfp_torch(y_ref, y_ref_mxscale, torch.float16, axis=-1)
         y_tri = upcast_from_mxfp_torch(y_tri, y_tri_mxscale, torch.float16, axis=-1)
 
@@ -238,18 +238,12 @@ def matmul_launch_metadata(grid, kernel, args):
     fM = M if M is not None else n_tokens
     ret[f"flops{nbits}"] = 2.0 * fM * N * K * (1 if expt_is_inner else batch_size)
 
-    dst = args.get("GatherDstIndx", None)
     # sindx = args.get("WriteBackIndx", None)
     n_x_bytes = X.numel() * X.element_size()
     n_y_bytes = Y.numel() * Y.element_size()
     if hist is not None:
         assert n_tokens is not None
-        n_expts_act = args["N_EXPTS_ACT"]
-
-        if (dst is not None) and launch_metadata_allow_sync():
-            n_read_rows = (dst.view((-1, n_expts_act)) != -1).any(dim=1).sum()
-        else:
-            n_read_rows = n_tokens
+        n_read_rows = n_tokens
 
         if expt_is_inner:
             n_x_bytes = n_read_rows * X.shape[-2] * X.element_size()
 
@@ -71,7 +71,7 @@ def _matmul_ogs(
              # epilogue transform
              EPILOGUE_FN: tl.constexpr, epilogue_fn_args,
              # MoE config
-             N_EXPTS_TOT: tl.constexpr, N_EXPTS_ACT: tl.constexpr,
+             N_EXPTS_TOT: tl.constexpr,
              # precision config
              MAX_NUM_IMPRECISE_ACC: tl.constexpr, ALLOW_TF32: tl.constexpr,
              FLEXPOINT_SATURATE_INF: tl.constexpr,
@@ -81,6 +81,7 @@ def _matmul_ogs(
              # optimization config
              BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
              GROUP_M: tl.constexpr, XCD_SWIZZLE: tl.constexpr,
+             INIT_OUTPUT_TO_ZERO: tl.constexpr,
              # One of ["HOPPER", "BLACKWELL", None]
              SWIZZLE_MX_VALUE: tl.constexpr,
              # One of ["HOPPER", "BLACKWELL", None]
@@ -198,7 +199,7 @@ def _matmul_ogs(
     # We are tiling Y here, so the tiling is independent of matmul (where we
     # tile X & W and scatter to different rows of Y).
     # TODO: refactor (same code in _p_matmul_ogs)
-    if HAS_FUSED_SCATTER and N_EXPTS_ACT == 1:
+    if HAS_FUSED_SCATTER and INIT_OUTPUT_TO_ZERO:
         tl.device_assert(batch_size == 1)
         pid_mnk = pid
         if XCD_SWIZZLE != 1:
@@ -241,7 +242,7 @@ def _matmul_ogs(
     else:
         GatherIndx += start_m
         # no needs to bounds-check here because `offs_x_m` wraps around M dim
-        offs_x_m = tl.load(GatherIndx + offs_x_m) // N_EXPTS_ACT
+        offs_x_m = tl.load(GatherIndx + offs_x_m)
     offs_k = off_k_x + tl.arange(0, BLOCK_K)
     XPtrs = X + offs_x_m.to(index_type)[:, None] * stride_x_m + offs_k.to(index_type)[None, :] * stride_x_k
 
@@ -455,7 +456,7 @@ def _matmul_ogs(
             YActualScale += start_m * stride_y_mx_m
             YActualScalePtrs = YActualScale + offs_y_m.to(index_type)[:, None] * stride_y_mx_m + offs_y_n_scale.to(index_type)[None, :] * stride_y_mx_n
         else:
-            YActualScalePtrs = YActualScale + (offs_y_m - num_idxs // N_EXPTS_ACT).to(index_type)[:, None] * stride_y_mx_m + offs_y_n_scale.to(index_type)[None, :] * stride_y_mx_n
+            YActualScalePtrs = YActualScale + offs_y_m.to(index_type)[:, None] * stride_y_mx_m + offs_y_n_scale.to(index_type)[None, :] * stride_y_mx_n
         tl.store(YActualScalePtrs, out_scale, mask=mask_m[:, None] & mask_n_scale[None, :])
     else:
         if PER_BATCH_OUT_SCALE:
 
@@ -80,7 +80,7 @@ def _p_matmul_ogs(
              # epilogue transform
              EPILOGUE_FN: tl.constexpr, epilogue_fn_args,
              # MoE config
-             N_EXPTS_TOT: tl.constexpr, N_EXPTS_ACT: tl.constexpr,
+             N_EXPTS_TOT: tl.constexpr,
              # precision config
              MAX_NUM_IMPRECISE_ACC: tl.constexpr, ALLOW_TF32: tl.constexpr,
              FLEXPOINT_SATURATE_INF: tl.constexpr,
@@ -90,6 +90,7 @@ def _p_matmul_ogs(
              # optimization config
              BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
              GROUP_M: tl.constexpr, XCD_SWIZZLE: tl.constexpr,
+             INIT_OUTPUT_TO_ZERO: tl.constexpr,
              # NYI: Must be None
              SWIZZLE_MX_VALUE: tl.constexpr,
              # One of ["BLACKWELL", None]
@@ -172,7 +173,7 @@ def _p_matmul_ogs(
     yN = N // ACTIVATION_REDUCTION_N
 
     # set masked out rows to 0
-    if HAS_SCATTER and N_EXPTS_ACT == 1:
+    if HAS_SCATTER and INIT_OUTPUT_TO_ZERO:
         # Iterate with reversed pids so that later pids will get more tiles if the number of
         # tiles isn't evenly divisible by the number of SMs.
         # The main loop after this iterates in the forward direction such that earlier
@@ -233,15 +234,14 @@ def _p_matmul_ogs(
                 offs_x_m += start_z * (stride_x_z // stride_x_m)
                 offs_x_m = tl.where(mask_m, offs_x_m, -1)
             else:
-                offs_x_m = tl.load(GatherIndx + start_m.to(index_type) + offs_m,
-                                    mask=mask_m, other=-N_EXPTS_ACT) // N_EXPTS_ACT
+                offs_x_m = tl.load(GatherIndx + start_m.to(index_type) + offs_m, mask=mask_m, other=-1)
         elif X_TMA_MODE is None or is_x_microscaled:
             offs_m = off_m + tl.arange(0, BLOCK_M)
             offs_m = tl.max_contiguous(tl.multiple_of(offs_m % eM, BLOCK_M), BLOCK_M)
             # no needs to bounds-check here because `offs_m` wraps around M dim
             if GatherIndx is not None:
                 tl.static_assert(HAS_GATHER)
-                offs_m = tl.load(GatherIndx + start_m.to(index_type) + offs_m) // N_EXPTS_ACT
+                offs_m = tl.load(GatherIndx + start_m.to(index_type) + offs_m)
             offs_x_m = offs_m.to(index_type)[:, None] * stride_x_m
 
         if is_x_microscaled:
 
@@ -28,10 +28,6 @@ class OptFlags:
     arch: str
     target_kernel_kwargs: dict
 
-    def __post_init__(self):
-        if self.fused_scatter and self.split_k != 1:
-            raise ValueError("Not supported")
-
 
 def max_allowable_mn(
     max_mn: int,