microsoft
diff --git a/‎MANIFEST.in‎
Lines changed: 1 addition & 0 deletions b/‎MANIFEST.in‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 1 addition & 0 deletions b/‎setup.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tutel/custom/antares_ops.h‎
Lines changed: 1 addition & 1 deletion b/‎tutel/custom/antares_ops.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tutel/custom/custom_kernel.cpp‎
Lines changed: 540 additions & 84 deletions b/‎tutel/custom/custom_kernel.cpp‎
Lines changed: 540 additions & 84 deletions
diff --git a/‎tutel/examples/nccl_allreduce_perf.py‎
Lines changed: 32 additions & 0 deletions b/‎tutel/examples/nccl_allreduce_perf.py‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎tutel/ops/__init__.py‎
Lines changed: 8 additions & 0 deletions b/‎tutel/ops/__init__.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎tutel/ops/deepseek_r1_sigmoid_top_k_f32.mod‎
75.5 KB b/‎tutel/ops/deepseek_r1_sigmoid_top_k_f32.mod‎
75.5 KB
diff --git a/‎tutel/ops/deepseek_r1_sigmoid_top_k_routed_scaled_f32.mod‎
75.6 KB b/‎tutel/ops/deepseek_r1_sigmoid_top_k_routed_scaled_f32.mod‎
75.6 KB
diff --git a/‎tutel/ops/fused_silu_mul_bf16.mod‎
7.73 KB b/‎tutel/ops/fused_silu_mul_bf16.mod‎
7.73 KB
diff --git a/‎tutel/ops/gemm_down_weight_sum_bf16xf8_s.mod‎
7.11 KB b/‎tutel/ops/gemm_down_weight_sum_bf16xf8_s.mod‎
7.11 KB
@@ -0,0 +1 @@
+include tutel/ops/*
@@ -113,6 +113,7 @@ def install(use_cuda, use_nccl):
         install_requires=[
             "numpy",
         ],
+        include_package_data=True,
         zip_safe=False,
         extras_require={
             'test': [
 
@@ -181,7 +181,7 @@ at::Tensor call(const void *key, const std::vector<at::Tensor> &ts, const std::v
       static std::unordered_map<std::string, decltype(torch::kInt8)> key_to_dtype = {
         {"int8", torch::kInt8}, {"int16", torch::kInt16}, {"int32", torch::kInt32}, {"int64", torch::kInt64},
         {"bfloat8", at::kFloat8_e5m2}, {"float8", at::kFloat8_e4m3fn}, {"bfloat16", torch::kBFloat16}, {"float16", torch::kFloat16}, {"float32", torch::kFloat32}, {"float64", torch::kFloat64},
-        {"bfloat2x16", at::kComplexHalf}, {"float2x16", at::kComplexHalf}, {"float2x32", at::kComplexFloat},
+        {"bfloat2x16", torch::kInt32}, {"float2x16", torch::kInt32}, {"float2x32", torch::kInt64},
       };
 
       auto dtype_it = key_to_dtype.find(o_type[1]);
 
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import torch
+import time
+import argparse
+
+from tutel import system, net
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu')
+parser.add_argument('--count', type=int, default=229376)
+parser.add_argument('--loop', type=int, default=50)
+parser.add_argument('--warmup', type=int, default=5, help='Number of warmup iterations')
+args = parser.parse_args()
+
+parallel_env = system.init_data_model_parallel(backend='nccl' if args.device == 'cuda' else 'gloo')
+local_device = parallel_env.local_device
+
+x = torch.randn([args.count], device=local_device, dtype=torch.float32)
+
+if args.device == 'cuda':
+  wait = lambda: torch.cuda.synchronize() or time.perf_counter()
+else:
+  wait = lambda: time.perf_counter()
+
+# Warmup phase (excluded from any measurement)
+with torch.no_grad():
+  for _ in range(args.warmup + args.loop):
+    torch.ops.tutel_ops.test_allreduce_bf16(args.count)
+
@@ -0,0 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import os
+import tutel_custom_kernel
+
+if 'OP_LOADER' not in os.environ:
+    os.environ['OP_LOADER'] = os.path.join(os.path.dirname(os.path.abspath(__file__)), '.')