Add use_int8 to the moe benchmarks

mzusman · mzusman · commit c12635cb2698 · 2024-08-15T09:03:21.000+03:00
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
@@ -31,18 +31,35 @@ def benchmark_config(
     topk: int,
     dtype: torch.dtype,
     use_fp8: bool,
+    use_int8: bool,
     num_iters: int = 100,
 ) -> float:
     init_dtype = torch.float16 if use_fp8 else dtype
     x = torch.randn(num_tokens, hidden_size, dtype=dtype)
-    w1 = torch.randn(num_experts,
-                     shard_intermediate_size,
-                     hidden_size,
-                     dtype=init_dtype)
-    w2 = torch.randn(num_experts,
-                     hidden_size,
-                     shard_intermediate_size // 2,
-                     dtype=init_dtype)
+    if use_int8:
+        w1 = torch.randint(-127,
+                           127, (
+                               num_experts,
+                               shard_intermediate_size,
+                               hidden_size,
+                           ),
+                           dtype=torch.int8)
+        w2 = torch.randint(-127,
+                           127, (
+                               num_experts,
+                               hidden_size,
+                               shard_intermediate_size // 2,
+                           ),
+                           dtype=torch.int8)
+    else:
+        w1 = torch.randn(num_experts,
+                         shard_intermediate_size,
+                         hidden_size,
+                         dtype=init_dtype)
+        w2 = torch.randn(num_experts,
+                         hidden_size,
+                         shard_intermediate_size // 2,
+                         dtype=init_dtype)
     gating_output = torch.randn(num_iters,
                                 num_tokens,
                                 num_experts,
@@ -52,6 +69,10 @@ def benchmark_config(
     w2_scale = None
     a1_scale = None
     a2_scale = None
+    if use_int8:
+        w1_scale = torch.randn((num_experts, 2 * shard_intermediate_size),
+                               dtype=torch.float32)
+        w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32)
     if use_fp8:
         w1_scale = torch.randn(num_experts, dtype=torch.float32)
         w2_scale = torch.randn(num_experts, dtype=torch.float32)
@@ -77,6 +98,7 @@ def run():
             inplace=True,
             override_config=config,
             use_fp8=use_fp8,
+            use_int8=use_int8,
             w1_scale=w1_scale,
             w2_scale=w2_scale,
             a1_scale=a1_scale,
@@ -156,10 +178,11 @@ def benchmark(
         topk: int,
         dtype: torch.dtype,
         use_fp8: bool,
+        use_int8: bool,
     ) -> Tuple[Dict[str, int], float]:
         torch.cuda.manual_seed_all(self.seed)
 
-        dtype_str = "float8" if use_fp8 else None
+        dtype_str = "float8" if use_fp8 else ("int8" if use_int8 else None)
         # NOTE(woosuk): The current naming convention uses w2.shape[2], which
         # is the intermediate size after silu_and_mul.
         op_config = get_moe_configs(num_experts, shard_intermediate_size // 2,
@@ -173,7 +196,7 @@ def benchmark(
                                    key=lambda x: abs(x - num_tokens))]
         kernel_time = benchmark_config(config, num_tokens, num_experts,
                                        shard_intermediate_size, hidden_size,
-                                       topk, dtype, use_fp8)
+                                       topk, dtype, use_fp8, use_int8)
         return config, kernel_time
 
     def tune(
@@ -185,8 +208,9 @@ def tune(
         topk: int,
         dtype: torch.dtype,
         use_fp8: bool,
-        search_space: List[BenchmarkConfig],
-    ) -> BenchmarkConfig:
+        use_int8: bool,
+        search_space: List[Dict[str, int]],
+    ) -> Dict[str, int]:
         best_config = None
         best_time = float("inf")
         for config in tqdm(search_space):
@@ -199,6 +223,7 @@ def tune(
                                                topk,
                                                dtype,
                                                use_fp8,
+                                               use_int8,
                                                num_iters=10)
             except triton.runtime.autotuner.OutOfResources:
                 # Some configurations may be invalid and fail to compile.
@@ -224,20 +249,15 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
     }
 
 
-def save_configs(
-    configs: Dict[int, BenchmarkConfig],
-    num_experts: int,
-    shard_intermediate_size: int,
-    hidden_size: int,
-    topk: int,
-    dtype: torch.dtype,
-    use_fp8: bool,
-) -> None:
-    dtype_str = "float8" if use_fp8 else None
+def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int,
+                 shard_intermediate_size: int, hidden_size: int, topk: int,
+                 dtype: torch.dtype, use_fp8: bool, use_int8: bool) -> None:
+    dtype_str = "float8" if use_fp8 else "int8" if use_int8 else None
     # NOTE(woosuk): The current naming convention uses w2.shape[2], which
     # is the intermediate size after silu_and_mul.
     filename = get_config_file_name(num_experts, shard_intermediate_size // 2,
                                     dtype_str)
+
     print(f"Writing best config to {filename}...")
     with open(filename, "w") as f:
         json.dump(configs, f, indent=4)
@@ -253,6 +273,11 @@ def main(args: argparse.Namespace):
         topk = config.ffn_config.moe_top_k
         intermediate_size = config.ffn_config.ffn_hidden_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    elif config.architectures[0] == "JambaForCausalLM":
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
     else:
         # Default: Mixtral.
         E = config.num_local_experts
@@ -263,6 +288,7 @@ def main(args: argparse.Namespace):
     hidden_size = config.hidden_size
     dtype = config.torch_dtype
     use_fp8 = args.dtype == "fp8"
+    use_int8 = args.dtype == "int8"
 
     if args.batch_size is None:
         batch_sizes = [
@@ -294,20 +320,20 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]:
         start = time.time()
         configs = _distribute(
             "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
-                      topk, dtype, use_fp8, search_space)
+                      topk, dtype, use_fp8, use_int8, search_space)
                      for batch_size in batch_sizes])
         best_configs = {
             M: sort_config(config)
             for M, config in zip(batch_sizes, configs)
         }
         save_configs(best_configs, E, shard_intermediate_size, hidden_size,
-                     topk, dtype, use_fp8)
+                     topk, dtype, use_fp8, use_int8)
         end = time.time()
         print(f"Tuning took {end - start:.2f} seconds")
     else:
         outputs = _distribute("benchmark",
                               [(batch_size, E, shard_intermediate_size,
-                                hidden_size, topk, dtype, use_fp8)
+                                hidden_size, topk, dtype, use_fp8, use_int8)
                                for batch_size in batch_sizes])
 
         for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
@@ -323,7 +349,7 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]:
     parser.add_argument("--tp-size", "-tp", type=int, default=2)
     parser.add_argument("--dtype",
                         type=str,
-                        choices=["auto", "fp8"],
+                        choices=["auto", "fp8", "int8"],
                         default="auto")
     parser.add_argument("--seed", type=int, default=0)
     parser.add_argument("--batch-size", type=int, required=False)