Update sgl-kernel UTs for activation/topk/norm/rope kernels#6452
Update sgl-kernel UTs for activation/topk/norm/rope kernels#6452zhyncs merged 13 commits intosgl-project:mainfrom
Conversation
mingfeima
left a comment
There was a problem hiding this comment.
merge test_grouped_topk.py and test_biased_grouped_topk.py into one, name it after test_topk.py. We only focus on DS v2 and v3 right now, later on we need to add more types of topk kernels.
test/srt/cpu/test_rope.py
Outdated
| def _forward_ref(self, positions, query, key, cos_sin_cache, offsets=None): | ||
| self.rotary_dim = 64 | ||
| self.head_size = 64 | ||
| self.is_neox_style = False |
There was a problem hiding this comment.
do we support is_neox in the C++ kernels?
test/srt/cpu/test_activation.py
Outdated
| def test_activation(self): | ||
| self._run_single_test([128, 22016], torch.bfloat16, "cpu") | ||
| self._run_single_test([129, 22016], torch.float16, "cpu") |
There was a problem hiding this comment.
This test case won't take too much time, suggest to use itertools.product to cover more combinations.
test/srt/cpu/test_norm.py
Outdated
| def _run_single_test(self, shape, dtype, device="cuda"): | ||
|
|
||
| x = torch.randn(shape, dtype=dtype).to(device=device) | ||
| hidden_size = x.size(-1) | ||
| weight = torch.randn(hidden_size, dtype=dtype).to(device=device) |
There was a problem hiding this comment.
| def _run_single_test(self, shape, dtype, device="cuda"): | |
| x = torch.randn(shape, dtype=dtype).to(device=device) | |
| hidden_size = x.size(-1) | |
| weight = torch.randn(hidden_size, dtype=dtype).to(device=device) | |
| def _run_single_test(self, shape, dtype): | |
| x = torch.randn(shape, dtype=dtype) | |
| hidden_size = x.size(-1) | |
| weight = torch.randn(hidden_size, dtype=dtype) |
test/srt/cpu/test_norm.py
Outdated
| # TEST: fused_add_rmsnorm | ||
| # flashinfer writes x and residual inplaced |
There was a problem hiding this comment.
| # TEST: fused_add_rmsnorm | |
| # flashinfer writes x and residual inplaced |
test/srt/cpu/test_norm.py
Outdated
| residual = torch.randn(shape, dtype=dtype).to(device=device) | ||
| ref_residual = residual.clone() |
There was a problem hiding this comment.
| residual = torch.randn(shape, dtype=dtype).to(device=device) | |
| ref_residual = residual.clone() | |
| residual = torch.randn(shape, dtype=dtype) | |
| ref_residual = residual.clone() |
test/srt/cpu/test_norm.py
Outdated
| def test_norm(self): | ||
| self._run_single_test([4096, 4096], torch.bfloat16, "cpu") | ||
| self._run_single_test([1024, 4096], torch.bfloat16, "cpu") | ||
| self._run_single_test([1024, 4096 + 13], torch.float16, "cpu") |
There was a problem hiding this comment.
use itertools.product and remove 'cpu'
test/srt/cpu/test_rope.py
Outdated
| self.rotary_dim = 64 | ||
| self.head_size = 64 | ||
| self.is_neox_style = False |
There was a problem hiding this comment.
make these input parameters, rotary_dim, head_size and is_neox_stype
|
|
||
|
|
||
| # This is used by the Deepseek-V2 model | ||
| class TestGroupedTopK(CustomTestCase): |
There was a problem hiding this comment.
can we import that native impl from https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/layers/moe/topk.py
test/srt/cpu/test_topk.py
Outdated
| def _biased_grouped_topk( | ||
| self, | ||
| hidden_states: torch.Tensor, | ||
| gating_output: torch.Tensor, | ||
| correction_bias: torch.Tensor, | ||
| topk: int, | ||
| renormalize: bool, | ||
| num_expert_group: int = 0, | ||
| topk_group: int = 0, | ||
| ): |
There was a problem hiding this comment.
same as above, can we import the native impl from https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/layers/moe/topk.py
test/srt/cpu/test_rope.py
Outdated
| def _forward_ref( | ||
| self, | ||
| positions, | ||
| query, | ||
| key, | ||
| cos_sin_cache, | ||
| rotary_dim, | ||
| head_size, | ||
| is_neox_style, | ||
| offsets=None, | ||
| ): | ||
| query_rot = query[..., :rotary_dim] | ||
| key_rot = key[..., :rotary_dim] | ||
| if rotary_dim < head_size: | ||
| query_pass = query[..., rotary_dim:] | ||
| key_pass = key[..., rotary_dim:] | ||
|
|
||
| cos_sin = cos_sin_cache[ | ||
| torch.add(positions, offsets) if offsets is not None else positions | ||
| ] | ||
| cos, sin = cos_sin.chunk(2, dim=-1) | ||
| if is_neox_style: | ||
| # shape [batch_size, seq_len]. | ||
| cos = cos.repeat(1, 1, 2).unsqueeze(-2) | ||
| sin = sin.repeat(1, 1, 2).unsqueeze(-2) |
There was a problem hiding this comment.
| # This is used by the Deepseek-V2 model | ||
| class TestGroupedTopK(CustomTestCase): | ||
| def _run_single_test(self, M, E, G, topk, topk_group, renormalize, dtype): | ||
|
|
There was a problem hiding this comment.
you may set a seed here as topk is unstable sort.
test/srt/cpu/utils.py
Outdated
|
|
||
|
|
||
| def _rotate_neox(x: torch.Tensor) -> torch.Tensor: | ||
| x1 = x[..., : x.shape[-1] // 2] | ||
| x2 = x[..., x.shape[-1] // 2 :] | ||
| return torch.cat((-x2, x1), dim=-1) | ||
|
|
||
|
|
||
| def _rotate_gptj(x: torch.Tensor) -> torch.Tensor: | ||
| x1 = x[..., ::2] | ||
| x2 = x[..., 1::2] | ||
| x = torch.stack((-x2, x1), dim=-1) | ||
| return x.flatten(-2) |
test/srt/cpu/test_activation.py
Outdated
|
|
||
| import torch | ||
| import torch.nn.functional as F | ||
| from sgl_kernel.common_ops import silu_and_mul_cpu as silu_and_mul |
There was a problem hiding this comment.
There was a problem hiding this comment.
@zhyncs Thanks, has updated according to #6404 (comment).
|
|
|
Motivation
This PR is a follow-up on #2807 to add UTs for activation/topk/morm/rope kernels.
Modifications
Checklist