Skip to content

Commit a27a9f7

Browse files
committed
fix tests
Signed-off-by: gnovack <[email protected]>
1 parent ec1cf16 commit a27a9f7

File tree

9 files changed

+10
-27
lines changed

9 files changed

+10
-27
lines changed

csrc/moe/moe_lora_align_sum_kernels.cu

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ __global__ void moe_lora_align_sum_kernel(
2828
int64_t block_size, int num_experts, int max_loras, size_t numel,
2929
int max_num_tokens_padded, int max_num_m_blocks,
3030
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
31-
int topk_num, int32_t* total_tokens_post_pad, int32_t* num_tokens_per_lora,
32-
int32_t* adapter_enabled, int32_t* lora_ids) {
31+
int topk_num, int32_t* total_tokens_post_pad, int32_t* adapter_enabled,
32+
int32_t* lora_ids) {
3333
const size_t tokens_per_thread = div_ceil(numel, blockDim.x);
3434
const size_t start_idx = threadIdx.x * tokens_per_thread;
3535

@@ -131,8 +131,8 @@ void moe_lora_align_block_size(
131131
int64_t num_experts, int64_t block_size, int64_t max_loras,
132132
int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
133133
torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
134-
torch::Tensor num_tokens_post_pad, torch::Tensor num_tokens_per_lora,
135-
torch::Tensor adapter_enabled, torch::Tensor lora_ids) {
134+
torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
135+
torch::Tensor lora_ids) {
136136
const int topk_num = topk_ids.size(1);
137137

138138
TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
@@ -169,7 +169,6 @@ void moe_lora_align_block_size(
169169
max_num_m_blocks, sorted_token_ids.data_ptr<int32_t>(),
170170
expert_ids.data_ptr<int32_t>(), topk_num,
171171
num_tokens_post_pad.data_ptr<int32_t>(),
172-
num_tokens_per_lora.data_ptr<int32_t>(),
173172
adapter_enabled.data_ptr<int32_t>(), lora_ids.data_ptr<int32_t>());
174173
});
175174
}

csrc/moe/moe_ops.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ void moe_lora_align_block_size(
2525
int64_t num_experts, int64_t block_size, int64_t max_loras,
2626
int64_t max_num_tokens_padded, int64_t max_num_m_blocks,
2727
torch::Tensor sorted_token_ids, torch::Tensor expert_ids,
28-
torch::Tensor num_tokens_post_pad, torch::Tensor num_tokens_per_lora,
29-
torch::Tensor adapter_enabled, torch::Tensor lora_ids);
28+
torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
29+
torch::Tensor lora_ids);
3030
#ifndef USE_ROCM
3131
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
3232
torch::Tensor b_qweight, torch::Tensor b_scales,

csrc/moe/torch_bindings.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
4545
" Tensor !sorted_token_ids,"
4646
" Tensor !experts_ids,"
4747
" Tensor !num_tokens_post_pad,"
48-
" Tensor !num_tokens_per_lora,"
4948
" Tensor !adapter_enabled,"
5049
" Tensor !lora_ids) -> () ");
5150
m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size);

tests/lora/test_fused_moe_lora_kernel.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -134,9 +134,8 @@ def use_fused_moe_lora_kernel(
134134
)
135135
expert_ids = torch.empty((max_loras * max_num_m_blocks,), dtype=torch.int32)
136136
num_tokens_post_padded = torch.empty((max_loras,), dtype=torch.int32)
137-
num_tokens_per_lora = torch.ones(max_loras + 1, dtype=torch.int32)
138137
adapter_enabled = torch.ones(max_loras + 1, dtype=torch.int32)
139-
lora_ids = torch.arange(1, max_loras + 1, dtype=torch.int32)
138+
lora_ids = torch.arange(max_loras + 2, dtype=torch.int32)
140139

141140
# call kernel
142141
ops.moe_lora_align_block_size(
@@ -150,8 +149,8 @@ def use_fused_moe_lora_kernel(
150149
sorted_token_ids,
151150
expert_ids,
152151
num_tokens_post_padded,
153-
num_tokens_per_lora,
154152
adapter_enabled,
153+
lora_ids,
155154
)
156155

157156
config = {

tests/lora/test_moe_lora_align_sum.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,8 @@ def test_moe_lora_align_block_size(
6060
(max_loras * max_num_m_blocks,), num_experts, dtype=torch.int32, device="cuda"
6161
)
6262
num_tokens_post_pad = torch.zeros((max_loras,), dtype=torch.int32, device="cuda")
63-
num_tokens_per_lora = torch.ones((max_loras + 1,), dtype=torch.int32, device="cuda")
6463
adapter_enabled = torch.ones((max_loras + 1,), dtype=torch.int32, device="cuda")
64+
lora_ids = torch.arange(max_loras + 2, dtype=torch.int32, device="cuda")
6565

6666
# call kernel
6767
ops.moe_lora_align_block_size(
@@ -75,8 +75,8 @@ def test_moe_lora_align_block_size(
7575
sorted_token_ids,
7676
expert_ids,
7777
num_tokens_post_pad,
78-
num_tokens_per_lora,
7978
adapter_enabled,
79+
lora_ids,
8080
)
8181

8282
# verify values

tests/lora/test_olmoe_tp.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,6 @@ def generate_and_test(
6060
for i in lora_id
6161
]
6262

63-
print(f"Sending lora req {lora_request}")
6463
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64)
6564
outputs = llm.generate(prompts, sampling_params, lora_request=lora_request)
6665
# Print the outputs.
@@ -78,7 +77,6 @@ def generate_and_test(
7877
if req_lora_id is not None
7978
else EXPECTED_BASE_MODEL_OUTPUT[i]
8079
)
81-
8280
assert generated_texts[i].startswith(expected_output)
8381

8482

@@ -100,8 +98,6 @@ def test_olmoe_lora(olmoe_lora_files):
10098

10199

102100
def test_olmoe_lora_base_model(olmoe_lora_files):
103-
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
104-
# Otherwise, the lora-test will fail due to CUDA OOM.
105101
llm = vllm.LLM(
106102
MODEL_PATH,
107103
max_model_len=1024,

vllm/_custom_ops.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1806,7 +1806,6 @@ def moe_lora_align_block_size(
18061806
sorted_token_ids: torch.Tensor,
18071807
experts_ids: torch.Tensor,
18081808
num_tokens_post_pad: torch.Tensor,
1809-
num_tokens_per_lora: torch.Tensor,
18101809
adapter_enabled: torch.Tensor,
18111810
lora_ids: torch.Tensor,
18121811
) -> None:
@@ -1821,7 +1820,6 @@ def moe_lora_align_block_size(
18211820
sorted_token_ids,
18221821
experts_ids,
18231822
num_tokens_post_pad,
1824-
num_tokens_per_lora,
18251823
adapter_enabled,
18261824
lora_ids,
18271825
)

vllm/lora/layers/fused_moe.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -108,11 +108,6 @@ def wrapper(*args, **kwargs):
108108
block_shape=layer.quant_method.moe_quant_config.block_shape,
109109
)
110110

111-
(_, _, num_tokens_per_lora, _, _, _) = (
112-
self.punica_wrapper.token_mapping_meta.meta_args(
113-
hidden_states.size(0)
114-
)
115-
)
116111
max_loras = self.w1_lora_a_stacked.shape[0]
117112
config = get_config_func(M)
118113
(
@@ -125,7 +120,6 @@ def wrapper(*args, **kwargs):
125120
config["BLOCK_SIZE_M"],
126121
global_num_experts,
127122
max_loras,
128-
num_tokens_per_lora,
129123
self.adapter_enabled,
130124
expert_map,
131125
)

vllm/lora/punica_wrapper/punica_gpu.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,6 @@ def moe_lora_align_block_size(
305305
block_size: int,
306306
num_experts: int,
307307
max_loras: int,
308-
num_tokens_per_lora: torch.Tensor,
309308
adapter_enabled: torch.Tensor,
310309
expert_map: torch.Tensor | None = None,
311310
pad_sorted_ids: bool = False,
@@ -348,7 +347,6 @@ def moe_lora_align_block_size(
348347
sorted_ids,
349348
expert_ids,
350349
num_tokens_post_pad,
351-
num_tokens_per_lora,
352350
adapter_enabled,
353351
lora_ids,
354352
)

0 commit comments

Comments
 (0)