|
13 | 13 | import pytest_asyncio |
14 | 14 |
|
15 | 15 | from vllm.config.multimodal import MultiModalConfig |
16 | | -from vllm.engine.multiprocessing.client import MQLLMEngineClient |
17 | 16 | from vllm.entrypoints.openai.protocol import ChatCompletionRequest |
18 | 17 | from vllm.entrypoints.openai.serving_chat import OpenAIServingChat |
19 | 18 | from vllm.entrypoints.openai.serving_models import (BaseModelPath, |
20 | 19 | OpenAIServingModels) |
21 | 20 | from vllm.transformers_utils.tokenizer import get_tokenizer |
| 21 | +from vllm.v1.engine.async_llm import AsyncLLM |
22 | 22 |
|
23 | 23 | from ...utils import RemoteOpenAIServer |
24 | 24 |
|
@@ -276,7 +276,7 @@ def test_async_serving_chat_init(): |
276 | 276 |
|
277 | 277 | @pytest.mark.asyncio |
278 | 278 | async def test_serving_chat_returns_correct_model_name(): |
279 | | - mock_engine = MagicMock(spec=MQLLMEngineClient) |
| 279 | + mock_engine = MagicMock(spec=AsyncLLM) |
280 | 280 | mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) |
281 | 281 | mock_engine.errored = False |
282 | 282 |
|
@@ -312,7 +312,7 @@ async def return_model_name(*args): |
312 | 312 |
|
313 | 313 | @pytest.mark.asyncio |
314 | 314 | async def test_serving_chat_should_set_correct_max_tokens(): |
315 | | - mock_engine = MagicMock(spec=MQLLMEngineClient) |
| 315 | + mock_engine = MagicMock(spec=AsyncLLM) |
316 | 316 | mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) |
317 | 317 | mock_engine.errored = False |
318 | 318 |
|
@@ -355,7 +355,7 @@ async def test_serving_chat_should_set_correct_max_tokens(): |
355 | 355 | } |
356 | 356 |
|
357 | 357 | # Reinitialize the engine with new settings |
358 | | - mock_engine = MagicMock(spec=MQLLMEngineClient) |
| 358 | + mock_engine = MagicMock(spec=AsyncLLM) |
359 | 359 | mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) |
360 | 360 | mock_engine.errored = False |
361 | 361 |
|
@@ -410,7 +410,7 @@ async def test_serving_chat_should_set_correct_max_tokens(): |
410 | 410 | } |
411 | 411 |
|
412 | 412 | # Reinitialize the engine with new settings |
413 | | - mock_engine = MagicMock(spec=MQLLMEngineClient) |
| 413 | + mock_engine = MagicMock(spec=AsyncLLM) |
414 | 414 | mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) |
415 | 415 | mock_engine.errored = False |
416 | 416 |
|
@@ -467,7 +467,7 @@ async def test_serving_chat_could_load_correct_generation_config(): |
467 | 467 | "repetition_penalty": 1.05 |
468 | 468 | } |
469 | 469 |
|
470 | | - mock_engine = MagicMock(spec=MQLLMEngineClient) |
| 470 | + mock_engine = MagicMock(spec=AsyncLLM) |
471 | 471 | mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) |
472 | 472 | mock_engine.errored = False |
473 | 473 |
|
@@ -523,7 +523,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type): |
523 | 523 | mock_model_config = MockModelConfig() |
524 | 524 | mock_model_config.hf_config.model_type = model_type |
525 | 525 |
|
526 | | - mock_engine = MagicMock(spec=MQLLMEngineClient) |
| 526 | + mock_engine = MagicMock(spec=AsyncLLM) |
527 | 527 | mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) |
528 | 528 | mock_engine.errored = False |
529 | 529 |
|
|
0 commit comments