Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions tests/entrypoints/llm/test_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,41 @@ def test_chat():
assert len(outputs) == 1


def test_multi_chat():

llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")

prompt1 = "Explain the concept of entropy."
prompt2 = "Explain what among us is."

conversation1 = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": prompt1
},
]

conversation2 = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": prompt2
},
]

messages = [conversation1, conversation2]

outputs = llm.chat(messages)
assert len(outputs) == 2


@pytest.mark.parametrize("image_urls",
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
def test_chat_multi_image(image_urls: List[str]):
Expand Down
125 changes: 86 additions & 39 deletions vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,7 @@ def generate(
outputs = self._run_engine(use_tqdm=use_tqdm)
return LLMEngine.validate_outputs(outputs, RequestOutput)

@overload
def chat(
self,
messages: List[ChatCompletionMessageParam],
Expand All @@ -363,6 +364,34 @@ def chat(
add_generation_prompt: bool = True,
tools: Optional[List[Dict[str, Any]]] = None,
) -> List[RequestOutput]:
...

@overload
def chat(
self,
messages: List[List[ChatCompletionMessageParam]],
sampling_params: Optional[Union[SamplingParams,
List[SamplingParams]]] = None,
use_tqdm: bool = True,
lora_request: Optional[LoRARequest] = None,
chat_template: Optional[str] = None,
add_generation_prompt: bool = True,
tools: Optional[List[Dict[str, Any]]] = None,
) -> List[List[RequestOutput]]:
...

def chat(
self,
messages: Union[List[ChatCompletionMessageParam],
List[List[ChatCompletionMessageParam]]],
sampling_params: Optional[Union[SamplingParams,
List[SamplingParams]]] = None,
use_tqdm: bool = True,
lora_request: Optional[LoRARequest] = None,
chat_template: Optional[str] = None,
add_generation_prompt: bool = True,
tools: Optional[List[Dict[str, Any]]] = None,
) -> Union[List[RequestOutput], List[List[RequestOutput]]]:
"""
Generate responses for a chat conversation.

Expand All @@ -374,8 +403,9 @@ def chat(
to the OpenAI API.

Args:
messages: A single conversation represented as a list of messages.
Each message is a dictionary with 'role' and 'content' keys.
messages: A list of conversations or a single conversation.
- Each conversation is represented as a list of messages.
- Each message is a dictionary with 'role' and 'content' keys.
sampling_params: The sampling parameters for text generation.
If None, we use the default sampling parameters. When it
is a single value, it is applied to every prompt. When it
Expand All @@ -389,49 +419,66 @@ def chat(
to each message.

Returns:
A list of ``RequestOutput`` objects containing the generated
responses in the same order as the input messages.
A list of lists or single list of ``RequestOutput`` objects
containing the generated responses in the same order as the input
conversations and messages.
"""
list_of_messages: List[List[ChatCompletionMessageParam]]

tokenizer = self.get_tokenizer()
model_config = self.llm_engine.get_model_config()

conversation, mm_data = parse_chat_messages(messages, model_config,
tokenizer)

prompt_data: Union[str, List[int]]
if isinstance(tokenizer, MistralTokenizer):
prompt_data = apply_mistral_chat_template(
tokenizer,
messages=messages,
chat_template=chat_template,
add_generation_prompt=add_generation_prompt,
tools=tools,
)
# Handle multi and single conversations
if is_list_of(messages, list):
# messages is List[List[...]]
list_of_messages = messages
else:
prompt_data = apply_hf_chat_template(
tokenizer,
conversation=conversation,
chat_template=chat_template,
add_generation_prompt=add_generation_prompt,
tools=tools,
)
# messages is List[...]
list_of_messages = [messages]

outputs: List[List[RequestOutput]]] = []

for msgs in list_of_messages:
tokenizer = self.get_tokenizer()
model_config = self.llm_engine.get_model_config()

conversation, mm_data = parse_chat_messages(
msgs, model_config, tokenizer)

prompt_data: Union[str, List[int]]
if isinstance(tokenizer, MistralTokenizer):
prompt_data = apply_mistral_chat_template(
tokenizer,
messages=msgs,
chat_template=chat_template,
add_generation_prompt=add_generation_prompt,
tools=tools,
)
else:
prompt_data = apply_hf_chat_template(
tokenizer,
conversation=conversation,
chat_template=chat_template,
add_generation_prompt=add_generation_prompt,
tools=tools,
)

prompt: PromptType
if is_list_of(prompt_data, int):
prompt = TokensPrompt(prompt_token_ids=prompt_data)
else:
prompt = TextPrompt(prompt=prompt_data)

prompt: PromptType
if is_list_of(prompt_data, int):
prompt = TokensPrompt(prompt_token_ids=prompt_data)
else:
prompt = TextPrompt(prompt=prompt_data)
if mm_data is not None:
prompt["multi_modal_data"] = mm_data

if mm_data is not None:
prompt["multi_modal_data"] = mm_data
out = self.generate(
prompt,
sampling_params=sampling_params,
use_tqdm=use_tqdm,
lora_request=lora_request,
)
outputs.append(out)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey @aandyw! I realized after some testing that this implementation is actually not what we want.

Here we're still parsing the list of messages, and calling .generate one by one, but what we really want is to make a batch of requests after parsing the list of messages, and call .generate on that one batch all together.

Can you make the changes accordingly? Thanks!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good. The overall output from llm.chat should be the same correct?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep!


return self.generate(
prompt,
sampling_params=sampling_params,
use_tqdm=use_tqdm,
lora_request=lora_request,
)
# When messages is List[...], return a single list
return outputs if len(outputs) > 1 else outputs[0]

@overload # LEGACY: single (prompt + optional token ids)
def encode(
Expand Down