-
-
Notifications
You must be signed in to change notification settings - Fork 11.7k
[Frontend] Batch inference for llm.chat() API #8648
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
95899dd
f7ed4a3
403bfdd
28eba35
148c4ca
71b6231
d2fd983
ec011b4
bcada94
ba593d5
2921016
8a88e10
e87415b
b7ab294
bdcf223
6bd19a9
e9510fe
049a657
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -352,6 +352,7 @@ def generate( | |
| outputs = self._run_engine(use_tqdm=use_tqdm) | ||
| return LLMEngine.validate_outputs(outputs, RequestOutput) | ||
|
|
||
| @overload | ||
| def chat( | ||
| self, | ||
| messages: List[ChatCompletionMessageParam], | ||
|
|
@@ -363,6 +364,34 @@ def chat( | |
| add_generation_prompt: bool = True, | ||
| tools: Optional[List[Dict[str, Any]]] = None, | ||
| ) -> List[RequestOutput]: | ||
| ... | ||
|
|
||
| @overload | ||
| def chat( | ||
| self, | ||
| messages: List[List[ChatCompletionMessageParam]], | ||
| sampling_params: Optional[Union[SamplingParams, | ||
| List[SamplingParams]]] = None, | ||
| use_tqdm: bool = True, | ||
| lora_request: Optional[LoRARequest] = None, | ||
| chat_template: Optional[str] = None, | ||
| add_generation_prompt: bool = True, | ||
| tools: Optional[List[Dict[str, Any]]] = None, | ||
| ) -> List[List[RequestOutput]]: | ||
| ... | ||
|
|
||
| def chat( | ||
| self, | ||
| messages: Union[List[ChatCompletionMessageParam], | ||
| List[List[ChatCompletionMessageParam]]], | ||
| sampling_params: Optional[Union[SamplingParams, | ||
| List[SamplingParams]]] = None, | ||
| use_tqdm: bool = True, | ||
| lora_request: Optional[LoRARequest] = None, | ||
| chat_template: Optional[str] = None, | ||
| add_generation_prompt: bool = True, | ||
| tools: Optional[List[Dict[str, Any]]] = None, | ||
| ) -> Union[List[RequestOutput], List[List[RequestOutput]]]: | ||
| """ | ||
| Generate responses for a chat conversation. | ||
|
|
||
|
|
@@ -374,8 +403,9 @@ def chat( | |
| to the OpenAI API. | ||
|
|
||
| Args: | ||
| messages: A single conversation represented as a list of messages. | ||
| Each message is a dictionary with 'role' and 'content' keys. | ||
| messages: A list of conversations or a single conversation. | ||
| - Each conversation is represented as a list of messages. | ||
| - Each message is a dictionary with 'role' and 'content' keys. | ||
| sampling_params: The sampling parameters for text generation. | ||
| If None, we use the default sampling parameters. When it | ||
| is a single value, it is applied to every prompt. When it | ||
|
|
@@ -389,49 +419,66 @@ def chat( | |
| to each message. | ||
|
|
||
| Returns: | ||
| A list of ``RequestOutput`` objects containing the generated | ||
| responses in the same order as the input messages. | ||
| A list of lists or single list of ``RequestOutput`` objects | ||
| containing the generated responses in the same order as the input | ||
| conversations and messages. | ||
| """ | ||
| list_of_messages: List[List[ChatCompletionMessageParam]] | ||
|
|
||
| tokenizer = self.get_tokenizer() | ||
| model_config = self.llm_engine.get_model_config() | ||
|
|
||
| conversation, mm_data = parse_chat_messages(messages, model_config, | ||
| tokenizer) | ||
|
|
||
| prompt_data: Union[str, List[int]] | ||
| if isinstance(tokenizer, MistralTokenizer): | ||
| prompt_data = apply_mistral_chat_template( | ||
| tokenizer, | ||
| messages=messages, | ||
| chat_template=chat_template, | ||
| add_generation_prompt=add_generation_prompt, | ||
| tools=tools, | ||
| ) | ||
| # Handle multi and single conversations | ||
| if is_list_of(messages, list): | ||
| # messages is List[List[...]] | ||
| list_of_messages = messages | ||
| else: | ||
| prompt_data = apply_hf_chat_template( | ||
| tokenizer, | ||
| conversation=conversation, | ||
| chat_template=chat_template, | ||
| add_generation_prompt=add_generation_prompt, | ||
| tools=tools, | ||
| ) | ||
| # messages is List[...] | ||
| list_of_messages = [messages] | ||
|
|
||
| outputs: List[List[RequestOutput]]] = [] | ||
DarkLight1337 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| for msgs in list_of_messages: | ||
| tokenizer = self.get_tokenizer() | ||
| model_config = self.llm_engine.get_model_config() | ||
|
|
||
| conversation, mm_data = parse_chat_messages( | ||
| msgs, model_config, tokenizer) | ||
|
|
||
| prompt_data: Union[str, List[int]] | ||
| if isinstance(tokenizer, MistralTokenizer): | ||
| prompt_data = apply_mistral_chat_template( | ||
| tokenizer, | ||
| messages=msgs, | ||
| chat_template=chat_template, | ||
| add_generation_prompt=add_generation_prompt, | ||
| tools=tools, | ||
| ) | ||
| else: | ||
| prompt_data = apply_hf_chat_template( | ||
| tokenizer, | ||
| conversation=conversation, | ||
| chat_template=chat_template, | ||
| add_generation_prompt=add_generation_prompt, | ||
| tools=tools, | ||
| ) | ||
|
|
||
| prompt: PromptType | ||
| if is_list_of(prompt_data, int): | ||
| prompt = TokensPrompt(prompt_token_ids=prompt_data) | ||
| else: | ||
| prompt = TextPrompt(prompt=prompt_data) | ||
|
|
||
| prompt: PromptType | ||
| if is_list_of(prompt_data, int): | ||
| prompt = TokensPrompt(prompt_token_ids=prompt_data) | ||
| else: | ||
| prompt = TextPrompt(prompt=prompt_data) | ||
| if mm_data is not None: | ||
| prompt["multi_modal_data"] = mm_data | ||
|
|
||
| if mm_data is not None: | ||
| prompt["multi_modal_data"] = mm_data | ||
| out = self.generate( | ||
| prompt, | ||
| sampling_params=sampling_params, | ||
| use_tqdm=use_tqdm, | ||
| lora_request=lora_request, | ||
| ) | ||
| outputs.append(out) | ||
|
||
|
|
||
| return self.generate( | ||
| prompt, | ||
| sampling_params=sampling_params, | ||
| use_tqdm=use_tqdm, | ||
| lora_request=lora_request, | ||
| ) | ||
| # When messages is List[...], return a single list | ||
| return outputs if len(outputs) > 1 else outputs[0] | ||
|
|
||
| @overload # LEGACY: single (prompt + optional token ids) | ||
| def encode( | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.