Skip to content
Merged
1 change: 1 addition & 0 deletions docs/backend/sampling_params.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ Please refer to our dedicated guide on [constrained decoding](./structured_outpu
* `n: int = 1`: Specifies the number of output sequences to generate per request. (Generating multiple outputs in one request (n > 1) is discouraged; repeat the same prompts for several times offer better control and efficiency.)
* `spaces_between_special_tokens: bool = True`: Whether or not to add spaces between special tokens during detokenization.
* `no_stop_trim: bool = False`: Don't trim stop words or EOS token from the generated text.
* `continue_final_message: bool = False` : When enabled, the final assistant message is removed and its content is used as a prefill so that the model continues that message instead of starting a new turn.
* `ignore_eos: bool = False`: Don't stop generation when EOS token is sampled.
* `skip_special_tokens: bool = True`: Remove special tokens during decoding.
* `custom_params: Optional[List[Optional[Dict[str, Any]]]] = None`: Used when employing `CustomLogitProcessor`. For usage see below.
Expand Down
42 changes: 38 additions & 4 deletions python/sglang/srt/openai_api/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -911,9 +911,16 @@ def v1_chat_generate_request(
openai_compatible_messages.append(
{"role": message.role, "content": content["text"]}
)
if openai_compatible_messages[-1]["role"] == "assistant":
assistant_prefix = openai_compatible_messages[-1]["content"]
openai_compatible_messages = openai_compatible_messages[:-1]
if (
openai_compatible_messages
and openai_compatible_messages[-1]["role"] == "assistant"
):
if request.continue_final_message:
# Remove the final assistant message so its content can be continued.
assistant_prefix = openai_compatible_messages[-1]["content"]
openai_compatible_messages = openai_compatible_messages[:-1]
else:
assistant_prefix = None
else:
assistant_prefix = None

Expand Down Expand Up @@ -949,7 +956,33 @@ def v1_chat_generate_request(
modalities = []
else:
conv = generate_chat_conv(request, chat_template_name)
prompt = conv.get_prompt()
# If we should continue the final assistant message, adjust the conversation.
if (
request.continue_final_message
and request.messages
and request.messages[-1].role == "assistant"
):
# Remove the auto-added blank assistant turn, if present.
if conv.messages and conv.messages[-1][1] is None:
conv.messages.pop()
# Rebuild the prompt from the conversation.
prompt = conv.get_prompt()
# Strip any trailing stop tokens or separators that indicate end-of-assistant.
if isinstance(conv.stop_str, list):
for stop_token in conv.stop_str:
if prompt.endswith(stop_token):
prompt = prompt[: -len(stop_token)]
elif isinstance(conv.stop_str, str) and prompt.endswith(
conv.stop_str
):
prompt = prompt[: -len(conv.stop_str)]
if conv.sep and prompt.endswith(conv.sep):
prompt = prompt[: -len(conv.sep)]
if getattr(conv, "sep2", None) and prompt.endswith(conv.sep2):
prompt = prompt[: -len(conv.sep2)]
else:
prompt = conv.get_prompt()

image_data = conv.image_data
modalities = conv.modalities
stop = conv.stop_str or []
Expand All @@ -959,6 +992,7 @@ def v1_chat_generate_request(
else:
stop.extend(request.stop)
prompt_ids = tokenizer_manager.tokenizer.encode(prompt)

else:
# Use the raw prompt and stop strings if the messages is already a string.
prompt_ids = request.messages
Expand Down
1 change: 1 addition & 0 deletions python/sglang/srt/openai_api/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,7 @@ class ChatCompletionRequest(BaseModel):
stop_token_ids: Optional[List[int]] = None
no_stop_trim: bool = False
ignore_eos: bool = False
continue_final_message: bool = False
skip_special_tokens: bool = True
lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
session_params: Optional[Dict] = None
Expand Down
1 change: 1 addition & 0 deletions test/srt/test_openai_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,7 @@ def test_response_prefill(self):
},
],
temperature=0,
extra_body={"continue_final_message": True},
)

assert (
Expand Down
Loading