Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 69 additions & 15 deletions xinference/model/llm/vllm/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -933,9 +933,21 @@ def _convert_request_output_to_completion(

async def _get_tokenizer(self, lora_request: Any) -> Any:
try:
return await self._engine.get_tokenizer(lora_request) # type: ignore
# vLLM 0.11.0+ get_tokenizer doesn't accept lora_request parameter
if (
VLLM_VERSION >= version.parse("0.11.0")
or VLLM_VERSION.base_version >= "0.11.0"
):
return await self._engine.get_tokenizer() # type: ignore
else:
return await self._engine.get_tokenizer(lora_request) # type: ignore
except AttributeError:
return await self._engine.get_tokenizer_async(lora_request) # type: ignore
# Fallback to get_tokenizer_async for older versions
try:
return await self._engine.get_tokenizer_async(lora_request) # type: ignore
except (AttributeError, TypeError):
# If all else fails, try without parameters
return await self._engine.get_tokenizer() # type: ignore

def _tokenize(self, tokenizer: Any, prompt: str, config: dict) -> List[int]:
truncate_prompt_tokens = config.get("truncate_prompt_tokens")
Expand Down Expand Up @@ -1016,23 +1028,65 @@ async def async_generate(
# guided decoding only available for vllm >= 0.6.3
from vllm.sampling_params import GuidedDecodingParams

guided_options = GuidedDecodingParams.from_optional(
json=sanitized_generate_config.pop("guided_json", None),
regex=sanitized_generate_config.pop("guided_regex", None),
choice=sanitized_generate_config.pop("guided_choice", None),
grammar=sanitized_generate_config.pop("guided_grammar", None),
json_object=sanitized_generate_config.pop("guided_json_object", None),
backend=sanitized_generate_config.pop("guided_decoding_backend", None),
whitespace_pattern=sanitized_generate_config.pop(
"guided_whitespace_pattern", None
),
# Extract guided decoding parameters
guided_params: dict[str, Any] = {}
guided_json = sanitized_generate_config.pop("guided_json", None)
if guided_json:
guided_params["json"] = guided_json

guided_regex = sanitized_generate_config.pop("guided_regex", None)
if guided_regex:
guided_params["regex"] = guided_regex

guided_choice = sanitized_generate_config.pop("guided_choice", None)
if guided_choice:
guided_params["choice"] = guided_choice

guided_grammar = sanitized_generate_config.pop("guided_grammar", None)
if guided_grammar:
guided_params["grammar"] = guided_grammar

guided_json_object = sanitized_generate_config.pop(
"guided_json_object", None
)
if guided_json_object:
guided_params["json_object"] = guided_json_object

guided_backend = sanitized_generate_config.pop(
"guided_decoding_backend", None
)
if guided_backend:
guided_params["_backend"] = guided_backend

sampling_params = SamplingParams(
guided_decoding=guided_options, **sanitized_generate_config
guided_whitespace_pattern = sanitized_generate_config.pop(
"guided_whitespace_pattern", None
)
if guided_whitespace_pattern:
guided_params["whitespace_pattern"] = guided_whitespace_pattern

# Create GuidedDecodingParams if we have any guided parameters
guided_options = None
if guided_params:
try:
guided_options = GuidedDecodingParams(**guided_params)
except Exception as e:
logger.warning(f"Failed to create GuidedDecodingParams: {e}")
guided_options = None

# Use structured_outputs for vLLM >= 0.11.0, guided_decoding for older versions
if (
VLLM_VERSION >= version.parse("0.11.0")
or VLLM_VERSION.base_version >= "0.11.0"
):
sampling_params = SamplingParams(
structured_outputs=guided_options, **sanitized_generate_config
)
else:
sampling_params = SamplingParams(
guided_decoding=guided_options, **sanitized_generate_config
)
else:
# ignore generate configs
# ignore generate configs for older versions
sanitized_generate_config.pop("guided_json", None)
sanitized_generate_config.pop("guided_regex", None)
sanitized_generate_config.pop("guided_choice", None)
Expand Down
Loading