Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 7 additions & 26 deletions fastdeploy/model_executor/guided_decoding/base_guided_decoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@
# limitations under the License.
"""

import multiprocessing
import os
import traceback
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import Future, ThreadPoolExecutor

from fastdeploy.config import ErnieArchitectures, FDConfig
from fastdeploy.engine.request import Request
Expand Down Expand Up @@ -135,9 +136,9 @@ class BackendBase:
"""

def __init__(self, fd_config: FDConfig):
self.cache = {}
self.fd_config = fd_config
self.executor = ThreadPoolExecutor()
max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
Copy link

Copilot AI Nov 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The logic for determining max_workers using (multiprocessing.cpu_count() + 1) // 2 seems arbitrary without documentation. Consider adding a comment explaining why half the CPU count plus one is chosen, or make this configurable through the FDConfig.

Suggested change
max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
# Determine max_workers for ThreadPoolExecutor.
# Default is half the CPU count plus one, to balance concurrency and avoid oversubscription.
# This can be overridden by setting 'max_workers' in FDConfig.
max_workers = getattr(self.fd_config, "max_workers", None)
if max_workers is None:
max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)

Copilot uses AI. Check for mistakes.
self.executor = ThreadPoolExecutor(max_workers=max_workers)
self.max_cache_size = 2048
self.reasoning_parser = None

Expand Down Expand Up @@ -263,7 +264,7 @@ def get_logits_processor(
self,
schemata_key: tuple[str, str],
enable_thinking: bool = False,
) -> tuple[LogitsProcessorBase, bool]:
) -> Future[LogitsProcessorBase]:
"""
get logits processor by key from cache or create new one.
Expand All @@ -275,13 +276,8 @@ def get_logits_processor(
- LogitsProcessorBase: The logits processor instance
- bool: True if processor was from cache, False if newly created
"""
value = self.cache.get(schemata_key, None)
if value:
value_copy = value.copy()
value_copy.enable_reasoning = enable_thinking
return value_copy, True
value = self.executor.submit(self._init_logits_processor, schemata_key, enable_thinking)
return value, False
return value

def _get_tokenizer_hf(self):
"""
Expand All @@ -303,7 +299,7 @@ def _get_tokenizer_hf(self):

Copy link

Copilot AI Nov 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The change from use_fast=False to use_fast=True is a significant behavior change that could affect tokenization results. This should be documented in the PR description or commit message as it may impact model outputs.

Suggested change
# NOTE: Setting use_fast=True switches to the fast (Rust) tokenizer, which may produce different tokenization results
# compared to the slow (Python) tokenizer. This can affect model outputs and downstream processing.
# Please ensure this change is compatible with your use case.

Copilot uses AI. Check for mistakes.
tokenizer = AutoTokenizer.from_pretrained(
self.fd_config.model_config.model,
use_fast=False,
use_fast=True,
)

if not isinstance(tokenizer, PreTrainedTokenizerFast):
Expand Down Expand Up @@ -334,21 +330,6 @@ def _get_tokenizer_hf(self):
except Exception as e:
raise Exception(f"Fail to initialize hf tokenizer: {e}, {str(traceback.format_exc())}")

def add_cache(self, schemata_key: tuple[str, str], processor: LogitsProcessorBase) -> None:
"""
add logits processor to cache.
Args:
schemata_key (tuple[str, str]): Tuple containing processor type and schema string
processor (LogitsProcessorBase): Logits processor instance to cache
Returns:
None: No return value
"""
if len(self.cache) >= self.max_cache_size:
return
self.cache[schemata_key] = processor.copy()


class BaseChecker:
"""
Expand Down
113 changes: 66 additions & 47 deletions fastdeploy/model_executor/guided_decoding/xgrammar_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
BaseChecker,
LogitsProcessorBase,
)
from fastdeploy.platforms import current_platform
from fastdeploy.utils import llm_logger

try:
Expand Down Expand Up @@ -86,6 +87,8 @@ def __init__(
terminate_without_stop_token=terminate_without_stop_token,
override_stop_tokens=override_stop_tokens,
)
# when matcher accept eos_token_id, is_terminated = True
self.is_terminated: bool = False

def allocate_token_bitmask(self) -> torch.Tensor:
"""
Expand All @@ -109,40 +112,6 @@ def fill_token_bitmask(self, token_bitmask: torch.Tensor, idx: int) -> None:
"""
self.matcher.fill_next_token_bitmask(token_bitmask, idx)

def apply_token_mask(
self,
logits: paddle.Tensor,
token_bitmask: torch.Tensor,
indices: Optional[List[int]] = None,
) -> paddle.Tensor:
"""
Apply the token mask to the logits, modifying probabilities of invalid tokens.

Args:
logits (paddle.Tensor): The logits tensor to modify
token_bitmask (torch.Tensor): The token bitmask indicating allowed tokens
indices (Optional[List[int]]): Optional list of batch indices to apply mask to

Returns:
paddle.Tensor: The modified logits tensor
"""
origin_place = logits.place
origin_dtype = logits.dtype
logits = torch.from_numpy(logits.numpy())

logits = logits.float() # cpu
apply_token_bitmask_inplace(
logits=logits,
bitmask=token_bitmask.to(logits.device, non_blocking=True),
indices=indices,
)

return paddle.to_tensor(
logits.numpy(),
dtype=origin_dtype,
place=origin_place,
)

def reset(self) -> None:
"""
Reset the grammar matcher state to initial conditions.
Expand All @@ -155,23 +124,21 @@ def reset(self) -> None:
def accept_token(self, token: int) -> None:
"""
Validate and accept a generated token against the grammar constraints.
when accept eos_token, is_terminated = True
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里在哪里判断的eos_token啊?输出超长的场景怎么处理的?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

eos accept 之后,matcher 的状态就是is_terminated,下面就会被重置掉了。后面输出的 token 不会再限制格式。开 ignore_eos 之后也可以继续生成。


Args:
token (int): The token ID to validate

Raises:
AssertionError: If token is not allowed by the grammar
"""
assert self.matcher.accept_token(token), f"Failed to accept token {token}"

def is_terminated(self) -> bool:
"""
Check if the grammar matching process has terminated.

Returns:
bool: True if matching has terminated, False otherwise
"""
return self.matcher.is_terminated()
if self.is_terminated or self.matcher.is_terminated():
self.is_terminated = True
return False
if not self.matcher.accept_token(token):
self.matcher.reset()
return False
if self.matcher.is_terminated():
self.is_terminated = True
return True
Comment on lines 124 to +141
Copy link

Copilot AI Nov 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The accept_token method's behavior has changed significantly. In the original implementation, it raised an assertion error if the token couldn't be accepted. Now it returns False and resets the matcher (line 137). This is a breaking API change that could silently fail where it previously would have raised an error. The docstring on line 126-127 still mentions "when accept eos_token, is_terminated = True" but doesn't document the return value or the new reset behavior.

Copilot uses AI. Check for mistakes.

def copy(self) -> "XGrammarProcessor":
"""
Expand Down Expand Up @@ -216,7 +183,13 @@ def __init__(

try:
tokenizer_info = TokenizerInfo.from_huggingface(self.hf_tokenizer, vocab_size=self.vocab_size)
self.grammar_compiler = GrammarCompiler(tokenizer_info=tokenizer_info)
llm_logger.info(f"xgrammar_backend.py tokenzer_info={tokenizer_info.dump_metadata()}")
self.grammar_compiler = GrammarCompiler(
tokenizer_info=tokenizer_info,
max_threads=8,
cache_enabled=True,
cache_limit_bytes=4 * 1024 * 1024,
) # TODO cfg
except Exception as e:
raise Exception(f"Failed to load XGrammar tokenizer: {e}")

Expand Down Expand Up @@ -467,3 +440,49 @@ def schema_format(self, request: Request):
else:
# regex is not format
return request, None


def apply_token_mask(
logits: paddle.Tensor,
token_bitmask: torch.Tensor,
indices: Optional[List[int]] = None,
) -> paddle.Tensor:
"""
Apply the token mask to the logits, modifying probabilities of invalid tokens.

Args:
logits (paddle.Tensor): The logits tensor to modify
token_bitmask (torch.Tensor): The token bitmask indicating allowed tokens
indices (Optional[List[int]]): Optional list of batch indices to apply mask to

Returns:
paddle.Tensor: The modified logits tensor
"""

if current_platform.is_cuda():
dlpack = paddle.utils.dlpack.to_dlpack(logits)
t_logits = torch.from_dlpack(dlpack)
apply_token_bitmask_inplace(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个算子是支持paddle.tensor 的吧,为什么还要转torch.tensor 呢

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里还是原生的 xgr. apply_token_bitmask_inplace 接口,只支持 tensor.Tensor

logits=t_logits,
bitmask=token_bitmask.to(t_logits.device, non_blocking=True),
indices=indices,
)
dlpack2 = torch.utils.dlpack.to_dlpack(t_logits)
return paddle.utils.dlpack.from_dlpack(dlpack2)
Comment on lines +468 to +476
Copy link

Copilot AI Nov 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potential memory safety issue with DLPack conversion. The code converts logits from Paddle to DLPack (line 463), then to PyTorch (line 464), performs in-place modification (line 465-469), converts back to DLPack (line 470), and finally back to Paddle (line 471). However, there's no guarantee that the Paddle tensor remains valid after the first DLPack conversion, and modifying through PyTorch could lead to undefined behavior if Paddle has already deallocated or moved the underlying memory. Consider adding documentation about the lifetime guarantees or testing this carefully with different tensor configurations.

Copilot uses AI. Check for mistakes.
else:
origin_place = logits.place
origin_dtype = logits.dtype
logits = torch.from_numpy(logits.numpy())

logits = logits.float() # cpu
apply_token_bitmask_inplace(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个算子在多硬件上好像没有验证过?不确定能不能用

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里是纯 cpu 操作。 bitmask=token_bitmask.to(logits.device, non_blocking=True),
这个逻辑有点误导,实际 to 的还是 cpu

logits=logits,
bitmask=token_bitmask.to(logits.device, non_blocking=True),
indices=indices,
)

return paddle.to_tensor(
logits.numpy(),
dtype=origin_dtype,
place=origin_place,
)
Loading
Loading