PaddlePaddle · ST-XX · Nov 5, 2025 · Nov 5, 2025 · Nov 7, 2025 · Nov 7, 2025
diff --git a/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py b/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py
@@ -14,9 +14,10 @@
 # limitations under the License.
 """
 
+import multiprocessing
 import os
 import traceback
-from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures import Future, ThreadPoolExecutor
 
 from fastdeploy.config import ErnieArchitectures, FDConfig
 from fastdeploy.engine.request import Request
@@ -135,9 +136,9 @@ class BackendBase:
     """
 
     def __init__(self, fd_config: FDConfig):
-        self.cache = {}
         self.fd_config = fd_config
-        self.executor = ThreadPoolExecutor()
+        max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
-        max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
+        # Determine max_workers for ThreadPoolExecutor.
+        # Default is half the CPU count plus one, to balance concurrency and avoid oversubscription.
+        # This can be overridden by setting 'max_workers' in FDConfig.
+        max_workers = getattr(self.fd_config, "max_workers", None)
+        if max_workers is None:
+            max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
-        max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
+        # Determine max_workers for ThreadPoolExecutor.
+        # Default is half the CPU count plus one, to balance concurrency and avoid oversubscription.
+        # This can be overridden by setting 'max_workers' in FDConfig.
+        max_workers = getattr(self.fd_config, "max_workers", None)
+        if max_workers is None:
+            max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
+        self.executor = ThreadPoolExecutor(max_workers=max_workers)
         self.max_cache_size = 2048
         self.reasoning_parser = None
 
@@ -263,7 +264,7 @@ def get_logits_processor(
         self,
         schemata_key: tuple[str, str],
         enable_thinking: bool = False,
-    ) -> tuple[LogitsProcessorBase, bool]:
+    ) -> Future[LogitsProcessorBase]:
         """
         get logits processor by key from cache or create new one.
 
@@ -275,13 +276,8 @@ def get_logits_processor(
                 - LogitsProcessorBase: The logits processor instance
                 - bool: True if processor was from cache, False if newly created
         """
-        value = self.cache.get(schemata_key, None)
-        if value:
-            value_copy = value.copy()
-            value_copy.enable_reasoning = enable_thinking
-            return value_copy, True
         value = self.executor.submit(self._init_logits_processor, schemata_key, enable_thinking)
-        return value, False
+        return value
 
     def _get_tokenizer_hf(self):
         """
@@ -303,7 +299,7 @@ def _get_tokenizer_hf(self):
 
-
+
+                # NOTE: Setting use_fast=True switches to the fast (Rust) tokenizer, which may produce different tokenization results
+                # compared to the slow (Python) tokenizer. This can affect model outputs and downstream processing.
+                # Please ensure this change is compatible with your use case.
-
+
+                # NOTE: Setting use_fast=True switches to the fast (Rust) tokenizer, which may produce different tokenization results
+                # compared to the slow (Python) tokenizer. This can affect model outputs and downstream processing.
+                # Please ensure this change is compatible with your use case.
                 tokenizer = AutoTokenizer.from_pretrained(
                     self.fd_config.model_config.model,
-                    use_fast=False,
+                    use_fast=True,
                 )
 
                 if not isinstance(tokenizer, PreTrainedTokenizerFast):
@@ -334,21 +330,6 @@ def _get_tokenizer_hf(self):
         except Exception as e:
             raise Exception(f"Fail to initialize hf tokenizer: {e}, {str(traceback.format_exc())}")
 
-    def add_cache(self, schemata_key: tuple[str, str], processor: LogitsProcessorBase) -> None:
-        """
-        add logits processor to cache.
-
-        Args:
-            schemata_key (tuple[str, str]): Tuple containing processor type and schema string
-            processor (LogitsProcessorBase): Logits processor instance to cache
-
-        Returns:
-            None: No return value
-        """
-        if len(self.cache) >= self.max_cache_size:
-            return
-        self.cache[schemata_key] = processor.copy()
-
 
 class BaseChecker:
     """

diff --git a/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py b/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py
@@ -29,6 +29,7 @@
     BaseChecker,
     LogitsProcessorBase,
 )
+from fastdeploy.platforms import current_platform
 from fastdeploy.utils import llm_logger
 
 try:
@@ -86,6 +87,8 @@ def __init__(
             terminate_without_stop_token=terminate_without_stop_token,
             override_stop_tokens=override_stop_tokens,
         )
+        # when matcher accept eos_token_id, is_terminated = True
+        self.is_terminated: bool = False
 
     def allocate_token_bitmask(self) -> torch.Tensor:
         """
@@ -109,40 +112,6 @@ def fill_token_bitmask(self, token_bitmask: torch.Tensor, idx: int) -> None:
         """
         self.matcher.fill_next_token_bitmask(token_bitmask, idx)
 
-    def apply_token_mask(
-        self,
-        logits: paddle.Tensor,
-        token_bitmask: torch.Tensor,
-        indices: Optional[List[int]] = None,
-    ) -> paddle.Tensor:
-        """
-        Apply the token mask to the logits, modifying probabilities of invalid tokens.
-
-        Args:
-            logits (paddle.Tensor): The logits tensor to modify
-            token_bitmask (torch.Tensor): The token bitmask indicating allowed tokens
-            indices (Optional[List[int]]): Optional list of batch indices to apply mask to
-
-        Returns:
-            paddle.Tensor: The modified logits tensor
-        """
-        origin_place = logits.place
-        origin_dtype = logits.dtype
-        logits = torch.from_numpy(logits.numpy())
-
-        logits = logits.float()  # cpu
-        apply_token_bitmask_inplace(
-            logits=logits,
-            bitmask=token_bitmask.to(logits.device, non_blocking=True),
-            indices=indices,
-        )
-
-        return paddle.to_tensor(
-            logits.numpy(),
-            dtype=origin_dtype,
-            place=origin_place,
-        )
-
     def reset(self) -> None:
         """
         Reset the grammar matcher state to initial conditions.
@@ -155,23 +124,21 @@ def reset(self) -> None:
     def accept_token(self, token: int) -> None:
         """
         Validate and accept a generated token against the grammar constraints.
+        when accept eos_token, is_terminated = True
 
         Args:
             token (int): The token ID to validate
 
-        Raises:
-            AssertionError: If token is not allowed by the grammar
-        """
-        assert self.matcher.accept_token(token), f"Failed to accept token {token}"
-
-    def is_terminated(self) -> bool:
-        """
-        Check if the grammar matching process has terminated.
-
-        Returns:
-            bool: True if matching has terminated, False otherwise
         """
-        return self.matcher.is_terminated()
+        if self.is_terminated or self.matcher.is_terminated():
+            self.is_terminated = True
+            return False
+        if not self.matcher.accept_token(token):
+            self.matcher.reset()
+            return False
+        if self.matcher.is_terminated():
+            self.is_terminated = True
+        return True
 
     def copy(self) -> "XGrammarProcessor":
         """
@@ -216,7 +183,13 @@ def __init__(
 
         try:
             tokenizer_info = TokenizerInfo.from_huggingface(self.hf_tokenizer, vocab_size=self.vocab_size)
-            self.grammar_compiler = GrammarCompiler(tokenizer_info=tokenizer_info)
+            llm_logger.info(f"xgrammar_backend.py tokenzer_info={tokenizer_info.dump_metadata()}")
+            self.grammar_compiler = GrammarCompiler(
+                tokenizer_info=tokenizer_info,
+                max_threads=8,
+                cache_enabled=True,
+                cache_limit_bytes=4 * 1024 * 1024,
+            )  # TODO cfg
         except Exception as e:
             raise Exception(f"Failed to load XGrammar tokenizer: {e}")
 
@@ -467,3 +440,49 @@ def schema_format(self, request: Request):
         else:
             # regex is not format
             return request, None
+
+
+def apply_token_mask(
+    logits: paddle.Tensor,
+    token_bitmask: torch.Tensor,
+    indices: Optional[List[int]] = None,
+) -> paddle.Tensor:
+    """
+    Apply the token mask to the logits, modifying probabilities of invalid tokens.
+
+    Args:
+        logits (paddle.Tensor): The logits tensor to modify
+        token_bitmask (torch.Tensor): The token bitmask indicating allowed tokens
+        indices (Optional[List[int]]): Optional list of batch indices to apply mask to
+
+    Returns:
+        paddle.Tensor: The modified logits tensor
+    """
+
+    if current_platform.is_cuda():
+        dlpack = paddle.utils.dlpack.to_dlpack(logits)
+        t_logits = torch.from_dlpack(dlpack)
+        apply_token_bitmask_inplace(
+            logits=t_logits,
+            bitmask=token_bitmask.to(t_logits.device, non_blocking=True),
+            indices=indices,
+        )
+        dlpack2 = torch.utils.dlpack.to_dlpack(t_logits)
+        return paddle.utils.dlpack.from_dlpack(dlpack2)
+    else:
+        origin_place = logits.place
+        origin_dtype = logits.dtype
+        logits = torch.from_numpy(logits.numpy())
+
+        logits = logits.float()  # cpu
+        apply_token_bitmask_inplace(
+            logits=logits,
+            bitmask=token_bitmask.to(logits.device, non_blocking=True),
+            indices=indices,
+        )
+
+        return paddle.to_tensor(
+            logits.numpy(),
+            dtype=origin_dtype,
+            place=origin_place,
+        )