Skip to content
This repository was archived by the owner on Sep 4, 2025. It is now read-only.

Commit 7a6d518

Browse files
committed
Squash 9631
Signed-off-by: Jefferson Fialho <[email protected]>
1 parent e949f65 commit 7a6d518

File tree

1 file changed

+8
-2
lines changed

1 file changed

+8
-2
lines changed

vllm/model_executor/guided_decoding/outlines_logits_processors.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,11 @@
1515
# limitations under the License.
1616
import copy
1717
import json
18-
import math
1918
from collections import defaultdict
2019
from functools import lru_cache
2120
from typing import Callable, DefaultDict, Dict, List, Union
2221

22+
import numpy as np
2323
import torch
2424
from lark import Lark
2525
from outlines import grammars
@@ -77,8 +77,14 @@ def __call__(self, input_ids: List[int],
7777
f"Unsupported instruction type {type(instruction)}")
7878

7979
mask = torch.full((scores.shape[-1], ),
80-
-math.inf,
80+
-torch.inf,
8181
device=scores.device)
82+
# The tokenizer may support more token ids than the model can generate,
83+
# eg. Llama 3.2 Vision models have an `<|image|>` token with id 128256
84+
# but scores.shape == torch.Size([128256])
85+
# Using NumPy is faster for filtering token ids
86+
allowed_tokens = np.array(allowed_tokens)
87+
allowed_tokens = allowed_tokens[allowed_tokens < scores.shape[-1]]
8288
mask[allowed_tokens] = 0
8389
scores.add_(mask)
8490
return scores

0 commit comments

Comments
 (0)