Restore int64 sampling (vllm-project#35)

kzawora-intel · web-flow · commit f6fb119ca85d · 2024-05-22T10:43:56.000+02:00
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
@@ -8,7 +8,7 @@
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.sequence import SequenceData, SequenceGroupMetadata
 from vllm.utils import (async_tensor_h2d, is_pin_memory_available,
-                        maybe_expand_dim, is_hpu)
+                        maybe_expand_dim)
 
 _SAMPLING_EPS = 1e-5
 _SEED_0_REPLACEMENT = 3403598558
@@ -501,19 +501,19 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
         sample_indices_t = torch.tensor(
             sample_indices,
             device="cpu",
-            dtype=torch.int,
+            dtype=torch.long,
             pin_memory=pin_memory,
         )
         prompt_tensor = torch.tensor(
             prompt_padded_tokens,
             device="cpu",
-            dtype=torch.int,
+            dtype=torch.long,
             pin_memory=pin_memory,
         )
         output_tensor = torch.tensor(
             output_padded_tokens,
             device="cpu",
-            dtype=torch.int,
+            dtype=torch.long,
             pin_memory=pin_memory,
         )
         # need to transpose and make contiguous to
@@ -522,7 +522,7 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
         sampling_seeds_t = torch.tensor(
             sampling_seeds,
             device="cpu",
-            dtype=torch.int,
+            dtype=torch.long,
             pin_memory=pin_memory,
         ).T.contiguous()
 
@@ -571,7 +571,7 @@ def _get_sequence_seeds(
             else:
                 generator = random.Random(str((seed, ) + extra_entropy))
                 randint_fn = generator.randint
-            lo, hi = torch.iinfo(torch.int).min, torch.iinfo(torch.int).max
+            lo, hi = torch.iinfo(torch.long).min, torch.iinfo(torch.long).max
             # If the user/random sets seed = 0 but request should
             # have sampling, we need to change it to something
             # else. We use a constant in that case.