diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py index 5c5d20a51e7d..1ccf10f1a60d 100644 --- a/vllm/worker/embedding_model_runner.py +++ b/vllm/worker/embedding_model_runner.py @@ -103,7 +103,8 @@ def execute_model( # a placeholder (it has wide hardware support). kv_caches = [ torch.tensor([], dtype=torch.float32, device=self.device) - ] * num_layers + for _ in range(num_layers) + ] execute_model_kwargs = { "input_ids": diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 0f8b4eeacde0..90dfad62e028 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -348,7 +348,8 @@ def profile_run(self) -> None: # a placeholder (it has wide hardware support). kv_caches = [ torch.tensor([], dtype=torch.float32, device=self.device) - ] * num_layers + for _ in range(num_layers) + ] finished_requests_ids = [seq.request_id for seq in seqs] model_input = self.prepare_model_input( seqs, finished_requests_ids=finished_requests_ids) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 76c04ce66fc2..40c0f5d0d99d 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1244,9 +1244,13 @@ def profile_run(self) -> None: # it by reference, rather by specializing on the value ``None``. # the `dtype` argument does not matter, and we use `float32` as # a placeholder (it has wide hardware support). + # it is important to create tensors inside the loop, rather than + # multiplying the list, to avoid Dynamo from treating them as + # tensor aliasing. kv_caches = [ torch.tensor([], dtype=torch.float32, device=self.device) - ] * num_layers + for _ in range(num_layers) + ] finished_requests_ids = [seq.request_id for seq in seqs] model_input = self.prepare_model_input( seqs, finished_requests_ids=finished_requests_ids)