Add TP support

alexm-redhat · alexm-redhat · commit d25ec0ea4b99 · 2025-01-16T15:33:49.000Z
diff --git a/examples/offline_inference/offline_inference.py b/examples/offline_inference/offline_inference.py
@@ -11,7 +11,7 @@
 sampling_params = SamplingParams()  #temperature=0.8, top_p=0.95)
 
 # Create an LLM.
-llm = LLM(model="Qwen/Qwen2-1.5B-Instruct", max_model_len=512, max_num_seqs=16)
+llm = LLM(model="Qwen/Qwen2-1.5B-Instruct", max_model_len=256, max_num_seqs=16, tensor_parallel_size=4)
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
diff --git a/requirements-tpu.txt b/requirements-tpu.txt
@@ -9,6 +9,7 @@ setuptools-scm>=8
 wheel
 jinja2
 ray[default]
+ray[adag] # TODO: Remove this
 
 # Install torch_xla
 --pre
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
@@ -83,7 +83,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         scheduler_config = vllm_config.scheduler_config
         if parallel_config.worker_cls == "auto":
             if envs.VLLM_USE_V1:
-                parallel_config.worker_cls = "vllm.v1.worker.tpu_worker.TRUWorker"
+                parallel_config.worker_cls = "vllm.v1.worker.tpu_worker.TPUWorker"
             else:
                 if scheduler_config.is_multi_step:
                     parallel_config.worker_cls = \
diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py
@@ -7,6 +7,7 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.platforms import current_platform
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.executor.ray_utils import (RayWorkerWrapper,
                                         initialize_ray_cluster, ray)
@@ -27,13 +28,17 @@ def __init__(self, vllm_config: VllmConfig) -> None:
         self.vllm_config = vllm_config
         self.parallel_config = vllm_config.parallel_config
         self.model_config = vllm_config.model_config
+
         self.forward_dag: Optional[ray.dag.CompiledDAG] = None
 
         # Disable Ray usage stats collection.
         ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
         if ray_usage != "1":
             os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
 
+        self.device_str = "TPU" if current_platform.is_tpu() else "GPU"
+        self.use_dag = current_platform.is_cuda()
+
         initialize_ray_cluster(self.parallel_config)
         placement_group = self.parallel_config.placement_group
 
@@ -42,16 +47,16 @@ def __init__(self, vllm_config: VllmConfig) -> None:
 
     def _init_workers_ray(self, placement_group: "PlacementGroup",
                           **ray_remote_kwargs):
-        # A list of workers to run a model.
-        self.workers: List[RayWorkerWrapper] = []
-        if self.parallel_config.ray_workers_use_nsight:
+        if (current_platform.is_cuda()
+                and self.parallel_config.ray_workers_use_nsight):
             ray_remote_kwargs = self._configure_ray_workers_use_nsight(
                 ray_remote_kwargs)
 
         # Create the workers.
+        self.workers: List[RayWorkerWrapper] = []
         driver_ip = get_ip()
         for bundle_id, bundle in enumerate(placement_group.bundle_specs):
-            if not bundle.get("GPU", 0):
+            if not bundle.get(self.device_str, 0):
                 # Skip bundles that don't have GPUs,
                 # as each worker needs one GPU.
                 continue
@@ -63,7 +68,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
 
             worker = ray.remote(
                 num_cpus=0,
-                num_gpus=1,
+                resources={self.device_str: 1},
                 scheduling_strategy=scheduling_strategy,
                 **ray_remote_kwargs,
             )(RayWorkerWrapper).remote(vllm_config=self.vllm_config)
@@ -279,11 +284,14 @@ def execute_model(
         self,
         scheduler_output,
     ) -> ModelRunnerOutput:
-        if self.forward_dag is None:
-            self.forward_dag = self._compiled_ray_dag()
-        # Only the first worker (with rank 0) returns the execution result.
-        # Others return None.
-        output = ray.get(self.forward_dag.execute(scheduler_output))[0]
+        if self.use_dag:
+            if self.forward_dag is None:
+                self.forward_dag = self._compiled_ray_dag()
+
+            output = ray.get(self.forward_dag.execute(scheduler_output))[0]
+        else:
+            output = self._run_workers("execute_model", scheduler_output)[0]
+
         return output
 
     def profile(self, is_start=True):