043 release fixes (vllm-project#40)

joerunde · web-flow · commit ef3e030a79f6 · 2024-06-04T15:20:03.000-06:00
This includes some fixes for supporting vllm 0.4.3+.

Mostly the `generate` api changed, so we have to update our grpc server
accordingly

---------

Signed-off-by: Joe Runde &lt;Joseph.Runde@ibm.com&gt;
diff --git a/Dockerfile.ubi b/Dockerfile.ubi
@@ -161,7 +161,7 @@ RUN microdnf install -y \
 
 ARG PYTHON_VERSION
 # 0.4.2 is built for CUDA 12.1 and PyTorch 2.3.0
-ARG VLLM_WHEEL_VERSION=0.4.2
+ARG VLLM_WHEEL_VERSION=0.4.3
 
 RUN curl -Lo vllm.whl https://github.com/vllm-project/vllm/releases/download/v${VLLM_WHEEL_VERSION}/vllm-${VLLM_WHEEL_VERSION}-cp${PYTHON_VERSION//.}-cp${PYTHON_VERSION//.}-manylinux1_x86_64.whl \
     && unzip vllm.whl \
@@ -208,12 +208,12 @@ COPY --link vllm vllm
 # Comment if building *.so files from scratch
 ##################################################
 # Copy the prebuilt *.so files
-# COPY --from=prebuilt-wheel --link /workspace/vllm/*.so /workspace/vllm/
-# ENV VLLM_USE_PRECOMPILED=1
+COPY --from=prebuilt-wheel --link /workspace/vllm/*.so /workspace/vllm/
+ENV VLLM_USE_PRECOMPILED=1
 ##################################################
 # Comment if not building .so files from scratch
-RUN microdnf install -y git \
-    && microdnf clean all
+#RUN microdnf install -y git \
+#    && microdnf clean all
 ##################################################
 
 # Copy over the generated *.pb2 files
diff --git a/vllm/entrypoints/grpc/grpc_server.py b/vllm/entrypoints/grpc/grpc_server.py
@@ -31,6 +31,7 @@
                                                      TokenizeResponse)
 from vllm.entrypoints.grpc.validation import validate_input, validate_params
 from vllm.entrypoints.openai.serving_completion import merge_async_iterators
+from vllm.inputs import TextTokensPrompt
 from vllm.logger import init_logger
 from vllm.sequence import Logprob
 from vllm.tgis_utils import logs
@@ -151,13 +152,16 @@ async def Generate(self, request: BatchedGenerationRequest,
             input_ids, max_is_token_limit[i]\
                 = await self._validate_prompt_and_tokenize(
                     sampling_params, truncate_input_tokens, req.text, context)
+            inputs = TextTokensPrompt(
+                prompt=req.text,
+                prompt_token_ids=input_ids
+            )
             generators.append(
                 # prompt is supplied for observability, the text is not
                 # re-tokenized when `prompt_token_ids` is supplied
-                self.engine.generate(prompt=req.text,
+                self.engine.generate(inputs=inputs,
                                      sampling_params=sampling_params,
-                                     request_id=f"{request_id}-{i}",
-                                     prompt_token_ids=input_ids),
+                                     request_id=f"{request_id}-{i}"),
             )
 
         # TODO handle cancellation
@@ -213,13 +217,17 @@ async def GenerateStream(
             sampling_params, truncate_input_tokens, request.request.text,
             context)
 
+        inputs = TextTokensPrompt(
+            prompt=request.request.text,
+            prompt_token_ids=input_ids
+        )
+
         result_generator = self.engine.generate(
             # prompt is supplied for observability, the text is not
             # re-tokenized when `prompt_token_ids` is supplied
-            prompt=request.request.text,
+            inputs=inputs,
             sampling_params=sampling_params,
-            request_id=request_id,
-            prompt_token_ids=input_ids,
+            request_id=request_id
         )
 
         resp_options = request.params.response