Skip to content

Commit ef3e030

Browse files
authored
043 release fixes (vllm-project#40)
This includes some fixes for supporting vllm 0.4.3+. Mostly the `generate` api changed, so we have to update our grpc server accordingly --------- Signed-off-by: Joe Runde <[email protected]>
1 parent a17c8fb commit ef3e030

File tree

2 files changed

+19
-11
lines changed

2 files changed

+19
-11
lines changed

Dockerfile.ubi

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ RUN microdnf install -y \
161161

162162
ARG PYTHON_VERSION
163163
# 0.4.2 is built for CUDA 12.1 and PyTorch 2.3.0
164-
ARG VLLM_WHEEL_VERSION=0.4.2
164+
ARG VLLM_WHEEL_VERSION=0.4.3
165165

166166
RUN curl -Lo vllm.whl https://github.com/vllm-project/vllm/releases/download/v${VLLM_WHEEL_VERSION}/vllm-${VLLM_WHEEL_VERSION}-cp${PYTHON_VERSION//.}-cp${PYTHON_VERSION//.}-manylinux1_x86_64.whl \
167167
&& unzip vllm.whl \
@@ -208,12 +208,12 @@ COPY --link vllm vllm
208208
# Comment if building *.so files from scratch
209209
##################################################
210210
# Copy the prebuilt *.so files
211-
# COPY --from=prebuilt-wheel --link /workspace/vllm/*.so /workspace/vllm/
212-
# ENV VLLM_USE_PRECOMPILED=1
211+
COPY --from=prebuilt-wheel --link /workspace/vllm/*.so /workspace/vllm/
212+
ENV VLLM_USE_PRECOMPILED=1
213213
##################################################
214214
# Comment if not building .so files from scratch
215-
RUN microdnf install -y git \
216-
&& microdnf clean all
215+
#RUN microdnf install -y git \
216+
# && microdnf clean all
217217
##################################################
218218

219219
# Copy over the generated *.pb2 files

vllm/entrypoints/grpc/grpc_server.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
TokenizeResponse)
3232
from vllm.entrypoints.grpc.validation import validate_input, validate_params
3333
from vllm.entrypoints.openai.serving_completion import merge_async_iterators
34+
from vllm.inputs import TextTokensPrompt
3435
from vllm.logger import init_logger
3536
from vllm.sequence import Logprob
3637
from vllm.tgis_utils import logs
@@ -151,13 +152,16 @@ async def Generate(self, request: BatchedGenerationRequest,
151152
input_ids, max_is_token_limit[i]\
152153
= await self._validate_prompt_and_tokenize(
153154
sampling_params, truncate_input_tokens, req.text, context)
155+
inputs = TextTokensPrompt(
156+
prompt=req.text,
157+
prompt_token_ids=input_ids
158+
)
154159
generators.append(
155160
# prompt is supplied for observability, the text is not
156161
# re-tokenized when `prompt_token_ids` is supplied
157-
self.engine.generate(prompt=req.text,
162+
self.engine.generate(inputs=inputs,
158163
sampling_params=sampling_params,
159-
request_id=f"{request_id}-{i}",
160-
prompt_token_ids=input_ids),
164+
request_id=f"{request_id}-{i}"),
161165
)
162166

163167
# TODO handle cancellation
@@ -213,13 +217,17 @@ async def GenerateStream(
213217
sampling_params, truncate_input_tokens, request.request.text,
214218
context)
215219

220+
inputs = TextTokensPrompt(
221+
prompt=request.request.text,
222+
prompt_token_ids=input_ids
223+
)
224+
216225
result_generator = self.engine.generate(
217226
# prompt is supplied for observability, the text is not
218227
# re-tokenized when `prompt_token_ids` is supplied
219-
prompt=request.request.text,
228+
inputs=inputs,
220229
sampling_params=sampling_params,
221-
request_id=request_id,
222-
prompt_token_ids=input_ids,
230+
request_id=request_id
223231
)
224232

225233
resp_options = request.params.response

0 commit comments

Comments
 (0)