Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
127 commits
Select commit Hold shift + click to select a range
bf041d9
Merge remote-tracking branch 'upstream/main' into llm-engine-spec
cadedaniel Apr 5, 2024
fa8705d
wip
cadedaniel Apr 7, 2024
8495321
wip
cadedaniel Apr 7, 2024
b63975b
wip
cadedaniel Apr 7, 2024
cb23e8c
wip
cadedaniel Apr 7, 2024
143ca28
wip
cadedaniel Apr 7, 2024
d8d4725
fix
cadedaniel Apr 7, 2024
b2728e0
wip
cadedaniel Apr 7, 2024
6250f6c
assertion
cadedaniel Apr 7, 2024
a930755
fix
cadedaniel Apr 7, 2024
5b896a3
fix
cadedaniel Apr 7, 2024
bb43b53
lint
cadedaniel Apr 7, 2024
cde3160
fix
cadedaniel Apr 7, 2024
dd8aeff
fix
cadedaniel Apr 7, 2024
46e4847
test
cadedaniel Apr 7, 2024
8454edc
test fixes
cadedaniel Apr 7, 2024
819e656
lint
cadedaniel Apr 7, 2024
67fd287
Merge remote-tracking branch 'upstream/main' into llm-engine-spec
cadedaniel Apr 7, 2024
c3449ba
Merge branch 'executor_base' into llm-engine-spec
cadedaniel Apr 7, 2024
d0fbe47
clean
cadedaniel Apr 7, 2024
5445af6
refactor out beam search model processor
cadedaniel Apr 7, 2024
632b439
fix
cadedaniel Apr 7, 2024
26e7368
dedup stop check
cadedaniel Apr 7, 2024
06e7c01
wip
cadedaniel Apr 7, 2024
184a52c
del
cadedaniel Apr 7, 2024
34468fe
rename
cadedaniel Apr 7, 2024
208c467
wip
cadedaniel Apr 8, 2024
3c6abcc
wip
cadedaniel Apr 8, 2024
bbbcef7
wip
cadedaniel Apr 8, 2024
b58762d
fix
cadedaniel Apr 8, 2024
8b500d4
wip
cadedaniel Apr 8, 2024
782ce22
unit tests for block decode
cadedaniel Apr 8, 2024
3062e1c
stop token ids
cadedaniel Apr 8, 2024
fba3b30
format
cadedaniel Apr 8, 2024
bda141f
fixing spec tests
cadedaniel Apr 8, 2024
49865fb
lint
cadedaniel Apr 8, 2024
1a17ed1
clean up gpu executor
cadedaniel Apr 8, 2024
dea67bb
wip
cadedaniel Apr 8, 2024
189d7eb
fix
cadedaniel Apr 8, 2024
a70a040
wip
cadedaniel Apr 8, 2024
3e1b8f5
detokenization
cadedaniel Apr 8, 2024
b9777a6
lint
cadedaniel Apr 8, 2024
29b4f12
docstrings
cadedaniel Apr 8, 2024
42aa0bc
fix
cadedaniel Apr 8, 2024
0ebd93b
more spec test
cadedaniel Apr 8, 2024
33a3d72
remove
cadedaniel Apr 8, 2024
15c942d
wip
cadedaniel Apr 8, 2024
063e34b
strip
cadedaniel Apr 8, 2024
672a855
print
cadedaniel Apr 8, 2024
b4543c8
[Model] add minicpm (#3893)
SUDA-HLT-ywfang Apr 8, 2024
f46864d
[Bugfix] Added Command-R GPTQ support (#3849)
egortolmachev Apr 8, 2024
bc0c019
[Bugfix] Enable Proper `attention_bias` Usage in Llama Model Configur…
Ki6an Apr 8, 2024
59a6abf
[Hotfix][CI/Build][Kernel] CUDA 11.8 does not support layernorm optim…
mawong-amd Apr 8, 2024
d036198
[BugFix][Model] Fix commandr RoPE max_position_embeddings (#3919)
esmeetu Apr 8, 2024
8021b38
fix flaky test
cadedaniel Apr 8, 2024
8e93fff
reduce output len
cadedaniel Apr 8, 2024
d06e9a4
strip
cadedaniel Apr 8, 2024
91cf0fc
Merge branch 'executor_base' into llm-engine-spec
cadedaniel Apr 9, 2024
6d592eb
[Core] separate distributed_init from worker (#3904)
youkaichao Apr 9, 2024
e7c7067
[Misc] [Core] Implement RFC "Augment BaseExecutor interfaces to enabl…
cadedaniel Apr 9, 2024
f6c7b2e
Zhuohan offline pr feedback
cadedaniel Apr 9, 2024
e23a43a
[Bugfix] Fix KeyError on loading GPT-NeoX (#3925)
jsato8094 Apr 9, 2024
0283fae
Merge remote-tracking branch 'upstream/main' into llm-engine-spec
cadedaniel Apr 9, 2024
96f81c4
lint
cadedaniel Apr 9, 2024
6c0b045
[ROCm][Hardware][AMD] Use Triton Kernel for default FA on ROCm (#3643)
jpvillam-amd Apr 9, 2024
11dd6eb
[Misc] Avoid loading incorrect LoRA config (#3777)
jeejeelee Apr 10, 2024
c013d32
[Benchmark] Add cpu options to bench scripts (#3915)
zedong-peng Apr 10, 2024
c2e00af
[Bugfix] fix utils.py/merge_dict func TypeError: 'type' object is no…
zhaotyer Apr 10, 2024
b3104b2
[Bugfix] Fix logits processor when prompt_logprobs is not None (#3899)
huyiwen Apr 10, 2024
0258b7a
[Bugfix] handle prompt_logprobs in _apply_min_tokens_penalty (#3876)
tjohnson31415 Apr 10, 2024
bd3c144
[Bugfix][ROCm] Add numba to Dockerfile.rocm (#3962)
WoosukKwon Apr 10, 2024
8b317c6
[Model][AMD] ROCm support for 256 head dims for Gemma (#3972)
jamestwhedbee Apr 10, 2024
e353974
[Doc] Add doc to state our model support policy (#3948)
youkaichao Apr 10, 2024
e4c4072
[Bugfix] Remove key sorting for `guided_json` parameter in OpenAi com…
dmarasco Apr 10, 2024
92cd2e2
[Doc] Fix getting stared to use publicly available model (#3963)
fpaupier Apr 10, 2024
de16919
pr feedback
cadedaniel Apr 10, 2024
934d366
[Bugfix] handle hf_config with architectures == None (#3982)
tjohnson31415 Apr 10, 2024
63e7176
[Core][Refactor] move parallel_utils into vllm/distributed (#3950)
youkaichao Apr 10, 2024
67b4221
[Core][5/N] Fully working chunked prefill e2e (#3884)
rkooo567 Apr 11, 2024
d933e50
Merge branch 'main' into llm-engine-spec
cadedaniel Apr 11, 2024
caada5e
[Core][Model] torch.compile for layernorm in commandr (#3985)
youkaichao Apr 11, 2024
e42df72
[Test] Add xformer and flash attn tests (#3961)
rkooo567 Apr 11, 2024
e9da5a4
[Misc] Add indirection layer for custom ops (#3913)
jikunshang Apr 11, 2024
f3d0bf7
[Doc][Installation] delete python setup.py develop (#3989)
youkaichao Apr 11, 2024
c1dc547
[Kernel] Fused MoE Config for Mixtral 8x22 (#4002)
ywang96 Apr 11, 2024
08ccee1
punica fix-bgmv-kernel-640 (#4007)
kingljl Apr 11, 2024
8afca50
[Hardware][Intel] Isolate CPUModelRunner and ModelRunner for better m…
bigPYJ1151 Apr 11, 2024
a10d305
[Core] Set `linear_weights` directly on the layer (#3977)
Yard1 Apr 11, 2024
559eb85
[Core] init_distributed_environment align with init_process_group(#4014)
youkaichao Apr 11, 2024
95e7d4a
Fix echo/logprob OpenAI completion bug (#3441)
dylanwhawk Apr 11, 2024
1e96c33
Add extra punica sizes to support bigger vocabs (#4015)
Yard1 Apr 11, 2024
e46a60a
[BugFix] Fix handling of stop strings and stop token ids (#3672)
njhill Apr 11, 2024
c2b4a1b
[Doc] Add typing hints / mypy types cleanup (#3816)
michaelfeil Apr 12, 2024
1096717
[Core] Support LoRA on quantized models (#4012)
jeejeelee Apr 12, 2024
7fd3949
[Frontend][Core] Move `merge_async_iterators` to utils (#4026)
DarkLight1337 Apr 12, 2024
36729ba
[Test] Test multiple attn backend for chunked prefill. (#4023)
rkooo567 Apr 12, 2024
96b6a6d
[Bugfix] fix type hint for py 3.8 (#4036)
youkaichao Apr 12, 2024
d4ec9ff
[Misc] Fix typo in scheduler.py (#4022)
zhuohan123 Apr 12, 2024
09473ee
[mypy] Add mypy type annotation part 1 (#4006)
rkooo567 Apr 12, 2024
fbb9d9e
[Core] fix custom allreduce default value (#4040)
youkaichao Apr 12, 2024
d04973a
Fix triton compilation issue (#3984)
Bellk17 Apr 12, 2024
b8aacac
[Bugfix] Fix LoRA bug (#4032)
jeejeelee Apr 12, 2024
546e721
[CI/Test] expand ruff and yapf for all supported python version (#4037)
youkaichao Apr 13, 2024
5c2e66e
[Bugfix] More type hint fixes for py 3.8 (#4039)
dylanwhawk Apr 13, 2024
98afde1
[Core][Distributed] improve logging for init dist (#4042)
youkaichao Apr 13, 2024
ec8e3c6
[Bugfix] fix_log_time_in_metrics (#4050)
zspo Apr 13, 2024
0a430b4
[Bugfix] fix_small_bug_in_neuron_executor (#4051)
zspo Apr 13, 2024
989ae25
[Kernel] Add punica dimension for Baichuan-13B (#4053)
jeejeelee Apr 13, 2024
711a000
[Frontend] [Core] feat: Add model loading using `tensorizer` (#3476)
sangstar Apr 14, 2024
2cd6b4f
[Core] avoid too many cuda context by caching p2p test (#4021)
youkaichao Apr 14, 2024
563c54f
[BugFix] Fix tensorizer extra in setup.py (#4072)
njhill Apr 14, 2024
aceb17c
[Docs] document that mixtral 8x22b is supported (#4073)
simon-mo Apr 14, 2024
8db1bf3
[Misc] Upgrade triton to 2.2.0 (#4061)
esmeetu Apr 15, 2024
e11e200
[Bugfix] Fix filelock version requirement (#4075)
zhuohan123 Apr 15, 2024
0003e91
[Misc][Minor] Fix CPU block num log in CPUExecutor. (#4088)
bigPYJ1151 Apr 15, 2024
eb46fbf
[Core] Simplifications to executor classes (#4071)
njhill Apr 15, 2024
d619ae2
[Doc] Add better clarity for tensorizer usage (#4090)
sangstar Apr 15, 2024
4695397
[Bugfix] Fix ray workers profiling with nsight (#4095)
rickyyx Apr 15, 2024
37e84a4
[Typing] Fix Sequence type GenericAlias only available after Python 3…
rkooo567 Apr 15, 2024
4e7ee66
[Core] Fix engine-use-ray broken (#4105)
rkooo567 Apr 16, 2024
0543476
LM Format Enforcer Guided Decoding Support (#3868)
noamgat Apr 16, 2024
2a19f5e
allow append empty tokens in block table
cadedaniel Apr 16, 2024
79325d3
Merge remote-tracking branch 'upstream/main' into llm-engine-spec
cadedaniel Apr 16, 2024
b6e9e82
rebase on stop string fixes
cadedaniel Apr 16, 2024
bf0c37c
test spec
cadedaniel Apr 16, 2024
a158256
lint & mypy
cadedaniel Apr 16, 2024
5a69f6c
doc
cadedaniel Apr 16, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,13 @@ steps:
command: pytest -v -s async_engine

- label: Basic Correctness Test
command: pytest -v -s basic_correctness
commands:
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
- VLLM_ATTENTION_BACKEND=ROCM_FLASH pytest -v -s basic_correctness/test_basic_correctness.py
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=ROCM_FLASH pytest -v -s basic_correctness/test_chunked_prefill.py

- label: Core Test
command: pytest -v -s core
Expand All @@ -29,6 +35,8 @@ steps:
- pytest -v -s test_pynccl.py
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py

- label: Engine Test
command: pytest -v -s engine tokenization test_sequence.py test_config.py
Expand Down Expand Up @@ -83,6 +91,9 @@ steps:
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 4

- label: Tensorizer Test
command: apt-get install curl libsodium23 && pytest -v -s tensorizer

- label: Metrics Test
command: pytest -v -s metrics

Expand Down
50 changes: 50 additions & 0 deletions .github/workflows/mypy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
name: mypy

on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
pull_request:
branches:
- main

jobs:
ruff:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install mypy==1.9.0
pip install types-setuptools
pip install types-PyYAML
pip install types-requests
pip install types-setuptools
- name: Mypy
run: |
mypy vllm/attention/*.py --follow-imports=skip --config-file pyproject.toml
mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
mypy vllm/distributed/*.py --follow-imports=skip --config-file pyproject.toml
mypy vllm/entrypoints/*.py --follow-imports=skip --config-file pyproject.toml
mypy vllm/executor/*.py --follow-imports=skip --config-file pyproject.toml
mypy vllm/usage/*.py --follow-imports=skip --config-file pyproject.toml
mypy vllm/*.py --follow-imports=skip --config-file pyproject.toml
mypy vllm/transformers_utils/*.py --follow-imports=skip --config-file pyproject.toml

# TODO(sang): Follow up
# mypy vllm/engine/*.py --follow-imports=skip --config-file pyproject.toml
# mypy vllm/worker/*.py --follow-imports=skip --config-file pyproject.toml
# mypy vllm/spec_decoding/*.py --follow-imports=skip --config-file pyproject.toml
# mypy vllm/model_executor/*.py --follow-imports=skip --config-file pyproject.toml
# mypy vllm/lora/*.py --follow-imports=skip --config-file pyproject.toml

2 changes: 1 addition & 1 deletion .github/workflows/ruff.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10"]
python-version: ["3.8", "3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/yapf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10"]
python-version: ["3.8", "3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
Expand Down
16 changes: 15 additions & 1 deletion Dockerfile.rocm
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ RUN echo "FA_BRANCH is $FA_BRANCH"
# In that case, we need to use the python reference attention implementation in vllm
ARG BUILD_FA="1"

# whether to build triton on rocm
ARG BUILD_TRITON="1"

# Install some basic utilities
RUN apt-get update && apt-get install python3 python3-pip -y

Expand Down Expand Up @@ -75,9 +78,20 @@ RUN if [ "$BUILD_FA" = "1" ]; then \
RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi

# build triton
RUN if [ "$BUILD_TRITON" = "1" ]; then \
mkdir -p libs \
&& cd libs \
&& pip uninstall -y triton \
&& git clone https://github.com/ROCm/triton.git \
&& cd triton/python \
&& pip3 install . \
&& cd ../..; \
fi

COPY ./ /app/vllm

RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --upgrade pip numba
RUN python3 -m pip install xformers==0.0.23 --no-deps

RUN cd /app \
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,9 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
- InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.)
- Jais (`core42/jais-13b`, `core42/jais-13b-chat`, `core42/jais-30b-v3`, `core42/jais-30b-chat-v3`, etc.)
- LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
- MiniCPM (`openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, etc.)
- Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
- Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.)
- Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.)
- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
- OLMo (`allenai/OLMo-1B`, `allenai/OLMo-7B`, etc.)
- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
Expand Down
62 changes: 33 additions & 29 deletions benchmarks/backend_request_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ class RequestFuncInput:
class RequestFuncOutput:
generated_text: str = ""
success: bool = False
latency: float = 0
ttft: float = 0 # Time to first token
latency: float = 0.0
ttft: float = 0.0 # Time to first token
itl: List[float] = field(
default_factory=list) # List of inter-token latencies
prompt_len: int = 0
Expand Down Expand Up @@ -58,23 +58,24 @@ async def async_request_tgi(
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len

ttft = 0
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload) as response:
if response.status == 200:
async for chunk in response.content:
chunk = chunk.strip()
if not chunk:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue

chunk = remove_prefix(chunk.decode("utf-8"), "data:")
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
"data:")

data = json.loads(chunk)
timestamp = time.perf_counter()
# First token
if ttft == 0:
if ttft == 0.0:
ttft = time.perf_counter() - st
output.ttft = ttft

Expand Down Expand Up @@ -119,23 +120,24 @@ async def async_request_trt_llm(
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len

ttft = 0
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload) as response:
if response.status == 200:
async for chunk in response.content:
chunk = chunk.strip()
if not chunk:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue

chunk = remove_prefix(chunk.decode("utf-8"), "data:")
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
"data:")

data = json.loads(chunk)
timestamp = time.perf_counter()
# First token
if ttft == 0:
if ttft == 0.0:
ttft = time.perf_counter() - st
output.ttft = ttft

Expand All @@ -151,7 +153,7 @@ async def async_request_trt_llm(
output.success = True

else:
output.error = response.reason
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
Expand Down Expand Up @@ -195,7 +197,7 @@ async def async_request_deepspeed_mii(
output.generated_text = parsed_resp["text"][0]
output.success = True
else:
output.error = response.reason
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
Expand Down Expand Up @@ -234,19 +236,20 @@ async def async_request_openai_completions(
output.prompt_len = request_func_input.prompt_len

generated_text = ""
ttft = 0
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload,
headers=headers) as response:
if response.status == 200:
async for chunk in response.content:
chunk = chunk.strip()
if not chunk:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue

chunk = remove_prefix(chunk.decode("utf-8"), "data: ")
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
"data: ")
if chunk == "[DONE]":
latency = time.perf_counter() - st
else:
Expand All @@ -255,7 +258,7 @@ async def async_request_openai_completions(
if data["choices"][0]["text"]:
timestamp = time.perf_counter()
# First token
if ttft == 0:
if ttft == 0.0:
ttft = time.perf_counter() - st
output.ttft = ttft

Expand Down Expand Up @@ -315,19 +318,20 @@ async def async_request_openai_chat_completions(
output.prompt_len = request_func_input.prompt_len

generated_text = ""
ttft = 0
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload,
headers=headers) as response:
if response.status == 200:
async for chunk in response.content:
chunk = chunk.strip()
if not chunk:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue

chunk = remove_prefix(chunk.decode("utf-8"), "data: ")
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
"data: ")
if chunk == "[DONE]":
latency = time.perf_counter() - st
else:
Expand All @@ -337,7 +341,7 @@ async def async_request_openai_chat_completions(
delta = data["choices"][0]["delta"]
if delta.get("content", None):
# First token
if ttft == 0:
if ttft == 0.0:
ttft = time.perf_counter() - st
output.ttft = ttft

Expand All @@ -354,7 +358,7 @@ async def async_request_openai_chat_completions(
output.success = True
output.latency = latency
else:
output.error = response.reason
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
Expand Down
7 changes: 3 additions & 4 deletions benchmarks/benchmark_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,16 +169,15 @@ def run_to_completion(profile_dir: Optional[str] = None):
"--device",
type=str,
default="cuda",
choices=["cuda"],
help='device type for vLLM execution, supporting CUDA only currently.')
choices=["cuda", "cpu"],
help='device type for vLLM execution, supporting CUDA and CPU.')
parser.add_argument('--block-size',
type=int,
default=16,
help='block size of key/value cache')
parser.add_argument(
'--enable-chunked-prefill',
type=bool,
default=False,
action='store_true',
help='If True, the prefill requests can be chunked based on the '
'max_num_batched_tokens')
parser.add_argument(
Expand Down
Loading