Skip to content
This repository was archived by the owner on Oct 11, 2024. It is now read-only.

Commit 7bbd2cc

Browse files
andy-neumaronenscMaxusmustiesmeetuWoosukKwon
authored
upstream merge sync 2024-03-11 (#108)
SUMMARY: * upstream merge (sync) up to `657061fdced8a33a60c1b09f5da2525de9da8f03` * some minor changes related to `ruff` and `yapf` NOTES: we are now consistently getting out memory of errors when running `tests/models/test_marlin.py`. i've disabled the test and created an ASANA ticket to track the issue. TEST PLAN: runs on remote push --------- Signed-off-by: Tao He <[email protected]> Signed-off-by: Yuan Tang <[email protected]> Co-authored-by: Ronen Schaffer <[email protected]> Co-authored-by: Mustafa Eyceoz <[email protected]> Co-authored-by: Roy <[email protected]> Co-authored-by: Woosuk Kwon <[email protected]> Co-authored-by: Massimiliano Pronesti <[email protected]> Co-authored-by: 44670 <[email protected]> Co-authored-by: zhaoyang-star <[email protected]> Co-authored-by: Harry Mellor <[email protected]> Co-authored-by: Jared Moore <[email protected]> Co-authored-by: Philipp Moritz <[email protected]> Co-authored-by: Cade Daniel <[email protected]> Co-authored-by: 张大成 <[email protected]> Co-authored-by: zhangdacheng <[email protected]> Co-authored-by: Jingru <[email protected]> Co-authored-by: Dylan Hawk <[email protected]> Co-authored-by: Tao He <[email protected]> Co-authored-by: Ganesh Jagadeesan <[email protected]> Co-authored-by: Allen.Dou <[email protected]> Co-authored-by: Liangfu Chen <[email protected]> Co-authored-by: CHU Tianxiang <[email protected]> Co-authored-by: Jae-Won Chung <[email protected]> Co-authored-by: Seonghyeon <[email protected]> Co-authored-by: Billy Cao <[email protected]> Co-authored-by: Nick Hill <[email protected]> Co-authored-by: felixzhu555 <[email protected]> Co-authored-by: br3no <[email protected]> Co-authored-by: simon-mo <[email protected]> Co-authored-by: Sherry <[email protected]> Co-authored-by: Yuan Tang <[email protected]> Co-authored-by: Huarong <[email protected]> Co-authored-by: huohuarong <[email protected]> Co-authored-by: Robert Shaw <[email protected]> Co-authored-by: Robert Shaw <[email protected]> Co-authored-by: alexm <[email protected]> Co-authored-by: zixiao <[email protected]> Co-authored-by: cloudhan <[email protected]> Co-authored-by: Sage Moore <[email protected]> Co-authored-by: ElizaWszola <[email protected]> Co-authored-by: Michael Goin <[email protected]> Co-authored-by: Jason Cox <[email protected]> Co-authored-by: Zhuohan Li <[email protected]> Co-authored-by: Roger Wang <[email protected]> Co-authored-by: TianYu GUO <[email protected]> Co-authored-by: Jialun Lyu <[email protected]> Co-authored-by: ttbachyinsda <[email protected]> Co-authored-by: guofangze <[email protected]> Co-authored-by: Antoni Baum <[email protected]> Co-authored-by: Avnish Narayan <[email protected]> Co-authored-by: Chen Wang <[email protected]> Co-authored-by: Hongxia Yang <[email protected]> Co-authored-by: lcskrishna <[email protected]> Co-authored-by: SangBin Cho <[email protected]> Co-authored-by: Chujie Zheng <[email protected]> Co-authored-by: TechxGenus <[email protected]> Co-authored-by: Michael Goin <[email protected]> Co-authored-by: jacobthebanana <[email protected]> Co-authored-by: whyiug <[email protected]> Co-authored-by: Terry <[email protected]> Co-authored-by: Douglas Lehr <[email protected]> Co-authored-by: andy-neuma <[email protected]>
1 parent aebf20b commit 7bbd2cc

File tree

126 files changed

+5123
-813
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

126 files changed

+5123
-813
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ steps:
1313

1414
- label: Basic Correctness Test
1515
command: pytest -v -s --forked basic_correctness
16+
17+
- label: Core Test
18+
command: pytest -v -s core
1619

1720
- label: Distributed Comm Ops Test
1821
command: pytest -v -s --forked test_comm_ops.py
@@ -25,7 +28,7 @@ steps:
2528
num_gpus: 2 # only support 1 or 2 for now.
2629

2730
- label: Engine Test
28-
command: pytest -v -s engine
31+
command: pytest -v -s engine test_sequence.py
2932

3033
- label: Entrypoints Test
3134
command: pytest -v -s entrypoints
@@ -49,6 +52,9 @@ steps:
4952
- label: Worker Test
5053
command: pytest -v -s worker
5154

55+
- label: Speculative decoding tests
56+
command: pytest -v -s spec_decode
57+
5258
- label: LoRA Test
5359
command: pytest -v -s lora --forked
5460

.github/workflows/remote-push.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ jobs:
2121
uses: ./.github/workflows/build-test.yml
2222
with:
2323
label: aws-avx2-192G-4-a10g-96G
24-
timeout: 180
24+
timeout: 240
2525
gitref: '${{ github.ref }}'
2626
Gi_per_thread: 4
2727
python: ${{ matrix.python }}

Dockerfile.rocm

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ RUN echo "FA_BRANCH is $FA_BRANCH"
2323
# In that case, we need to use the python reference attention implementation in vllm
2424
ARG BUILD_FA="1"
2525

26+
# whether to build cupy on rocm
27+
ARG BUILD_CUPY="1"
28+
2629
# Install some basic utilities
2730
RUN apt-get update && apt-get install python3 python3-pip -y
2831

@@ -70,16 +73,33 @@ RUN if [ "$BUILD_FA" = "1" ]; then \
7073
&& cd ..; \
7174
fi
7275

73-
COPY ./ /app/vllm
74-
75-
RUN python3 -m pip install --upgrade pip
76-
RUN python3 -m pip install xformers==0.0.23 --no-deps
77-
7876
# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
7977
# Manually removed it so that later steps of numpy upgrade can continue
8078
RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
8179
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
8280

81+
# build cupy
82+
RUN if [ "$BUILD_CUPY" = "1" ]; then \
83+
mkdir -p libs \
84+
&& cd libs \
85+
&& git clone -b hipgraph_enablement --recursive https://github.com/ROCm/cupy.git \
86+
&& cd cupy \
87+
&& pip install mpi4py-mpich \
88+
&& pip install scipy==1.9.3 \
89+
&& pip install cython==0.29.* \
90+
&& env CC=$MPI_HOME/bin/mpicc python -m pip install mpi4py \
91+
&& export CUPY_INSTALL_USE_HIP=1 \
92+
&& export ROCM_HOME=/opt/rocm \
93+
&& export HCC_AMDGPU_TARGET="gfx90a,gfx942,gfx1100" \
94+
&& pip install . \
95+
&& cd ..; \
96+
fi
97+
98+
COPY ./ /app/vllm
99+
100+
RUN python3 -m pip install --upgrade pip
101+
RUN python3 -m pip install xformers==0.0.23 --no-deps
102+
83103
RUN cd /app \
84104
&& cd vllm \
85105
&& pip install -U -r requirements-rocm.txt \

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ pip install -e .
2727

2828
## Quickstart
2929

30-
Neural Magic maintains a variety of sparse models on our Hugging Face organization profiles, [neuralmagic](https://huggingface.co/neuralmagic) and [nm-testing](https://huggingface.co/nm-testing).
30+
Neural Magic maintains a variety of sparse models on our Hugging Face organization profiles, [neuralmagic](https://huggingface.co/neuralmagic) and [nm-testing](https://huggingface.co/nm-testing).
3131

3232
A collection of ready-to-use SparseGPT and GPTQ models in inference optimized marlin format are [available on Hugging Face](https://huggingface.co/collections/neuralmagic/compressed-llms-for-nm-vllm-65e73e3d51d3200e34b77431)
3333

@@ -63,7 +63,7 @@ For a quick demonstration, here's how to run a small [50% sparse llama2-110M](ht
6363
from vllm import LLM, SamplingParams
6464

6565
model = LLM(
66-
"neuralmagic/llama2.c-stories110M-pruned50",
66+
"neuralmagic/llama2.c-stories110M-pruned50",
6767
sparsity="sparse_w16a16", # If left off, model will be loaded as dense
6868
)
6969

benchmarks/backend_request_func.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,10 +277,80 @@ async def async_request_openai_completions(
277277
return output
278278

279279

280+
async def async_request_openai_chat_completions(
281+
request_func_input: RequestFuncInput,
282+
pbar: Optional[tqdm] = None,
283+
) -> RequestFuncOutput:
284+
api_url = request_func_input.api_url
285+
assert api_url.endswith(
286+
"v1/chat/completions"
287+
), "OpenAI Chat API URL must end with 'v1/chat/completions'."
288+
289+
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
290+
assert not request_func_input.use_beam_search
291+
payload = {
292+
"model": request_func_input.model,
293+
"messages": [
294+
{
295+
"role": "user",
296+
"content": request_func_input.prompt,
297+
},
298+
],
299+
"temperature": 0.0,
300+
"max_tokens": request_func_input.output_len,
301+
"stream": True,
302+
}
303+
headers = {
304+
"Content-Type": "application/json",
305+
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
306+
}
307+
308+
output = RequestFuncOutput()
309+
output.prompt_len = request_func_input.prompt_len
310+
311+
generated_text = ""
312+
ttft = 0
313+
st = time.perf_counter()
314+
try:
315+
async with session.post(url=api_url, json=payload,
316+
headers=headers) as response:
317+
if response.status == 200:
318+
async for chunk in response.content:
319+
if ttft == 0:
320+
ttft = time.perf_counter() - st
321+
output.ttft = ttft
322+
323+
chunk = chunk.strip()
324+
if not chunk:
325+
continue
326+
327+
chunk = chunk.decode("utf-8").lstrip("data: ")
328+
if chunk == "[DONE]":
329+
latency = time.perf_counter() - st
330+
else:
331+
body = json.loads(chunk)
332+
if "content" in body["choices"][0]["delta"]:
333+
generated_text += body["choices"][0]["delta"][
334+
"content"]
335+
336+
output.generated_text = generated_text
337+
output.success = True
338+
output.latency = latency
339+
else:
340+
output.success = False
341+
except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
342+
output.success = False
343+
344+
if pbar:
345+
pbar.update(1)
346+
return output
347+
348+
280349
ASYNC_REQUEST_FUNCS = {
281350
"tgi": async_request_tgi,
282351
"vllm": async_request_vllm,
283352
"deepspeed-mii": async_request_deepspeed_mii,
284353
"openai": async_request_openai_completions,
354+
"openai-chat": async_request_openai_chat_completions,
285355
"tensorrt-llm": async_request_trt_llm,
286356
}

benchmarks/benchmark_serving.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
On the client side, run:
1313
python benchmarks/benchmark_serving.py \
1414
--backend <backend> \
15-
--tokenizer <your_model> --dataset <target_dataset> \
15+
--model <your_model> --dataset <target_dataset> \
1616
--request-rate <request_rate>
1717
"""
1818
import argparse
@@ -171,10 +171,10 @@ async def benchmark(
171171
else:
172172
raise ValueError(f"Unknown backend: {backend}")
173173

174-
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
175-
176174
print(f"Traffic request rate: {request_rate}")
177175

176+
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
177+
178178
benchmark_start_time = time.perf_counter()
179179
tasks = []
180180
async for request in get_request(input_requests, request_rate):

csrc/attention/attention_kernels.cu

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,6 @@
1515
* See the License for the specific language governing permissions and
1616
* limitations under the License.
1717
*/
18-
#ifdef USE_ROCM
19-
#include <hip/hip_runtime.h>
20-
#endif
2118

2219
#include <torch/extension.h>
2320
#include <ATen/cuda/CUDAContext.h>
@@ -31,11 +28,6 @@
3128

3229
#include <algorithm>
3330

34-
#ifndef USE_ROCM
35-
#define WARP_SIZE 32
36-
#else
37-
#define WARP_SIZE warpSize
38-
#endif
3931
#define MAX(a, b) ((a) > (b) ? (a) : (b))
4032
#define MIN(a, b) ((a) < (b) ? (a) : (b))
4133
#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))

csrc/cuda_compat.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,15 @@
11
#pragma once
22

3+
#ifdef USE_ROCM
4+
#include <hip/hip_runtime.h>
5+
#endif
6+
7+
#ifndef USE_ROCM
8+
#define WARP_SIZE 32
9+
#else
10+
#define WARP_SIZE warpSize
11+
#endif
12+
313
#ifndef USE_ROCM
414
#define VLLM_LDG(arg) __ldg(arg)
515
#else

csrc/punica/bgmv/bgmv_config.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,15 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
1414
f(in_T, out_T, W_T, narrow, 128) \
1515
f(in_T, out_T, W_T, narrow, 256) \
1616
f(in_T, out_T, W_T, narrow, 512) \
17+
f(in_T, out_T, W_T, narrow, 768) \
1718
f(in_T, out_T, W_T, narrow, 1024) \
1819
f(in_T, out_T, W_T, narrow, 1280) \
1920
f(in_T, out_T, W_T, narrow, 1728) \
2021
f(in_T, out_T, W_T, narrow, 1792) \
2122
f(in_T, out_T, W_T, narrow, 2048) \
2223
f(in_T, out_T, W_T, narrow, 2560) \
2324
f(in_T, out_T, W_T, narrow, 2752) \
25+
f(in_T, out_T, W_T, narrow, 2816) \
2426
f(in_T, out_T, W_T, narrow, 3072) \
2527
f(in_T, out_T, W_T, narrow, 3456) \
2628
f(in_T, out_T, W_T, narrow, 3584) \
@@ -36,6 +38,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
3638
f(in_T, out_T, W_T, narrow, 10240) \
3739
f(in_T, out_T, W_T, narrow, 11008) \
3840
f(in_T, out_T, W_T, narrow, 12288) \
41+
f(in_T, out_T, W_T, narrow, 13696) \
3942
f(in_T, out_T, W_T, narrow, 13824) \
4043
f(in_T, out_T, W_T, narrow, 14336) \
4144
f(in_T, out_T, W_T, narrow, 16384) \

csrc/reduction_utils.cuh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,15 @@ namespace vllm {
2424
template<typename T>
2525
__inline__ __device__ T warpReduceSum(T val) {
2626
#pragma unroll
27-
for (int mask = 16; mask > 0; mask >>= 1)
27+
for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1)
2828
val += VLLM_SHFL_XOR_SYNC(val, mask);
2929
return val;
3030
}
3131

3232
/* Calculate the sum of all elements in a block */
3333
template<typename T>
3434
__inline__ __device__ T blockReduceSum(T val) {
35-
static __shared__ T shared[32];
35+
static __shared__ T shared[WARP_SIZE];
3636
int lane = threadIdx.x & 0x1f;
3737
int wid = threadIdx.x >> 5;
3838

@@ -45,7 +45,7 @@ __inline__ __device__ T blockReduceSum(T val) {
4545

4646
// Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
4747
// blockDim.x is not divided by 32
48-
val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f);
48+
val = (threadIdx.x < (blockDim.x / (WARP_SIZE * 1.0f))) ? shared[lane] : (T)(0.0f);
4949
val = warpReduceSum<T>(val);
5050
return val;
5151
}

0 commit comments

Comments
 (0)