Skip to content

Commit 5f24f38

Browse files
authored
Merge branch 'main' into refactor-wait-kv-load
2 parents 1f0041a + 4b1ff13 commit 5f24f38

File tree

99 files changed

+2063
-410
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

99 files changed

+2063
-410
lines changed

.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml

Lines changed: 0 additions & 12 deletions
This file was deleted.

.buildkite/test-pipeline.yaml

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,9 @@ steps:
472472
- tests/compile
473473
commands:
474474
- pytest -v -s compile/test_full_graph.py
475-
- pytest -v -s compile/test_fusions_e2e.py
475+
# Limit to no custom ops to reduce running time
476+
# Wrap with quotes to escape yaml and avoid starting -k string with a -
477+
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
476478

477479
- label: Cudagraph test
478480
timeout_in_minutes: 20
@@ -546,8 +548,11 @@ steps:
546548

547549
- label: Model Executor Test # 23min
548550
timeout_in_minutes: 35
551+
torch_nightly: true
549552
mirror_hardwares: [amdexperimental]
550553
source_file_dependencies:
554+
- vllm/engine/arg_utils.py
555+
- vllm/config/model.py
551556
- vllm/model_executor
552557
- tests/model_executor
553558
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
@@ -926,6 +931,29 @@ steps:
926931
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
927932
# this runner has 2 GPUs available even though num_gpus=2 is not set
928933
- pytest -v -s tests/compile/test_fusion_all_reduce.py
934+
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
935+
# Wrap with quotes to escape yaml
936+
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
937+
938+
- label: Blackwell Fusion E2E Tests # 30 min
939+
timeout_in_minutes: 40
940+
working_dir: "/vllm-workspace/"
941+
gpu: b200
942+
optional: true
943+
num_gpus: 2
944+
source_file_dependencies:
945+
- csrc/quantization/fp4/
946+
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
947+
- vllm/v1/attention/backends/flashinfer.py
948+
- vllm/compilation/
949+
# can affect pattern matching
950+
- vllm/model_executor/layers/layernorm.py
951+
- vllm/model_executor/layers/activation.py
952+
- vllm/model_executor/layers/quantization/input_quant_fp8.py
953+
- tests/compile/test_fusions_e2e.py
954+
commands:
955+
- nvidia-smi
956+
# Run all e2e fusion tests
929957
- pytest -v -s tests/compile/test_fusions_e2e.py
930958

931959
- label: Blackwell GPT-OSS Eval

.github/CODEOWNERS

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,3 +127,8 @@ mkdocs.yaml @hmellor
127127
/vllm/config/pooler.py @noooop
128128
/vllm/pooling_params.py @noooop
129129
/vllm/model_executor/layers/pooler.py @noooop
130+
131+
# Security guide and policies
132+
/docs/usage/security.md @russellb
133+
/SECURITY.md @russellb
134+
/docs/contributing/vulnerability_management.md @russellb

benchmarks/kernels/benchmark_grouped_gemm_cutlass.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616
from vllm.utils.argparse_utils import FlexibleArgumentParser
1717

1818
DEFAULT_MODELS = [
19-
"nm-testing/Mixtral-8x7B-Instruct-v0.1",
20-
"nm-testing/deepseekv2-lite",
19+
"mistralai/Mixtral-8x7B-Instruct-v0.1",
20+
"deepseek-ai/DeepSeek-V2-Lite",
2121
"ibm-granite/granite-3.0-1b-a400m",
2222
"ibm-granite/granite-3.0-3b-a800m",
2323
]

benchmarks/kernels/benchmark_shapes.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,11 +78,11 @@
7878
}
7979

8080
WEIGHT_SHAPES_MOE = {
81-
"nm-testing/Mixtral-8x7B-Instruct-v0.1": [
81+
"mistralai/Mixtral-8x7B-Instruct-v0.1": [
8282
[8, 2, 4096, 28672],
8383
[8, 2, 14336, 4096],
8484
],
85-
"nm-testing/deepseekv2-lite": [
85+
"deepseek-ai/DeepSeek-V2-Lite": [
8686
[64, 6, 2048, 1408],
8787
],
8888
"ibm-granite/granite-3.0-1b-a400m": [

csrc/quantization/fp4/nvfp4_quant_kernels.cu

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,13 @@
3131

3232
namespace vllm {
3333

34+
template <typename Int>
35+
__host__ __device__ inline Int round_up(Int x, Int y) {
36+
static_assert(std::is_integral_v<Int>,
37+
"round_up argument must be integral type");
38+
return (x + y - 1) / y * y;
39+
}
40+
3441
// Use UE4M3 by default.
3542
template <class Type, bool UE8M0_SF = false>
3643
__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
@@ -42,10 +49,21 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
4249
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
4350
"Vec size is not matched.");
4451

52+
int sf_m = round_up<int>(numRows, 128);
53+
int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
54+
int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
55+
for (int row = numRows + blockIdx.x; row < sf_m; row += gridDim.x) {
56+
// Each thread writes 4 uint32_t elements.
57+
for (int col = sf_n_unpadded + threadIdx.x * 4; col < sf_n_int;
58+
col += blockDim.x * 4) {
59+
SFout[row * sf_n_int + col] = 0x00;
60+
}
61+
}
62+
4563
// Get the global scaling factor, which will be applied to the SF.
4664
// Note SFScale is the same as next GEMM's alpha, which is
4765
// (448.f / (Alpha_A / 6.f)).
48-
float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
66+
float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0];
4967

5068
// Input tensor row/col loops.
5169
for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
@@ -64,7 +82,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
6482
rowIdx, colIdx, numCols, SFout);
6583

6684
out_pos =
67-
cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
85+
cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
6886
}
6987
}
7088
}

docs/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ vLLM is flexible and easy to use with:
5656
- Tensor, pipeline, data and expert parallelism support for distributed inference
5757
- Streaming outputs
5858
- OpenAI-compatible API server
59-
- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
59+
- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, Arm CPUs and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
6060
- Prefix caching support
6161
- Multi-LoRA support
6262

docs/deployment/frameworks/helm.md

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ Before you begin, ensure that you have the following:
1313
- A running Kubernetes cluster
1414
- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin)
1515
- Available GPU resources in your cluster
16-
- An S3 with the model which will be deployed
16+
- (Optional) An S3 bucket or other storage with the model weights, if using automatic model download
1717

1818
## Installing the chart
1919

@@ -61,10 +61,16 @@ The following table describes configurable parameters of the chart in `values.ya
6161
| deploymentStrategy | object | {} | Deployment strategy configuration |
6262
| externalConfigs | list | [] | External configuration |
6363
| extraContainers | list | [] | Additional containers configuration |
64-
| extraInit | object | {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} | Additional configuration for the init container |
65-
| extraInit.pvcStorage | string | "1Gi" | Storage size of the s3 |
66-
| extraInit.s3modelpath | string | "relative_s3_model_path/opt-125m" | Path of the model on the s3 which hosts model weights and config files |
67-
| extraInit.awsEc2MetadataDisabled | boolean | true | Disables the use of the Amazon EC2 instance metadata service |
64+
| extraInit | object | {"modelDownload":{"enabled":true},"initContainers":[],"pvcStorage":"1Gi"} | Additional configuration for init containers |
65+
| extraInit.modelDownload | object | {"enabled":true} | Model download functionality configuration |
66+
| extraInit.modelDownload.enabled | bool | true | Enable automatic model download job and wait container |
67+
| extraInit.modelDownload.image | object | {"repository":"amazon/aws-cli","tag":"2.6.4","pullPolicy":"IfNotPresent"} | Image for model download operations |
68+
| extraInit.modelDownload.waitContainer | object | {} | Wait container configuration (command, args, env) |
69+
| extraInit.modelDownload.downloadJob | object | {} | Download job configuration (command, args, env) |
70+
| extraInit.initContainers | list | [] | Custom init containers (appended after model download if enabled) |
71+
| extraInit.pvcStorage | string | "1Gi" | Storage size for the PVC |
72+
| extraInit.s3modelpath | string | "relative_s3_model_path/opt-125m" | (Optional) Path of the model on S3 |
73+
| extraInit.awsEc2MetadataDisabled | bool | true | (Optional) Disable AWS EC2 metadata service |
6874
| extraPorts | list | [] | Additional ports configuration |
6975
| gpuModels | list | ["TYPE_GPU_USED"] | Type of gpu used |
7076
| image | object | {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} | Image configuration |
@@ -98,3 +104,36 @@ The following table describes configurable parameters of the chart in `values.ya
98104
| serviceName | string | "" | Service name |
99105
| servicePort | int | 80 | Service port |
100106
| labels.environment | string | test | Environment name |
107+
108+
## Configuration Examples
109+
110+
### Using S3 Model Download (Default)
111+
112+
```yaml
113+
extraInit:
114+
modelDownload:
115+
enabled: true
116+
pvcStorage: "10Gi"
117+
s3modelpath: "models/llama-7b"
118+
```
119+
120+
### Using Custom Init Containers Only
121+
122+
For use cases like llm-d where you need custom sidecars without model download:
123+
124+
```yaml
125+
extraInit:
126+
modelDownload:
127+
enabled: false
128+
initContainers:
129+
- name: llm-d-routing-proxy
130+
image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
131+
imagePullPolicy: IfNotPresent
132+
ports:
133+
- containerPort: 8080
134+
name: proxy
135+
securityContext:
136+
runAsUser: 1000
137+
restartPolicy: Always
138+
pvcStorage: "10Gi"
139+
```

docs/getting_started/installation/cpu.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ Currently, there are no pre-built CPU wheels.
9494
## Related runtime environment variables
9595

9696
- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`.
97-
- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads, can be set as CPU id lists or `auto` (by default). For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node respectively.
97+
- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads, can be set as CPU id lists, `auto` (by default), or `nobind` (to disable binding to individual CPU cores and to inherit user-defined OpenMP variables). For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node respectively. If set to `nobind`, the number of OpenMP threads is determined by the standard `OMP_NUM_THREADS` environment variable.
9898
- `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `None`. If the value is not set and use `auto` thread binding, no CPU will be reserved for `world_size == 1`, 1 CPU per rank will be reserved for `world_size > 1`.
9999
- `CPU_VISIBLE_MEMORY_NODES`: specify visible NUMA memory nodes for vLLM CPU workers, similar to ```CUDA_VISIBLE_DEVICES```. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. The variable provides more control for the auto thread-binding feature, such as masking nodes and changing nodes binding sequence.
100100
- `VLLM_CPU_MOE_PREPACK` (x86 only): whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).

examples/online_serving/chart-helm/README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,15 @@ This directory contains a Helm chart for deploying the vllm application. The cha
1919
- templates/pvc.yaml: Template for Persistent Volume Claims.
2020
- templates/secrets.yaml: Template for Kubernetes Secrets.
2121
- templates/service.yaml: Template for creating Services.
22+
23+
## Running Tests
24+
25+
This chart includes unit tests using [helm-unittest](https://github.com/helm-unittest/helm-unittest). Install the plugin and run tests:
26+
27+
```bash
28+
# Install plugin
29+
helm plugin install https://github.com/helm-unittest/helm-unittest
30+
31+
# Run tests
32+
helm unittest .
33+
```

0 commit comments

Comments
 (0)