vllm-project · youkaichao · Feb 7, 2024 · Feb 7, 2024 · Feb 12, 2024 · Feb 15, 2024
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -22,8 +22,13 @@ steps:
   working_dir: "/vllm-workspace/tests/distributed"
   num_gpus: 2 # only support 1 or 2 for now.
 
-- label: Distributed Correctness Test
-  command: pytest -v -s --forked test_basic_distributed_correctness.py
+- label: Distributed Correctness Test-facebook/opt-125m
+  command: TEST_DIST_MODEL=facebook/opt-125m pytest -v -s --forked test_basic_distributed_correctness.py
+  working_dir: "/vllm-workspace/tests/distributed"
+  num_gpus: 2 # only support 1 or 2 for now.
+
+- label: Distributed Correctness Test-meta-llama/Llama-2-7b-hf
+  command: TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s --forked test_basic_distributed_correctness.py
   working_dir: "/vllm-workspace/tests/distributed"
   num_gpus: 2 # only support 1 or 2 for now.
 

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -49,7 +49,7 @@ jobs:
       matrix:
           os: ['ubuntu-20.04']
           python-version: ['3.8', '3.9', '3.10', '3.11']
-          pytorch-version: ['2.1.2']  # Must be the most recent version that meets requirements.txt.
+          pytorch-version: ['2.2.1']  # Must be the most recent version that meets requirements.txt.
           cuda-version: ['11.8', '12.1']
 
     steps:

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ requires = [
     "ninja",
     "packaging",
     "setuptools >= 49.4.0",
-    "torch == 2.1.2",
+    "torch == 2.2.1",
     "wheel",
 ]
 build-backend = "setuptools.build_meta"

diff --git a/requirements-build.txt b/requirements-build.txt
@@ -2,5 +2,5 @@
 ninja
 packaging
 setuptools>=49.4.0
-torch==2.1.2
+torch==2.2.1
 wheel
diff --git a/requirements.txt b/requirements.txt
@@ -3,9 +3,9 @@ psutil
 ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
-torch == 2.1.2
+torch == 2.2.1
 transformers >= 4.38.0  # Required for Gemma.
-xformers == 0.0.23.post1  # Required for CUDA 12.1.
+xformers == 0.0.25  # Requires PyTorch 2.2.1.
 fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.

diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py
@@ -1,13 +1,23 @@
 """Compare the outputs of HF and distributed vLLM when using greedy sampling.
 
-Run `pytest tests/distributed/test_basic_distributed_correctness.py --forked`.
+vLLM will allocate all the available memory, so we need to run the tests one
+by one. The solution is to pass arguments (model name) by environment
+variables.
+Run:
+
+```sh
+TEST_DIST_MODEL=facebook/opt-125m pytest \
+    test_basic_distributed_correctness.py
+TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
+    test_basic_distributed_correctness.py
+```
 """
+import os
 import pytest
 import torch
 
 MODELS = [
-    "facebook/opt-125m",
-    "meta-llama/Llama-2-7b-hf",
+    os.environ["TEST_DIST_MODEL"],
 ]