CPU only build (vllm-project#9)

maktukmak · bigPYJ1151 · commit 62936e3f9af6 · 2023-12-29T15:17:10.000Z
diff --git a/Makefile b/Makefile
@@ -28,6 +28,9 @@ sanitizer:
 py_install:
 	VLLM_BUILD_CPU_OPS=1 MAX_JOBS=JOBS pip install --no-build-isolation  -v -e .
 
+py_install_cpu:
+	VLLM_BUILD_CPU_ONLY=1 MAX_JOBS=JOBS pip install --no-build-isolation  -v -e .
+
 package:
 	VLLM_BUILD_CPU_OPS=1 MAX_JOBS=JOBS python setup.py bdist_wheel
 	echo "Wheel package is saved in ./dist/"
diff --git a/cpu.Dockerfile b/cpu.Dockerfile
@@ -0,0 +1,77 @@
+FROM python:3.10 AS dev
+
+RUN apt-get update -y \
+    && apt-get install -y python3-pip
+
+WORKDIR /workspace
+
+# install build and runtime dependencies
+COPY requirements-cpu.txt requirements-cpu.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r requirements-cpu.txt
+
+# install development dependencies
+COPY requirements-dev.txt requirements-dev.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r requirements-dev.txt
+
+# image to build pytorch extensions
+FROM dev AS build
+
+# install build dependencies
+COPY requirements-build-cpu.txt requirements-build-cpu.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r requirements-build-cpu.txt
+
+# copy input files
+COPY csrc csrc
+COPY setup.py setup.py
+COPY requirements-cpu.txt requirements-cpu.txt
+COPY pyproject.toml pyproject.toml
+COPY vllm/__init__.py vllm/__init__.py
+
+# max jobs used by Ninja to build extensions
+ENV MAX_JOBS=$max_jobs
+RUN python3 setup.py build_ext --inplace
+
+# image to run unit testing suite
+FROM dev AS test
+
+# copy pytorch extensions separately to avoid having to rebuild
+# when python code changes
+COPY --from=build /workspace/vllm/*.so /workspace/vllm/
+COPY tests tests
+COPY vllm vllm
+
+ENTRYPOINT ["python3", "-m", "pytest", "tests"]
+
+# use CUDA base as CUDA runtime dependencies are already installed via pip
+FROM python:3.10 AS dev
+
+# libnccl required for ray
+RUN apt-get update -y \
+    && apt-get install -y python3-pip
+
+WORKDIR /workspace
+COPY requirements-cpu.txt requirements-cpu.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r requirements-cpu.txt
+
+FROM vllm-base AS vllm
+COPY --from=build /workspace/vllm/*.so /workspace/vllm/
+COPY vllm vllm
+
+EXPOSE 8000
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"]
+
+# openai api server alternative
+FROM vllm-base AS vllm-openai
+# install additional dependencies for openai api server
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install accelerate fschat
+
+COPY --from=build /workspace/vllm/*.so /workspace/vllm/
+COPY vllm vllm
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+
diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
@@ -14,10 +14,14 @@
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 
+#ifdef VLLM_BUILD_CPU_ONLY
+#define VLLM_DISPATCH_TO_CUDA_CASE(BASENAME, ...) 
+#else
 #define VLLM_DISPATCH_TO_CUDA_CASE(BASENAME, ...)                              \
   case c10::DeviceType::CUDA: {                                                \
     return BASENAME(__VA_ARGS__);                                              \
   }
+#endif
 
 #ifdef VLLM_BUILD_CPU_OPS
 #define VLLM_DISPATCH_TO_CPU_CASE(BASENAME, ...)                               \
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
@@ -87,6 +87,12 @@ void gptq_shuffle_dispatch(
     VLLM_DISPATCH_DEVICES(q_weight.device(), gptq_shuffle, q_weight, q_perm);
 }
 
+#ifdef VLLM_BUILD_CPU_ONLY
+int get_device_attribute(
+    int attribute,
+    int device_id) { return 94387; }
+#endif
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   // vLLM custom ops
   pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
diff --git a/gpu.Dockerfile b/gpu.Dockerfile
@@ -6,9 +6,9 @@ RUN apt-get update -y \
 WORKDIR /workspace
 
 # install build and runtime dependencies
-COPY requirements.txt requirements.txt
+COPY requirements-gpu.txt requirements-gpu.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements.txt
+    pip install -r requirements-gpu.txt
 
 # install development dependencies
 COPY requirements-dev.txt requirements-dev.txt
@@ -19,14 +19,14 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 FROM dev AS build
 
 # install build dependencies
-COPY requirements-build.txt requirements-build.txt
+COPY requirements-build-gpu.txt requirements-build-gpu.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements-build.txt
+    pip install -r requirements-build-gpu.txt
 
 # copy input files
 COPY csrc csrc
 COPY setup.py setup.py
-COPY requirements.txt requirements.txt
+COPY requirements-gpu.txt requirements-gpu.txt
 COPY pyproject.toml pyproject.toml
 COPY vllm/__init__.py vllm/__init__.py
 
@@ -60,9 +60,9 @@ RUN apt-get update -y \
     && apt-get install -y python3-pip
 
 WORKDIR /workspace
-COPY requirements.txt requirements.txt
+COPY requirements-gpu.txt requirements-gpu.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements.txt
+    pip install -r requirements-gpu.txt
 
 FROM vllm-base AS vllm
 COPY --from=build /workspace/vllm/*.so /workspace/vllm/
diff --git a/requirements-build-cpu.txt b/requirements-build-cpu.txt
@@ -0,0 +1,6 @@
+# Should be mirrored in pyproject.toml
+ninja
+packaging
+setuptools>=49.4.0
+torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.1.0%2Bcpu.cxx11.abi-cp310-cp310-linux_x86_64.whl#sha256=88f1ee550c6291af8d0417871fb7af84b86527d18bc02ac4249f07dcd84dda56 #2.1.0+cpu
+wheel
diff --git a/requirements-build-gpu.txt b/requirements-build-gpu.txt
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
@@ -0,0 +1,14 @@
+ninja  # For faster builds.
+psutil
+ray >= 2.5.1
+pandas  # Required for Ray data.
+pyarrow  # Required for Ray data.
+sentencepiece  # Required for LLaMA tokenizer.
+numpy
+einops  # Required for phi-1_5
+torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.1.0%2Bcpu.cxx11.abi-cp310-cp310-linux_x86_64.whl#sha256=88f1ee550c6291af8d0417871fb7af84b86527d18bc02ac4249f07dcd84dda56 #2.1.0+cpu
+transformers >= 4.34.0  # Required for Mistral.
+fastapi
+uvicorn[standard]
+pydantic == 1.10.13  # Required for OpenAI server.
+aioprometheus[starlette]
diff --git a/requirements-gpu.txt b/requirements-gpu.txt
diff --git a/setup.py b/setup.py
@@ -8,7 +8,13 @@
 from packaging.version import parse, Version
 import setuptools
 import torch
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME
+
+BUILD_CPU_ONLY = os.getenv('VLLM_BUILD_CPU_ONLY', "0") == "1"
+
+if not BUILD_CPU_ONLY:
+    from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME
+else:
+    from torch.utils.cpp_extension import BuildExtension, CppExtension
 
 ROOT_DIR = os.path.dirname(__file__)
 
@@ -21,11 +27,11 @@
 
 
 def _is_hip() -> bool:
-    return torch.version.hip is not None
+    return torch.version.hip is not None and not BUILD_CPU_ONLY
 
 
 def _is_cuda() -> bool:
-    return torch.version.cuda is not None
+    return torch.version.cuda is not None and not BUILD_CPU_ONLY
 
 
 # Compiler flags.
@@ -86,7 +92,6 @@ def get_hipcc_rocm_version():
         print("Could not find HIP version in the output")
         return None
 
-
 def get_nvcc_cuda_version(cuda_dir: str) -> Version:
     """Get the CUDA version from nvcc.
 
@@ -137,6 +142,19 @@ def get_torch_arch_list() -> Set[str]:
             stacklevel=2)
     return arch_list
 
+if not BUILD_CPU_ONLY:
+    # First, check the TORCH_CUDA_ARCH_LIST environment variable.
+    compute_capabilities = get_torch_arch_list()
+    if not compute_capabilities:
+        # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
+        # GPUs on the current machine.
+        device_count = torch.cuda.device_count()
+        for i in range(device_count):
+            major, minor = torch.cuda.get_device_capability(i)
+            if major < 7:
+                raise RuntimeError(
+                    "GPUs with compute capability below 7.0 are not supported.")
+            compute_capabilities.add(f"{major}.{minor}")
 
 # First, check the TORCH_CUDA_ARCH_LIST environment variable.
 compute_capabilities = get_torch_arch_list()
@@ -211,9 +229,11 @@ def get_torch_arch_list() -> Set[str]:
             f"amdgpu_arch_found: {amd_arch}")
 
 # Setup CPU Operations
-BUILD_CPU_OPS = os.getenv('VLLM_BUILD_CPU_OPS', "0") == "1"
+BUILD_CPU_OPS = (os.getenv('VLLM_BUILD_CPU_OPS', "0") == "1" or BUILD_CPU_ONLY)
 CPU_OPS_SOURCES = []
 if BUILD_CPU_OPS:
+    if BUILD_CPU_ONLY:
+        CXX_FLAGS += ["-DVLLM_BUILD_CPU_ONLY"]
     CXX_FLAGS += [
         "-DVLLM_BUILD_CPU_OPS", "-fopenmp", "-mavx512f", "-mavx512bf16",
         "-mavx512vl"
@@ -228,29 +248,42 @@ def get_torch_arch_list() -> Set[str]:
 
 ext_modules = []
 
-vllm_extension_sources = [
-    "csrc/cache_kernels.cu",
-    "csrc/attention/attention_kernels.cu",
-    "csrc/pos_encoding_kernels.cu",
-    "csrc/activation_kernels.cu",
-    "csrc/layernorm_kernels.cu",
-    "csrc/quantization/squeezellm/quant_cuda_kernel.cu",
-    "csrc/quantization/gptq/q_gemm.cu",
-    "csrc/cuda_utils_kernels.cu",
-    "csrc/pybind.cpp",
-] + CPU_OPS_SOURCES
+if not BUILD_CPU_ONLY:
+    vllm_extension_sources = [
+        "csrc/cache_kernels.cu",
+        "csrc/attention/attention_kernels.cu",
+        "csrc/pos_encoding_kernels.cu",
+        "csrc/activation_kernels.cu",
+        "csrc/layernorm_kernels.cu",
+        "csrc/quantization/squeezellm/quant_cuda_kernel.cu",
+        "csrc/quantization/gptq/q_gemm.cu",
+        "csrc/cuda_utils_kernels.cu",
+        "csrc/pybind.cpp",
+    ] + CPU_OPS_SOURCES
+
+    if _is_cuda():
+        vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
+
+    vllm_extension = CUDAExtension(
+        name="vllm._C",
+        sources=vllm_extension_sources,
+        extra_compile_args={
+            "cxx": CXX_FLAGS,
+            "nvcc": NVCC_FLAGS,
+        },
+    )
+else:
+    vllm_extension_sources = [
+        "csrc/pybind.cpp",
+    ] + CPU_OPS_SOURCES
+    vllm_extension = CppExtension(
+        name="vllm._C",
+        sources=vllm_extension_sources,
+        extra_compile_args={
+            "cxx": CXX_FLAGS,
+        },
+    )
 
-if _is_cuda():
-    vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
-
-vllm_extension = CUDAExtension(
-    name="vllm._C",
-    sources=vllm_extension_sources,
-    extra_compile_args={
-        "cxx": CXX_FLAGS,
-        "nvcc": NVCC_FLAGS,
-    },
-)
 ext_modules.append(vllm_extension)
 
 
@@ -280,7 +313,7 @@ def get_vllm_version() -> str:
         if hipcc_version != MAIN_CUDA_VERSION:
             rocm_version_str = hipcc_version.replace(".", "")[:3]
             version += f"+rocm{rocm_version_str}"
-    else:
+    elif _is_cuda():
         cuda_version = str(nvcc_cuda_version)
         if cuda_version != MAIN_CUDA_VERSION:
             cuda_version_str = cuda_version.replace(".", "")[:3]
@@ -303,9 +336,13 @@ def get_requirements() -> List[str]:
     if _is_hip():
         with open(get_path("requirements-rocm.txt")) as f:
             requirements = f.read().strip().split("\n")
+    elif _is_cuda():
+        with open(get_path("requirements-gpu.txt")) as f:
+            requirements = f.read().strip().split("\n")
     else:
-        with open(get_path("requirements.txt")) as f:
+        with open(get_path("requirements-cpu.txt")) as f:
             requirements = f.read().strip().split("\n")
+
     return requirements
 
 
diff --git a/vllm/utils.py b/vllm/utils.py
@@ -5,10 +5,12 @@
 
 import psutil
 import torch
+import os
 
 from vllm._C import cuda_utils
 
 
+
 class Device(enum.Enum):
     GPU = enum.auto()
     CPU = enum.auto()