88from packaging .version import parse , Version
99import setuptools
1010import torch
11- from torch .utils .cpp_extension import BuildExtension , CUDAExtension , CUDA_HOME , ROCM_HOME
11+
12+ BUILD_CPU_ONLY = os .getenv ('VLLM_BUILD_CPU_ONLY' , "0" ) == "1"
13+
14+ if not BUILD_CPU_ONLY :
15+ from torch .utils .cpp_extension import BuildExtension , CUDAExtension , CUDA_HOME , ROCM_HOME
16+ else :
17+ from torch .utils .cpp_extension import BuildExtension , CppExtension
1218
1319ROOT_DIR = os .path .dirname (__file__ )
1420
2127
2228
2329def _is_hip () -> bool :
24- return torch .version .hip is not None
30+ return torch .version .hip is not None and not BUILD_CPU_ONLY
2531
2632
2733def _is_cuda () -> bool :
28- return torch .version .cuda is not None
34+ return torch .version .cuda is not None and not BUILD_CPU_ONLY
2935
3036
3137# Compiler flags.
@@ -86,7 +92,6 @@ def get_hipcc_rocm_version():
8692 print ("Could not find HIP version in the output" )
8793 return None
8894
89-
9095def get_nvcc_cuda_version (cuda_dir : str ) -> Version :
9196 """Get the CUDA version from nvcc.
9297
@@ -137,6 +142,19 @@ def get_torch_arch_list() -> Set[str]:
137142 stacklevel = 2 )
138143 return arch_list
139144
145+ if not BUILD_CPU_ONLY :
146+ # First, check the TORCH_CUDA_ARCH_LIST environment variable.
147+ compute_capabilities = get_torch_arch_list ()
148+ if not compute_capabilities :
149+ # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
150+ # GPUs on the current machine.
151+ device_count = torch .cuda .device_count ()
152+ for i in range (device_count ):
153+ major , minor = torch .cuda .get_device_capability (i )
154+ if major < 7 :
155+ raise RuntimeError (
156+ "GPUs with compute capability below 7.0 are not supported." )
157+ compute_capabilities .add (f"{ major } .{ minor } " )
140158
141159# First, check the TORCH_CUDA_ARCH_LIST environment variable.
142160compute_capabilities = get_torch_arch_list ()
@@ -211,9 +229,11 @@ def get_torch_arch_list() -> Set[str]:
211229 f"amdgpu_arch_found: { amd_arch } " )
212230
213231# Setup CPU Operations
214- BUILD_CPU_OPS = os .getenv ('VLLM_BUILD_CPU_OPS' , "0" ) == "1"
232+ BUILD_CPU_OPS = ( os .getenv ('VLLM_BUILD_CPU_OPS' , "0" ) == "1" or BUILD_CPU_ONLY )
215233CPU_OPS_SOURCES = []
216234if BUILD_CPU_OPS :
235+ if BUILD_CPU_ONLY :
236+ CXX_FLAGS += ["-DVLLM_BUILD_CPU_ONLY" ]
217237 CXX_FLAGS += [
218238 "-DVLLM_BUILD_CPU_OPS" , "-fopenmp" , "-mavx512f" , "-mavx512bf16" ,
219239 "-mavx512vl"
@@ -228,29 +248,42 @@ def get_torch_arch_list() -> Set[str]:
228248
229249ext_modules = []
230250
231- vllm_extension_sources = [
232- "csrc/cache_kernels.cu" ,
233- "csrc/attention/attention_kernels.cu" ,
234- "csrc/pos_encoding_kernels.cu" ,
235- "csrc/activation_kernels.cu" ,
236- "csrc/layernorm_kernels.cu" ,
237- "csrc/quantization/squeezellm/quant_cuda_kernel.cu" ,
238- "csrc/quantization/gptq/q_gemm.cu" ,
239- "csrc/cuda_utils_kernels.cu" ,
240- "csrc/pybind.cpp" ,
241- ] + CPU_OPS_SOURCES
251+ if not BUILD_CPU_ONLY :
252+ vllm_extension_sources = [
253+ "csrc/cache_kernels.cu" ,
254+ "csrc/attention/attention_kernels.cu" ,
255+ "csrc/pos_encoding_kernels.cu" ,
256+ "csrc/activation_kernels.cu" ,
257+ "csrc/layernorm_kernels.cu" ,
258+ "csrc/quantization/squeezellm/quant_cuda_kernel.cu" ,
259+ "csrc/quantization/gptq/q_gemm.cu" ,
260+ "csrc/cuda_utils_kernels.cu" ,
261+ "csrc/pybind.cpp" ,
262+ ] + CPU_OPS_SOURCES
263+
264+ if _is_cuda ():
265+ vllm_extension_sources .append ("csrc/quantization/awq/gemm_kernels.cu" )
266+
267+ vllm_extension = CUDAExtension (
268+ name = "vllm._C" ,
269+ sources = vllm_extension_sources ,
270+ extra_compile_args = {
271+ "cxx" : CXX_FLAGS ,
272+ "nvcc" : NVCC_FLAGS ,
273+ },
274+ )
275+ else :
276+ vllm_extension_sources = [
277+ "csrc/pybind.cpp" ,
278+ ] + CPU_OPS_SOURCES
279+ vllm_extension = CppExtension (
280+ name = "vllm._C" ,
281+ sources = vllm_extension_sources ,
282+ extra_compile_args = {
283+ "cxx" : CXX_FLAGS ,
284+ },
285+ )
242286
243- if _is_cuda ():
244- vllm_extension_sources .append ("csrc/quantization/awq/gemm_kernels.cu" )
245-
246- vllm_extension = CUDAExtension (
247- name = "vllm._C" ,
248- sources = vllm_extension_sources ,
249- extra_compile_args = {
250- "cxx" : CXX_FLAGS ,
251- "nvcc" : NVCC_FLAGS ,
252- },
253- )
254287ext_modules .append (vllm_extension )
255288
256289
@@ -280,7 +313,7 @@ def get_vllm_version() -> str:
280313 if hipcc_version != MAIN_CUDA_VERSION :
281314 rocm_version_str = hipcc_version .replace ("." , "" )[:3 ]
282315 version += f"+rocm{ rocm_version_str } "
283- else :
316+ elif _is_cuda () :
284317 cuda_version = str (nvcc_cuda_version )
285318 if cuda_version != MAIN_CUDA_VERSION :
286319 cuda_version_str = cuda_version .replace ("." , "" )[:3 ]
@@ -303,9 +336,13 @@ def get_requirements() -> List[str]:
303336 if _is_hip ():
304337 with open (get_path ("requirements-rocm.txt" )) as f :
305338 requirements = f .read ().strip ().split ("\n " )
339+ elif _is_cuda ():
340+ with open (get_path ("requirements-gpu.txt" )) as f :
341+ requirements = f .read ().strip ().split ("\n " )
306342 else :
307- with open (get_path ("requirements.txt" )) as f :
343+ with open (get_path ("requirements-cpu .txt" )) as f :
308344 requirements = f .read ().strip ().split ("\n " )
345+
309346 return requirements
310347
311348
0 commit comments