From c591456be645a5c963f1dcb81e0284b71e112962 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sat, 6 May 2023 06:57:19 +0000 Subject: [PATCH 1/8] Minor --- setup.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index 52ff89f63b0e..5a0f7a837b2f 100644 --- a/setup.py +++ b/setup.py @@ -1,13 +1,16 @@ import setuptools import torch -from torch.utils import cpp_extension +from torch.utils.cpp_extension import BuildExtension, CUDAExtension +from torch.utils.cpp_extension import CUDA_HOME + +# Build custom operators. CXX_FLAGS = ['-g'] NVCC_FLAGS = ['-O2'] if not torch.cuda.is_available(): raise RuntimeError( - f'Cannot find CUDA at CUDA_HOME: {cpp_extension.CUDA_HOME}. ' + f'Cannot find CUDA at CUDA_HOME: {CUDA_HOME}. ' 'CUDA must be available in order to build the package.') # FIXME(woosuk): Consider the case where the machine has multiple GPUs with @@ -21,7 +24,7 @@ ext_modules = [] # Cache operations. -cache_extension = cpp_extension.CUDAExtension( +cache_extension = CUDAExtension( name='cacheflow.cache_ops', sources=['csrc/cache.cpp', 'csrc/cache_kernels.cu'], extra_compile_args={'cxx': CXX_FLAGS, 'nvcc': NVCC_FLAGS}, @@ -29,7 +32,7 @@ ext_modules.append(cache_extension) # Attention kernels. -attention_extension = cpp_extension.CUDAExtension( +attention_extension = CUDAExtension( name='cacheflow.attention_ops', sources=['csrc/attention.cpp', 'csrc/attention/attention_kernels.cu'], extra_compile_args={'cxx': CXX_FLAGS, 'nvcc': NVCC_FLAGS}, @@ -37,7 +40,7 @@ ext_modules.append(attention_extension) # Positional encoding kernels. -positional_encoding_extension = cpp_extension.CUDAExtension( +positional_encoding_extension = CUDAExtension( name='cacheflow.pos_encoding_ops', sources=['csrc/pos_encoding.cpp', 'csrc/pos_encoding_kernels.cu'], extra_compile_args={'cxx': CXX_FLAGS, 'nvcc': NVCC_FLAGS}, @@ -45,7 +48,7 @@ ext_modules.append(positional_encoding_extension) # Layer normalization kernels. -layernorm_extension = cpp_extension.CUDAExtension( +layernorm_extension = CUDAExtension( name='cacheflow.layernorm_ops', sources=['csrc/layernorm.cpp', 'csrc/layernorm_kernels.cu'], extra_compile_args={'cxx': CXX_FLAGS, 'nvcc': NVCC_FLAGS}, @@ -53,7 +56,7 @@ ext_modules.append(layernorm_extension) # Activation kernels. -activation_extension = cpp_extension.CUDAExtension( +activation_extension = CUDAExtension( name='cacheflow.activation_ops', sources=['csrc/activation.cpp', 'csrc/activation_kernels.cu'], extra_compile_args={'cxx': CXX_FLAGS, 'nvcc': NVCC_FLAGS}, @@ -63,5 +66,5 @@ setuptools.setup( name='cacheflow', ext_modules=ext_modules, - cmdclass={'build_ext': cpp_extension.BuildExtension}, + cmdclass={'build_ext': BuildExtension}, ) From 0f8969484cde74882c6817055cef9c25845e2099 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sat, 6 May 2023 07:09:53 +0000 Subject: [PATCH 2/8] single quote -> double quote --- setup.py | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/setup.py b/setup.py index 5a0f7a837b2f..5f217084ade2 100644 --- a/setup.py +++ b/setup.py @@ -5,13 +5,13 @@ # Build custom operators. -CXX_FLAGS = ['-g'] -NVCC_FLAGS = ['-O2'] +CXX_FLAGS = ["-g"] +NVCC_FLAGS = ["-O2"] if not torch.cuda.is_available(): raise RuntimeError( - f'Cannot find CUDA at CUDA_HOME: {CUDA_HOME}. ' - 'CUDA must be available in order to build the package.') + f"Cannot find CUDA at CUDA_HOME: {CUDA_HOME}. " + "CUDA must be available in order to build the package.") # FIXME(woosuk): Consider the case where the machine has multiple GPUs with # different compute capabilities. @@ -19,52 +19,52 @@ major, minor = compute_capability # Enable bfloat16 support if the compute capability is >= 8.0. if major >= 8: - NVCC_FLAGS.append('-DENABLE_BF16') + NVCC_FLAGS.append("-DENABLE_BF16") ext_modules = [] # Cache operations. cache_extension = CUDAExtension( - name='cacheflow.cache_ops', - sources=['csrc/cache.cpp', 'csrc/cache_kernels.cu'], - extra_compile_args={'cxx': CXX_FLAGS, 'nvcc': NVCC_FLAGS}, + name="cacheflow.cache_ops", + sources=["csrc/cache.cpp", "csrc/cache_kernels.cu"], + extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS}, ) ext_modules.append(cache_extension) # Attention kernels. attention_extension = CUDAExtension( - name='cacheflow.attention_ops', - sources=['csrc/attention.cpp', 'csrc/attention/attention_kernels.cu'], - extra_compile_args={'cxx': CXX_FLAGS, 'nvcc': NVCC_FLAGS}, + name="cacheflow.attention_ops", + sources=["csrc/attention.cpp", "csrc/attention/attention_kernels.cu"], + extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS}, ) ext_modules.append(attention_extension) # Positional encoding kernels. positional_encoding_extension = CUDAExtension( - name='cacheflow.pos_encoding_ops', - sources=['csrc/pos_encoding.cpp', 'csrc/pos_encoding_kernels.cu'], - extra_compile_args={'cxx': CXX_FLAGS, 'nvcc': NVCC_FLAGS}, + name="cacheflow.pos_encoding_ops", + sources=["csrc/pos_encoding.cpp", "csrc/pos_encoding_kernels.cu"], + extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS}, ) ext_modules.append(positional_encoding_extension) # Layer normalization kernels. layernorm_extension = CUDAExtension( - name='cacheflow.layernorm_ops', - sources=['csrc/layernorm.cpp', 'csrc/layernorm_kernels.cu'], - extra_compile_args={'cxx': CXX_FLAGS, 'nvcc': NVCC_FLAGS}, + name="cacheflow.layernorm_ops", + sources=["csrc/layernorm.cpp", "csrc/layernorm_kernels.cu"], + extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS}, ) ext_modules.append(layernorm_extension) # Activation kernels. activation_extension = CUDAExtension( - name='cacheflow.activation_ops', - sources=['csrc/activation.cpp', 'csrc/activation_kernels.cu'], - extra_compile_args={'cxx': CXX_FLAGS, 'nvcc': NVCC_FLAGS}, + name="cacheflow.activation_ops", + sources=["csrc/activation.cpp", "csrc/activation_kernels.cu"], + extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS}, ) ext_modules.append(activation_extension) setuptools.setup( - name='cacheflow', + name="cacheflow", ext_modules=ext_modules, - cmdclass={'build_ext': BuildExtension}, + cmdclass={"build_ext": BuildExtension}, ) From 883ab1f1fc8acb392558a65ff4208d834ac27913 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sat, 6 May 2023 07:10:24 +0000 Subject: [PATCH 3/8] Minor --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 5f217084ade2..a0bbdef93f6e 100644 --- a/setup.py +++ b/setup.py @@ -6,6 +6,7 @@ # Build custom operators. CXX_FLAGS = ["-g"] +# TODO(woosuk): Should we use -O3? NVCC_FLAGS = ["-O2"] if not torch.cuda.is_available(): From e00e42cf20b89b70ae022dcd306834f9f26f605f Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sat, 6 May 2023 07:29:35 +0000 Subject: [PATCH 4/8] Define dependencies in requirements.txt --- requirements.txt | 8 ++++++++ setup.py | 11 +++++++++++ 2 files changed, 19 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000000..aceb871be6fa --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +ninja # For faster builds. +psutil +ray +sentencepiece # Required for LLaMA tokenizer. +numpy +torch >= 2.0.0 +transformers >= 4.28.0 # Required for LLaMA. +xformers >= 0.0.19 diff --git a/setup.py b/setup.py index a0bbdef93f6e..94bbcb648a9d 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,5 @@ +from typing import List + import setuptools import torch from torch.utils.cpp_extension import BuildExtension, CUDAExtension @@ -64,8 +66,17 @@ ) ext_modules.append(activation_extension) + +def get_requirements() -> List[str]: + """Get Python package dependencies from requirements.txt.""" + with open("requirements.txt") as f: + requirements = f.read().strip().split("\n") + return requirements + + setuptools.setup( name="cacheflow", + install_requires=get_requirements(), ext_modules=ext_modules, cmdclass={"build_ext": BuildExtension}, ) From 72b30b8e5e7ab84fb6d4e59bf285b7dc2149fa3c Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sat, 6 May 2023 07:29:46 +0000 Subject: [PATCH 5/8] Fix README --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 0543b9def659..dbd2da9290c3 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ # CacheFlow -## Installation +## Build from source ```bash -pip install ninja psutil numpy sentencepiece ray torch transformers xformers -pip install -e . +pip install -r requirements.txt +pip install -e . # This may take several minutes. ``` ## Test simple server From 9fe74496fd6b6de2aedf1c9aa2a1daa7fdced1d5 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sat, 6 May 2023 07:39:34 +0000 Subject: [PATCH 6/8] Specify python version --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 94bbcb648a9d..48538ffe9834 100644 --- a/setup.py +++ b/setup.py @@ -76,6 +76,7 @@ def get_requirements() -> List[str]: setuptools.setup( name="cacheflow", + python_requires=">=3.8", install_requires=get_requirements(), ext_modules=ext_modules, cmdclass={"build_ext": BuildExtension}, From 99a9fd0b033e16d1ca3a26851055cd68f5cfa0a3 Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sat, 6 May 2023 22:35:43 +0000 Subject: [PATCH 7/8] Add FastAPI and uvicorn to requirements.txt --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index aceb871be6fa..bcb79da5213a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,5 @@ numpy torch >= 2.0.0 transformers >= 4.28.0 # Required for LLaMA. xformers >= 0.0.19 +fastapi +uvicorn From 3fc23577187ce453485586cf68ea6b95fac3d29f Mon Sep 17 00:00:00 2001 From: woWoosuk Kwon Date: Sun, 7 May 2023 23:30:13 +0000 Subject: [PATCH 8/8] Minor --- README.md | 5 ----- 1 file changed, 5 deletions(-) diff --git a/README.md b/README.md index dbd2da9290c3..b70cde906a53 100644 --- a/README.md +++ b/README.md @@ -21,11 +21,6 @@ python simple_server.py --help ## FastAPI server -Install the following additional dependencies: -```bash -pip install fastapi uvicorn -``` - To start the server: ```bash ray start --head