From 79c1b54c98443a20bd736a38905ea97dc856f423 Mon Sep 17 00:00:00 2001 From: "s.m.kochetkov" Date: Sat, 1 Jun 2024 12:16:07 +0000 Subject: [PATCH 01/13] it_works --- csrc/punica/bgmv/bgmv_config.h | 6 ++++ vllm/model_executor/models/commandr.py | 46 +++++++++++++++++++++++--- 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h index 19c058cacfbc..7d96bf1d3fd1 100644 --- a/csrc/punica/bgmv/bgmv_config.h +++ b/csrc/punica/bgmv/bgmv_config.h @@ -51,6 +51,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 16384) \ f(in_T, out_T, W_T, narrow, 20480) \ f(in_T, out_T, W_T, narrow, 22016) \ + f(in_T, out_T, W_T, narrow, 22528) \ f(in_T, out_T, W_T, narrow, 24576) \ f(in_T, out_T, W_T, narrow, 27392) \ f(in_T, out_T, W_T, narrow, 28672) \ @@ -71,6 +72,8 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 128000) \ f(in_T, out_T, W_T, narrow, 128256) \ f(in_T, out_T, W_T, narrow, 128512) \ + f(in_T, out_T, W_T, narrow, 60544) \ + f(in_T, out_T, W_T, narrow, 60672) \ // Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA // and vllm/tests/lora/test_punica.py @@ -119,6 +122,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 16384, narrow) \ f(in_T, out_T, W_T, 20480, narrow) \ f(in_T, out_T, W_T, 22016, narrow) \ + f(in_T, out_T, W_T, 22528, narrow) \ f(in_T, out_T, W_T, 24576, narrow) \ f(in_T, out_T, W_T, 27392, narrow) \ f(in_T, out_T, W_T, 28672, narrow) \ @@ -139,6 +143,8 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 128000, narrow) \ f(in_T, out_T, W_T, 128256, narrow) \ f(in_T, out_T, W_T, 128512, narrow) \ + f(in_T, out_T, W_T, 60544, narrow) \ + f(in_T, out_T, W_T, 60672, narrow) \ // Keep above in sync with vllm/lora/layers::SamplerWithLoRA diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 17c2f1223d96..83b9575f50f5 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -29,6 +29,7 @@ from transformers import CohereConfig from vllm.attention import Attention, AttentionMetadata +from vllm.config import LoRAConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.layers.activation import SiluAndMul @@ -259,10 +260,14 @@ def __init__( self, config: CohereConfig, quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, ): super().__init__() self.config = config - self.vocab_size = config.vocab_size + lora_vocab = (lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0 + self.vocab_size = config.vocab_size + lora_vocab + self.org_vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding(config.vocab_size, config.hidden_size) self.layers = nn.ModuleList([ @@ -296,17 +301,48 @@ def forward( class CohereForCausalLM(nn.Module): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + # LoRA specific attributes + supported_lora_modules = [ + "qkv_proj", + "o_proj", + "gate_up_proj", + "down_proj", + "embed_tokens", + "lm_head", + ] + embedding_modules = { + "embed_tokens": "input_embeddings", + "lm_head": "output_embeddings", + } + embedding_padding_modules = ["lm_head"] + def __init__( self, config: CohereConfig, quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, ) -> None: super().__init__() self.config = config + self.unpadded_vocab_size = config.vocab_size + if lora_config: + self.unpadded_vocab_size += lora_config.lora_extra_vocab_size self.quant_config = quant_config - self.logits_processor = LogitsProcessor(config.vocab_size, + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size, scale=config.logit_scale) - self.model = CohereModel(config, quant_config) + self.model = CohereModel(config, quant_config, lora_config=lora_config) self.sampler = Sampler() @torch.no_grad() @@ -323,7 +359,9 @@ def forward( def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.model.embed_tokens.weight, + #TODO not sure, check later + embedding_weights = self.model.embed_tokens.weight if hasattr(self.model.embed_tokens,'weight') else self.model.embed_tokens.base_layer.weight + logits = self.logits_processor(embedding_weights, hidden_states, sampling_metadata) return logits From b9a68d0d3811be40fcc92edb773929c28c172563 Mon Sep 17 00:00:00 2001 From: "s.m.kochetkov" Date: Sat, 1 Jun 2024 13:14:05 +0000 Subject: [PATCH 02/13] lora --- vllm/model_executor/models/commandr.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index cafa54942abe..04d981e29fe6 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -29,11 +29,7 @@ from transformers import CohereConfig from vllm.attention import Attention, AttentionMetadata -<<<<<<< HEAD -from vllm.config import LoRAConfig -======= -from vllm.config import CacheConfig ->>>>>>> main +from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.layers.activation import SiluAndMul @@ -321,7 +317,7 @@ class CohereForCausalLM(nn.Module): "up_proj", ], } - # LoRA specific attributes + # LoRA specific attributes supported_lora_modules = [ "qkv_proj", "o_proj", @@ -349,10 +345,13 @@ def __init__( if lora_config: self.unpadded_vocab_size += lora_config.lora_extra_vocab_size self.quant_config = quant_config - self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, scale=config.logit_scale) - self.model = CohereModel(config, cache_config, quant_config, lora_config=lora_config) + self.model = CohereModel(config, + cache_config, + quant_config, + lora_config=lora_config) self.sampler = Sampler() @torch.no_grad() @@ -370,9 +369,11 @@ def forward( def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata) -> torch.Tensor: #TODO not sure, check later - embedding_weights = self.model.embed_tokens.weight if hasattr(self.model.embed_tokens,'weight') else self.model.embed_tokens.base_layer.weight - logits = self.logits_processor(embedding_weights, - hidden_states, sampling_metadata) + embedding_weights = self.model.embed_tokens.weight if hasattr( + self.model.embed_tokens, + 'weight') else self.model.embed_tokens.base_layer.weight + logits = self.logits_processor(embedding_weights, hidden_states, + sampling_metadata) return logits def sample( From b67dabfcb40db76907ffd468d3ee0ffb9a1a1e79 Mon Sep 17 00:00:00 2001 From: "s.m.kochetkov" Date: Mon, 3 Jun 2024 05:14:57 +0000 Subject: [PATCH 03/13] fix: just retriger ci/cd --- vllm/model_executor/models/commandr.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 04d981e29fe6..4914623a555b 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -332,6 +332,7 @@ class CohereForCausalLM(nn.Module): } embedding_padding_modules = ["lm_head"] + def __init__( self, config: CohereConfig, From 2bac9151a6d7e23c8aac25bef0c0d9ef7a7b9c0b Mon Sep 17 00:00:00 2001 From: "s.m.kochetkov" Date: Mon, 3 Jun 2024 05:17:05 +0000 Subject: [PATCH 04/13] fix: just retriger ci/cd --- vllm/model_executor/models/commandr.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 4914623a555b..04d981e29fe6 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -332,7 +332,6 @@ class CohereForCausalLM(nn.Module): } embedding_padding_modules = ["lm_head"] - def __init__( self, config: CohereConfig, From 1893e7fe65517f9923719a0a23b91efb6b60f823 Mon Sep 17 00:00:00 2001 From: "s.m.kochetkov" Date: Mon, 3 Jun 2024 14:24:42 +0000 Subject: [PATCH 05/13] trigger_ci/cd --- csrc/punica/bgmv/bgmv_config.h | 1 - 1 file changed, 1 deletion(-) mode change 100644 => 100755 csrc/punica/bgmv/bgmv_config.h diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h old mode 100644 new mode 100755 index 1727769979dd..c664326e09df --- a/csrc/punica/bgmv/bgmv_config.h +++ b/csrc/punica/bgmv/bgmv_config.h @@ -9,7 +9,6 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, int64_t layer_idx, float scale); // clang-format off - #define FOR_BGMV_WIDE(f, in_T, out_T, W_T, narrow) \ f(in_T, out_T, W_T, narrow, 128) \ f(in_T, out_T, W_T, narrow, 256) \ From a9bc7c10c8e6b1a9f46e69dc9bcc6dcaf003e606 Mon Sep 17 00:00:00 2001 From: "s.m.kochetkov" Date: Thu, 6 Jun 2024 09:38:52 +0000 Subject: [PATCH 06/13] fix: kernels size --- csrc/punica/bgmv/bgmv_config.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h index c664326e09df..a4460375be7c 100755 --- a/csrc/punica/bgmv/bgmv_config.h +++ b/csrc/punica/bgmv/bgmv_config.h @@ -9,6 +9,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, int64_t layer_idx, float scale); // clang-format off + #define FOR_BGMV_WIDE(f, in_T, out_T, W_T, narrow) \ f(in_T, out_T, W_T, narrow, 128) \ f(in_T, out_T, W_T, narrow, 256) \ @@ -44,6 +45,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 9216) \ f(in_T, out_T, W_T, narrow, 10240) \ f(in_T, out_T, W_T, narrow, 11008) \ + f(in_T, out_T, W_T, narrow, 11264) \ f(in_T, out_T, W_T, narrow, 12288) \ f(in_T, out_T, W_T, narrow, 13696) \ f(in_T, out_T, W_T, narrow, 13824) \ @@ -65,6 +67,8 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 36864) \ f(in_T, out_T, W_T, narrow, 43264) \ f(in_T, out_T, W_T, narrow, 49152) \ + f(in_T, out_T, W_T, narrow, 60544) \ + f(in_T, out_T, W_T, narrow, 60672) \ f(in_T, out_T, W_T, narrow, 64000) \ f(in_T, out_T, W_T, narrow, 64256) \ f(in_T, out_T, W_T, narrow, 64512) \ @@ -74,8 +78,8 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 128000) \ f(in_T, out_T, W_T, narrow, 128256) \ f(in_T, out_T, W_T, narrow, 128512) \ - f(in_T, out_T, W_T, narrow, 60544) \ - f(in_T, out_T, W_T, narrow, 60672) \ + + // Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA // and vllm/tests/lora/test_punica.py @@ -118,6 +122,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 9216, narrow) \ f(in_T, out_T, W_T, 10240, narrow) \ f(in_T, out_T, W_T, 11008, narrow) \ + f(in_T, out_T, W_T, 11264, narrow) \ f(in_T, out_T, W_T, 12288, narrow) \ f(in_T, out_T, W_T, 13696, narrow) \ f(in_T, out_T, W_T, 13824, narrow) \ @@ -139,6 +144,8 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 36864, narrow) \ f(in_T, out_T, W_T, 43264, narrow) \ f(in_T, out_T, W_T, 49152, narrow) \ + f(in_T, out_T, W_T, 60544, narrow) \ + f(in_T, out_T, W_T, 60672, narrow) \ f(in_T, out_T, W_T, 64000, narrow) \ f(in_T, out_T, W_T, 64256, narrow) \ f(in_T, out_T, W_T, 64512, narrow) \ @@ -148,8 +155,6 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 128000, narrow) \ f(in_T, out_T, W_T, 128256, narrow) \ f(in_T, out_T, W_T, 128512, narrow) \ - f(in_T, out_T, W_T, 60544, narrow) \ - f(in_T, out_T, W_T, 60672, narrow) \ // Keep above in sync with vllm/lora/layers::SamplerWithLoRA From 961bc60e7f545e1d69c632528135262fde67fcee Mon Sep 17 00:00:00 2001 From: "s.m.kochetkov" Date: Thu, 6 Jun 2024 14:59:13 +0000 Subject: [PATCH 07/13] remove_vllm_head --- vllm/model_executor/models/commandr.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 04d981e29fe6..561d94667ea6 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -323,14 +323,12 @@ class CohereForCausalLM(nn.Module): "o_proj", "gate_up_proj", "down_proj", - "embed_tokens", - "lm_head", + "embed_tokens" ] embedding_modules = { - "embed_tokens": "input_embeddings", - "lm_head": "output_embeddings", + "embed_tokens": "input_embeddings" } - embedding_padding_modules = ["lm_head"] + embedding_padding_modules = [] def __init__( self, @@ -368,10 +366,12 @@ def forward( def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata) -> torch.Tensor: - #TODO not sure, check later - embedding_weights = self.model.embed_tokens.weight if hasattr( - self.model.embed_tokens, - 'weight') else self.model.embed_tokens.base_layer.weight + is_not_lora=hasattr(self.model.embed_tokens,'weight') + if is_not_lora: + embedding_weights = self.model.embed_tokens.weight + else: + embedding_weights = self.model.embed_tokens.base_layer.weight + logits = self.logits_processor(embedding_weights, hidden_states, sampling_metadata) return logits From b0d4d4a62731e1bf2fdf953ff84c659e18d17082 Mon Sep 17 00:00:00 2001 From: "s.m.kochetkov" Date: Fri, 7 Jun 2024 05:09:50 +0000 Subject: [PATCH 08/13] fix: reformat --- vllm/model_executor/models/commandr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 561d94667ea6..cd2dae897775 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -366,7 +366,7 @@ def forward( def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata) -> torch.Tensor: - is_not_lora=hasattr(self.model.embed_tokens,'weight') + is_not_lora = hasattr(self.model.embed_tokens, 'weight') if is_not_lora: embedding_weights = self.model.embed_tokens.weight else: From bb72113257faac6207fe246a1be9e6b9d4039f47 Mon Sep 17 00:00:00 2001 From: "s.m.kochetkov" Date: Fri, 7 Jun 2024 05:10:54 +0000 Subject: [PATCH 09/13] fix: reformat --- vllm/model_executor/models/commandr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index cd2dae897775..7b0bb52ea31f 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -371,7 +371,7 @@ def compute_logits(self, hidden_states: torch.Tensor, embedding_weights = self.model.embed_tokens.weight else: embedding_weights = self.model.embed_tokens.base_layer.weight - + logits = self.logits_processor(embedding_weights, hidden_states, sampling_metadata) return logits From da05ea496612aa02f4eba0ead87e706a337ae727 Mon Sep 17 00:00:00 2001 From: "s.m.kochetkov" Date: Fri, 7 Jun 2024 05:25:08 +0000 Subject: [PATCH 10/13] test: new shapes added to test_punica.py --- tests/lora/test_punica.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py index f021c003b132..80c4c0139c67 100644 --- a/tests/lora/test_punica.py +++ b/tests/lora/test_punica.py @@ -75,10 +75,12 @@ def _lora_ref_impl( 9216, 10240, 11008, + 11264, 13824, 14336, 15360, 22016, + 22528, 24576, 27392, 27648, @@ -90,6 +92,8 @@ def _lora_ref_impl( 36864, 43264, 49152, + 60544, + 60672, 64000, 64256, 102400, From 6d43ac61d591b5f0b0ba7da292241b1ddd23496d Mon Sep 17 00:00:00 2001 From: "s.m.kochetkov" Date: Fri, 7 Jun 2024 05:31:51 +0000 Subject: [PATCH 11/13] format --- vllm/model_executor/models/commandr.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 7b0bb52ea31f..4a14634d7319 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -319,15 +319,9 @@ class CohereForCausalLM(nn.Module): } # LoRA specific attributes supported_lora_modules = [ - "qkv_proj", - "o_proj", - "gate_up_proj", - "down_proj", - "embed_tokens" + "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens" ] - embedding_modules = { - "embed_tokens": "input_embeddings" - } + embedding_modules = {"embed_tokens": "input_embeddings"} embedding_padding_modules = [] def __init__( From 7c605d6ea1e20f548a0de4d5f4d4d76e78d89c4b Mon Sep 17 00:00:00 2001 From: "s.m.kochetkov" Date: Fri, 7 Jun 2024 06:23:48 +0000 Subject: [PATCH 12/13] ci: trigger ci/cd --- tests/lora/test_punica.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py index 80c4c0139c67..1ab2afc3d6aa 100644 --- a/tests/lora/test_punica.py +++ b/tests/lora/test_punica.py @@ -99,7 +99,7 @@ def _lora_ref_impl( 102400, 102656, 128000, - 128256, + 128256 ] H2 = [64] + H2 R = [1, 2, 4] From f27e4e44668049b6b272301d5b95626214fda1d9 Mon Sep 17 00:00:00 2001 From: "s.m.kochetkov" Date: Fri, 7 Jun 2024 06:31:17 +0000 Subject: [PATCH 13/13] reformat --- tests/lora/test_punica.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py index 1ab2afc3d6aa..80c4c0139c67 100644 --- a/tests/lora/test_punica.py +++ b/tests/lora/test_punica.py @@ -99,7 +99,7 @@ def _lora_ref_impl( 102400, 102656, 128000, - 128256 + 128256, ] H2 = [64] + H2 R = [1, 2, 4]