Add BigDL Llama worker for batching on decoding (vllm-project#4)

xiangyuT · web-flow · commit 02b4cac7af17 · 2023-10-19T15:14:53.000+08:00
* Init

* refine
diff --git a/tests/under_models/send_mock_request.py b/tests/under_models/send_mock_request.py
@@ -43,6 +43,7 @@ async def step_async(self) -> List[RequestOutput]:
             blocks_to_swap_out={},
             blocks_to_copy={},
         )
+        print(output)
 
         # TODO: change this to real one
         return RequestOutput(request_id=request_id, prompt="", prompt_token_ids=[1, 3087, 8970, 338, 263], outputs=[], finished=False)
@@ -109,7 +110,7 @@ async def _run_workers_async(
 @pytest.mark.asyncio
 async def test_model_execution():
     # Let's build an engine_args    
-    engine_args = AsyncEngineArgs(model='/models/vicuna-7b/', tokenizer='/models/vicuna-7b/', tokenizer_mode='auto', trust_remote_code=False, download_dir=None, load_format='auto', dtype='auto', seed=0, max_model_len=None, worker_use_ray=False, pipeline_parallel_size=1, tensor_parallel_size=1, block_size=16, swap_space=16, gpu_memory_utilization=0.9, max_num_batched_tokens=None, max_num_seqs=256, disable_log_stats=False, revision=None, tokenizer_revision=None, quantization=None, engine_use_ray=False, disable_log_requests=True, max_log_len=None)
+    engine_args = AsyncEngineArgs(model='/models/vicuna-7b/', tokenizer='/models/vicuna-7b/', tokenizer_mode='auto', trust_remote_code=False, download_dir=None, load_format='dummy', dtype='auto', seed=0, max_model_len=None, worker_use_ray=False, pipeline_parallel_size=1, tensor_parallel_size=1, block_size=16, swap_space=16, gpu_memory_utilization=0.9, max_num_batched_tokens=None, max_num_seqs=256, disable_log_stats=False, revision=None, tokenizer_revision=None, quantization=None, engine_use_ray=False, disable_log_requests=True, max_log_len=None)
     # Start the engine
     engine = AsyncLLMEngine.from_engine_args(engine_args)
 
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
@@ -68,12 +68,12 @@ def __init__(
 
         # Instantiate the scheduling policy.
         self.policy = PolicyFactory.get_policy(policy_name="fcfs")
-        # Create the block space manager.
-        self.block_manager = BlockSpaceManager(
-            block_size=self.cache_config.block_size,
-            num_gpu_blocks=self.cache_config.num_gpu_blocks,
-            num_cpu_blocks=self.cache_config.num_cpu_blocks,
-            sliding_window=self.cache_config.sliding_window)
+        # # Create the block space manager.
+        # self.block_manager = BlockSpaceManager(
+        #     block_size=self.cache_config.block_size,
+        #     num_gpu_blocks=self.cache_config.num_gpu_blocks,
+        #     num_cpu_blocks=self.cache_config.num_cpu_blocks,
+        #     sliding_window=self.cache_config.sliding_window)
 
         # TODO(zhuohan): Use deque instead of list for better performance.
         # Sequence groups in the WAITING state.
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -109,8 +109,8 @@ def __init__(
         else:
             self._init_workers(distributed_init_method)
 
-        # Profile the memory usage and initialize the cache.
-        self._init_cache()
+        # # Profile the memory usage and initialize the cache.
+        # self._init_cache()
 
         # Create the scheduler.
         self.scheduler = Scheduler(scheduler_config, cache_config)
diff --git a/vllm/model_executor/input_metadata.py b/vllm/model_executor/input_metadata.py
@@ -25,48 +25,48 @@ def __init__(
         seq_groups: List[Tuple[List[int], SamplingParams]],
         seq_data: Dict[int, SequenceData],
         prompt_lens: List[int],
-        slot_mapping: torch.Tensor,
+        # slot_mapping: torch.Tensor,
         context_lens: torch.Tensor,
         max_context_len: int,
-        block_tables: torch.Tensor,
+        # block_tables: torch.Tensor,
         sliding_window: Optional[int] = None,
     ) -> None:
         self.seq_groups = seq_groups
         self.seq_data = seq_data
         self.prompt_lens = prompt_lens
-        self.slot_mapping = slot_mapping
+        # self.slot_mapping = slot_mapping
         self.context_lens = context_lens
         self.max_context_len = max_context_len
-        self.block_tables = block_tables
+        # self.block_tables = block_tables
 
         self.to_cache = None
-        if sliding_window is not None:
-            # We need to keep the positions of sliding windows within
-            # the key / value tables, this is helpful to know which
-            # elements we need to cache and where
-            to_cache, start_idx = [], 0
-            for prompt_len in self.prompt_lens:
-                to_cache.extend(
-                    range(
-                        start_idx + max(0, prompt_len - sliding_window),
-                        start_idx + prompt_len,
-                    ))
-                start_idx += prompt_len
-            to_cache.extend(range(start_idx, slot_mapping.shape[0]))
-            self.to_cache = torch.tensor(to_cache,
-                                         dtype=torch.int32,
-                                         device=self.slot_mapping.device)
+        # if sliding_window is not None:
+        #     # We need to keep the positions of sliding windows within
+        #     # the key / value tables, this is helpful to know which
+        #     # elements we need to cache and where
+        #     to_cache, start_idx = [], 0
+        #     for prompt_len in self.prompt_lens:
+        #         to_cache.extend(
+        #             range(
+        #                 start_idx + max(0, prompt_len - sliding_window),
+        #                 start_idx + prompt_len,
+        #             ))
+        #         start_idx += prompt_len
+        #     to_cache.extend(range(start_idx, slot_mapping.shape[0]))
+        #     self.to_cache = torch.tensor(to_cache,
+        #                                  dtype=torch.int32,
+        #                                  device=self.slot_mapping.device)
 
         self.num_prompts = len(prompt_lens)
         self.num_prompt_tokens = sum(prompt_lens)
         self.num_generation_tokens = context_lens.shape[0]
-        self.num_valid_tokens = slot_mapping.shape[0]
-        if block_tables.numel() > 0:
-            self.max_num_blocks_per_seq = block_tables.shape[1]
-        else:
-            self.max_num_blocks_per_seq = 0
-        assert block_tables.shape[0] == self.num_generation_tokens
-        assert context_lens.shape[0] == self.num_generation_tokens
+        # self.num_valid_tokens = slot_mapping.shape[0]
+        # if block_tables.numel() > 0:
+        #     self.max_num_blocks_per_seq = block_tables.shape[1]
+        # else:
+        #     self.max_num_blocks_per_seq = 0
+        # assert block_tables.shape[0] == self.num_generation_tokens
+        # assert context_lens.shape[0] == self.num_generation_tokens
 
         # Set during the execution of the first attention op.
         self.attn_bias: List[AttentionBias] = []
diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py
@@ -23,7 +23,7 @@
     "GPTJForCausalLM": GPTJForCausalLM,
     "GPTNeoXForCausalLM": GPTNeoXForCausalLM,
     "InternLMForCausalLM": InternLMForCausalLM,
-    "LlamaForCausalLM": LlamaForCausalLM,
+    "LlamaForCausalLM": BigDLLlamaForCausalLM,
     "LLaMAForCausalLM": LlamaForCausalLM,  # For decapoda-research/llama-*
     "MistralForCausalLM": MistralForCausalLM,
     "MPTForCausalLM": MPTForCausalLM,
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
@@ -9,6 +9,7 @@
 from vllm.model_executor.models.gpt_neox import GPTNeoXForCausalLM
 from vllm.model_executor.models.internlm import InternLMForCausalLM
 from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.model_executor.models.bigdl_llama import BigDLLlamaForCausalLM
 from vllm.model_executor.models.mpt import MPTForCausalLM
 from vllm.model_executor.models.opt import OPTForCausalLM
 from vllm.model_executor.models.qwen import QWenLMHeadModel
@@ -26,6 +27,7 @@
     "GPTNeoXForCausalLM",
     "InternLMForCausalLM",
     "LlamaForCausalLM",
+    "BigDLLlamaForCausalLM",
     "MPTForCausalLM",
     "OPTForCausalLM",
     "QWenLMHeadModel",
diff --git a/vllm/model_executor/models/bigdl_llama.py b/vllm/model_executor/models/bigdl_llama.py
@@ -0,0 +1,140 @@
+import torch
+from torch import nn
+
+from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizerBase, LlamaConfig
+from typing import Optional, Tuple, List, Type, Dict
+
+from vllm.transformers_utils.tokenizer import (detokenize_incrementally,
+                                               get_tokenizer)
+from vllm.model_executor.quantization_utils import QuantizationConfig
+from vllm.sequence import SamplerOutput, SequenceOutputs
+import math
+
+import pdb
+
+from transformers.generation.logits_process import (
+    LogitsProcessorList,
+    RepetitionPenaltyLogitsProcessor,
+    TemperatureLogitsWarper,
+    TopKLogitsWarper,
+    TopPLogitsWarper,
+)
+
+def prepare_logits_processor(
+    temperature: float, repetition_penalty: float, top_p: float, top_k: int
+) -> LogitsProcessorList:
+    processor_list = LogitsProcessorList()
+    # TemperatureLogitsWarper doesn't accept 0.0, 1.0 makes it a no-op so we skip two cases.
+    if temperature >= 1e-5 and temperature != 1.0:
+        processor_list.append(TemperatureLogitsWarper(temperature))
+    # if repetition_penalty > 1.0:
+    #     processor_list.append(RepetitionPenaltyLogitsProcessor(repetition_penalty))
+    if 1e-8 <= top_p < 1.0:
+        processor_list.append(TopPLogitsWarper(top_p))
+    if top_k > 0:
+        processor_list.append(TopKLogitsWarper(top_k))
+    return processor_list
+
+class BigDLLlamaForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        # pdb.set_trace()
+        self.config = config
+        self.model = AutoModelForCausalLM.from_pretrained(config._name_or_path)
+        self.tokenizer = AutoTokenizer.from_pretrained(config._name_or_path)
+
+    def decode(self, generated_ids: List[int]) -> str:
+        return self.tokenizer.decode(
+            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+
+    def forward(
+        self, seq_group_meta_data_lists, kv_cache: Optional = None
+    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
+        kv_cache_0 = self.model.config.num_hidden_layers
+        kv_cache_1 = 2
+        bigdl_kv_cache = [[torch.Tensor() for _ in range(kv_cache_1)] for _ in range(kv_cache_0)]
+        seq_len = len(seq_group_meta_data_lists)
+        for i in range(seq_len):
+            if kv_cache.get(i) is None:
+                kv_cache[i] = bigdl_kv_cache[:]
+
+        bigdl_input_ids = []
+        bigdl_position_ids = []
+        cur_seq_ids = []
+        bigdl_sampling_params = {}
+        
+        all_decoding = True
+        for seq_group_meta_data in seq_group_meta_data_lists:
+            req_id = seq_group_meta_data.request_id
+            all_decoding = all_decoding and (not seq_group_meta_data.is_prompt)
+            seq_ids = list(seq_group_meta_data.seq_data.keys())
+            seq_id = seq_ids[0]
+            print(seq_id)
+            cur_seq_ids.append(seq_id)
+            seq_data = seq_group_meta_data.seq_data[seq_id]
+            
+            cur_seq_input_ids = seq_data.get_token_ids()
+            bigdl_input_ids.append(cur_seq_input_ids)
+            
+            bigdl_sampling_params[seq_id] = seq_group_meta_data.sampling_params
+
+            context_len = seq_data.get_len()
+            bigdl_position_ids.append(range(context_len))
+        if all_decoding: 
+            for seq_group_meta_data in seq_group_meta_data_lists:
+                for i in range(kv_cache_0):
+                    for j in range(kv_cache_1):
+                        bigdl_kv_cache[i][j] = torch.cat((bigdl_kv_cache[i][j], kv_cache[seq_id][i][j]), dim=0)
+            
+        bigdl_input_ids = torch.tensor(bigdl_input_ids, device="cuda")
+        bigdl_position_ids = torch.tensor(bigdl_position_ids, device="cuda")
+        if all_decoding:
+            kwargs = {
+                        "input_ids": bigdl_input_ids,
+                        "position_ids": bigdl_position_ids,
+                        "past_key_values": bigdl_kv_cache,
+                        "use_cache": True,
+                        "return_dict": True,
+                    }
+        else:
+            kwargs = {
+                        "input_ids": bigdl_input_ids,
+                        "position_ids": bigdl_position_ids,
+                        "past_key_values": None,
+                        "use_cache": True,
+                        "return_dict": True,
+                    }
+        # kwargs["position_ids"] = position_ids
+        outputs = self.model.forward(**kwargs)
+        index = 0
+        bigdl_output = []
+        for seq_id in cur_seq_ids:
+            cur_sampling_params = bigdl_sampling_params[seq_id]
+            logits_processor = prepare_logits_processor(
+                cur_sampling_params.temperature, 1,
+                cur_sampling_params.top_p, cur_sampling_params.top_k
+            )
+    
+            last_token_logits = logits_processor(None, outputs.logits[index:index+1, -1, :])[0]
+            probs = torch.softmax(last_token_logits, dim=-1)
+            indices = torch.multinomial(probs, num_samples=2)
+            tokens = [int(token) for token in indices.tolist()]
+
+            logprobs = math.log(probs[tokens[0]])
+            seq_output = SequenceOutputs(
+                parent_seq_id = seq_id,
+                output_token = tokens[0],
+                logprobs = {tokens[0]: logprobs}
+            )
+            bigdl_output.append([seq_output])
+            
+            for i in range(kv_cache_0):
+                for j in range(kv_cache_1):
+                    kv_cache[seq_id][i][j] = outputs.past_key_values[i][j][index].unsqueeze(0)
+            index = index + 1
+        return bigdl_output
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py