vllm-project
diff --git a/‎examples/deepseek-chat.py‎
Lines changed: 85 additions & 0 deletions b/‎examples/deepseek-chat.py‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎vllm/attention/backends/flashinfer.py‎
Lines changed: 7 additions & 72 deletions b/‎vllm/attention/backends/flashinfer.py‎
Lines changed: 7 additions & 72 deletions
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm import LLM, SamplingParams
+
+llm = LLM(
+    model="deepseek-ai/DeepSeek-V2-Lite",
+    trust_remote_code=True,
+)
+sampling_params = SamplingParams(temperature=0.5)
+
+
+def print_outputs(outputs):
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    print("-" * 80)
+
+
+print("=" * 80)
+
+# In this script, we demonstrate how to pass input to the chat method:
+
+conversation = [
+    {
+        "role": "system",
+        "content": "You are a helpful assistant"
+    },
+    {
+        "role": "user",
+        "content": "Hello"
+    },
+    {
+        "role": "assistant",
+        "content": "Hello! How can I assist you today?"
+    },
+    {
+        "role": "user",
+        "content": "Write an essay about the importance of higher education.",
+    },
+]
+outputs = llm.chat(conversation,
+                   sampling_params=sampling_params,
+                   use_tqdm=False)
+print_outputs(outputs)
+
+# You can run batch inference with llm.chat API
+conversation = [
+    {
+        "role": "system",
+        "content": "You are a helpful assistant"
+    },
+    {
+        "role": "user",
+        "content": "Hello"
+    },
+    {
+        "role": "assistant",
+        "content": "Hello! How can I assist you today?"
+    },
+    {
+        "role": "user",
+        "content": "Write an essay about the importance of higher education.",
+    },
+]
+conversations = [conversation for _ in range(10)]
+
+# We turn on tqdm progress bar to verify it's indeed running batch inference
+outputs = llm.chat(messages=conversations,
+                   sampling_params=sampling_params,
+                   use_tqdm=True)
+print_outputs(outputs)
+
+# A chat template can be optionally supplied.
+# If not, the model will use its default chat template.
+
+# with open('template_falcon_180b.jinja', "r") as f:
+#     chat_template = f.read()
+
+# outputs = llm.chat(
+#     conversations,
+#     sampling_params=sampling_params,
+#     use_tqdm=False,
+#     chat_template=chat_template,
+# )
@@ -32,12 +32,13 @@
                                               AttentionMetadata,
                                               AttentionMetadataBuilder,
                                               AttentionState, AttentionType)
-from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
+from vllm.attention.backends.utils import (PAD_SLOT_ID, PerLayerParameters,
+                                           compute_slot_mapping,
                                            compute_slot_mapping_start_idx,
+                                           infer_global_hyperparameters,
                                            is_block_tables_empty)
-from vllm.attention.layer import Attention
 from vllm.attention.ops.paged_attn import PagedAttention
-from vllm.config import VllmConfig, get_current_vllm_config
+from vllm.config import get_current_vllm_config
 from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
                         make_tensor_with_pad)
 
@@ -106,72 +107,6 @@ def get_fp8_dtype_for_flashinfer(kv_cache_dtype: str) -> torch.dtype:
             raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
 
 
-@dataclass
-class PerLayerParameters:
-    """
-    Currently, FlashInfer backend only support models in which all layers share
-    the same values for the following hyperparameters.
-    """
-
-    window_left: int
-    logits_soft_cap: Optional[float]
-    sm_scale: float
-
-
-def get_per_layer_parameters(
-        vllm_config: VllmConfig) -> Dict[str, PerLayerParameters]:
-    """
-    Scan all attention layers and determine some hyperparameters
-    to use during `plan`.
-    """
-
-    layers = vllm_config.compilation_config.static_forward_context
-    per_layer_params: Dict[str, PerLayerParameters] = {}
-
-    for key, layer in layers.items():
-        assert isinstance(layer, Attention)
-
-        impl = layer.impl
-        assert isinstance(impl, FlashInferImpl)
-
-        # Infer hyperparameters from the attention layer
-        window_size = impl.sliding_window
-        window_left = window_size[0] if window_size is not None else -1
-        logits_soft_cap = impl.logits_soft_cap
-        sm_scale = impl.scale
-
-        per_layer_params[key] = PerLayerParameters(window_left,
-                                                   logits_soft_cap, sm_scale)
-
-    return per_layer_params
-
-
-def infer_global_hyperparameters(
-        per_layer_params: Dict[str, PerLayerParameters]) -> PerLayerParameters:
-    """
-    Currently, FlashInfer backend only support models in which all layers share
-    the same values for the following hyperparameters:
-    - `window_left`
-    - `logits_soft_cap`
-    - `sm_scale`
-
-    So this function asserts that all layers share the same values for these
-    hyperparameters and returns the global values.
-    """
-
-    assert len(per_layer_params) > 0, "No attention layers found in the model."
-
-    param_sets = list(per_layer_params.values())
-    global_params = param_sets[0]
-    for params in param_sets:
-        assert params == global_params, (
-            "FlashInfer backend currently only supports models in which all "
-            "layers share the same values for the following hyperparameters: "
-            "`window_left`, `logits_soft_cap`, `sm_scale`.")
-
-    return global_params
-
-
 class FlashInferState(AttentionState):
 
     def __init__(self, runner):
@@ -293,8 +228,8 @@ def graph_capture_get_metadata_for_batch(
                                             batch_size + 1,
                                             dtype=torch.int32)
 
-        global_params = infer_global_hyperparameters(
-            get_per_layer_parameters(self.vllm_config))
+        global_params = infer_global_hyperparameters(self.vllm_config,
+                                                     FlashInferImpl)
 
         attn_metadata = self.runner.attn_backend.make_metadata(
             num_prefills=0,
@@ -652,7 +587,7 @@ def prepare(self):
             # - `logits_soft_cap`
             # - `sm_scale`
             inferred_params = infer_global_hyperparameters(
-                get_per_layer_parameters(self.vllm_config))
+                self.vllm_config, FlashInferImpl)
             self.global_hyperparameters = inferred_params
             self.window_left = inferred_params.window_left
             self.logits_soft_cap = inferred_params.logits_soft_cap