vllm-project · tdoublep · Jun 16, 2025 · Jun 17, 2025 · Jun 18, 2025 · Jun 18, 2025
diff --git a/examples/offline_inference/lora/README.md b/examples/offline_inference/lora/README.md
@@ -0,0 +1,27 @@
+# LoRA Examples
+
+This folder contains examples of offline inference using LoRA.
+
+## Multi-LoRA
+
+This example shows how to use the multi-LoRA functionality:
+
+```bash
+python examples/offline_inference/lora/multilora_inference.py
+```
+
+## LoRA with Quantization
+
+This example shows how to use LoRA with different quantization techniques:
+
+```bash
+python examples/offline_inference/lora/lora_with_quantization_inference.py
+```
+
+## Activated LoRA
+
+This example how to use [activated LoRA](https://arxiv.org/abs/2504.12397):
+
+```bash
+python examples/offline_inference/lora/activated_lora.py
+```
diff --git a/examples/offline_inference/lora/activated_lora.py b/examples/offline_inference/lora/activated_lora.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import time
+
+import torch
+from huggingface_hub import snapshot_download
+
+from vllm import LLM, SamplingParams
+from vllm.lora.request import LoRARequest
+
+BASE_NAME = "ibm-granite/granite-3.2-8b-instruct"
+
+ALORA_NAME = "ibm-granite/granite-3.2-8b-alora-uncertainty"
+invocation_string = "<|start_of_role|>certainty<|end_of_role|>"
+
+# download your LoRA adapter to ~/.cache/huggingface/…
+alora_path = snapshot_download(repo_id=ALORA_NAME)
+
+print(alora_path)
+#######################################
+
+
+llm = LLM(
+    model=BASE_NAME,
+    enable_lora=True,
+    enable_activated_lora=True,
+    dtype=torch.bfloat16,
+    max_lora_rank=64,
+)
+
+prompts = [
+    (
+        "<|start_of_role|>user<|end_of_role|>What is MIT?<|end_of_text|>\n"
+        "<|start_of_role|>assistant<|end_of_role|>"
+    ),
+]
+
+sampling_params = SamplingParams(temperature=0, max_tokens=600)
+
+outputsBase = llm.generate(
+    prompts,
+    sampling_params,
+    use_tqdm=False,
+)
+generated_text = []
+for output in outputsBase:
+    prompt = output.prompt
+    generated_text += [output.outputs[0].text]
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text[-1]!r}")
+
+prompts_alora = [
+    x + y + "<|end_of_text|>\n" + invocation_string
+    for x, y in zip(prompts, generated_text)
+]
+
+sampling_params = SamplingParams(temperature=0, max_tokens=10)
+
+t0 = time.time()
+outputs = llm.generate(
+    prompts_alora,
+    sampling_params,
+    lora_request=LoRARequest("UQ_adapter", 1, alora_path),
+    use_tqdm=False,
+)
+t = time.time() - t0
+print(f"Time: {t}")
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/...rence/lora_with_quantization_inference.py → .../lora/lora_with_quantization_inference.py b/...rence/lora_with_quantization_inference.py → .../lora/lora_with_quantization_inference.py
diff --git a/.../offline_inference/multilora_inference.py → ...ine_inference/lora/multilora_inference.py b/.../offline_inference/multilora_inference.py → ...ine_inference/lora/multilora_inference.py
@@ -2455,6 +2455,8 @@ class LoRAConfig:
     bias_enabled: bool = False
     """[DEPRECATED] Enable bias for LoRA adapters. This option will be
     removed in v0.12.0."""
+    activated_lora_enabled: bool = False
+    """Enable Activated LoRA."""
 
     def compute_hash(self) -> str:
         """

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -371,6 +371,7 @@ class EngineArgs:
     # LoRA fields
     enable_lora: bool = False
     enable_lora_bias: bool = LoRAConfig.bias_enabled
+    enable_activated_lora: bool = LoRAConfig.activated_lora_enabled
     max_loras: int = LoRAConfig.max_loras
     max_lora_rank: int = LoRAConfig.max_lora_rank
     default_mm_loras: Optional[Dict[str, str]] = \
@@ -791,6 +792,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             help="If True, enable handling of LoRA adapters.")
         lora_group.add_argument("--enable-lora-bias",
                                 **lora_kwargs["bias_enabled"])
+        lora_group.add_argument("--enable-activated-lora",
+                                **lora_kwargs["activated_lora_enabled"])
         lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"])
         lora_group.add_argument("--max-lora-rank",
                                 **lora_kwargs["max_lora_rank"])
@@ -1364,6 +1367,7 @@ def create_engine_config(
 
         lora_config = LoRAConfig(
             bias_enabled=self.enable_lora_bias,
+            activated_lora_enabled=self.enable_activated_lora,
             max_lora_rank=self.max_lora_rank,
             max_loras=self.max_loras,
             default_mm_loras=self.default_mm_loras,

diff --git a/vllm/forward_context.py b/vllm/forward_context.py
@@ -26,6 +26,11 @@
 batchsize_forward_time: defaultdict = defaultdict(list)
 
 
+@dataclass
+class ALoRAMetadata:
+    mask1d: torch.Tensor
+
+
 class BatchDescriptor(NamedTuple):
     """
     Batch descriptor for cudagraph dispatching. We should keep the num of
@@ -173,6 +178,7 @@ class ForwardContext:
     virtual_engine: int  # set dynamically for each forward pass
     # set dynamically for each forward pass
     dp_metadata: Optional[DPMetadata] = None
+    alora_metadata: Optional[ALoRAMetadata] = None
     # determine the cudagraph style at runtime to be FULL, PIECEWISE, or NONE.
     # by default NONE, no cudagraph is used.
     cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE
@@ -203,7 +209,8 @@ def set_forward_context(
         num_tokens: Optional[int] = None,
         num_tokens_across_dp: Optional[torch.Tensor] = None,
         cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
-        batch_descriptor: Optional[BatchDescriptor] = None):
+        batch_descriptor: Optional[BatchDescriptor] = None,
+        alora_metadata: Optional[ALoRAMetadata] = None):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
     Here we can inject common logic for every model forward pass.
@@ -227,6 +234,7 @@ def set_forward_context(
         virtual_engine=virtual_engine,
         attn_metadata=attn_metadata,
         dp_metadata=dp_metadata,
+        alora_metadata=alora_metadata,
         cudagraph_runtime_mode=cudagraph_runtime_mode,
         batch_descriptor=batch_descriptor,
     )

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
@@ -4,7 +4,7 @@
 # pylint: disable=unused-argument
 import math
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional, Union, cast
+from typing import Optional, Union, cast
 
 import torch
 import torch.nn as nn
@@ -19,6 +19,8 @@
                               tensor_model_parallel_all_gather,
                               tensor_model_parallel_all_reduce)
 from vllm.distributed.utils import divide
+from vllm.forward_context import get_forward_context
+from vllm.lora.punica_wrapper import PunicaWrapperBase
 # yapf: disable
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearBase,
@@ -32,9 +34,6 @@
     VocabParallelEmbedding)
 from vllm.platforms import current_platform
 
-if TYPE_CHECKING:
-    from vllm.lora.punica_wrapper import PunicaWrapperBase
-
 
 def _get_lora_device(base_layer: nn.Module) -> torch.device:
     # code borrowed from https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/vllm/lora/layers.py#L34
@@ -1190,3 +1189,44 @@ def can_replace_layer(
     ) -> bool:
         # Special handling for the LogitsProcessor.
         return False
+
+
+class ActivatedLoRAMixin:
+
+    base_layer: LinearBase
+    punica_wrapper: PunicaWrapperBase
+    lora_a_stacked: torch.tensor
+    lora_b_stacked: torch.tensor
+    lora_bias_stacked: Optional[tuple[torch.Tensor, ...]]
+    output_slices: tuple[int, ...]
+
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
+
+        # In transformers backend, x and output have extra batch dimension like
+        # (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim),
+        # therefore we need to flatten the batch dimensions.
+        if x.ndim == 3 and output.ndim == 3:
+            output = output.flatten(0, 1)
+            x = x.flatten(0, 1)
+
+        # Extract aLoRA batch metadata from forward context
+        alora_metadata = get_forward_context().alora_metadata
+
+        mask1d = alora_metadata.mask1d
+        mask2d = mask1d.unsqueeze(1).to(output.dtype)
+
+        # Clone base layer output before running LoRA
+        # TODO(tdoublep): pass in mask1d and only operate on valid entries
+        orig_out = output.clone()
+
+        # Apply LoRA in‐place on `output`:
+        self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked,
+                                            self.lora_b_stacked,
+                                            self.lora_bias_stacked, 1.0,
+                                            self.output_slices)
+        # Apply alora mask
+        final_output = orig_out.mul(mask2d) + output.mul(1.0 - mask2d)
+        return final_output
@@ -35,6 +35,8 @@ class PEFTHelper:
     use_rslora: bool = field(default=False)
     # True to use Weight-Decomposed Low-Rank Adaptation (DoRA, see: https://arxiv.org/abs/2402.09353)
     use_dora: bool = field(default=False)
+    # Invocation tokens for Activated LoRA (aLoRA, see: https://arxiv.org/abs/2504.12397)
+    alora_invocation_tokens: Optional[list[int]] = field(default=None)
     # Extra vllm field, start with 'vllm_' to avoid conflict
     vllm_lora_scaling_factor: float = field(default=1.0)
     vllm_max_position_embeddings: Optional[int] = field(default=False)

@@ -33,6 +33,7 @@ class LoRARequest(
     long_lora_max_len: Optional[int] = None
     base_model_name: Optional[str] = msgspec.field(default=None)
     tensorizer_config_dict: Optional[dict] = None
+    invocation_start: Optional[int] = None
 
     def __post_init__(self):
         if self.lora_local_path:

@@ -21,7 +21,8 @@
 # being imported for _all_lora_classes below
 # yapf conflicts with isort for this block
 # yapf: disable
-from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
+from vllm.lora.layers import (ActivatedLoRAMixin, BaseLayerWithLoRA,
+                              ColumnParallelLinearWithLoRA,
                               LogitsProcessorWithLoRA,
                               MergedColumnParallelLinearWithLoRA,
                               MergedQKVParallelLinearWithLoRA,
@@ -69,6 +70,12 @@ def from_layer(layer: nn.Module,
                                       lora_config=lora_config,
                                       packed_modules_list=packed_modules_list,
                                       model_config=model_config):
+            # inject a-LoRA behaviour
+            if (lora_config.activated_lora_enabled
+                    and issubclass(lora_cls, BaseLayerWithLoRA)):
+                lora_cls = type(
+                    lora_cls.__name__.replace("LoRA", "ActivatedLoRA"),
+                    (ActivatedLoRAMixin, lora_cls), {})
             instance_layer = lora_cls(layer)
             instance_layer.create_lora_weights(max_loras, lora_config,
                                                model_config)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
@@ -9,6 +9,7 @@
 import torch.nn as nn
 from torch.nn.parameter import Parameter, UninitializedParameter
 
+from vllm.config import get_current_vllm_config
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               split_tensor_along_last_dim,
@@ -240,6 +241,16 @@ def __init__(
     ):
         super().__init__()
 
+        vllm_config = get_current_vllm_config()
+        if (vllm_config.lora_config
+                and vllm_config.lora_config.activated_lora_enabled):
+            # lets torch.compile know that forward_context needs to be
+            # considered as an input to the layer (copied from attention)
+            compilation_config = vllm_config.compilation_config
+            if prefix in compilation_config.static_forward_context:
+                raise ValueError(f"Duplicate layer name: {prefix}")
+            compilation_config.static_forward_context[prefix] = self
+
         # Keep input parameters
         self.input_size = input_size
         self.output_size = output_size

@@ -589,6 +589,12 @@ def request_block_hasher(request: Request) -> list[BlockHash]:
             # MM and LoRA requests need extra keys for block-hash computation.
             extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
                 request, start_token_idx, end_token_idx, curr_mm_idx)
+            # Respect aLoRA behaviour
+            if (request.lora_request is not None
+                    and request.lora_request.invocation_start is not None and
+                    end_token_idx <= request.lora_request.invocation_start):
+                # cache is equivalent to base model cache
+                extra_keys = None
 
             # Compute the hash of the current block
             block_tokens = request.all_token_ids[start_token_idx:end_token_idx]

@@ -9,6 +9,7 @@
 from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
 from vllm.inputs.parse import split_enc_dec_inputs
 from vllm.inputs.preprocess import InputPreprocessor
+from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.multimodal.cache import processor_cache_from_config
@@ -429,6 +430,36 @@ def process_inputs(
                         identifier=decoder_mm_hashes[modality][idx],
                         mm_position=decoder_mm_positions[modality][idx]))
 
+        # Handle aLoRA invocation sequence if applicable.
+        if (self.lora_config and self.lora_config.activated_lora_enabled
+                and lora_request is not None):
+
+            text_config = self.model_config.hf_config.get_text_config()
+
+            peft_helper = PEFTHelper.from_local_dir(
+                lora_request.lora_path, text_config.max_position_embeddings,
+                lora_request.tensorizer_config_dict)
+
+            if peft_helper.alora_invocation_tokens is not None:
+                invocation_tokens = peft_helper.alora_invocation_tokens
+                invocation_start = -1
+                n = len(invocation_tokens)
+                token_ids = decoder_inputs["prompt_token_ids"]
+                if n > 0 and len(token_ids) >= n:
+                    # scan backward for the last match
+                    # (faster than full forward scan+max)
+                    for idx in range(len(token_ids) - n, -1, -1):
+                        if token_ids[idx:idx + n] == invocation_tokens:
+                            # weights activated after start
+                            invocation_start = idx
+                            break
+                if invocation_start == -1:
+                    raise ValueError(
+                        "Invocation sequence not found in prompt "
+                        f"for request '{request_id}'. aLoRA models require the "
+                        "invocation tokens to be present in the input.")
+                lora_request.invocation_start = invocation_start
+
         return decoder_inputs.get("prompt"), EngineCoreRequest(
             request_id=request_id,
             prompt_token_ids=decoder_inputs["prompt_token_ids"],