Add support for loading a module quantized with ModuleFqnToConfig using regex

jerryzh168 · jerryzh168 · commit 21383c797865 · 2025-10-08T17:19:31.000-07:00
Summary: att, we are adding regex support to simplify the config, and enabling the support in both transformers and vllm to make sure regex config works everywhere torchao PR that adds the functionality to quantize_ API: pytorch/ao#3084 transformer PR: Test Plan: We save the model with the regex config in transformers, in vllm we just make sure we can load the model: pytest tests/quantization/test_torchao.py test_opt_125m_module_fqn_to_config_regex_model_loading_with_params Reviewers: Subscribers: Tasks: Tags: Signed-off-by: Jerry Zhang <jerryzh168@gmail.com>
diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py
@@ -232,6 +232,22 @@ def test_opt_125m_float8_weight_only_safetensors_model_loading_with_params(vllm_
 
         assert output
 
+@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
+@pytest.mark.skip(
+    reason="since torchao nightly is only compatible with torch nightly"
+    "currently https://github.com/pytorch/ao/issues/2919, we'll have to skip "
+    "torchao tests that requires newer versions (0.14.0.dev+) for now"
+)
+def test_opt_125m_module_fqn_to_config_regex_model(vllm_runner):
+    torch._dynamo.reset()
+    model_name = "torchao-testing/opt-125m-ModuleFqnToConfig-v1-regex-0.14.0.dev"
+    with vllm_runner(
+        model_name=model_name, dtype="bfloat16", pt_load_map_location="cuda:0"
+    ) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
+
+        assert output
+
 
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py
@@ -5,6 +5,7 @@
 from importlib.util import find_spec
 from typing import Any, Optional
 
+import regex as re
 import torch
 import torch.nn.functional as F
 from packaging import version
@@ -192,9 +193,26 @@ def get_quant_method(
         module_fqn = prefix
         if isinstance(self.torchao_config, ModuleFqnToConfig):
             module_fqn_to_config = self.torchao_config.module_fqn_to_config
-            c = module_fqn_to_config.get(module_fqn) or module_fqn_to_config.get(
-                "_default", None
-            )
+            c = None
+            if module_fqn in module_fqn_to_config:
+                assert not module_fqn.startswith("re:"), (
+                    "module fqn should not start with"
+                    "`re:`, which is used for specifying regex"
+                )
+                c = module_fqn_to_config[module_fqn]
+            else:
+                for maybe_module_fqn_pattern in module_fqn_to_config:
+                    if not maybe_module_fqn_pattern.startswith("re:"):
+                        continue
+                    elif re.fullmatch(maybe_module_fqn_pattern[3:], module_fqn):
+                        # we'll apply the config for first fully matched pattern
+                        c = module_fqn_to_config[maybe_module_fqn_pattern]
+                        break
+                else:
+                    # fallback to use default if no module specific
+                    # config is provided
+                    c = module_fqn_to_config.get("_default", None)
+
             if c is not None:
                 current_torchao_config = TorchAOConfig(
                     c, self.skip_modules, self.is_checkpoint_torchao_serialized