vllm-project
diff --git a/‎tests/compile/piecewise/test_simple.py‎
Lines changed: 5 additions & 3 deletions b/‎tests/compile/piecewise/test_simple.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎tests/compile/piecewise/test_toy_llama.py‎
Lines changed: 12 additions & 10 deletions b/‎tests/compile/piecewise/test_toy_llama.py‎
Lines changed: 12 additions & 10 deletions
diff --git a/‎tests/compile/test_basic_correctness.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/compile/test_basic_correctness.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/compile/test_full_graph.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/compile/test_full_graph.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/compile/test_fusion.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/compile/test_fusion.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/compile/test_wrapper.py‎
Lines changed: 3 additions & 1 deletion b/‎tests/compile/test_wrapper.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎tests/compile/utils.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/compile/utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/model_executor/test_enabled_custom_ops.py‎
Lines changed: 26 additions & 26 deletions b/‎tests/model_executor/test_enabled_custom_ops.py‎
Lines changed: 26 additions & 26 deletions
diff --git a/‎tests/tpu/test_compilation.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/tpu/test_compilation.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/tpu/test_custom_dispatcher.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/tpu/test_custom_dispatcher.py‎
Lines changed: 1 addition & 1 deletion
@@ -11,8 +11,8 @@
 from vllm.compilation.compile_context import set_compile_context
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
-from vllm.compilation.levels import CompilationLevel
-from vllm.config import VllmConfig
+from vllm.config import CompilationLevel, VllmConfig
+from vllm.plugins import set_current_vllm_config
 from vllm.utils import direct_register_custom_op
 
 global_counter = 0
@@ -82,7 +82,9 @@ def test_simple_piecewise_compile():
     os.environ["VLLM_TORCH_COMPILE_CONFIG"] = config
     os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE)
 
-    model = SillyModel(vllm_config=VllmConfig(), prefix='')
+    vllm_config = VllmConfig()
+    with set_current_vllm_config(vllm_config):
+        model = SillyModel(vllm_config=vllm_config, prefix='')
 
     inputs = torch.randn(100).cuda()
 
 
@@ -15,12 +15,10 @@
 from torch.library import Library
 
 from vllm.compilation.compile_context import set_compile_context
-from vllm.compilation.config import CompilationConfig
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
-from vllm.compilation.levels import CompilationLevel
-from vllm.config import VllmConfig
-from vllm.plugins import set_compilation_config
+from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
+from vllm.plugins import set_compilation_config, set_current_vllm_config
 from vllm.utils import direct_register_custom_op
 
 # create a library to hold the custom op
@@ -272,9 +270,11 @@ def run_model(llama_config,
             CompilationLevel.NO_COMPILATION)
         set_compilation_config(None)
 
-    model = LlamaModel(config=llama_config,
-                       vllm_config=VllmConfig(),
-                       prefix="").eval().cuda()
+    vllm_config = VllmConfig()
+    with set_current_vllm_config(vllm_config):
+        model = LlamaModel(config=llama_config,
+                           vllm_config=vllm_config,
+                           prefix="").eval().cuda()
 
     B = 16  # max batch size
     input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
@@ -395,9 +395,11 @@ def benchmark():
         else:
             set_compilation_config(None)
 
-        model = LlamaModel(config=llama_config,
-                           vllm_config=VllmConfig(),
-                           prefix="").eval().cuda().to(torch.bfloat16)
+        vllm_config = VllmConfig()
+        with set_current_vllm_config(vllm_config):
+            model = LlamaModel(config=llama_config,
+                               vllm_config=vllm_config,
+                               prefix="").eval().cuda().to(torch.bfloat16)
 
         B = 256  # max batch size
         input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
 
@@ -3,7 +3,7 @@
 
 import pytest
 
-from vllm.compilation.levels import CompilationLevel
+from vllm.config import CompilationLevel
 from vllm.utils import cuda_device_count_stateless
 
 from ..utils import compare_all_settings
 
@@ -1,6 +1,6 @@
 import pytest
 
-from vllm.compilation.levels import CompilationLevel
+from vllm.config import CompilationLevel
 
 from ..utils import fork_new_process_for_each_test
 from .utils import TEST_MODELS, check_full_graph_support
 
@@ -3,10 +3,10 @@
 from compressed_tensors.quantization import FP8_DTYPE
 
 import vllm.envs as envs
-from vllm.compilation.config import CompilationConfig
 from vllm.compilation.fusion import (FusionPass, find_auto_fn,
                                      find_auto_fn_maybe)
 from vllm.compilation.reshapes import RedundantReshapesPass
+from vllm.config import CompilationConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     apply_fp8_linear)
 
@@ -3,6 +3,7 @@
 import torch
 
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
+from vllm.config import CompilationLevel
 
 
 class MyMod(torch.nn.Module):
@@ -18,7 +19,8 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
     def __init__(self, model):
         self.model = model
         compiled_callable = torch.compile(self.forward, backend="eager")
-        super().__init__(compiled_callable)
+        super().__init__(compiled_callable,
+                         compilation_level=CompilationLevel.DYNAMO_ONCE)
 
     def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
         # this is the function to be compiled
 
@@ -4,7 +4,7 @@
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.compilation.levels import CompilationLevel
+from vllm.config import CompilationLevel
 from vllm.platforms import current_platform
 
 TEST_MODELS = [
 
@@ -3,11 +3,13 @@
 
 import pytest
 
+from vllm.config import CompilationConfig, VllmConfig
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import (GeluAndMul,
                                                    ReLUSquaredActivation,
                                                    SiluAndMul)
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.plugins import set_current_vllm_config
 
 
 # Registered subclass for test
@@ -51,42 +53,40 @@ class Relu3(ReLUSquaredActivation):
     ])
 def test_enabled_ops(env: str, torch_level: int, ops_enabled: List[int],
                      default_on: bool):
-    os.environ["VLLM_CUSTOM_OPS"] = env
     os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(torch_level)
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        custom_ops=env.split(",")))
+    with set_current_vllm_config(vllm_config):
+        assert CustomOp.default_on() == default_on
 
-    # Reset default_on (computed once):
-    CustomOp.default_on.cache_clear()
+        ops_enabled = [bool(x) for x in ops_enabled]
 
-    assert CustomOp.default_on() == default_on
+        assert RMSNorm(1024).enabled() == ops_enabled[0]
+        assert CustomOp.op_registry["rms_norm"].enabled() == ops_enabled[0]
 
-    ops_enabled = [bool(x) for x in ops_enabled]
+        assert SiluAndMul().enabled() == ops_enabled[1]
+        assert CustomOp.op_registry["silu_and_mul"].enabled() == ops_enabled[1]
 
-    assert RMSNorm(1024).enabled() == ops_enabled[0]
-    assert CustomOp.op_registry["rms_norm"].enabled() == ops_enabled[0]
+        assert GeluAndMul().enabled() == ops_enabled[2]
+        assert CustomOp.op_registry["gelu_and_mul"].enabled() == ops_enabled[2]
 
-    assert SiluAndMul().enabled() == ops_enabled[1]
-    assert CustomOp.op_registry["silu_and_mul"].enabled() == ops_enabled[1]
+        # If registered, subclasses should follow their own name
+        assert Relu3().enabled() == ops_enabled[3]
+        assert CustomOp.op_registry["relu3"].enabled() == ops_enabled[3]
 
-    assert GeluAndMul().enabled() == ops_enabled[2]
-    assert CustomOp.op_registry["gelu_and_mul"].enabled() == ops_enabled[2]
+        # Unregistered subclass
+        class SiluAndMul2(SiluAndMul):
+            pass
 
-    # If registered, subclasses should follow their own name
-    assert Relu3().enabled() == ops_enabled[3]
-    assert CustomOp.op_registry["relu3"].enabled() == ops_enabled[3]
-
-    # Unregistered subclass
-    class SiluAndMul2(SiluAndMul):
-        pass
-
-    # Subclasses should not require registration
-    assert SiluAndMul2().enabled() == SiluAndMul().enabled()
+        # Subclasses should not require registration
+        assert SiluAndMul2().enabled() == SiluAndMul().enabled()
 
 
 @pytest.mark.parametrize(
     "env", ["all,none", "all,+rms_norm,all", "+rms_norm,-rms_norm"])
 def test_enabled_ops_invalid(env: str):
-    os.environ["VLLM_CUSTOM_OPS"] = env
-    CustomOp.default_on.cache_clear()
-
-    with pytest.raises(AssertionError):
-        RMSNorm(1024).enabled()
+    with pytest.raises(Exception):  # noqa
+        vllm_config = VllmConfig(compilation_config=CompilationConfig(
+            custom_ops=env.split(",")))
+        with set_current_vllm_config(vllm_config):
+            RMSNorm(1024).enabled()
@@ -5,7 +5,7 @@
 
 import depyf
 
-from vllm.compilation.levels import CompilationLevel
+from vllm.config import CompilationLevel
 
 # disable custom dispatcher, let Dynamo takes over
 # all the control
 
@@ -1,6 +1,6 @@
 import os
 
-from vllm.compilation.levels import CompilationLevel
+from vllm.config import CompilationLevel
 
 from ..utils import compare_two_settings