refactor: update DacVAE Architecture, Configuration, ComponentLoader and revert necessary changes (sgl-project#20)

cms42 · CloudRipple · CloudRipple · commit 046aac01e217 · 2026-01-24T17:05:36.000Z
* refactor: enhance KL divergence method in DiagonalGaussianDistribution for flexible dimension handling and clean up DAC class by removing unused code

* refactor: update DacVAE architecture, configuration and its customized loader.

* Revert "fix: update adjust_frames parameter to False for improved multi-GPU compatibility"

* revert changes in base pipeline configs

* revert changes in configs/sample/__init__.py

* [Feature] Remove weight norm in DAC

* [Fix] Use legacy weight norm, which can be removed

* [Fix] remove weight norm at the right place

* [Chore] update test script

* Revert "[Fix] remove weight norm at the right place"

This reverts commit 3a0accbae41650e926c5828025323a12454827a4.

* Revert "[Fix] Use legacy weight norm, which can be removed"

This reverts commit eb93f20f134888adba4a5124fa1d167b93d180e7.

* Revert "[Feature] Remove weight norm in DAC"

This reverts commit aaa64abbc25112a706bf3d3604ffeac390a1d8a8.

* [Feature] Remove all weight norm from DAC modeling

---------

Co-authored-by: CloudRipple &lt;yiyangzhang25@m.fudan.edu.cn&gt;
diff --git a/python/sglang/multimodal_gen/configs/models/vaes/dac.py b/python/sglang/multimodal_gen/configs/models/vaes/dac.py
@@ -2,19 +2,29 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dataclasses import dataclass, field
+from typing import List
 
-from sglang.multimodal_gen.configs.models.vaes.base import VAEArchConfig, VAEConfig
+from sglang.multimodal_gen.configs.models.base import ArchConfig, ModelConfig
 
 
 @dataclass
-class DacVAEArchConfig(VAEArchConfig):
-    sample_rate: int = 44100
-    hop_length: int = 2048
+class DacVAEArchConfig(ArchConfig):
+    codebook_dim: int = 8
+    codebook_size: int = 1024
+    continuous: bool = True
+    decoder_dim: int = 2048
+    decoder_rates: List[int] = field(default_factory=lambda: [8, 5, 4, 3, 2])
+    encoder_dim: int = 128
+    encoder_rates: List[int] = field(default_factory=lambda: [2, 3, 4, 5, 8])
+    hop_length: int = 3840
     latent_dim: int = 128
+    n_codebooks: int = 9
+    quantizer_dropout: bool = False
+    sample_rate: int = 48000
 
 
 @dataclass
-class DacVAEConfig(VAEConfig):
-    arch_config: VAEArchConfig = field(default_factory=DacVAEArchConfig)
-    load_encoder: bool = False
+class DacVAEConfig(ModelConfig):
+    arch_config: DacVAEArchConfig = field(default_factory=DacVAEArchConfig)
+    load_encoder: bool = True
     load_decoder: bool = True
diff --git a/python/sglang/multimodal_gen/configs/pipeline_configs/base.py b/python/sglang/multimodal_gen/configs/pipeline_configs/base.py
@@ -536,29 +536,9 @@ def from_kwargs(
         )
         from sglang.multimodal_gen.registry import get_pipeline_config_classes
 
-        pipeline_config_cls = None
-
-        # If users explicitly specify a pipeline class name, try to resolve
-        # pipeline config classes directly without relying on model_index.json.
-        if pipeline_class_name:
-            config_classes = get_pipeline_config_classes(pipeline_class_name)
-            if config_classes is not None:
-                pipeline_config_cls, _ = config_classes
-                logger.info(
-                    "Using pipeline_class_name '%s' to resolve PipelineConfig: %s",
-                    pipeline_class_name,
-                    pipeline_config_cls.__name__,
-                )
-            else:
-                logger.warning(
-                    "pipeline_class_name '%s' not found in pipeline config registry; "
-                    "falling back to model auto-detection.",
-                    pipeline_class_name,
-                )
-
         # If model_path is a safetensors file and pipeline_class_name is specified,
         # try to get PipelineConfig from the registry first
-        if pipeline_config_cls is None and is_safetensors_file and pipeline_class_name:
+        if is_safetensors_file and pipeline_class_name:
             config_classes = get_pipeline_config_classes(pipeline_class_name)
             if config_classes is not None:
                 pipeline_config_cls, _ = config_classes
@@ -582,7 +562,7 @@ def from_kwargs(
                         f"Available pipelines with config classes: {available_pipelines}"
                     )
                 pipeline_config_cls = model_info.pipeline_config_cls
-        elif pipeline_config_cls is None:
+        else:
             model_info = get_model_info(model_path)
             if model_info is None:
                 raise ValueError(
diff --git a/python/sglang/multimodal_gen/configs/sample/__init__.py b/python/sglang/multimodal_gen/configs/sample/__init__.py
@@ -3,7 +3,6 @@
 from sglang.multimodal_gen.configs.sample.diffusers_generic import (
     DiffusersGenericSamplingParams,
 )
-from sglang.multimodal_gen.configs.sample.mova import MovaSamplingParams
 from sglang.multimodal_gen.configs.sample.sampling_params import SamplingParams
 
-__all__ = ["SamplingParams", "DiffusersGenericSamplingParams", "MovaSamplingParams"]
+__all__ = ["SamplingParams", "DiffusersGenericSamplingParams"]
diff --git a/python/sglang/multimodal_gen/configs/sample/sampling_params.py b/python/sglang/multimodal_gen/configs/sample/sampling_params.py
@@ -148,7 +148,7 @@ class SamplingParams:
     # if True, disallow user params to override subclass-defined protected fields
     no_override_protected_fields: bool = False
     # whether to adjust num_frames for multi-GPU friendly splitting (default: True)
-    adjust_frames: bool = False
+    adjust_frames: bool = True
 
     def _set_output_file_ext(self):
         # add extension if needed
diff --git a/python/sglang/multimodal_gen/runtime/loader/component_loader.py b/python/sglang/multimodal_gen/runtime/loader/component_loader.py
@@ -704,40 +704,67 @@ def should_offload(
         return server_args.vae_cpu_offload
 
     def load_customized(
-        self, component_model_path: str, server_args: ServerArgs, module_name: str
+        self,
+        component_model_path: str,
+        server_args: ServerArgs,
+        module_name: str | None = None,
     ):
-        from sglang.multimodal_gen.runtime.models.vaes.dac import DAC
 
-        server_args.model_paths[module_name] = component_model_path
-        # Prefer diffusers-style directory if present
-        config_path = os.path.join(component_model_path, "config.json")
-        if os.path.isfile(config_path):
-            audio_vae = DAC.from_pretrained(component_model_path)
-            return audio_vae.eval()
-
-        # Fallback: load from a single checkpoint file
-        if os.path.isfile(component_model_path):
-            if component_model_path.endswith(".dac"):
-                return DAC.load(component_model_path).eval()
-            state_dict = torch.load(component_model_path, map_location="cpu")
-            audio_vae = DAC()
-            audio_vae.load_state_dict(state_dict, strict=False)
-            return audio_vae.eval()
-
-        # Attempt to load any supported file in directory
-        for candidate in ("model.safetensors", "pytorch_model.bin", "model.pth"):
-            candidate_path = os.path.join(component_model_path, candidate)
-            if os.path.isfile(candidate_path):
-                if candidate_path.endswith(".safetensors"):
-                    state_dict = safetensors_load_file(candidate_path)
-                else:
-                    state_dict = torch.load(candidate_path, map_location="cpu")
-                audio_vae = DAC()
-                audio_vae.load_state_dict(state_dict, strict=False)
-                return audio_vae.eval()
-        raise FileNotFoundError(
-            f"Cannot locate audio VAE weights in {component_model_path}"
-        )
+        config = get_diffusers_component_config(model_path=component_model_path)
+        class_name = config.pop("_class_name", None)
+        assert (
+            class_name is not None
+        ), "Model config does not contain a _class_name attribute. Only diffusers format is supported."
+
+        module_key = "audio_vae"
+        if module_name in ("audio_vae",):
+            module_key = module_name
+        server_args.model_paths[module_key] = component_model_path
+        logger.info("HF model config: %s", config)
+
+        audio_vae_config = server_args.pipeline_config.audio_vae_config
+        audio_vae_config.update_model_arch(config)
+
+        should_offload = self.should_offload(server_args)
+        target_device = self.target_device(should_offload)
+
+        # Check for auto_map first (custom VAE classes)
+        auto_map = config.get("auto_map", {})
+        auto_model_map = auto_map.get("AutoModel")
+        if auto_model_map:
+            module_path, cls_name = auto_model_map.rsplit(".", 1)
+            custom_module_file = os.path.join(component_model_path, f"{module_path}.py")
+            spec = importlib.util.spec_from_file_location("_custom", custom_module_file)
+            custom_module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(custom_module)
+            vae_cls = getattr(custom_module, cls_name)
+            vae_dtype = PRECISION_TO_TYPE[server_args.pipeline_config.vae_precision]
+            with set_default_torch_dtype(vae_dtype):
+                vae = vae_cls.from_pretrained(
+                    component_model_path,
+                    revision=server_args.revision,
+                    trust_remote_code=server_args.trust_remote_code,
+                )
+            vae = vae.to(device=target_device, dtype=vae_dtype)
+            return vae.eval()
+
+        # Load from ModelRegistry (standard VAE classes)
+        with (
+            set_default_torch_dtype(
+                PRECISION_TO_TYPE[server_args.pipeline_config.vae_precision]
+            ),
+            skip_init_modules(),
+        ):
+            audio_vae_cls, _ = ModelRegistry.resolve_model_cls(class_name)
+            audio_vae = audio_vae_cls(audio_vae_config).to(target_device)
+
+        safetensors_list = _list_safetensors_files(component_model_path)
+        assert (
+            len(safetensors_list) == 1
+        ), f"Found {len(safetensors_list)} safetensors files in {component_model_path}"
+        loaded = safetensors_load_file(safetensors_list[0])
+        audio_vae.load_state_dict(loaded, strict=False)
+        return audio_vae.eval()
 
 
 class MovaDiTLoader(ComponentLoader):
diff --git a/python/sglang/multimodal_gen/runtime/models/vaes/common.py b/python/sglang/multimodal_gen/runtime/models/vaes/common.py
@@ -611,15 +611,17 @@ def sample(self, generator: torch.Generator | None = None) -> torch.Tensor:
         return x
 
     def kl(
-        self, other: Optional["DiagonalGaussianDistribution"] = None
+        self,
+        other: Optional["DiagonalGaussianDistribution"] = None,
+        dims: tuple[int, ...] = (1, 2, 3),
     ) -> torch.Tensor:
         if self.deterministic:
             return torch.Tensor([0.0])
         else:
             if other is None:
                 return 0.5 * torch.sum(
                     torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
-                    dim=[1, 2, 3],
+                    dim=dims,
                 )
             else:
                 return 0.5 * torch.sum(
@@ -628,7 +630,7 @@ def kl(
                     - 1.0
                     - self.logvar
                     + other.logvar,
-                    dim=[1, 2, 3],
+                    dim=dims,
                 )
 
     def nll(
diff --git a/python/sglang/multimodal_gen/runtime/models/vaes/dac.py b/python/sglang/multimodal_gen/runtime/models/vaes/dac.py
diff --git a/test_sglang_mova.sh b/test_sglang_mova.sh

Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,6 @@`
`3`	`3`	`from sglang.multimodal_gen.configs.sample.diffusers_generic import (`
`4`	`4`	`DiffusersGenericSamplingParams,`
`5`	`5`	`)`
`6`		`-from sglang.multimodal_gen.configs.sample.mova import MovaSamplingParams`
`7`	`6`	`from sglang.multimodal_gen.configs.sample.sampling_params import SamplingParams`
`8`	`7`
`9`		`-__all__ = ["SamplingParams", "DiffusersGenericSamplingParams", "MovaSamplingParams"]`
	`8`	`+__all__ = ["SamplingParams", "DiffusersGenericSamplingParams"]`