Bump modelopt to 0.35.0 and remove safe_import("modelopt") in llm collection (#14656)

kevalmorabia97 · chtruong814 · AAnoosheh · chtruong814 · commit 251fe608b713 · 2025-09-20T09:45:56.000-05:00
* Bump modelopt to 0.35.0 and remove safe_import in llm collection

Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;

* Update eagle architecture spec setting

Signed-off-by: Asha Anoosheh &lt;aanoosheh@nvidia.com&gt;

* Reduce specdec memory usage

Signed-off-by: Asha Anoosheh &lt;aanoosheh@nvidia.com&gt;

---------

Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
Signed-off-by: Asha Anoosheh &lt;aanoosheh@nvidia.com&gt;
Co-authored-by: Charlie Truong &lt;chtruong@nvidia.com&gt;
Co-authored-by: Asha Anoosheh &lt;aanoosheh@nvidia.com&gt;
Signed-off-by: Charlie Truong &lt;chtruong@nvidia.com&gt;
diff --git a/docker/common/install_dep.sh b/docker/common/install_dep.sh
@@ -304,7 +304,7 @@ extra() {
     "llama-index==0.10.43"                                                                     # incompatible with nvidia-pytriton
     "ctc_segmentation==1.7.1 ; (platform_machine == 'x86_64' and platform_system != 'Darwin')" # requires numpy<2.0.0 to be installed before
     "nemo_run"
-    "nvidia-modelopt[torch]==0.33.0 ; platform_system != 'Darwin'"                             # We want a specific version of nvidia-modelopt
+    "nvidia-modelopt==0.35.0"                                                                  # We want a specific version of nvidia-modelopt
   )
   if [[ "${NVIDIA_PYTORCH_VERSION}" != "" ]]; then
     DEPS+=(
diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
@@ -42,7 +42,7 @@
     PruningConfig,
     QuantizationConfig,
     Quantizer,
-    prune_gpt_model,
+    prune_language_model,
     save_pruned_model,
     set_modelopt_spec_if_exists_in_ckpt,
     setup_trainer_and_restore_model_with_modelopt_spec,
@@ -310,6 +310,8 @@ def prune(
     num_nodes: int = 1,
     tp_size: int = 1,
     pp_size: int = 1,
+    num_layers_in_first_pipeline_stage: int | None = None,
+    num_layers_in_last_pipeline_stage: int | None = None,
     num_train_samples: int = 1024,
     data: pl.LightningDataModule | None = None,
     tokenizer_path: str | None = None,
@@ -327,6 +329,8 @@ def prune(
         tp_size (int): The tensor parallel size.
         pp_size (int): The pipeline parallel size.
         num_train_samples (int): Number of training samples for importance estimation using forward pass.
+        num_layers_in_first_pipeline_stage (int): The number of layers in the first pipeline stage.
+        num_layers_in_last_pipeline_stage (int): The number of layers in the last pipeline stage.
         data (pl.LightningDataModule): The data module for forward pass.
             Required if not dropping layers.
         tokenizer_path (str): Path to the tokenizer if not using model's tokenizer.
@@ -362,6 +366,8 @@ def prune(
         model_path=nemo_checkpoint,
         tensor_model_parallel_size=tp_size,
         pipeline_model_parallel_size=pp_size,
+        num_layers_in_first_pipeline_stage=num_layers_in_first_pipeline_stage,
+        num_layers_in_last_pipeline_stage=num_layers_in_last_pipeline_stage,
         devices=devices,
         num_nodes=num_nodes,
         inference_only=True,
@@ -371,7 +377,7 @@ def prune(
         trainer_kwargs={"max_steps": steps, "limit_val_batches": steps, "val_check_interval": steps},
         model_config_overrides={"sequence_parallel": False},
     )
-    prune_gpt_model(model, pruning_config, data, trainer)
+    prune_language_model(model, pruning_config, data, trainer)
     save_pruned_model(trainer, save_path)
 
     console = Console()
diff --git a/nemo/collections/llm/modelopt/distill/model.py b/nemo/collections/llm/modelopt/distill/model.py
@@ -23,7 +23,6 @@
 from nemo.collections.nlp.modules.common.megatron.utils import average_losses_across_data_parallel_group
 from nemo.lightning.megatron_parallel import MaskedTokenLossReduction
 from nemo.utils import logging
-from nemo.utils.import_utils import safe_import
 from nemo.utils.model_utils import unwrap_model
 
 from .utils import adjust_distillation_model_for_mcore, load_distillation_config, teacher_provider
@@ -32,7 +31,7 @@
     from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
     from nemo.lightning.pytorch.optim import OptimizerModule
 
-mtd, HAVE_MODELOPT = safe_import("modelopt.torch.distill")
+import modelopt.torch.distill as mtd
 
 
 class _DistillationLossReduction(MaskedTokenLossReduction):
@@ -134,8 +133,6 @@ def __init__(
             tokenizer: Tokenizer.
             model_transform: Transform to apply to model during setup.
         """
-        if not HAVE_MODELOPT:
-            raise RuntimeError("nvidia-modelopt is needed to use DistillationGPTModel")
         super().__init__(config, optim, tokenizer, model_transform)
         self._teacher_config = teacher_config
         self._teacher_ckpt_path = teacher_ckpt_path
diff --git a/nemo/collections/llm/modelopt/distill/utils.py b/nemo/collections/llm/modelopt/distill/utils.py
@@ -28,7 +28,6 @@
 from nemo import lightning as nl
 from nemo.collections import llm
 from nemo.utils import logging
-from nemo.utils.import_utils import safe_import, safe_import_from
 
 from .loss import HiddenStateCosineLoss, LogitsAndIntermediatesLossBalancer, LogitsKLLoss, ProjectionLayer
 
@@ -39,9 +38,8 @@
 
     from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 
-mto, HAVE_MODELOPT = safe_import("modelopt.torch.opt")
-DistillationModel, _ = safe_import_from("modelopt.torch.distill", "DistillationModel", alt=object)
-DistillationLossBalancer, _ = safe_import_from("modelopt.torch.distill", "DistillationLossBalancer", alt=object)
+import modelopt.torch.opt as mto
+from modelopt.torch.distill import DistillationLossBalancer, DistillationModel
 
 
 @dataclass
@@ -242,8 +240,6 @@ def get_tensor_shapes_adjust_fn_for_distillation(
     Currently only used during non-interleaved pipelining for Distillation.
     Concatenates sizes of student and teacher output tensors for inter-process communication.
     """
-    if not HAVE_MODELOPT:
-        return None
     if (
         forward_only
         or parallel_state.get_pipeline_model_parallel_world_size() == 1
diff --git a/nemo/collections/llm/modelopt/model_utils.py b/nemo/collections/llm/modelopt/model_utils.py
@@ -18,6 +18,7 @@
 from typing import TYPE_CHECKING, Callable, Optional, Union
 
 import lightning.pytorch as L
+import modelopt.torch.opt as mto
 import torch
 import torch.nn as nn
 from lightning.pytorch.plugins.io.wrapper import _WrappingCheckpointIO
@@ -32,8 +33,6 @@
 from nemo.utils.import_utils import safe_import
 from nemo.utils.model_utils import unwrap_model
 
-mto, HAVE_MODELOPT = safe_import("modelopt.torch.opt")
-
 _, HAVE_TE = safe_import("transformer_engine")
 if HAVE_TE:
     # These custom modelopt specs are a mix of local MCORE and TE specs.
@@ -214,8 +213,6 @@ def restore_modelopt_state(
         path (str): The path to the checkpoint.
         trainer (pl.Trainer): The trainer object, in case path not provided.
     """
-    if not HAVE_MODELOPT:
-        return
     if not path:
         if trainer is None:
             return
@@ -254,9 +251,6 @@ def save_modelopt_state(model: "MegatronParallel", path: str, checkpoint_io: "Ch
         path (str): The path to the checkpoint.
         checkpoint_io (CheckpointIO): The checkpoint IO object from MegatronStrategy.
     """
-    if not HAVE_MODELOPT:
-        return
-
     # Save ModelOpt state too, if it exists.
     core_model = unwrap_model(model)
     if not mto.ModeloptStateManager.is_converted(core_model):
diff --git a/nemo/collections/llm/modelopt/prune/__init__.py b/nemo/collections/llm/modelopt/prune/__init__.py
@@ -14,6 +14,6 @@
 
 """Prune utilities for using TensorRT Model Optimizer."""
 
-from .pruner import PruningConfig, prune_gpt_model, save_pruned_model
+from .pruner import PruningConfig, prune_language_model, save_pruned_model
 
-__all__ = ["PruningConfig", "prune_gpt_model", "save_pruned_model"]
+__all__ = ["PruningConfig", "prune_language_model", "save_pruned_model"]
diff --git a/nemo/collections/llm/modelopt/prune/pruner.py b/nemo/collections/llm/modelopt/prune/pruner.py
@@ -15,6 +15,7 @@
 from dataclasses import dataclass
 from functools import partial
 
+import modelopt.torch.prune as mtp
 import pytorch_lightning as pl
 from megatron.core import dist_checkpointing
 
@@ -24,16 +25,15 @@
 from nemo.lightning.io.pl import TrainerContext, ckpt_to_weights_subdir
 from nemo.utils import logging
 from nemo.utils.get_rank import is_global_rank_zero
-from nemo.utils.import_utils import safe_import
-
-mtp, HAVE_MODELOPT = safe_import("modelopt.torch.prune")
 
 SUPPORTED_PRUNING_HPARAMS = {
     # Width pruning
     "ffn_hidden_size",
     "hidden_size",
     "num_attention_heads",
     "num_query_groups",
+    "mamba_num_heads",
+    "mamba_head_dim",
     # Depth pruning
     "num_layers",
 }
@@ -50,6 +50,8 @@ class PruningConfig:
             Required if `target_num_query_groups` is provided.
         target_num_query_groups (int, optional): Target number of query groups for grouped-query attention.
             Required if `target_num_attention_heads` is provided.
+        target_mamba_num_heads (int, optional): Target number of Mamba attention heads.
+        target_mamba_head_dim (int, optional): Target dimension of Mamba attention heads.
         target_num_layers (int, optional): Target number of transformer layers using importance metric.
         drop_layers (list[int], optional): List of specific layer indices (1-indexed) to drop from the model.
             Cannot be used with other pruning parameters.
@@ -59,6 +61,8 @@ class PruningConfig:
     target_hidden_size: int | None = None
     target_num_attention_heads: int | None = None
     target_num_query_groups: int | None = None
+    target_mamba_num_heads: int | None = None
+    target_mamba_head_dim: int | None = None
     target_num_layers: int | None = None
     drop_layers: list[int] | None = None
 
@@ -69,19 +73,21 @@ def __post_init__(self):
                 self.target_hidden_size,
                 self.target_num_attention_heads,
                 self.target_num_query_groups,
+                self.target_mamba_num_heads,
+                self.target_mamba_head_dim,
                 self.target_num_layers,
             ]
             if any(p is not None for p in other_params):
                 raise ValueError("drop_layers cannot be used with other pruning parameters")
 
 
-def prune_gpt_model(
+def prune_language_model(
     model: llm.GPTModel,
     pruning_config: PruningConfig,
     data_module: pl.LightningDataModule | None = None,
     trainer: nl.Trainer | None = None,
 ) -> llm.GPTModel:
-    """Prune a GPT model in-place based on the provided pruning configuration.
+    """Prune a GPT / Mamba (sub-class of GPT) model in-place based on the provided pruning configuration.
 
     Args:
         model (llm.GPTModel): The model to prune.
@@ -94,9 +100,8 @@ def prune_gpt_model(
     Returns:
         llm.GPTModel: The pruned model.
     """
-    assert HAVE_MODELOPT, "nvidia-modelopt is required to prune the model."
     if pruning_config.drop_layers:
-        mtp.plugins.drop_mcore_gpt_layers(model, layers_to_drop=pruning_config.drop_layers)
+        mtp.plugins.drop_mcore_language_model_layers(model, layers_to_drop=pruning_config.drop_layers)
     else:
         assert data_module is not None, "data_module is required to prune the model."
         assert trainer is not None, "trainer is required to prune the model."
@@ -111,7 +116,7 @@ def prune_gpt_model(
         }
         mtp.prune(
             model,
-            mode="mcore_gpt_minitron",
+            mode="mcore_minitron",
             constraints={"export_config": export_config},
             dummy_input=None,  # Not used
             config={"forward_loop": partial(llm.validate, data=data_module, trainer=trainer, tokenizer="model")},
diff --git a/nemo/collections/llm/modelopt/quantization/quant_cfg_choices.py b/nemo/collections/llm/modelopt/quantization/quant_cfg_choices.py
@@ -14,9 +14,7 @@
 
 from typing import Any, Dict
 
-from nemo.utils.import_utils import safe_import
-
-mtq, HAVE_MODELOPT = safe_import("modelopt.torch.quantization")
+import modelopt.torch.quantization as mtq
 
 
 def get_quant_cfg_choices() -> Dict[str, Dict[str, Any]]:
@@ -32,9 +30,6 @@ def get_quant_cfg_choices() -> Dict[str, Dict[str, Any]]:
         dict: A dictionary where keys are short names (e.g., "fp8") and values are the
             corresponding modelopt quantization configuration objects.
     """
-    if not HAVE_MODELOPT:
-        return {}
-
     QUANT_CFG_NAMES = [
         ("int8", "INT8_DEFAULT_CFG"),
         ("int8_sq", "INT8_SMOOTHQUANT_CFG"),
diff --git a/nemo/collections/llm/modelopt/quantization/quantizer.py b/nemo/collections/llm/modelopt/quantization/quantizer.py
@@ -21,6 +21,9 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Optional, Union
 
+import modelopt.torch.export as mte
+import modelopt.torch.opt as mto
+import modelopt.torch.quantization as mtq
 import torch
 from datasets import load_dataset
 from megatron.core.inference.common_inference_params import CommonInferenceParams
@@ -37,7 +40,6 @@
 from nemo.lightning.io.pl import TrainerContext, ckpt_to_weights_subdir
 from nemo.utils import logging
 from nemo.utils.get_rank import is_global_rank_zero
-from nemo.utils.import_utils import safe_import
 from nemo.utils.model_utils import unwrap_model
 
 if TYPE_CHECKING:
@@ -46,10 +48,6 @@
     from nemo.lightning import Trainer
     from nemo.lightning.megatron_parallel import MegatronParallel
 
-mte, HAVE_MODELOPT_MTE = safe_import("modelopt.torch.export")
-mtq, HAVE_MODELOPT_MTQ = safe_import("modelopt.torch.quantization")
-mto, HAVE_MODELOPT_MTO = safe_import("modelopt.torch.opt")
-HAVE_MODELOPT = HAVE_MODELOPT_MTQ and HAVE_MODELOPT_MTE and HAVE_MODELOPT_MTO
 
 QUANT_CFG_CHOICES = get_quant_cfg_choices()
 SUPPORTED_DTYPE = [16, "16", "bf16"]  # Default precision for non-quantized layers
@@ -121,8 +119,6 @@ class Quantizer:
 
     def __init__(self, quantization_config: QuantizationConfig, export_config: ExportConfig):
         """Initialize Quantizer with quantization and export configurations."""
-        if not HAVE_MODELOPT:
-            raise RuntimeError("nvidia-modelopt is needed to use Quantizer")
         if not torch.cuda.is_available():
             raise EnvironmentError("GPU is required for the quantization.")
 
diff --git a/nemo/collections/llm/modelopt/speculative/model_transform.py b/nemo/collections/llm/modelopt/speculative/model_transform.py
@@ -12,28 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import modelopt.torch.opt as mto
+import modelopt.torch.speculative as mtsp
 import torch.nn as nn
 
 from nemo.collections.llm import GPTModel
 from nemo.utils import logging
-from nemo.utils.import_utils import UnavailableError, safe_import
 from nemo.utils.model_utils import unwrap_model
 
-mto, HAVE_MODELOPT = safe_import("modelopt.torch.opt")
-mtsp, _ = safe_import("modelopt.torch.speculative")
-
-try:
-    ALGORITHMS = {
-        "eagle3": mtsp.EAGLE3_DEFAULT_CFG,
-        # more TBD
-    }
-except UnavailableError:
-    ALGORITHMS = {}
+ALGORITHMS = {
+    "eagle3": mtsp.EAGLE3_DEFAULT_CFG,
+    # more TBD
+}
 
 
 def apply_speculative_decoding(model: nn.Module, algorithm: str = "eagle3") -> nn.Module:
-    """
-    Transform a model to enable Speculative Decoding using Model Optimizer.
+    """Transform a model to enable Speculative Decoding using Model Optimizer.
 
     Args:
         model: The model to transform.
@@ -43,9 +37,6 @@ def apply_speculative_decoding(model: nn.Module, algorithm: str = "eagle3") -> n
     Returns:
         The transformed model.
     """
-    if not HAVE_MODELOPT:
-        raise ImportError("nvidia-modelopt is required to use Speculative Decoding")
-
     assert algorithm in ALGORITHMS, f"Invalid algorithm: {algorithm}. Choices: {ALGORITHMS.keys()}"
     mode_cfg = ALGORITHMS[algorithm]
     mode, cfg = mode_cfg["algorithm"], mode_cfg["config"]
@@ -63,16 +54,26 @@ def apply_speculative_decoding(model: nn.Module, algorithm: str = "eagle3") -> n
     if unwrapped_model.config.virtual_pipeline_model_parallel_size is not None:
         raise ValueError("Speculative decoding is incompatible with virtual pipeline parallelism.")
 
-    logging.info(f"Converting to Speculative Decoding model with mode: {mode} and config:\n{cfg}")
+    # Adjust decoder head architecture
+    if "eagle_architecture_config" in cfg:
+        # These ones are necessary
+        cfg["eagle_architecture_config"]["hidden_size"] = unwrapped_model.config.hidden_size
+        cfg["eagle_architecture_config"]["vocab_size"] = unwrapped_model.vocab_size
+        cfg["eagle_architecture_config"]["draft_vocab_size"] = unwrapped_model.vocab_size
+        # These ones are optional but we copy base model's to scale memory usage reasonably
+        cfg["eagle_architecture_config"]["intermediate_size"] = unwrapped_model.config.ffn_hidden_size
+        cfg["eagle_architecture_config"]["num_attention_heads"] = unwrapped_model.config.num_attention_heads
+        cfg["eagle_architecture_config"]["num_key_value_heads"] = unwrapped_model.config.num_query_groups
+
+    # Convert
+    logging.info(f"Converting to Speculative Decoding model with mode: '{mode}' and config:\n{cfg}")
     mtsp.convert(unwrapped_model, [(mode, cfg)])  # assumes in-place
 
     return model
 
 
 def _has_same_speculative_decoding_state(model: nn.Module, mode: str) -> bool:
-    """
-    Check if the model has the same Speculative Decoding state as the incoming algorithm mode.
-    """
+    """Check if the model has the same Speculative Decoding state as the incoming algorithm mode."""
     from modelopt.torch.opt.mode import _ModeRegistryCls
 
     mode_registry = _ModeRegistryCls.get_registry_by_name("speculative")
diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
@@ -13,7 +13,7 @@ megatron_core
 multi-storage-client>=0.21.0
 nltk>=3.6.5
 numpy<2  # tensorstore has an implicit compiled dependency on numpy<2
-nvidia-modelopt[torch]>=0.27.0,<=0.33.0; platform_system != 'Darwin'
+nvidia-modelopt==0.35.0
 nvidia-resiliency-ext>=0.3.0,<1.0.0; platform_system != 'Darwin'
 nvtx
 opencc
diff --git a/scripts/llm/gpt_prune.py b/scripts/llm/gpt_prune.py
diff --git a/tests/functional_tests/L2_NeMo_2_GPT_Speculative_Llama3_TP2PP1.sh b/tests/functional_tests/L2_NeMo_2_GPT_Speculative_Llama3_TP2PP1.sh

Original file line number	Diff line number	Diff line change
`@@ -304,7 +304,7 @@ extra() {`
`304`	`304`	`"llama-index==0.10.43" # incompatible with nvidia-pytriton`
`305`	`305`	`"ctc_segmentation==1.7.1 ; (platform_machine == 'x86_64' and platform_system != 'Darwin')" # requires numpy<2.0.0 to be installed before`
`306`	`306`	`"nemo_run"`
`307`		`- "nvidia-modelopt[torch]==0.33.0 ; platform_system != 'Darwin'" # We want a specific version of nvidia-modelopt`
	`307`	`+ "nvidia-modelopt==0.35.0" # We want a specific version of nvidia-modelopt`
`308`	`308`	`)`
`309`	`309`	`if [[ "${NVIDIA_PYTORCH_VERSION}" != "" ]]; then`
`310`	`310`	`DEPS+=(`