NVIDIA-NeMo · nithinraok · Aug 6, 2025 · Jul 27, 2025 · Jul 27, 2025 · Jul 27, 2025
diff --git a/docs/source/asr/api.rst b/docs/source/asr/api.rst
@@ -39,12 +39,6 @@ Model Classes
     :show-inheritance:
     :members: from_asr_config, from_pretrained_models, save_asr_model_to, setup_training_data
 
-.. _confidence-ensembles-api:
-
-.. autoclass:: nemo.collections.asr.models.confidence_ensemble.ConfidenceEnsembleModel
-    :show-inheritance:
-    :members: transcribe
-
 .. _asr-api-modules:
 
 Modules

diff --git a/docs/source/asr/models.rst b/docs/source/asr/models.rst
@@ -309,38 +309,6 @@ For the detailed information see:
 * :ref:`Text-only dataset <Hybrid-ASR-TTS_model__Text-Only-Data>` preparation
 * :ref:`Configs and training <Hybrid-ASR-TTS_model__Config>`
 
-
-.. _Confidence-Ensembles:
-
-Confidence-based Ensembles
---------------------------
-
-Confidence-based ensemble is a simple way to combine multiple models into a single system by only retaining the
-output of the most confident model. Below is a schematic illustration of how such ensembles work.
-
-.. image:: images/conf-ensembles-overview.png
-    :align: center
-    :alt: confidence-based ensembles
-    :scale: 50%
-
-For more details about this model, see the `paper <https://arxiv.org/abs/2306.15824>`_
-or read our `tutorial <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/asr/Confidence_Ensembles.ipynb>`_.
-
-NeMo support Confidence-based Ensembles through the
-:ref:`nemo.collections.asr.models.confidence_ensemble.ConfidenceEnsembleModel <confidence-ensembles-api>` class.
-
-A typical workflow to create and use the ensemble is like this
-
-1. Run `scripts/confidence_ensembles/build_ensemble.py <https://github.com/NVIDIA/NeMo/blob/main/scripts/confidence_ensembles/build_ensemble.py>`_
-   script to create ensemble from existing models. See the documentation inside the script for usage examples
-   and description of all the supported functionality.
-2. The script outputs a checkpoint that combines all the models in an ensemble. It can be directly used to transcribe
-   speech by calling ``.trascribe()`` method or using
-   `examples/asr/transcribe_speech.py <https://github.com/NVIDIA/NeMo/blob/main/examples/asr/transcribe_speech.py>`_.
-
-Note that the ensemble cannot be modified after construction (e.g. it does not support finetuning) and only
-transcribe functionality is supported (e.g., ``.forward()`` is not properly defined).
-
 .. _Jasper_model:
 
 Jasper

diff --git a/docs/source/starthere/tutorials.rst b/docs/source/starthere/tutorials.rst
@@ -152,9 +152,6 @@ Tutorial Overview
    * - ASR
      - ASR Confidence Estimation
      - `ASR Confidence Estimation <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/asr/ASR_Confidence_Estimation.ipynb>`_
-   * - ASR
-     - Confidence-based Ensembles
-     - `Confidence-based Ensembles <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/asr/Confidence_Ensembles.ipynb>`_
 
 .. list-table:: **Text-to-Speech (TTS) Tutorials**
    :widths: 15 35 50

diff --git a/nemo/collections/asr/models/confidence_ensemble.py b/nemo/collections/asr/models/confidence_ensemble.py
@@ -17,33 +17,23 @@
 import pickle
 import warnings
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Union
 
 try:
     from joblib.numpy_pickle_utils import _read_fileobject as _validate_joblib_file
 except ImportError:
     from joblib.numpy_pickle_utils import _validate_fileobject_and_memmap as _validate_joblib_file
-import numpy as np
 import torch
-from lightning.pytorch import Trainer
-from omegaconf import DictConfig, open_dict
 from sklearn.linear_model import LogisticRegression
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler
 
-from nemo.collections.asr.models.asr_model import ASRModel
-from nemo.collections.asr.models.hybrid_rnnt_ctc_models import EncDecHybridRNNTCTCModel
-from nemo.collections.asr.parts.preprocessing.segment import ChannelSelectorType
 from nemo.collections.asr.parts.utils.asr_confidence_utils import (
     ConfidenceConfig,
     ConfidenceMethodConfig,
     get_confidence_aggregation_bank,
     get_confidence_measure_bank,
 )
 from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
-from nemo.core.classes import ModelPT
-from nemo.utils import model_utils
-from nemo.utils.decorators import deprecated
 
 
 # frozen is required to allow hashing of this class and use it
@@ -241,191 +231,3 @@ class SecurityError(Exception):
     """Custom exception for security-related errors."""
 
     pass
-
-
-@deprecated(version='v2.1.0')
-class ConfidenceEnsembleModel(ModelPT):
-    """Implementation of the confidence ensemble model.
-
-    See https://arxiv.org/abs/2306.15824 for details.
-
-    .. note::
-        Currently this class only support `transcribe` method as it requires
-        full-utterance confidence scores to operate.
-    """
-
-    def __init__(
-        self,
-        cfg: DictConfig,
-        trainer: 'Trainer' = None,
-    ):
-        super().__init__(cfg=cfg, trainer=trainer)
-
-        # either we load all models from ``load_models`` cfg parameter
-        # or all of them are specified in the config as modelX alongside the num_models key
-        #
-        # ideally, we'd like to directly store all models in a list, but that
-        # is not currently supported by the submodule logic
-        # so to access all the models, we do something like
-        #
-        # for model_idx in range(self.num_models):
-        #    model = getattr(self, f"model{model_idx}")
-
-        if 'num_models' in self.cfg:
-            self.num_models = self.cfg.num_models
-            for idx in range(self.num_models):
-                cfg_field = f"model{idx}"
-                model_cfg = self.cfg[cfg_field]
-                model_class = model_utils.import_class_by_path(model_cfg['target'])
-                self.register_nemo_submodule(
-                    name=cfg_field,
-                    config_field=cfg_field,
-                    model=model_class(model_cfg, trainer=trainer),
-                )
-        else:
-            self.num_models = len(cfg.load_models)
-            with open_dict(self.cfg):
-                self.cfg.num_models = self.num_models
-            for idx, model in enumerate(cfg.load_models):
-                cfg_field = f"model{idx}"
-                if model.endswith(".nemo"):
-                    self.register_nemo_submodule(
-                        name=cfg_field,
-                        config_field=cfg_field,
-                        model=ASRModel.restore_from(model, trainer=trainer, map_location="cpu"),
-                    )
-                else:
-                    self.register_nemo_submodule(
-                        cfg_field,
-                        config_field=cfg_field,
-                        model=ASRModel.from_pretrained(model, map_location="cpu"),
-                    )
-
-        # registering model selection block - this is expected to be a joblib-saved
-        # pretrained sklearn pipeline containing standardization + logistic regression
-        # trained to predict "most-confident" model index from the confidence scores of all models
-        model_selection_block_path = self.register_artifact("model_selection_block", cfg.model_selection_block)
-        try:
-            self.model_selection_block = safe_joblib_load(model_selection_block_path)
-        except SecurityError as e:
-            raise RuntimeError(f"Security error loading model selection block: {str(e)}")
-        except Exception as e:
-            raise RuntimeError(f"Error loading model selection block: {str(e)}")
-
-        self.confidence_cfg = ConfidenceConfig(**self.cfg.confidence)
-
-        # making sure each model has correct temperature setting in the decoder strategy
-        for model_idx in range(self.num_models):
-            model = getattr(self, f"model{model_idx}")
-            # for now we assume users are direclty responsible for matching
-            # decoder type when building ensemble with inference type
-            # TODO: add automatic checks for errors
-            if isinstance(model, EncDecHybridRNNTCTCModel):
-                self.update_decoding_parameters(model.cfg.decoding)
-                model.change_decoding_strategy(model.cfg.decoding, decoder_type="rnnt")
-                self.update_decoding_parameters(model.cfg.aux_ctc.decoding)
-                model.change_decoding_strategy(model.cfg.aux_ctc.decoding, decoder_type="ctc")
-            else:
-                self.update_decoding_parameters(model.cfg.decoding)
-                model.change_decoding_strategy(model.cfg.decoding)
-
-    def update_decoding_parameters(self, decoding_cfg: DictConfig):
-        """Updating temperature/preserve_alignment parameters of the config."""
-        with open_dict(decoding_cfg):
-            decoding_cfg.temperature = self.cfg.temperature
-            decoding_cfg.preserve_alignments = True
-
-    def setup_training_data(self, train_data_config: Union[DictConfig, Dict]):
-        """Pass-through to the ensemble models.
-
-        Note that training is not actually supported for this class!
-        """
-        for model_idx in range(self.num_models):
-            getattr(self, f"model{model_idx}").setup_training_data(train_data_config)
-
-    def setup_validation_data(self, val_data_config: Union[DictConfig, Dict]):
-        """Pass-through to the ensemble models."""
-        for model_idx in range(self.num_models):
-            getattr(self, f"model{model_idx}").setup_validation_data(val_data_config)
-
-    def change_attention_model(
-        self, self_attention_model: str = None, att_context_size: List[int] = None, update_config: bool = True
-    ):
-        """Pass-through to the ensemble models."""
-        for model_idx in range(self.num_models):
-            getattr(self, f"model{model_idx}").change_attention_model(
-                self_attention_model, att_context_size, update_config
-            )
-
-    def change_decoding_strategy(self, decoding_cfg: Optional[DictConfig] = None, decoder_type: str = None):
-        """Pass-through to the ensemble models.
-
-        The only change here is that we always require expected temperature
-        to be set as well as ``decoding_cfg.preserve_alignments = True``
-        """
-        self.update_decoding_parameters(decoding_cfg)
-        for model_idx in range(self.num_models):
-            model = getattr(self, f"model{model_idx}")
-            if isinstance(model, EncDecHybridRNNTCTCModel):
-                model.change_decoding_strategy(decoding_cfg, decoder_type=decoder_type)
-            else:
-                model.change_decoding_strategy(decoding_cfg)
-
-    @torch.no_grad()
-    def transcribe(
-        self,
-        paths2audio_files: List[str],
-        batch_size: int = 4,
-        return_hypotheses: bool = False,
-        num_workers: int = 0,
-        channel_selector: Optional[ChannelSelectorType] = None,
-        augmentor: DictConfig = None,
-        verbose: bool = True,
-        **kwargs,  # any other model specific parameters are passed directly
-    ) -> List[str]:
-        """Confidence-ensemble transcribe method.
-
-        Consists of the following steps:
-
-            1. Run all models (TODO: in parallel)
-            2. Compute confidence for each model
-            3. Use logistic regression to pick the "most confident" model
-            4. Return the output of that model
-        """
-        confidences = []
-        all_transcriptions = []
-        # always requiring to return hypothesis
-        # TODO: make sure to return text only if was False originally
-        return_hypotheses = True
-        for model_idx in range(self.num_models):
-            model = getattr(self, f"model{model_idx}")
-            transcriptions = model.transcribe(
-                paths2audio_files=paths2audio_files,
-                batch_size=batch_size,
-                return_hypotheses=return_hypotheses,
-                num_workers=num_workers,
-                channel_selector=channel_selector,
-                augmentor=augmentor,
-                verbose=verbose,
-                **kwargs,
-            )
-            if isinstance(transcriptions, tuple):  # transducers return a tuple
-                transcriptions = transcriptions[0]
-
-            model_confidences = []
-            for transcription in transcriptions:
-                model_confidences.append(compute_confidence(transcription, self.confidence_cfg))
-            confidences.append(model_confidences)
-            all_transcriptions.append(transcriptions)
-
-        # transposing with zip(*list)
-        features = np.array(list(zip(*confidences)))
-        model_indices = self.model_selection_block.predict(features)
-        final_transcriptions = []
-        for transcrption_idx in range(len(all_transcriptions[0])):
-            final_transcriptions.append(all_transcriptions[model_indices[transcrption_idx]][transcrption_idx])
-
-        return final_transcriptions
-
-    def list_available_models(self):
-        return []