Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions docs/source/asr/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,6 @@ Model Classes
:show-inheritance:
:members: from_asr_config, from_pretrained_models, save_asr_model_to, setup_training_data

.. _confidence-ensembles-api:

.. autoclass:: nemo.collections.asr.models.confidence_ensemble.ConfidenceEnsembleModel
:show-inheritance:
:members: transcribe

.. _asr-api-modules:

Modules
Expand Down
32 changes: 0 additions & 32 deletions docs/source/asr/models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -309,38 +309,6 @@ For the detailed information see:
* :ref:`Text-only dataset <Hybrid-ASR-TTS_model__Text-Only-Data>` preparation
* :ref:`Configs and training <Hybrid-ASR-TTS_model__Config>`


.. _Confidence-Ensembles:

Confidence-based Ensembles
--------------------------

Confidence-based ensemble is a simple way to combine multiple models into a single system by only retaining the
output of the most confident model. Below is a schematic illustration of how such ensembles work.

.. image:: images/conf-ensembles-overview.png
:align: center
:alt: confidence-based ensembles
:scale: 50%

For more details about this model, see the `paper <https://arxiv.org/abs/2306.15824>`_
or read our `tutorial <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/asr/Confidence_Ensembles.ipynb>`_.

NeMo support Confidence-based Ensembles through the
:ref:`nemo.collections.asr.models.confidence_ensemble.ConfidenceEnsembleModel <confidence-ensembles-api>` class.

A typical workflow to create and use the ensemble is like this

1. Run `scripts/confidence_ensembles/build_ensemble.py <https://github.com/NVIDIA/NeMo/blob/main/scripts/confidence_ensembles/build_ensemble.py>`_
script to create ensemble from existing models. See the documentation inside the script for usage examples
and description of all the supported functionality.
2. The script outputs a checkpoint that combines all the models in an ensemble. It can be directly used to transcribe
speech by calling ``.trascribe()`` method or using
`examples/asr/transcribe_speech.py <https://github.com/NVIDIA/NeMo/blob/main/examples/asr/transcribe_speech.py>`_.

Note that the ensemble cannot be modified after construction (e.g. it does not support finetuning) and only
transcribe functionality is supported (e.g., ``.forward()`` is not properly defined).

.. _Jasper_model:

Jasper
Expand Down
3 changes: 0 additions & 3 deletions docs/source/starthere/tutorials.rst
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,6 @@ Tutorial Overview
* - ASR
- ASR Confidence Estimation
- `ASR Confidence Estimation <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/asr/ASR_Confidence_Estimation.ipynb>`_
* - ASR
- Confidence-based Ensembles
- `Confidence-based Ensembles <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/asr/Confidence_Ensembles.ipynb>`_

.. list-table:: **Text-to-Speech (TTS) Tutorials**
:widths: 15 35 50
Expand Down
198 changes: 0 additions & 198 deletions nemo/collections/asr/models/confidence_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,33 +17,23 @@
import pickle
import warnings
from dataclasses import dataclass
from typing import Dict, List, Optional, Union

try:
from joblib.numpy_pickle_utils import _read_fileobject as _validate_joblib_file
except ImportError:
from joblib.numpy_pickle_utils import _validate_fileobject_and_memmap as _validate_joblib_file
import numpy as np
import torch
from lightning.pytorch import Trainer
from omegaconf import DictConfig, open_dict
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from nemo.collections.asr.models.asr_model import ASRModel
from nemo.collections.asr.models.hybrid_rnnt_ctc_models import EncDecHybridRNNTCTCModel
from nemo.collections.asr.parts.preprocessing.segment import ChannelSelectorType
from nemo.collections.asr.parts.utils.asr_confidence_utils import (
ConfidenceConfig,
ConfidenceMethodConfig,
get_confidence_aggregation_bank,
get_confidence_measure_bank,
)
from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
from nemo.core.classes import ModelPT
from nemo.utils import model_utils
from nemo.utils.decorators import deprecated


# frozen is required to allow hashing of this class and use it
Expand Down Expand Up @@ -241,191 +231,3 @@ class SecurityError(Exception):
"""Custom exception for security-related errors."""

pass


@deprecated(version='v2.1.0')
class ConfidenceEnsembleModel(ModelPT):
"""Implementation of the confidence ensemble model.

See https://arxiv.org/abs/2306.15824 for details.

.. note::
Currently this class only support `transcribe` method as it requires
full-utterance confidence scores to operate.
"""

def __init__(
self,
cfg: DictConfig,
trainer: 'Trainer' = None,
):
super().__init__(cfg=cfg, trainer=trainer)

# either we load all models from ``load_models`` cfg parameter
# or all of them are specified in the config as modelX alongside the num_models key
#
# ideally, we'd like to directly store all models in a list, but that
# is not currently supported by the submodule logic
# so to access all the models, we do something like
#
# for model_idx in range(self.num_models):
# model = getattr(self, f"model{model_idx}")

if 'num_models' in self.cfg:
self.num_models = self.cfg.num_models
for idx in range(self.num_models):
cfg_field = f"model{idx}"
model_cfg = self.cfg[cfg_field]
model_class = model_utils.import_class_by_path(model_cfg['target'])
self.register_nemo_submodule(
name=cfg_field,
config_field=cfg_field,
model=model_class(model_cfg, trainer=trainer),
)
else:
self.num_models = len(cfg.load_models)
with open_dict(self.cfg):
self.cfg.num_models = self.num_models
for idx, model in enumerate(cfg.load_models):
cfg_field = f"model{idx}"
if model.endswith(".nemo"):
self.register_nemo_submodule(
name=cfg_field,
config_field=cfg_field,
model=ASRModel.restore_from(model, trainer=trainer, map_location="cpu"),
)
else:
self.register_nemo_submodule(
cfg_field,
config_field=cfg_field,
model=ASRModel.from_pretrained(model, map_location="cpu"),
)

# registering model selection block - this is expected to be a joblib-saved
# pretrained sklearn pipeline containing standardization + logistic regression
# trained to predict "most-confident" model index from the confidence scores of all models
model_selection_block_path = self.register_artifact("model_selection_block", cfg.model_selection_block)
try:
self.model_selection_block = safe_joblib_load(model_selection_block_path)
except SecurityError as e:
raise RuntimeError(f"Security error loading model selection block: {str(e)}")
except Exception as e:
raise RuntimeError(f"Error loading model selection block: {str(e)}")

self.confidence_cfg = ConfidenceConfig(**self.cfg.confidence)

# making sure each model has correct temperature setting in the decoder strategy
for model_idx in range(self.num_models):
model = getattr(self, f"model{model_idx}")
# for now we assume users are direclty responsible for matching
# decoder type when building ensemble with inference type
# TODO: add automatic checks for errors
if isinstance(model, EncDecHybridRNNTCTCModel):
self.update_decoding_parameters(model.cfg.decoding)
model.change_decoding_strategy(model.cfg.decoding, decoder_type="rnnt")
self.update_decoding_parameters(model.cfg.aux_ctc.decoding)
model.change_decoding_strategy(model.cfg.aux_ctc.decoding, decoder_type="ctc")
else:
self.update_decoding_parameters(model.cfg.decoding)
model.change_decoding_strategy(model.cfg.decoding)

def update_decoding_parameters(self, decoding_cfg: DictConfig):
"""Updating temperature/preserve_alignment parameters of the config."""
with open_dict(decoding_cfg):
decoding_cfg.temperature = self.cfg.temperature
decoding_cfg.preserve_alignments = True

def setup_training_data(self, train_data_config: Union[DictConfig, Dict]):
"""Pass-through to the ensemble models.

Note that training is not actually supported for this class!
"""
for model_idx in range(self.num_models):
getattr(self, f"model{model_idx}").setup_training_data(train_data_config)

def setup_validation_data(self, val_data_config: Union[DictConfig, Dict]):
"""Pass-through to the ensemble models."""
for model_idx in range(self.num_models):
getattr(self, f"model{model_idx}").setup_validation_data(val_data_config)

def change_attention_model(
self, self_attention_model: str = None, att_context_size: List[int] = None, update_config: bool = True
):
"""Pass-through to the ensemble models."""
for model_idx in range(self.num_models):
getattr(self, f"model{model_idx}").change_attention_model(
self_attention_model, att_context_size, update_config
)

def change_decoding_strategy(self, decoding_cfg: Optional[DictConfig] = None, decoder_type: str = None):
"""Pass-through to the ensemble models.

The only change here is that we always require expected temperature
to be set as well as ``decoding_cfg.preserve_alignments = True``
"""
self.update_decoding_parameters(decoding_cfg)
for model_idx in range(self.num_models):
model = getattr(self, f"model{model_idx}")
if isinstance(model, EncDecHybridRNNTCTCModel):
model.change_decoding_strategy(decoding_cfg, decoder_type=decoder_type)
else:
model.change_decoding_strategy(decoding_cfg)

@torch.no_grad()
def transcribe(
self,
paths2audio_files: List[str],
batch_size: int = 4,
return_hypotheses: bool = False,
num_workers: int = 0,
channel_selector: Optional[ChannelSelectorType] = None,
augmentor: DictConfig = None,
verbose: bool = True,
**kwargs, # any other model specific parameters are passed directly
) -> List[str]:
"""Confidence-ensemble transcribe method.

Consists of the following steps:

1. Run all models (TODO: in parallel)
2. Compute confidence for each model
3. Use logistic regression to pick the "most confident" model
4. Return the output of that model
"""
confidences = []
all_transcriptions = []
# always requiring to return hypothesis
# TODO: make sure to return text only if was False originally
return_hypotheses = True
for model_idx in range(self.num_models):
model = getattr(self, f"model{model_idx}")
transcriptions = model.transcribe(
paths2audio_files=paths2audio_files,
batch_size=batch_size,
return_hypotheses=return_hypotheses,
num_workers=num_workers,
channel_selector=channel_selector,
augmentor=augmentor,
verbose=verbose,
**kwargs,
)
if isinstance(transcriptions, tuple): # transducers return a tuple
transcriptions = transcriptions[0]

model_confidences = []
for transcription in transcriptions:
model_confidences.append(compute_confidence(transcription, self.confidence_cfg))
confidences.append(model_confidences)
all_transcriptions.append(transcriptions)

# transposing with zip(*list)
features = np.array(list(zip(*confidences)))
model_indices = self.model_selection_block.predict(features)
final_transcriptions = []
for transcrption_idx in range(len(all_transcriptions[0])):
final_transcriptions.append(all_transcriptions[model_indices[transcrption_idx]][transcrption_idx])

return final_transcriptions

def list_available_models(self):
return []
Loading
Loading