Merge branch 'fix-pad-inconsistency-feature-extractor' of https://github.com/nvidia/nemo into fix-pad-inconsistency-feature-extractor

pzelasko · pzelasko · commit c314f436daca · 2025-06-05T19:29:11.000-04:00
diff --git a/nemo/collections/asr/models/confidence_ensemble.py b/nemo/collections/asr/models/confidence_ensemble.py
@@ -19,7 +19,10 @@
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Union
 
-import joblib
+try:
+    from joblib.numpy_pickle_utils import _read_fileobject as _validate_joblib_file
+except ImportError:
+    from joblib.numpy_pickle_utils import _validate_fileobject_and_memmap as _validate_joblib_file
 import numpy as np
 import torch
 from lightning.pytorch import Trainer
@@ -205,13 +208,16 @@ def find_class(self, module, name):
             warnings.simplefilter("ignore")
             # First try to load with our custom unpickler
             try:
-                with open(file_path, 'rb') as f:
-                    unpickler = RestrictedUnpickler(f)
-                    model = unpickler.load()
-            except (pickle.UnpicklingError, AttributeError):
-                # If that fails, try loading with joblib's default loader first
-                # then validate the loaded object
-                model = joblib.load(file_path)
+                with open(file_path, 'rb') as rawf:
+                    with _validate_joblib_file(rawf, file_path, mmap_mode=None) as stream:
+                        if isinstance(stream, tuple):
+                            stream = stream[0]
+
+                        if isinstance(stream, str):
+                            with open(stream, "rb") as f:
+                                model = RestrictedUnpickler(f).load()
+                        else:
+                            model = RestrictedUnpickler(stream).load()
 
                 # Validate the loaded object is a sklearn Pipeline
                 if not isinstance(model, Pipeline):
@@ -222,6 +228,9 @@ def find_class(self, module, name):
                     if not (isinstance(step_obj, (StandardScaler, LogisticRegression))):
                         raise ValueError(f"Unauthorized pipeline step: {type(step_obj)}")
 
+            except (pickle.UnpicklingError, AttributeError) as e:
+                raise SecurityError(f"Failed to safely load model: {e}")
+
         return model
 
     except Exception as e:
diff --git a/requirements/requirements_automodel.txt b/requirements/requirements_automodel.txt
@@ -1,2 +1,2 @@
-bitsandbytes==0.45.3 ; (platform_machine == 'x86_64' and platform_system != 'Darwin')
+bitsandbytes==0.45.5 ; (platform_machine == 'x86_64' and platform_system != 'Darwin')
 # liger-kernel ; (platform_machine == 'x86_64' and platform_system != 'Darwin')
diff --git a/tests/collections/asr/test_confidence_ensembles.py b/tests/collections/asr/test_confidence_ensembles.py
@@ -15,6 +15,9 @@
 import joblib
 import pytest
 from omegaconf import DictConfig, ListConfig
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
 
 from nemo.collections.asr.models import EncDecCTCModel, EncDecHybridRNNTCTCModel, EncDecRNNTModel
 from nemo.collections.asr.models.confidence_ensemble import ConfidenceEnsembleModel
@@ -98,26 +101,34 @@ class TestConfidenceEnsembles:
 
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "model_class0", [EncDecCTCModel, EncDecRNNTModel, EncDecHybridRNNTCTCModel],
+        "model_class0",
+        [EncDecCTCModel, EncDecRNNTModel, EncDecHybridRNNTCTCModel],
     )
     @pytest.mark.parametrize(
-        "model_class1", [EncDecCTCModel, EncDecRNNTModel, EncDecHybridRNNTCTCModel],
+        "model_class1",
+        [EncDecCTCModel, EncDecRNNTModel, EncDecHybridRNNTCTCModel],
     )
     def test_model_creation_2models(self, tmp_path, model_class0, model_class1):
         """Basic test to check that ensemble of 2 models can be created."""
         model_config0 = get_model_config(model_class0)
         model_config1 = get_model_config(model_class1)
 
         # dummy pickle file for the model selection block
-        joblib.dump({}, tmp_path / 'dummy.pkl')
+        pipe = Pipeline([("scaler", StandardScaler()), ("clf", LogisticRegression())])
+        joblib.dump(pipe, tmp_path / 'dummy.pkl')
 
         # default confidence
         confidence_config = ConfidenceConfig(
             # we keep frame confidences and apply aggregation manually to get full-utterance confidence
             preserve_frame_confidence=True,
             exclude_blank=True,
             aggregation="mean",
-            method_cfg=ConfidenceMethodConfig(name="entropy", entropy_type="renyi", alpha=0.25, entropy_norm="lin",),
+            method_cfg=ConfidenceMethodConfig(
+                name="entropy",
+                entropy_type="renyi",
+                alpha=0.25,
+                entropy_norm="lin",
+            ),
         )
 
         # just checking that no errors are raised when creating the model
@@ -140,15 +151,21 @@ def test_model_creation_5models(self, tmp_path):
         model_configs = [get_model_config(EncDecCTCModel) for _ in range(5)]
 
         # dummy pickle file for the model selection block
-        joblib.dump({}, tmp_path / 'dummy.pkl')
+        pipe = Pipeline([("scaler", StandardScaler()), ("clf", LogisticRegression())])
+        joblib.dump(pipe, tmp_path / 'dummy.pkl')
 
         # default confidence
         confidence_config = ConfidenceConfig(
             # we keep frame confidences and apply aggregation manually to get full-utterance confidence
             preserve_frame_confidence=True,
             exclude_blank=True,
             aggregation="mean",
-            method_cfg=ConfidenceMethodConfig(name="entropy", entropy_type="renyi", alpha=0.25, entropy_norm="lin",),
+            method_cfg=ConfidenceMethodConfig(
+                name="entropy",
+                entropy_type="renyi",
+                alpha=0.25,
+                entropy_norm="lin",
+            ),
         )
 
         # just checking that no errors are raised when creating the model

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-bitsandbytes==0.45.3 ; (platform_machine == 'x86_64' and platform_system != 'Darwin')`
	`1`	`+bitsandbytes==0.45.5 ; (platform_machine == 'x86_64' and platform_system != 'Darwin')`
`2`	`2`	`# liger-kernel ; (platform_machine == 'x86_64' and platform_system != 'Darwin')`