diff --git a/python-client/giskard/core/model_validation.py b/python-client/giskard/core/model_validation.py index dade7e65f9..02054dc5e6 100644 --- a/python-client/giskard/core/model_validation.py +++ b/python-client/giskard/core/model_validation.py @@ -86,7 +86,28 @@ def validate_model_execution(model: BaseModel, dataset: Dataset, deterministic: try: prediction = model.predict(validation_ds) except Exception as e: - raise ValueError(error_message) from e + features = model.meta.feature_names if model.meta.feature_names is not None else validation_ds.df.columns + number_of_features = len(features) + + # Some models (mostly sklearn) expect a 1-dimensional ndarray or pd.Series as input in the case they're + # trained with 1 feature. Here we try to detect in case a user defines their prediction function using + # model.predict_proba(df) (which would break) instead of model.predict_proba(df.feature) + one_dimension_case = ( + isinstance(e, IndexError) + and "index 1 is out of bounds for axis 0 with size 1" in str(e) + and number_of_features == 1 + ) + + if not one_dimension_case: + raise ValueError(error_message) from e + + feature = features[0] + one_dimension_error_message = ( + "Your model returned an error when we passed a 'pandas.Dataframe' as input. " + f"Try to use a one-dimensional input like 'df.{feature}' " + "inside your prediction function." + ) + raise ValueError(one_dimension_error_message) from e # testing one entry validation_size = min(len(dataset), 1) diff --git a/python-client/tests/models/test_function_model.py b/python-client/tests/models/test_function_model.py index d193a2b71e..f401e239cf 100644 --- a/python-client/tests/models/test_function_model.py +++ b/python-client/tests/models/test_function_model.py @@ -1,5 +1,6 @@ import numpy as np import pandas as pd +import pytest import tests.utils from giskard import Dataset, Model @@ -22,3 +23,44 @@ def test_prediction_function_upload(): ) tests.utils.verify_model_upload(gsk_model, Dataset(df=pd.DataFrame({"x": [1, 2, 3], "y": [1, 0, 1]}), target="y")) + + +def test_single_feature(): + import datasets + from sklearn.ensemble import RandomForestClassifier + from sklearn.feature_extraction.text import TfidfVectorizer + from sklearn.pipeline import Pipeline + + # Load training data + train_data = datasets.load_dataset("sst2", split="train[:5]").to_pandas() + + preprocessor = TfidfVectorizer(max_features=10, lowercase=False) + + classifier = RandomForestClassifier(n_estimators=10, n_jobs=-1) + + model = Pipeline([("preprocessor", preprocessor), ("classifier", classifier)]) + + X = train_data.sentence + y = train_data.label + + model.fit(X, y) + + giskard_dataset = Dataset( + df=train_data, + target="label", + name="review_classification_dataset", + ) + + giskard_model = Model( + model=model.predict_proba, + model_type="classification", + name="review_classifier", + classification_labels=model.classes_, + feature_names=["sentence"], + ) + + from giskard.core.model_validation import validate_model + + with pytest.raises(Exception) as e: + validate_model(giskard_model, giskard_dataset) + assert e.match(r"Your model returned an error when we passed a 'pandas.Dataframe' as input.*")