Giskard-AI · mattbit · Sep 19, 2023 · Sep 15, 2023 · Sep 15, 2023 · Sep 15, 2023
diff --git a/python-client/giskard/core/model_validation.py b/python-client/giskard/core/model_validation.py
@@ -86,7 +86,28 @@ def validate_model_execution(model: BaseModel, dataset: Dataset, deterministic:
     try:
         prediction = model.predict(validation_ds)
     except Exception as e:
-        raise ValueError(error_message) from e
+        features = model.meta.feature_names if model.meta.feature_names is not None else validation_ds.df.columns
+        number_of_features = len(features)
+
+        # Some models (mostly sklearn) expect a 1-dimensional ndarray or pd.Series as input in the case they're
+        # trained with 1 feature. Here we try to detect in case a user defines their prediction function using
+        # model.predict_proba(df) (which would break) instead of model.predict_proba(df.feature)
+        one_dimension_case = (
+            isinstance(e, IndexError)
+            and "index 1 is out of bounds for axis 0 with size 1" in str(e)
+            and number_of_features == 1
+        )
+
+        if not one_dimension_case:
+            raise ValueError(error_message) from e
+
+        feature = features[0]
+        one_dimension_error_message = (
+            "Your model returned an error when we passed a 'pandas.Dataframe' as input. "
+            f"Try to use a one-dimensional input like 'df.{feature}' "
+            "inside your prediction function."
+        )
+        raise ValueError(one_dimension_error_message) from e
 
     # testing one entry
     validation_size = min(len(dataset), 1)

diff --git a/python-client/tests/models/test_function_model.py b/python-client/tests/models/test_function_model.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pandas as pd
+import pytest
 
 import tests.utils
 from giskard import Dataset, Model
@@ -22,3 +23,44 @@ def test_prediction_function_upload():
     )
 
     tests.utils.verify_model_upload(gsk_model, Dataset(df=pd.DataFrame({"x": [1, 2, 3], "y": [1, 0, 1]}), target="y"))
+
+
+def test_single_feature():
+    import datasets
+    from sklearn.ensemble import RandomForestClassifier
+    from sklearn.feature_extraction.text import TfidfVectorizer
+    from sklearn.pipeline import Pipeline
+
+    # Load training data
+    train_data = datasets.load_dataset("sst2", split="train[:5]").to_pandas()
+
+    preprocessor = TfidfVectorizer(max_features=10, lowercase=False)
+
+    classifier = RandomForestClassifier(n_estimators=10, n_jobs=-1)
+
+    model = Pipeline([("preprocessor", preprocessor), ("classifier", classifier)])
+
+    X = train_data.sentence
+    y = train_data.label
+
+    model.fit(X, y)
+
+    giskard_dataset = Dataset(
+        df=train_data,
+        target="label",
+        name="review_classification_dataset",
+    )
+
+    giskard_model = Model(
+        model=model.predict_proba,
+        model_type="classification",
+        name="review_classifier",
+        classification_labels=model.classes_,
+        feature_names=["sentence"],
+    )
+
+    from giskard.core.model_validation import validate_model
+
+    with pytest.raises(Exception) as e:
+        validate_model(giskard_model, giskard_dataset)
+    assert e.match(r"Your model returned an error when we passed a 'pandas.Dataframe' as input.*")