Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion python-client/giskard/core/model_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,28 @@ def validate_model_execution(model: BaseModel, dataset: Dataset, deterministic:
try:
prediction = model.predict(validation_ds)
except Exception as e:
raise ValueError(error_message) from e
features = model.meta.feature_names if model.meta.feature_names is not None else validation_ds.df.columns
number_of_features = len(features)

# Some models (mostly sklearn) expect a 1-dimensional ndarray or pd.Series as input in the case they're
# trained with 1 feature. Here we try to detect in case a user defines their prediction function using
# model.predict_proba(df) (which would break) instead of model.predict_proba(df.feature)
one_dimension_case = (
isinstance(e, IndexError)
and "index 1 is out of bounds for axis 0 with size 1" in str(e)
and number_of_features == 1
)

if not one_dimension_case:
raise ValueError(error_message) from e

feature = features[0]
one_dimension_error_message = (
"Your model returned an error when we passed a 'pandas.Dataframe' as input. "
f"Try to use a one-dimensional input like 'df.{feature}' "
"inside your prediction function."
)
raise ValueError(one_dimension_error_message) from e

# testing one entry
validation_size = min(len(dataset), 1)
Expand Down
42 changes: 42 additions & 0 deletions python-client/tests/models/test_function_model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
import pandas as pd
import pytest

import tests.utils
from giskard import Dataset, Model
Expand All @@ -22,3 +23,44 @@ def test_prediction_function_upload():
)

tests.utils.verify_model_upload(gsk_model, Dataset(df=pd.DataFrame({"x": [1, 2, 3], "y": [1, 0, 1]}), target="y"))


def test_single_feature():
import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

# Load training data
train_data = datasets.load_dataset("sst2", split="train[:5]").to_pandas()

preprocessor = TfidfVectorizer(max_features=10, lowercase=False)

classifier = RandomForestClassifier(n_estimators=10, n_jobs=-1)

model = Pipeline([("preprocessor", preprocessor), ("classifier", classifier)])

X = train_data.sentence
y = train_data.label

model.fit(X, y)

giskard_dataset = Dataset(
df=train_data,
target="label",
name="review_classification_dataset",
)

giskard_model = Model(
model=model.predict_proba,
model_type="classification",
name="review_classifier",
classification_labels=model.classes_,
feature_names=["sentence"],
)

from giskard.core.model_validation import validate_model

with pytest.raises(Exception) as e:
validate_model(giskard_model, giskard_dataset)
assert e.match(r"Your model returned an error when we passed a 'pandas.Dataframe' as input.*")