Giskard-AI · andreybavt · Jun 21, 2023 · Jun 19, 2023
diff --git a/python-client/giskard/demo/__init__.py b/python-client/giskard/demo/__init__.py
@@ -1,100 +1,28 @@
-import os
-
-import numpy as np
-import pandas as pd
-from sklearn import model_selection
-from sklearn.compose import ColumnTransformer
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.impute import SimpleImputer
-from sklearn.linear_model import LinearRegression
-from sklearn.linear_model import LogisticRegression
-from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from . import titanic_classification, linear_regression
 
 
 def titanic_df():
-    df = pd.read_csv(os.path.join(os.path.dirname(__file__), "titanic.csv"))
-    df.drop(["Ticket", "Cabin"], axis=1, inplace=True)
-    _classification_labels = {0: "no", 1: "yes"}
-    df["Survived"] = df["Survived"].apply(lambda x: _classification_labels[x])
-    return df
+    return titanic_classification.get_df()
 
 
 def titanic():
-    df = titanic_df()
-    cat_cols = ["Pclass", "Sex", "SibSp", "Parch", "Embarked"]
-    num_cols = ["PassengerId", "Age", "Fare"]
-    text_cols = ["Name"]
-    target = "Survived"
-
-    # tfidf the text column
-    text_transformer = Pipeline([("tfidf", TfidfVectorizer(lowercase=False, strip_accents=None))])
-
-    # transform and scale the numeric columns
-    num_transformer = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
-
-    # one hot encode the categorical values
-    cat_transormer = Pipeline(
-        [
-            ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
-            ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False)),
-        ]
-    )
-
-    # Perform preprocessing of the columns with the above pipelines
-    preprocessor = ColumnTransformer(
-        transformers=[
-            ("text", text_transformer, text_cols[0]),
-            ("num", num_transformer, num_cols),
-            ("cat", cat_transormer, cat_cols),
-        ]
-    )
-
-    # Pipeline for the model Logistic Regression
-    clf = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())])
-
-    Y = df[target]
-    X = df.drop(target, axis=1)
-    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
-        X, Y, test_size=0.50, random_state=30, stratify=Y
-    )
-
-    clf.fit(X_train, Y_train)
-
-    test_data = pd.concat([X_test, Y_test], axis=1)
-
-    return clf, test_data
+    return titanic_classification.get_model_and_df()
 
 
 def titanic_pipeline():
-    clf, _ = titanic()
-
-    def preprocessor(df):
-        return clf[0].transform(df)
-
-    return preprocessor, clf[1]
+    return titanic_classification.get_pipeline()
 
 
 def linear_df():
-    df = pd.DataFrame({"x": np.arange(100), "y": np.arange(100)})
-    return df
+    return linear_regression.get_df()
 
 
 def linear():
-    df = linear_df()
-
-    reg = LinearRegression()
-    reg.fit(df["x"].to_numpy().reshape(100, 1), df["y"].to_numpy().reshape(100, 1))
-    return reg, df
+    return linear_regression.get_model_and_df()
 
 
 def linear_pipeline():
-    reg, _ = linear()
-
-    def preprocessor(df):
-        return df["x"].to_numpy().reshape(len(df["x"]), 1)
-
-    return preprocessor, reg
+    return linear_regression.get_pipeline()
 
 
 __all__ = [

diff --git a/python-client/giskard/demo/linear_regression.py b/python-client/giskard/demo/linear_regression.py
@@ -0,0 +1,25 @@
+import numpy as np
+import pandas as pd
+from sklearn.linear_model import LinearRegression
+
+
+def get_df():
+    df = pd.DataFrame({"x": np.arange(100), "y": np.arange(100)})
+    return df
+
+
+def get_model_and_df():
+    df = get_df()
+
+    reg = LinearRegression()
+    reg.fit(df["x"].to_numpy().reshape(100, 1), df["y"].to_numpy().reshape(100, 1))
+    return reg, df
+
+
+def get_pipeline():
+    reg, _ = get_model_and_df()
+
+    def preprocessor(df):
+        return df["x"].to_numpy().reshape(len(df["x"]), 1)
+
+    return preprocessor, reg
diff --git a/python-client/giskard/demo/titanic_classification.py b/python-client/giskard/demo/titanic_classification.py
@@ -0,0 +1,72 @@
+import os
+import pandas as pd
+from sklearn import model_selection
+from sklearn.compose import ColumnTransformer
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.impute import SimpleImputer
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+
+
+def get_df():
+    df = pd.read_csv(os.path.join(os.path.dirname(__file__), "titanic.csv"))
+    df.drop(["Ticket", "Cabin"], axis=1, inplace=True)
+    _classification_labels = {0: "no", 1: "yes"}
+    df["Survived"] = df["Survived"].apply(lambda x: _classification_labels[x])
+    return df
+
+
+def get_model_and_df():
+    df = get_df()
+    cat_cols = ["Pclass", "Sex", "SibSp", "Parch", "Embarked"]
+    num_cols = ["PassengerId", "Age", "Fare"]
+    text_cols = ["Name"]
+    target = "Survived"
+
+    # tfidf the text column
+    text_transformer = Pipeline([("tfidf", TfidfVectorizer(lowercase=False, strip_accents=None))])
+
+    # transform and scale the numeric columns
+    num_transformer = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
+
+    # one hot encode the categorical values
+    cat_transormer = Pipeline(
+        [
+            ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
+            ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False)),
+        ]
+    )
+
+    # Perform preprocessing of the columns with the above pipelines
+    preprocessor = ColumnTransformer(
+        transformers=[
+            ("text", text_transformer, text_cols[0]),
+            ("num", num_transformer, num_cols),
+            ("cat", cat_transormer, cat_cols),
+        ]
+    )
+
+    # Pipeline for the model Logistic Regression
+    clf = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())])
+
+    Y = df[target]
+    X = df.drop(target, axis=1)
+    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
+        X, Y, test_size=0.50, random_state=30, stratify=Y
+    )
+
+    clf.fit(X_train, Y_train)
+
+    test_data = pd.concat([X_test, Y_test], axis=1)
+
+    return clf, test_data
+
+
+def get_pipeline():
+    clf, _ = get_model_and_df()
+
+    def preprocessor(df):
+        return clf[0].transform(df)
+
+    return preprocessor, clf[1]