Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
FROM python:3.9.2-buster AS build

COPY ml_project ml_project
WORKDIR /ml_project
RUN pip install --upgrade build
RUN python -m build

FROM python:3.9.2-buster

COPY online_inference/requirements.txt ./requirements.txt
RUN pip install -r requirements.txt

COPY --from=build /ml_project/dist dist
RUN pip install dist/*.whl && rm -rf dist

COPY online_inference/api api

COPY ml_project/models/model.pkl model.pkl
COPY ml_project/models/pipeline.pkl pipeline.pkl
COPY ml_project/models/metadata.pkl metadata.pkl

WORKDIR .

ENV model_path="/model.pkl"
ENV pipeline_path="/pipeline.pkl"
ENV metadata_path="/metadata.pkl"

CMD ["uvicorn", "api.api:app", "--host", "0.0.0.0", "--port", "80"]
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,27 @@
Репозиторий для курса "Машинное обучение в продакшене" MADE.

[Профиль](https://data.mail.ru/profile/e.polikutin/)

Сборка образа
-----------------
```shell
python -m heart_disease.models.train_model
docker build . -t ml_project:latest
```

Публикация образа
-----------------
```shell
docker tag ml_project:latest polikutinevgeny/ml_project:latest
docker push polikutinevgeny/ml_project:latest
```

Запуск образа
-------------
```shell
docker pull polikutinevgeny/ml_project:latest
docker run -p 8000:80 polikutinevgeny/ml_project:latest
```

Протыкать скриптом:
`python -m online_inference.api.make_request`
120 changes: 103 additions & 17 deletions ml_project/tests/conftest.py → conftest.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
from collections import OrderedDict
from pathlib import Path
from typing import List, Union, Dict, Callable
from typing import List, Union, Dict, Callable, Tuple

import numpy as np
import pandas as pd
import pytest
from numpy.random import Generator, PCG64

from heart_disease.data.make_dataset import read_data
from heart_disease.entities.data_loading_config import DataLoadingConfig
from heart_disease.entities.feature_config import FeatureConfig, RandomProjectionFeaturesConfig, \
StatisticalFeaturesConfig, KMeansFeaturesConfig, PolynomialFeaturesConfig, RawFeaturesConfig
from heart_disease.features.build_features import build_feature_pipeline
from heart_disease.entities.model_config import TrainModelConfig, EvaluateModelConfig, ModelType
from heart_disease.entities.pipeline_config import TrainingConfig
from heart_disease.entities.splitting_config import SplittingConfig
from heart_disease.features.build_features import build_feature_pipeline, extract_raw_features
from heart_disease.models.train_model import train_pipeline


def get_row_generators(rng: Generator) -> Dict[str, Callable]:
Expand All @@ -32,17 +36,17 @@ def get_row_generators(rng: Generator) -> Dict[str, Callable]:
}


@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def categorical_features() -> List[str]:
return ["thal", "ca", "slope", "exang", "restecg", "fbs", "cp", "sex"]


@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def numerical_features() -> List[str]:
return ["age", "trestbps", "chol", "thalach", "oldpeak"]


@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def target_column() -> str:
return "target"

Expand All @@ -54,31 +58,48 @@ def generate_random_row(row_generators: Dict[str, Callable]) -> Dict[str, Union[
return row


@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def dataset_filename() -> str:
return "data.csv"


@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def dataset_size() -> int:
return 200


@pytest.fixture(scope='function')
def dataset_file(tmp_path: Path, dataset_filename: str, dataset_size: int) -> str:
@pytest.fixture(scope="session")
def dataset_file(tmp_path_factory, dataset_filename: str, dataset_size: int) -> str:
path = tmp_path_factory.mktemp("path")
rng = Generator(PCG64(12345))
data = pd.DataFrame.from_records([generate_random_row(get_row_generators(rng)) for _ in range(dataset_size)])
dataset_path = tmp_path / dataset_filename
dataset_path = path / dataset_filename
data.to_csv(dataset_path, index=False)
return str(dataset_path)


@pytest.fixture(scope='function')
@pytest.fixture(scope="session")
def test_dataset_file(tmp_path_factory, dataset_filename: str, dataset_size: int, target_column: str) -> str:
path = tmp_path_factory.mktemp("path")
rng = Generator(PCG64(12345))
data = pd.DataFrame.from_records([generate_random_row(get_row_generators(rng)) for _ in range(dataset_size)])
data.drop(columns=[target_column, ], inplace=True)
dataset_path = path / dataset_filename
data.to_csv(dataset_path, index=False)
return str(dataset_path)


@pytest.fixture(scope="session")
def dataset(dataset_file: str) -> pd.DataFrame:
return read_data(dataset_file)


@pytest.fixture(scope='function')
@pytest.fixture(scope="session")
def test_dataset(test_dataset_file: str) -> pd.DataFrame:
return read_data(test_dataset_file)


@pytest.fixture(scope="session")
def features(
dataset: pd.DataFrame,
categorical_features: List[str],
Expand All @@ -92,16 +113,16 @@ def features(
config = get_feature_config(target_column, categorical_features, n_clusters, numerical_features, polynomial_degree,
projection_features, statistics)
pipeline = build_feature_pipeline(config)
transformed_features = pipeline.fit_transform(dataset)
transformed_features = pipeline.fit_transform(extract_raw_features(dataset, config))
return transformed_features


@pytest.fixture(scope='function')
@pytest.fixture(scope="session")
def target(dataset: pd.DataFrame, target_column: str) -> np.ndarray:
return dataset[target_column].values


@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def statistics() -> OrderedDict[str, Callable]:
return OrderedDict(sum=np.sum, var=lambda x, **kwargs: np.var(x, ddof=1, **kwargs), median=np.median,
mean=np.mean, std=lambda x, **kwargs: np.std(x, ddof=1, **kwargs), max=np.max, min=np.min)
Expand Down Expand Up @@ -132,6 +153,71 @@ def get_feature_config(
return config


@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def metrics() -> List[str]:
return ["accuracy", "f1", "precision", "recall"]


@pytest.fixture(scope="session")
def metrics_path(tmp_path_factory) -> str:
return str(tmp_path_factory.mktemp("path") / "metrics.yaml")


@pytest.fixture(scope="session")
def model_save_path(tmp_path_factory) -> str:
return str(tmp_path_factory.mktemp("path") / "model.pkl")


@pytest.fixture(scope="session")
def pipeline_save_path(tmp_path_factory) -> str:
return str(tmp_path_factory.mktemp("path") / "pipeline.pkl")


@pytest.fixture(scope="session")
def metadata_path(tmp_path_factory) -> str:
return str(tmp_path_factory.mktemp("path") / "metadata.pkl")


@pytest.fixture(scope="session")
def train_artifacts(
categorical_features: List[str],
dataset_file: str,
metrics: List[str],
numerical_features: List[str],
statistics: OrderedDict[str, Callable],
model_save_path: str,
pipeline_save_path: str,
metrics_path: str,
metadata_path: str,
target_column: str
) -> Tuple[str, str, str, str]:
projection_features = 5
polynomial_degree = 2
n_clusters = 2
feature_config = get_feature_config(target_column, categorical_features, n_clusters, numerical_features,
polynomial_degree,
projection_features, statistics)
config = TrainingConfig(
data_load_config=DataLoadingConfig(
split_config=SplittingConfig(
random_state=42,
val_size=0.2
),
data_path=dataset_file
),
feature_config=feature_config,
model_config=TrainModelConfig(
model=ModelType.random_forest,
random_state=42,
params=dict(n_estimators=55)
),
evaluation_config=EvaluateModelConfig(
metrics=metrics,
metric_file_path=metrics_path
),
pipeline_save_path=pipeline_save_path,
model_save_path=model_save_path,
metadata_save_path=metadata_path
)
train_pipeline(config)
return metrics_path, model_save_path, pipeline_save_path, metadata_path
2 changes: 1 addition & 1 deletion ml_project/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ endif
## Install Python Dependencies
requirements: test_environment
$(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel
$(PYTHON_INTERPRETER) -m pip install -r requirements.txt
$(PYTHON_INTERPRETER) -m pip install -r dev_requirements.txt

## Build EDA report
eda_report: requirements
Expand Down
Empty file added ml_project/__init__.py
Empty file.
1 change: 1 addition & 0 deletions ml_project/config/experiment_1/train_config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
model_save_path: models/model_experiment_1.pkl
pipeline_save_path: models/pipeline_experiment_1.pkl
metadata_save_path: models/metadata_experiment_1.pkl
data_load_config:
data_path: data/heart.csv
split_config:
Expand Down
1 change: 1 addition & 0 deletions ml_project/config/experiment_2/train_config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
model_save_path: models/model_experiment_2.pkl
pipeline_save_path: models/pipeline_experiment_2.pkl
metadata_save_path: models/metadata_experiment_2.pkl
data_load_config:
data_path: data/heart.csv
split_config:
Expand Down
1 change: 1 addition & 0 deletions ml_project/config/train_config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
model_save_path: models/model.pkl
pipeline_save_path: models/pipeline.pkl
metadata_save_path: models/metadata.pkl
data_load_config:
data_path: data/heart.csv
split_config:
Expand Down
4 changes: 4 additions & 0 deletions ml_project/dev_requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# local package
-e .

-r requirements.txt
1 change: 1 addition & 0 deletions ml_project/heart_disease/entities/pipeline_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class TrainingConfig:
evaluation_config: EvaluateModelConfig = field(default_factory=lambda: EvaluateModelConfig)
model_save_path: str = omegaconf.MISSING
pipeline_save_path: str = omegaconf.MISSING
metadata_save_path: str = omegaconf.MISSING


@dataclass
Expand Down
23 changes: 17 additions & 6 deletions ml_project/heart_disease/features/build_features.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import pickle
from typing import List
from typing import List, Dict

import numpy as np
import pandas as pd
Expand All @@ -12,6 +11,7 @@
from sklearn.random_projection import SparseRandomProjection

from heart_disease.entities.feature_config import FeatureConfig
from heart_disease.utils import serialize_object, deserialize_object


class StatisticalFeaturesExtractor(TransformerMixin):
Expand Down Expand Up @@ -95,11 +95,22 @@ def extract_target(df: pd.DataFrame, config: FeatureConfig) -> pd.Series:
return target


def extract_raw_features(df: pd.DataFrame, config: FeatureConfig) -> pd.DataFrame:
return df[config.raw_features.numeric_features + config.raw_features.categorical_features]


def serialize_pipeline(pipeline: Pipeline, path: str):
with open(path, "wb") as f:
pickle.dump(pipeline, f)
serialize_object(pipeline, path)


def deserialize_pipeline(path: str) -> Pipeline:
with open(path, "rb") as f:
return pickle.load(f)
return deserialize_object(path)


def serialize_metadata(df: pd.DataFrame, config: FeatureConfig, path: str):
all_features = config.raw_features.numeric_features + config.raw_features.categorical_features
return serialize_object(df[all_features].dtypes.to_dict(), path)


def deserialize_metadata(path: str) -> Dict[str, np.dtype]:
return deserialize_object(path)
8 changes: 3 additions & 5 deletions ml_project/heart_disease/models/model.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import pickle
from typing import Union, Dict, List

import numpy as np
Expand All @@ -7,6 +6,7 @@
from sklearn.metrics import get_scorer

from heart_disease.entities.model_config import TrainModelConfig, ModelType
from heart_disease.utils import deserialize_object, serialize_object

Classifier = Union[RandomForestClassifier, ExtraTreesClassifier]

Expand Down Expand Up @@ -41,10 +41,8 @@ def save_metrics(metrics: Dict[str, float], path: str):


def serialize_model(model: Classifier, path: str):
with open(path, "wb") as f:
pickle.dump(model, f)
serialize_object(model, path)


def deserialize_model(path: str) -> Classifier:
with open(path, "rb") as f:
return pickle.load(f)
return deserialize_object(path)
Loading