Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 124 additions & 0 deletions notebooks/Data Engineering - Smooth .ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Smoothing Dynamic Set"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from transplant.config import * "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Dans ce notebook nous réalisons une simple fonction smoothing sur le set dynamique. Nous mesurons également le temps d'exécution du script. Pour un rolling mean sur 5 périodes, le script s'éxecute en "
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.read_csv(PATH_DYNAMIC_CLEAN)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = df[DYNAMIC_HEADERS]"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"6.654979944229126\n"
]
}
],
"source": [
"import time\n",
"start = time.time()\n",
"melt = pd.melt(df, id_vars = ['id_patient', 'time']).groupby(['id_patient', 'variable']).rolling(5, min_periods =1, on='time').value.mean().reset_index()\n",
"end = time.time()\n",
"print(end - start)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"6.545935153961182\n"
]
}
],
"source": [
"import time\n",
"start = time.time()\n",
"melt.pivot_table(index=['id_patient', 'time'], columns='variable', values='value').reset_index()\n",
"end = time.time()\n",
"print(end - start)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
2 changes: 2 additions & 0 deletions transplant/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ La classe `Dataset` vous permet d'accéder aux données statiques et dynamiques
_Variables_

- **time_offset**: Détermine le moment (en minutes) auquel les données dynamiques seront tronquées. Par exemple, un `time_offset` à 30 signifie que les données dynamiques seront arrêtées 30 minutes avant la dernière mesure enregistrée par les instruments de mesure.
- **smooth**: `True / False` détermine si les données dynamiques doivent être lissées.
- **smooth_periods**: `Int` nombre de périodes sur lesquelles la fonction smooth est appliquée. Fixé à 5 périodes par défaut.

_Fonctions_

Expand Down
20 changes: 19 additions & 1 deletion transplant/data/dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pandas as pd
import numpy as np
import datetime
import logging
from datetime import timedelta
from sklearn.model_selection import train_test_split

Expand All @@ -20,8 +21,10 @@ class Dataset:

_random_state = 1

def __init__(self, time_offset=30):
def __init__(self, time_offset=30, smooth=False, smooth_periods=5):
self.time_offset = time_offset
self.smooth = smooth
self.smooth_periods = smooth_periods

def get_static(self):

Expand Down Expand Up @@ -67,6 +70,21 @@ def get_dynamic(self):

df = df.groupby('id_patient').apply(self._truncate_datetime)

# Smooth dynamic dataset

if self.smooth:
alert_smooth = "Smoothing dynamic dataset with {} periods" \
.format(self.smooth_periods)
logging.warning(alert_smooth)
melt = pd.melt(df, id_vars=['id_patient', 'time']) \
.groupby(['id_patient', 'variable']) \
.rolling(self.smooth_periods, min_periods=3, on='time') \
.value.mean() \
.reset_index()
df = melt.pivot_table(index=['id_patient', 'time'],
columns='variable',
values='value').reset_index()

# Filter result based on static set

train, test = self.get_static()
Expand Down