dataforgoodfr · martindaniel4 · Jan 10, 2019 · Jan 10, 2019 · Jan 10, 2019 · Jan 10, 2019
diff --git a/notebooks/Data Engineering - Smooth .ipynb b/notebooks/Data Engineering - Smooth .ipynb
@@ -0,0 +1,124 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Smoothing Dynamic Set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from transplant.config import * "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Dans ce notebook nous réalisons une simple fonction smoothing sur le set dynamique. Nous mesurons également le temps d'exécution du script. Pour un rolling mean sur 5 périodes, le script s'éxecute en "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(PATH_DYNAMIC_CLEAN)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df = df[DYNAMIC_HEADERS]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "6.654979944229126\n"
+     ]
+    }
+   ],
+   "source": [
+    "import time\n",
+    "start = time.time()\n",
+    "melt = pd.melt(df, id_vars = ['id_patient', 'time']).groupby(['id_patient', 'variable']).rolling(5, min_periods =1, on='time').value.mean().reset_index()\n",
+    "end = time.time()\n",
+    "print(end - start)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "6.545935153961182\n"
+     ]
+    }
+   ],
+   "source": [
+    "import time\n",
+    "start = time.time()\n",
+    "melt.pivot_table(index=['id_patient', 'time'], columns='variable', values='value').reset_index()\n",
+    "end = time.time()\n",
+    "print(end - start)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/transplant/README.md b/transplant/README.md
@@ -9,6 +9,8 @@ La classe `Dataset` vous permet d'accéder aux données statiques et dynamiques
 _Variables_
 
 - **time_offset**: Détermine le moment (en minutes) auquel les données dynamiques seront tronquées. Par exemple, un `time_offset` à 30 signifie que les données dynamiques seront arrêtées 30 minutes avant la dernière mesure enregistrée par les instruments de mesure.
+- **smooth**: `True / False` détermine si les données dynamiques doivent être lissées. 
+- **smooth_periods**: `Int` nombre de périodes sur lesquelles la fonction smooth est appliquée. Fixé à 5 périodes par défaut.
 
 _Fonctions_
 

diff --git a/transplant/data/dataset.py b/transplant/data/dataset.py
@@ -1,6 +1,7 @@
 import pandas as pd
 import numpy as np
 import datetime
+import logging
 from datetime import timedelta
 from sklearn.model_selection import train_test_split
 
@@ -20,8 +21,10 @@ class Dataset:
 
     _random_state = 1
 
-    def __init__(self, time_offset=30):
+    def __init__(self, time_offset=30, smooth=False, smooth_periods=5):
         self.time_offset = time_offset
+        self.smooth = smooth
+        self.smooth_periods = smooth_periods
 
     def get_static(self):
 
@@ -67,6 +70,21 @@ def get_dynamic(self):
 
         df = df.groupby('id_patient').apply(self._truncate_datetime)
 
+        # Smooth dynamic dataset
+
+        if self.smooth:
+            alert_smooth = "Smoothing dynamic dataset with {} periods" \
+                            .format(self.smooth_periods)
+            logging.warning(alert_smooth)
+            melt = pd.melt(df, id_vars=['id_patient', 'time']) \
+                     .groupby(['id_patient', 'variable']) \
+                     .rolling(self.smooth_periods, min_periods=3, on='time') \
+                     .value.mean() \
+                     .reset_index()
+            df = melt.pivot_table(index=['id_patient', 'time'],
+                                  columns='variable',
+                                  values='value').reset_index()
+
         # Filter result based on static set
 
         train, test = self.get_static()