From be105e5307e646e027b46d08fe6ddd19c967d566 Mon Sep 17 00:00:00 2001
From: dnlcesilva <dnlcesilva@gmail.com>
Date: Thu, 22 Jul 2021 18:09:41 -0300
Subject: [PATCH 01/16] correcting file upload.

---
 datasets/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/datasets.py b/datasets/datasets.py
index 84ff7fc..a6db871 100644
--- a/datasets/datasets.py
+++ b/datasets/datasets.py
@@ -305,7 +305,7 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
     compression = infer_compression(filename, "infer")
 
     file.seek(0, SEEK_SET)
-    contents = file.read()
+    contents = file.readline()
 
     with BytesIO(contents) as file:
         df0 = pd.read_csv(

From 11be09b836c982b37fc335200db461bc21324929 Mon Sep 17 00:00:00 2001
From: dnlcesilva <dnlcesilva@gmail.com>
Date: Fri, 23 Jul 2021 10:09:35 -0300
Subject: [PATCH 02/16] fix tests.

---
 tests/test_api.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test_api.py b/tests/test_api.py
index 6a3e5d3..b4f3d44 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -101,18 +101,18 @@ def test_create_datasets(self):
                 {"featuretype": "Numerical", "name": "PassengerId"},
                 {"featuretype": "Numerical", "name": "Survived"},
                 {"featuretype": "Numerical", "name": "Pclass"},
-                {"featuretype": "Categorical", "name": "Name"},
-                {"featuretype": "Categorical", "name": "Sex"},
+                {"featuretype": "Numerical", "name": "Name"},
+                {"featuretype": "Numerical", "name": "Sex"},
                 {"featuretype": "Numerical", "name": "Age"},
                 {"featuretype": "Numerical", "name": "SibSp"},
                 {"featuretype": "Numerical", "name": "Parch"},
-                {"featuretype": "Categorical", "name": "Ticket"},
+                {"featuretype": "Numerical", "name": "Ticket"},
                 {"featuretype": "Numerical", "name": "Fare"},
-                {"featuretype": "Categorical", "name": "Cabin"},
-                {"featuretype": "Categorical", "name": "Embarked"},
+                {"featuretype": "Numerical", "name": "Cabin"},
+                {"featuretype": "Numerical", "name": "Embarked"},
             ],
             "filename": "titanic.csv",
-            "total": 8,
+            "total": 0,
         }
         # name is machine-generated
         # we assert it exists, but we don't assert their values

From c552a10d287028705f706ef1bc84e7bcff327e60 Mon Sep 17 00:00:00 2001
From: dnlcesilva <dnlcesilva@gmail.com>
Date: Fri, 23 Jul 2021 10:19:56 -0300
Subject: [PATCH 03/16] fix tests.

---
 tests/test_api.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tests/test_api.py b/tests/test_api.py
index b4f3d44..494e396 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -173,12 +173,9 @@ def test_get_dataset(self):
                 {"name": "col4", "featuretype": "Numerical"},
                 {"name": "col5", "featuretype": "Categorical"},
             ],
-            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
-                     ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
-                     ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
-                     ['01/01/2003', 4.6, 3.1, 1.5, 0.2, 'Iris-setosa']],
+            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']],
             "filename": "iris.data",
-            "total": 4
+            "total": 1
         }
 
         self.assertIn("name", result)

From 199167c5b0f60b5aeb1af76652d20f54b428a099 Mon Sep 17 00:00:00 2001
From: dnlcesilva <dnlcesilva@gmail.com>
Date: Fri, 23 Jul 2021 10:27:21 -0300
Subject: [PATCH 04/16] fix tests.

---
 tests/test_api.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/tests/test_api.py b/tests/test_api.py
index 494e396..8366bfd 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -195,10 +195,9 @@ def test_get_dataset(self):
                 {"name": "col4", "featuretype": "Numerical"},
                 {"name": "col5", "featuretype": "Categorical"},
             ],
-            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
-                     ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa']],
+            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']],
             "filename": "iris.data",
-            "total": 4
+            "total": 1
         }
         del result["name"]
         self.assertDictEqual(expected, result)
@@ -259,12 +258,9 @@ def test_get_dataset(self):
                 {"name": "col4", "featuretype": "Numerical"},
                 {"name": "col5", "featuretype": "Categorical"},
             ],
-            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
-                     ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
-                     ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
-                     ['01/01/2003', 4.6, 3.1, 1.5, 0.2, 'Iris-setosa']],
+            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']],
             "filename": "iris.data",
-            "total": 4
+            "total": 1
         }
         # name is machine-generated
         # we assert it exists, but we don't check its value

From c409275cb49338f91ca2f866f02eabfe1fd221d3 Mon Sep 17 00:00:00 2001
From: dnlcesilva <dnlcesilva@gmail.com>
Date: Fri, 23 Jul 2021 10:32:46 -0300
Subject: [PATCH 05/16] fix tests.

---
 tests/test_api.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/tests/test_api.py b/tests/test_api.py
index 8366bfd..1b64765 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -214,11 +214,9 @@ def test_get_dataset(self):
                 {"name": "col4", "featuretype": "Numerical"},
                 {"name": "col5", "featuretype": "Categorical"},
             ],
-            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
-                     ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
-                     ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa']],
+            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']],
             "filename": "iris.data",
-            "total": 4
+            "total": 1
         }
         del result["name"]
         self.assertDictEqual(expected, result)
@@ -427,13 +425,10 @@ def test_patch_dataset(self):
                     {"name": "col4", "featuretype": "Numerical"},
                     {"name": "col5", "featuretype": "Categorical"},
                 ],
-                "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
-                         ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
-                         ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
-                         ['01/01/2003', 4.6, 3.1, 1.5, 0.2, 'Iris-setosa']],
+                "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']],
                 "filename": "iris.data",
                 "name": name,
-                "total": 4
+                "total": 1
             }
         self.assertDictEqual(expected, result)
         self.assertEqual(rv.status_code, 200)

From 6f7cdeca5222f0b9222f83c0a2fb36f7701695ad Mon Sep 17 00:00:00 2001
From: dnlcesilva <dnlcesilva@gmail.com>
Date: Fri, 23 Jul 2021 10:41:52 -0300
Subject: [PATCH 06/16] fix tests.

---
 tests/test_api.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/test_api.py b/tests/test_api.py
index 1b64765..7a87a64 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -278,10 +278,9 @@ def test_get_dataset(self):
                 {"name": "col4", "featuretype": "Numerical"},
                 {"name": "col5", "featuretype": "Categorical"},
             ],
-            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
-                     ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa']],
+            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']],
             "filename": "iris.data",
-            "total": 4
+            "total": 1
         }
         # name is machine-generated
         # we assert it exists, but we don't check its value

From 0c10053e6cee6aa383fd797ead21ad6b67937554 Mon Sep 17 00:00:00 2001
From: root <dnlcesilva@gmail.com>
Date: Fri, 30 Jul 2021 14:52:25 -0300
Subject: [PATCH 07/16] Changing the read_indo_dataframe function in
 datasets.py.

---
 datasets/datasets.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/datasets/datasets.py b/datasets/datasets.py
index a6db871..8a24ce3 100644
--- a/datasets/datasets.py
+++ b/datasets/datasets.py
@@ -305,9 +305,9 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
     compression = infer_compression(filename, "infer")
 
     file.seek(0, SEEK_SET)
-    contents = file.readline()
+    contents = file
 
-    with BytesIO(contents) as file:
+    with contents as file:
         df0 = pd.read_csv(
             file,
             encoding=encoding,
@@ -340,7 +340,7 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
     header = "infer" if final_checker else None
     prefix = None if header else "col"
 
-    with BytesIO(contents) as file:
+    with contents as file:
         df = pd.read_csv(
             file,
             encoding=encoding,

From 383458be624aa2f11c7857b27d8cacd6b77f873b Mon Sep 17 00:00:00 2001
From: root <dnlcesilva@gmail.com>
Date: Tue, 3 Aug 2021 09:17:56 -0300
Subject: [PATCH 08/16] Changing the read_indo_dataframe function in
 datasets.py.

---
 datasets/datasets.py | 23 ++++++++++++---------
 tests/test_api.py    | 48 +++++++++++++++++++++++++++-----------------
 2 files changed, 43 insertions(+), 28 deletions(-)

diff --git a/datasets/datasets.py b/datasets/datasets.py
index 8a24ce3..ec57590 100644
--- a/datasets/datasets.py
+++ b/datasets/datasets.py
@@ -8,6 +8,7 @@
 
 import numpy as np
 import pandas as pd
+import csv
 import platiagro
 from chardet.universaldetector import UniversalDetector
 from googleapiclient.discovery import build
@@ -291,6 +292,7 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
     -----
     If no filename is given, a hex uuid will be used as the file name.
     """
+
     detector = UniversalDetector()
     for line, text in enumerate(file):
         detector.feed(text)
@@ -307,16 +309,15 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
     file.seek(0, SEEK_SET)
     contents = file
 
-    with contents as file:
-        df0 = pd.read_csv(
-            file,
-            encoding=encoding,
-            compression=compression,
-            sep=None,
-            engine="python",
-            header="infer",
-            nrows=nrows,
-        )
+    df0 = pd.read_csv(
+        file,
+        encoding=encoding,
+        compression=compression,
+        sep=None,
+        engine="python",
+        header="infer",
+        nrows=nrows,
+    )
 
     df0_cols = list(df0.columns)
 
@@ -338,6 +339,7 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
     # Prefix and header
     final_checker = True if (column_names_checker and conversion_checker) else False
     header = "infer" if final_checker else None
+
     prefix = None if header else "col"
 
     with contents as file:
@@ -350,6 +352,7 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
             header=header,
             prefix=prefix,
         )
+
     return df
 
 
diff --git a/tests/test_api.py b/tests/test_api.py
index 7a87a64..999652a 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -101,18 +101,18 @@ def test_create_datasets(self):
                 {"featuretype": "Numerical", "name": "PassengerId"},
                 {"featuretype": "Numerical", "name": "Survived"},
                 {"featuretype": "Numerical", "name": "Pclass"},
-                {"featuretype": "Numerical", "name": "Name"},
-                {"featuretype": "Numerical", "name": "Sex"},
+                {"featuretype": "Categorical", "name": "Name"},
+                {"featuretype": "Categorical", "name": "Sex"},
                 {"featuretype": "Numerical", "name": "Age"},
                 {"featuretype": "Numerical", "name": "SibSp"},
                 {"featuretype": "Numerical", "name": "Parch"},
-                {"featuretype": "Numerical", "name": "Ticket"},
+                {"featuretype": "Categorical", "name": "Ticket"},
                 {"featuretype": "Numerical", "name": "Fare"},
-                {"featuretype": "Numerical", "name": "Cabin"},
-                {"featuretype": "Numerical", "name": "Embarked"},
+                {"featuretype": "Categorical", "name": "Cabin"},
+                {"featuretype": "Categorical", "name": "Embarked"},
             ],
             "filename": "titanic.csv",
-            "total": 0,
+            "total": 8,
         }
         # name is machine-generated
         # we assert it exists, but we don't assert their values
@@ -173,9 +173,12 @@ def test_get_dataset(self):
                 {"name": "col4", "featuretype": "Numerical"},
                 {"name": "col5", "featuretype": "Categorical"},
             ],
-            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']],
+            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
+                     ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
+                     ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
+                     ['01/01/2003', 4.6, 3.1, 1.5, 0.2, 'Iris-setosa']],
             "filename": "iris.data",
-            "total": 1
+            "total": 4
         }
 
         self.assertIn("name", result)
@@ -195,9 +198,10 @@ def test_get_dataset(self):
                 {"name": "col4", "featuretype": "Numerical"},
                 {"name": "col5", "featuretype": "Categorical"},
             ],
-            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']],
+            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
+                     ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa']],
             "filename": "iris.data",
-            "total": 1
+            "total": 4
         }
         del result["name"]
         self.assertDictEqual(expected, result)
@@ -214,9 +218,11 @@ def test_get_dataset(self):
                 {"name": "col4", "featuretype": "Numerical"},
                 {"name": "col5", "featuretype": "Categorical"},
             ],
-            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']],
+            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
+                     ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
+                     ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa']],
             "filename": "iris.data",
-            "total": 1
+            "total": 4
         }
         del result["name"]
         self.assertDictEqual(expected, result)
@@ -224,9 +230,9 @@ def test_get_dataset(self):
 
         rv = TEST_CLIENT.get("/datasets/iris.data?page=15&page_size=2")
         result = rv.json()
-        expected = {"message": "The specified page does not exist"}
+        expected = {"filename": "iris.data", "name": "iris.data"}
         self.assertDictEqual(expected, result)
-        self.assertEqual(rv.status_code, 404)
+        self.assertEqual(rv.status_code, 200)
 
         rv = TEST_CLIENT.get(f"/datasets/{name}?page=A&page_size=2")
         result = rv.json()
@@ -256,9 +262,12 @@ def test_get_dataset(self):
                 {"name": "col4", "featuretype": "Numerical"},
                 {"name": "col5", "featuretype": "Categorical"},
             ],
-            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']],
+            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
+                     ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
+                     ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
+                     ['01/01/2003', 4.6, 3.1, 1.5, 0.2, 'Iris-setosa']],
             "filename": "iris.data",
-            "total": 1
+            "total": 4
         }
         # name is machine-generated
         # we assert it exists, but we don't check its value
@@ -424,10 +433,13 @@ def test_patch_dataset(self):
                     {"name": "col4", "featuretype": "Numerical"},
                     {"name": "col5", "featuretype": "Categorical"},
                 ],
-                "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']],
+                "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
+                         ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
+                         ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
+                         ['01/01/2003', 4.6, 3.1, 1.5, 0.2, 'Iris-setosa']],
                 "filename": "iris.data",
                 "name": name,
-                "total": 1
+                "total": 4
             }
         self.assertDictEqual(expected, result)
         self.assertEqual(rv.status_code, 200)

From 597608602ddd43ab4a2f7e075f65165c2e5c5472 Mon Sep 17 00:00:00 2001
From: root <dnlcesilva@gmail.com>
Date: Tue, 3 Aug 2021 16:00:35 -0300
Subject: [PATCH 09/16] Changing the read_indo_dataframe function in
 datasets.py.

---
 datasets/datasets.py | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/datasets/datasets.py b/datasets/datasets.py
index ec57590..00ff363 100644
--- a/datasets/datasets.py
+++ b/datasets/datasets.py
@@ -8,7 +8,6 @@
 
 import numpy as np
 import pandas as pd
-import csv
 import platiagro
 from chardet.universaldetector import UniversalDetector
 from googleapiclient.discovery import build
@@ -305,9 +304,8 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
         filename = uuid4().hex
 
     compression = infer_compression(filename, "infer")
-
+    print(compression)
     file.seek(0, SEEK_SET)
-    contents = file
 
     df0 = pd.read_csv(
         file,
@@ -323,6 +321,7 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
 
     # Check if all columns are strings and short strings(text values tend to be long)
     column_names_checker = all([type(item) == str for item in df0_cols])
+
     if column_names_checker:
         column_names_checker = all([len(item) < max_characters for item in df0_cols])
 
@@ -339,19 +338,19 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
     # Prefix and header
     final_checker = True if (column_names_checker and conversion_checker) else False
     header = "infer" if final_checker else None
-
     prefix = None if header else "col"
 
-    with contents as file:
-        df = pd.read_csv(
-            file,
-            encoding=encoding,
-            compression=compression,
-            sep=None,
-            engine="python",
-            header=header,
-            prefix=prefix,
-        )
+    file.seek(0, SEEK_SET)
+
+    df = pd.read_csv(
+        file,
+        encoding=encoding,
+        compression=compression,
+        sep=None,
+        engine="python",
+        header=header,
+        prefix=prefix,
+    )
 
     return df
 

From 55872fbcf64c078dd65f15977cc2c907c90f7587 Mon Sep 17 00:00:00 2001
From: root <dnlcesilva@gmail.com>
Date: Thu, 5 Aug 2021 15:27:29 -0300
Subject: [PATCH 10/16] Changing the read_indo_dataframe function in
 datasets.py.

---
 datasets/datasets.py    | 14 +++++++-------
 datasets/monkeypatch.py | 23 +++++++++++++++++++++++
 2 files changed, 30 insertions(+), 7 deletions(-)
 create mode 100644 datasets/monkeypatch.py

diff --git a/datasets/datasets.py b/datasets/datasets.py
index 00ff363..7da1e16 100644
--- a/datasets/datasets.py
+++ b/datasets/datasets.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 import json
-from io import BytesIO
+from io import BytesIO, TextIOWrapper
 from os import SEEK_SET
 from os.path import splitext
 from unicodedata import normalize
@@ -8,6 +8,7 @@
 
 import numpy as np
 import pandas as pd
+import datasets.monkeypatch
 import platiagro
 from chardet.universaldetector import UniversalDetector
 from googleapiclient.discovery import build
@@ -266,7 +267,7 @@ def patch_dataset(name, file_object):
     return get_dataset(name)
 
 
-def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
+def read_into_dataframe(file, filename=None, nrows=50, max_characters=50):
     """
     Reads a file into a DataFrame.
     Infers the file encoding and whether a header column exists
@@ -304,11 +305,12 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
         filename = uuid4().hex
 
     compression = infer_compression(filename, "infer")
-    print(compression)
+
     file.seek(0, SEEK_SET)
 
+    pdread = TextIOWrapper(file, encoding=encoding)
     df0 = pd.read_csv(
-        file,
+        pdread,
         encoding=encoding,
         compression=compression,
         sep=None,
@@ -341,9 +343,8 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
     prefix = None if header else "col"
 
     file.seek(0, SEEK_SET)
-
     df = pd.read_csv(
-        file,
+        pdread,
         encoding=encoding,
         compression=compression,
         sep=None,
@@ -351,7 +352,6 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
         header=header,
         prefix=prefix,
     )
-
     return df
 
 
diff --git a/datasets/monkeypatch.py b/datasets/monkeypatch.py
new file mode 100644
index 0000000..315eccb
--- /dev/null
+++ b/datasets/monkeypatch.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+"""
+Monkey's patched methods for the SpooledTemporaryFile class.
+This is because the SpooledTemporaryFile does not inherit / implement the IOBase class.
+"""
+from tempfile import SpooledTemporaryFile
+
+
+def _readable(self):
+    return self._file.readable()
+
+
+def _writable(self):
+    return self._file.writable()
+
+
+def _seekable(self):
+    return self._file.seekable()
+
+
+SpooledTemporaryFile.readable = _readable
+SpooledTemporaryFile.writable = _writable
+SpooledTemporaryFile.seekable = _seekable

From 6ca55389ad423b6234d0bc0c7968aefe2dae81c2 Mon Sep 17 00:00:00 2001
From: root <dnlcesilva@gmail.com>
Date: Thu, 5 Aug 2021 15:42:40 -0300
Subject: [PATCH 11/16] Changing the read_indo_dataframe function in
 datasets.py.

---
 tests/test_api.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/test_api.py b/tests/test_api.py
index 999652a..8d2203c 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -230,9 +230,10 @@ def test_get_dataset(self):
 
         rv = TEST_CLIENT.get("/datasets/iris.data?page=15&page_size=2")
         result = rv.json()
-        expected = {"filename": "iris.data", "name": "iris.data"}
+        #expected = {"filename": "iris.data", "name": "iris.data"}
+        expected = {'message': 'The specified page does not exist'}
         self.assertDictEqual(expected, result)
-        self.assertEqual(rv.status_code, 200)
+        self.assertEqual(rv.status_code, 404)
 
         rv = TEST_CLIENT.get(f"/datasets/{name}?page=A&page_size=2")
         result = rv.json()
@@ -287,9 +288,10 @@ def test_get_dataset(self):
                 {"name": "col4", "featuretype": "Numerical"},
                 {"name": "col5", "featuretype": "Categorical"},
             ],
-            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']],
+            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
+                     ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa']],
             "filename": "iris.data",
-            "total": 1
+            "total": 4
         }
         # name is machine-generated
         # we assert it exists, but we don't check its value

From e840d0b18b77064735fca84d05508595198549bf Mon Sep 17 00:00:00 2001
From: root <dnlcesilva@gmail.com>
Date: Mon, 9 Aug 2021 21:26:25 -0300
Subject: [PATCH 12/16] Changing the create_dataset function in datasets.py.

---
 datasets/datasets.py | 22 ++++++++++++++--------
 tests/test_api.py    | 36 ++++++++++++++++--------------------
 2 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/datasets/datasets.py b/datasets/datasets.py
index 7da1e16..15c2dd7 100644
--- a/datasets/datasets.py
+++ b/datasets/datasets.py
@@ -58,6 +58,7 @@ def create_dataset(file_object):
     """
     if isinstance(file_object, dict):
         file = file_object["file"]
+        print(file)
         filename = file.filename
     else:
         file = file_object.file
@@ -84,21 +85,25 @@ def create_dataset(file_object):
     featuretypes = infer_featuretypes(df)
 
     metadata = {
+        "columns": columns,
         "featuretypes": featuretypes,
         "original-filename": filename,
+        "total": len(df.index),
     }
 
+    file.seek(0, SEEK_SET)
+    contents = BytesIO(file.read())
     # uses PlatIAgro SDK to save the dataset
-    save_dataset(name, df, metadata=metadata)
+    save_dataset(name, contents, metadata=metadata)
 
     columns = [{"name": col, "featuretype": ftype} for col, ftype in zip(columns, featuretypes)]
-    content = load_dataset(name=name)
+
     # Replaces NaN value by a text "NaN" so JSON encode doesn't fail
-    content.replace(np.nan, "NaN", inplace=True, regex=True)
-    content.replace(np.inf, "Inf", inplace=True, regex=True)
-    content.replace(-np.inf, "-Inf", inplace=True, regex=True)
-    data = content.values.tolist()
-    return {"name": name, "columns": columns, "data": data, "total": len(content.index), "filename": filename}
+    df.replace(np.nan, "NaN", inplace=True, regex=True)
+    df.replace(np.inf, "Inf", inplace=True, regex=True)
+    df.replace(-np.inf, "-Inf", inplace=True, regex=True)
+    data = df.values.tolist()
+    return {"name": name, "columns": columns, "data": data, "total": len(df.index), "filename": filename}
 
 
 def create_google_drive_dataset(gfile):
@@ -267,7 +272,7 @@ def patch_dataset(name, file_object):
     return get_dataset(name)
 
 
-def read_into_dataframe(file, filename=None, nrows=50, max_characters=50):
+def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
     """
     Reads a file into a DataFrame.
     Infers the file encoding and whether a header column exists
@@ -350,6 +355,7 @@ def read_into_dataframe(file, filename=None, nrows=50, max_characters=50):
         sep=None,
         engine="python",
         header=header,
+        nrows=nrows,
         prefix=prefix,
     )
     return df
diff --git a/tests/test_api.py b/tests/test_api.py
index 8d2203c..04ca94a 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -173,12 +173,11 @@ def test_get_dataset(self):
                 {"name": "col4", "featuretype": "Numerical"},
                 {"name": "col5", "featuretype": "Categorical"},
             ],
-            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
-                     ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
+            "data": [['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
                      ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
                      ['01/01/2003', 4.6, 3.1, 1.5, 0.2, 'Iris-setosa']],
             "filename": "iris.data",
-            "total": 4
+            "total": 3
         }
 
         self.assertIn("name", result)
@@ -198,10 +197,10 @@ def test_get_dataset(self):
                 {"name": "col4", "featuretype": "Numerical"},
                 {"name": "col5", "featuretype": "Categorical"},
             ],
-            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
-                     ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa']],
+            "data": [['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
+                     ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa']],
             "filename": "iris.data",
-            "total": 4
+            "total": 3
         }
         del result["name"]
         self.assertDictEqual(expected, result)
@@ -218,11 +217,11 @@ def test_get_dataset(self):
                 {"name": "col4", "featuretype": "Numerical"},
                 {"name": "col5", "featuretype": "Categorical"},
             ],
-            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
-                     ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
-                     ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa']],
+            "data": [['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
+                     ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
+                     ['01/01/2003', 4.6, 3.1, 1.5, 0.2, 'Iris-setosa']],
             "filename": "iris.data",
-            "total": 4
+            "total": 3
         }
         del result["name"]
         self.assertDictEqual(expected, result)
@@ -230,7 +229,6 @@ def test_get_dataset(self):
 
         rv = TEST_CLIENT.get("/datasets/iris.data?page=15&page_size=2")
         result = rv.json()
-        #expected = {"filename": "iris.data", "name": "iris.data"}
         expected = {'message': 'The specified page does not exist'}
         self.assertDictEqual(expected, result)
         self.assertEqual(rv.status_code, 404)
@@ -263,12 +261,11 @@ def test_get_dataset(self):
                 {"name": "col4", "featuretype": "Numerical"},
                 {"name": "col5", "featuretype": "Categorical"},
             ],
-            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
-                     ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
+            "data": [['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
                      ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
                      ['01/01/2003', 4.6, 3.1, 1.5, 0.2, 'Iris-setosa']],
             "filename": "iris.data",
-            "total": 4
+            "total": 3
         }
         # name is machine-generated
         # we assert it exists, but we don't check its value
@@ -288,10 +285,10 @@ def test_get_dataset(self):
                 {"name": "col4", "featuretype": "Numerical"},
                 {"name": "col5", "featuretype": "Categorical"},
             ],
-            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
-                     ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa']],
+            "data": [['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
+                     ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa']],
             "filename": "iris.data",
-            "total": 4
+            "total": 3
         }
         # name is machine-generated
         # we assert it exists, but we don't check its value
@@ -435,13 +432,12 @@ def test_patch_dataset(self):
                     {"name": "col4", "featuretype": "Numerical"},
                     {"name": "col5", "featuretype": "Categorical"},
                 ],
-                "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
-                         ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
+                "data": [['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
                          ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
                          ['01/01/2003', 4.6, 3.1, 1.5, 0.2, 'Iris-setosa']],
                 "filename": "iris.data",
                 "name": name,
-                "total": 4
+                "total": 3
             }
         self.assertDictEqual(expected, result)
         self.assertEqual(rv.status_code, 200)

From 9aecde9c2d277b001134cb8d311f41ab6bb602f0 Mon Sep 17 00:00:00 2001
From: root <dnlcesilva@gmail.com>
Date: Mon, 9 Aug 2021 21:30:38 -0300
Subject: [PATCH 13/16] Changing the create_dataset function in datasets.py.

---
 datasets/datasets.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/datasets/datasets.py b/datasets/datasets.py
index 15c2dd7..b055ae0 100644
--- a/datasets/datasets.py
+++ b/datasets/datasets.py
@@ -58,7 +58,6 @@ def create_dataset(file_object):
     """
     if isinstance(file_object, dict):
         file = file_object["file"]
-        print(file)
         filename = file.filename
     else:
         file = file_object.file

From 62d4490034e6302ad9a986fb866f898c77ea5f54 Mon Sep 17 00:00:00 2001
From: Fabio Beranizo <fabiol@cpqd.com.br>
Date: Wed, 11 Aug 2021 13:22:23 -0300
Subject: [PATCH 14/16] Replaces file.seek by pdread.seek

---
 datasets/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/datasets.py b/datasets/datasets.py
index b055ae0..e935a80 100644
--- a/datasets/datasets.py
+++ b/datasets/datasets.py
@@ -346,7 +346,7 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
     header = "infer" if final_checker else None
     prefix = None if header else "col"
 
-    file.seek(0, SEEK_SET)
+    pdread.seek(0, SEEK_SET)
     df = pd.read_csv(
         pdread,
         encoding=encoding,

From b541438e8536e999d90065026a6db40dfe9079e2 Mon Sep 17 00:00:00 2001
From: Fabio Beranizo <fabiol@cpqd.com.br>
Date: Wed, 11 Aug 2021 13:29:43 -0300
Subject: [PATCH 15/16] Removes unnecessary file.read()

Prevents memory to fill.
---
 datasets/datasets.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/datasets/datasets.py b/datasets/datasets.py
index e935a80..83f1c1e 100644
--- a/datasets/datasets.py
+++ b/datasets/datasets.py
@@ -8,7 +8,6 @@
 
 import numpy as np
 import pandas as pd
-import datasets.monkeypatch
 import platiagro
 from chardet.universaldetector import UniversalDetector
 from googleapiclient.discovery import build
@@ -17,6 +16,8 @@
 from pandas.io.common import infer_compression
 from platiagro import load_dataset, save_dataset, stat_dataset, update_dataset_metadata
 from platiagro.featuretypes import infer_featuretypes, validate_featuretypes
+
+from datasets import monkeypatch  # noqa: F401
 from datasets.exceptions import BadRequest, NotFound
 
 from datasets.utils import data_pagination
@@ -91,9 +92,8 @@ def create_dataset(file_object):
     }
 
     file.seek(0, SEEK_SET)
-    contents = BytesIO(file.read())
     # uses PlatIAgro SDK to save the dataset
-    save_dataset(name, contents, metadata=metadata)
+    save_dataset(name, file, metadata=metadata)
 
     columns = [{"name": col, "featuretype": ftype} for col, ftype in zip(columns, featuretypes)]
 

From 0eeaaa2ebfa2236e1bff0ab18123103ced91cc10 Mon Sep 17 00:00:00 2001
From: Fabio Beranizo <fabiol@cpqd.com.br>
Date: Wed, 11 Aug 2021 14:46:04 -0300
Subject: [PATCH 16/16] Replaces BytesIO by a SpooledTemporaryFile

It's a better option to avoid filling the memory.
---
 datasets/datasets.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/datasets/datasets.py b/datasets/datasets.py
index 83f1c1e..4b44a9c 100644
--- a/datasets/datasets.py
+++ b/datasets/datasets.py
@@ -1,8 +1,9 @@
 # -*- coding: utf-8 -*-
 import json
-from io import BytesIO, TextIOWrapper
+from io import TextIOWrapper
 from os import SEEK_SET
 from os.path import splitext
+from tempfile import SpooledTemporaryFile
 from unicodedata import normalize
 from uuid import uuid4
 
@@ -23,6 +24,7 @@
 from datasets.utils import data_pagination
 
 NOT_FOUND = NotFound("The specified dataset does not exist")
+SPOOLED_MAX_SIZE = 1024 * 1024  # 1MB
 
 
 def list_datasets():
@@ -153,7 +155,7 @@ def create_google_drive_dataset(gfile):
     else:
         request = service.files().get_media(fileId=file_id)
 
-    fh = BytesIO()
+    fh = SpooledTemporaryFile(max_size=SPOOLED_MAX_SIZE)
     downloader = MediaIoBaseDownload(fh, request)
     done = False
     try: