platiagro · fberanizo · Aug 11, 2021 · Jul 22, 2021 · Jul 23, 2021 · Jul 23, 2021
diff --git a/datasets/datasets.py b/datasets/datasets.py
@@ -1,13 +1,14 @@
 # -*- coding: utf-8 -*-
 import json
-from io import BytesIO
+from io import BytesIO, TextIOWrapper
 from os import SEEK_SET
 from os.path import splitext
 from unicodedata import normalize
 from uuid import uuid4
 
 import numpy as np
 import pandas as pd
+import datasets.monkeypatch
 import platiagro
 from chardet.universaldetector import UniversalDetector
 from googleapiclient.discovery import build
@@ -266,7 +267,7 @@ def patch_dataset(name, file_object):
     return get_dataset(name)
 
 
-def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
+def read_into_dataframe(file, filename=None, nrows=50, max_characters=50):
     """
     Reads a file into a DataFrame.
     Infers the file encoding and whether a header column exists
@@ -291,6 +292,7 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
     -----
     If no filename is given, a hex uuid will be used as the file name.
     """
+
     detector = UniversalDetector()
     for line, text in enumerate(file):
         detector.feed(text)
@@ -305,23 +307,23 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
     compression = infer_compression(filename, "infer")
 
     file.seek(0, SEEK_SET)
-    contents = file.read()
-
-    with BytesIO(contents) as file:
-        df0 = pd.read_csv(
-            file,
-            encoding=encoding,
-            compression=compression,
-            sep=None,
-            engine="python",
-            header="infer",
-            nrows=nrows,
-        )
+
+    pdread = TextIOWrapper(file, encoding=encoding)
+    df0 = pd.read_csv(
+        pdread,
+        encoding=encoding,
+        compression=compression,
+        sep=None,
+        engine="python",
+        header="infer",
+        nrows=nrows,
+    )
 
     df0_cols = list(df0.columns)
 
     # Check if all columns are strings and short strings(text values tend to be long)
     column_names_checker = all([type(item) == str for item in df0_cols])
+
     if column_names_checker:
         column_names_checker = all([len(item) < max_characters for item in df0_cols])
 
@@ -340,16 +342,16 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
     header = "infer" if final_checker else None
     prefix = None if header else "col"
 
-    with BytesIO(contents) as file:
-        df = pd.read_csv(
-            file,
-            encoding=encoding,
-            compression=compression,
-            sep=None,
-            engine="python",
-            header=header,
-            prefix=prefix,
-        )
+    file.seek(0, SEEK_SET)
+    df = pd.read_csv(
+        pdread,
+        encoding=encoding,
+        compression=compression,
+        sep=None,
+        engine="python",
+        header=header,
+        prefix=prefix,
+    )
     return df
 
 

diff --git a/datasets/monkeypatch.py b/datasets/monkeypatch.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+"""
+Monkey's patched methods for the SpooledTemporaryFile class.
+This is because the SpooledTemporaryFile does not inherit / implement the IOBase class.
+"""
+from tempfile import SpooledTemporaryFile
+
+
+def _readable(self):
+    return self._file.readable()
+
+
+def _writable(self):
+    return self._file.writable()
+
+
+def _seekable(self):
+    return self._file.seekable()
+
+
+SpooledTemporaryFile.readable = _readable
+SpooledTemporaryFile.writable = _writable
+SpooledTemporaryFile.seekable = _seekable
diff --git a/tests/test_api.py b/tests/test_api.py
@@ -230,7 +230,8 @@ def test_get_dataset(self):
 
         rv = TEST_CLIENT.get("/datasets/iris.data?page=15&page_size=2")
         result = rv.json()
-        expected = {"message": "The specified page does not exist"}
+        #expected = {"filename": "iris.data", "name": "iris.data"}
+        expected = {'message': 'The specified page does not exist'}
         self.assertDictEqual(expected, result)
         self.assertEqual(rv.status_code, 404)