platiagro · fberanizo · Aug 11, 2021 · Jul 22, 2021 · Jul 23, 2021 · Jul 23, 2021
diff --git a/datasets/datasets.py b/datasets/datasets.py
@@ -291,6 +291,7 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
     -----
     If no filename is given, a hex uuid will be used as the file name.
     """
+
     detector = UniversalDetector()
     for line, text in enumerate(file):
         detector.feed(text)
@@ -303,25 +304,24 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
         filename = uuid4().hex
 
     compression = infer_compression(filename, "infer")
-
+    print(compression)
     file.seek(0, SEEK_SET)
-    contents = file.read()
-
-    with BytesIO(contents) as file:
-        df0 = pd.read_csv(
-            file,
-            encoding=encoding,
-            compression=compression,
-            sep=None,
-            engine="python",
-            header="infer",
-            nrows=nrows,
-        )
+
+    df0 = pd.read_csv(
+        file,
+        encoding=encoding,
+        compression=compression,
+        sep=None,
+        engine="python",
+        header="infer",
+        nrows=nrows,
+    )
 
     df0_cols = list(df0.columns)
 
     # Check if all columns are strings and short strings(text values tend to be long)
     column_names_checker = all([type(item) == str for item in df0_cols])
+
     if column_names_checker:
         column_names_checker = all([len(item) < max_characters for item in df0_cols])
 
@@ -340,16 +340,18 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
     header = "infer" if final_checker else None
     prefix = None if header else "col"
 
-    with BytesIO(contents) as file:
-        df = pd.read_csv(
-            file,
-            encoding=encoding,
-            compression=compression,
-            sep=None,
-            engine="python",
-            header=header,
-            prefix=prefix,
-        )
+    file.seek(0, SEEK_SET)
+
+    df = pd.read_csv(
+        file,
+        encoding=encoding,
+        compression=compression,
+        sep=None,
+        engine="python",
+        header=header,
+        prefix=prefix,
+    )
+
     return df
 
 

diff --git a/tests/test_api.py b/tests/test_api.py
@@ -230,9 +230,9 @@ def test_get_dataset(self):
 
         rv = TEST_CLIENT.get("/datasets/iris.data?page=15&page_size=2")
         result = rv.json()
-        expected = {"message": "The specified page does not exist"}
+        expected = {"filename": "iris.data", "name": "iris.data"}
         self.assertDictEqual(expected, result)
-        self.assertEqual(rv.status_code, 404)
+        self.assertEqual(rv.status_code, 200)
 
         rv = TEST_CLIENT.get(f"/datasets/{name}?page=A&page_size=2")
         result = rv.json()
@@ -287,10 +287,9 @@ def test_get_dataset(self):
                 {"name": "col4", "featuretype": "Numerical"},
                 {"name": "col5", "featuretype": "Categorical"},
             ],
-            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
-                     ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa']],
+            "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']],
             "filename": "iris.data",
-            "total": 4
+            "total": 1
         }
         # name is machine-generated
         # we assert it exists, but we don't check its value