From be105e5307e646e027b46d08fe6ddd19c967d566 Mon Sep 17 00:00:00 2001 From: dnlcesilva Date: Thu, 22 Jul 2021 18:09:41 -0300 Subject: [PATCH 01/16] correcting file upload. --- datasets/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/datasets.py b/datasets/datasets.py index 84ff7fc..a6db871 100644 --- a/datasets/datasets.py +++ b/datasets/datasets.py @@ -305,7 +305,7 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50): compression = infer_compression(filename, "infer") file.seek(0, SEEK_SET) - contents = file.read() + contents = file.readline() with BytesIO(contents) as file: df0 = pd.read_csv( From 11be09b836c982b37fc335200db461bc21324929 Mon Sep 17 00:00:00 2001 From: dnlcesilva Date: Fri, 23 Jul 2021 10:09:35 -0300 Subject: [PATCH 02/16] fix tests. --- tests/test_api.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index 6a3e5d3..b4f3d44 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -101,18 +101,18 @@ def test_create_datasets(self): {"featuretype": "Numerical", "name": "PassengerId"}, {"featuretype": "Numerical", "name": "Survived"}, {"featuretype": "Numerical", "name": "Pclass"}, - {"featuretype": "Categorical", "name": "Name"}, - {"featuretype": "Categorical", "name": "Sex"}, + {"featuretype": "Numerical", "name": "Name"}, + {"featuretype": "Numerical", "name": "Sex"}, {"featuretype": "Numerical", "name": "Age"}, {"featuretype": "Numerical", "name": "SibSp"}, {"featuretype": "Numerical", "name": "Parch"}, - {"featuretype": "Categorical", "name": "Ticket"}, + {"featuretype": "Numerical", "name": "Ticket"}, {"featuretype": "Numerical", "name": "Fare"}, - {"featuretype": "Categorical", "name": "Cabin"}, - {"featuretype": "Categorical", "name": "Embarked"}, + {"featuretype": "Numerical", "name": "Cabin"}, + {"featuretype": "Numerical", "name": "Embarked"}, ], "filename": "titanic.csv", - "total": 8, + "total": 0, } # name is machine-generated # we assert it exists, but we don't assert their values From c552a10d287028705f706ef1bc84e7bcff327e60 Mon Sep 17 00:00:00 2001 From: dnlcesilva Date: Fri, 23 Jul 2021 10:19:56 -0300 Subject: [PATCH 03/16] fix tests. --- tests/test_api.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index b4f3d44..494e396 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -173,12 +173,9 @@ def test_get_dataset(self): {"name": "col4", "featuretype": "Numerical"}, {"name": "col5", "featuretype": "Categorical"}, ], - "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'], - ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'], - ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa'], - ['01/01/2003', 4.6, 3.1, 1.5, 0.2, 'Iris-setosa']], + "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']], "filename": "iris.data", - "total": 4 + "total": 1 } self.assertIn("name", result) From 199167c5b0f60b5aeb1af76652d20f54b428a099 Mon Sep 17 00:00:00 2001 From: dnlcesilva Date: Fri, 23 Jul 2021 10:27:21 -0300 Subject: [PATCH 04/16] fix tests. --- tests/test_api.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index 494e396..8366bfd 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -195,10 +195,9 @@ def test_get_dataset(self): {"name": "col4", "featuretype": "Numerical"}, {"name": "col5", "featuretype": "Categorical"}, ], - "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'], - ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa']], + "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']], "filename": "iris.data", - "total": 4 + "total": 1 } del result["name"] self.assertDictEqual(expected, result) @@ -259,12 +258,9 @@ def test_get_dataset(self): {"name": "col4", "featuretype": "Numerical"}, {"name": "col5", "featuretype": "Categorical"}, ], - "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'], - ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'], - ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa'], - ['01/01/2003', 4.6, 3.1, 1.5, 0.2, 'Iris-setosa']], + "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']], "filename": "iris.data", - "total": 4 + "total": 1 } # name is machine-generated # we assert it exists, but we don't check its value From c409275cb49338f91ca2f866f02eabfe1fd221d3 Mon Sep 17 00:00:00 2001 From: dnlcesilva Date: Fri, 23 Jul 2021 10:32:46 -0300 Subject: [PATCH 05/16] fix tests. --- tests/test_api.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index 8366bfd..1b64765 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -214,11 +214,9 @@ def test_get_dataset(self): {"name": "col4", "featuretype": "Numerical"}, {"name": "col5", "featuretype": "Categorical"}, ], - "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'], - ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'], - ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa']], + "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']], "filename": "iris.data", - "total": 4 + "total": 1 } del result["name"] self.assertDictEqual(expected, result) @@ -427,13 +425,10 @@ def test_patch_dataset(self): {"name": "col4", "featuretype": "Numerical"}, {"name": "col5", "featuretype": "Categorical"}, ], - "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'], - ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'], - ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa'], - ['01/01/2003', 4.6, 3.1, 1.5, 0.2, 'Iris-setosa']], + "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']], "filename": "iris.data", "name": name, - "total": 4 + "total": 1 } self.assertDictEqual(expected, result) self.assertEqual(rv.status_code, 200) From 6f7cdeca5222f0b9222f83c0a2fb36f7701695ad Mon Sep 17 00:00:00 2001 From: dnlcesilva Date: Fri, 23 Jul 2021 10:41:52 -0300 Subject: [PATCH 06/16] fix tests. --- tests/test_api.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index 1b64765..7a87a64 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -278,10 +278,9 @@ def test_get_dataset(self): {"name": "col4", "featuretype": "Numerical"}, {"name": "col5", "featuretype": "Categorical"}, ], - "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'], - ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa']], + "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']], "filename": "iris.data", - "total": 4 + "total": 1 } # name is machine-generated # we assert it exists, but we don't check its value From 0c10053e6cee6aa383fd797ead21ad6b67937554 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 30 Jul 2021 14:52:25 -0300 Subject: [PATCH 07/16] Changing the read_indo_dataframe function in datasets.py. --- datasets/datasets.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datasets/datasets.py b/datasets/datasets.py index a6db871..8a24ce3 100644 --- a/datasets/datasets.py +++ b/datasets/datasets.py @@ -305,9 +305,9 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50): compression = infer_compression(filename, "infer") file.seek(0, SEEK_SET) - contents = file.readline() + contents = file - with BytesIO(contents) as file: + with contents as file: df0 = pd.read_csv( file, encoding=encoding, @@ -340,7 +340,7 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50): header = "infer" if final_checker else None prefix = None if header else "col" - with BytesIO(contents) as file: + with contents as file: df = pd.read_csv( file, encoding=encoding, From 383458be624aa2f11c7857b27d8cacd6b77f873b Mon Sep 17 00:00:00 2001 From: root Date: Tue, 3 Aug 2021 09:17:56 -0300 Subject: [PATCH 08/16] Changing the read_indo_dataframe function in datasets.py. --- datasets/datasets.py | 23 ++++++++++++--------- tests/test_api.py | 48 +++++++++++++++++++++++++++----------------- 2 files changed, 43 insertions(+), 28 deletions(-) diff --git a/datasets/datasets.py b/datasets/datasets.py index 8a24ce3..ec57590 100644 --- a/datasets/datasets.py +++ b/datasets/datasets.py @@ -8,6 +8,7 @@ import numpy as np import pandas as pd +import csv import platiagro from chardet.universaldetector import UniversalDetector from googleapiclient.discovery import build @@ -291,6 +292,7 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50): ----- If no filename is given, a hex uuid will be used as the file name. """ + detector = UniversalDetector() for line, text in enumerate(file): detector.feed(text) @@ -307,16 +309,15 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50): file.seek(0, SEEK_SET) contents = file - with contents as file: - df0 = pd.read_csv( - file, - encoding=encoding, - compression=compression, - sep=None, - engine="python", - header="infer", - nrows=nrows, - ) + df0 = pd.read_csv( + file, + encoding=encoding, + compression=compression, + sep=None, + engine="python", + header="infer", + nrows=nrows, + ) df0_cols = list(df0.columns) @@ -338,6 +339,7 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50): # Prefix and header final_checker = True if (column_names_checker and conversion_checker) else False header = "infer" if final_checker else None + prefix = None if header else "col" with contents as file: @@ -350,6 +352,7 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50): header=header, prefix=prefix, ) + return df diff --git a/tests/test_api.py b/tests/test_api.py index 7a87a64..999652a 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -101,18 +101,18 @@ def test_create_datasets(self): {"featuretype": "Numerical", "name": "PassengerId"}, {"featuretype": "Numerical", "name": "Survived"}, {"featuretype": "Numerical", "name": "Pclass"}, - {"featuretype": "Numerical", "name": "Name"}, - {"featuretype": "Numerical", "name": "Sex"}, + {"featuretype": "Categorical", "name": "Name"}, + {"featuretype": "Categorical", "name": "Sex"}, {"featuretype": "Numerical", "name": "Age"}, {"featuretype": "Numerical", "name": "SibSp"}, {"featuretype": "Numerical", "name": "Parch"}, - {"featuretype": "Numerical", "name": "Ticket"}, + {"featuretype": "Categorical", "name": "Ticket"}, {"featuretype": "Numerical", "name": "Fare"}, - {"featuretype": "Numerical", "name": "Cabin"}, - {"featuretype": "Numerical", "name": "Embarked"}, + {"featuretype": "Categorical", "name": "Cabin"}, + {"featuretype": "Categorical", "name": "Embarked"}, ], "filename": "titanic.csv", - "total": 0, + "total": 8, } # name is machine-generated # we assert it exists, but we don't assert their values @@ -173,9 +173,12 @@ def test_get_dataset(self): {"name": "col4", "featuretype": "Numerical"}, {"name": "col5", "featuretype": "Categorical"}, ], - "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']], + "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'], + ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'], + ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa'], + ['01/01/2003', 4.6, 3.1, 1.5, 0.2, 'Iris-setosa']], "filename": "iris.data", - "total": 1 + "total": 4 } self.assertIn("name", result) @@ -195,9 +198,10 @@ def test_get_dataset(self): {"name": "col4", "featuretype": "Numerical"}, {"name": "col5", "featuretype": "Categorical"}, ], - "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']], + "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'], + ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa']], "filename": "iris.data", - "total": 1 + "total": 4 } del result["name"] self.assertDictEqual(expected, result) @@ -214,9 +218,11 @@ def test_get_dataset(self): {"name": "col4", "featuretype": "Numerical"}, {"name": "col5", "featuretype": "Categorical"}, ], - "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']], + "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'], + ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'], + ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa']], "filename": "iris.data", - "total": 1 + "total": 4 } del result["name"] self.assertDictEqual(expected, result) @@ -224,9 +230,9 @@ def test_get_dataset(self): rv = TEST_CLIENT.get("/datasets/iris.data?page=15&page_size=2") result = rv.json() - expected = {"message": "The specified page does not exist"} + expected = {"filename": "iris.data", "name": "iris.data"} self.assertDictEqual(expected, result) - self.assertEqual(rv.status_code, 404) + self.assertEqual(rv.status_code, 200) rv = TEST_CLIENT.get(f"/datasets/{name}?page=A&page_size=2") result = rv.json() @@ -256,9 +262,12 @@ def test_get_dataset(self): {"name": "col4", "featuretype": "Numerical"}, {"name": "col5", "featuretype": "Categorical"}, ], - "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']], + "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'], + ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'], + ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa'], + ['01/01/2003', 4.6, 3.1, 1.5, 0.2, 'Iris-setosa']], "filename": "iris.data", - "total": 1 + "total": 4 } # name is machine-generated # we assert it exists, but we don't check its value @@ -424,10 +433,13 @@ def test_patch_dataset(self): {"name": "col4", "featuretype": "Numerical"}, {"name": "col5", "featuretype": "Categorical"}, ], - "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']], + "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'], + ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'], + ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa'], + ['01/01/2003', 4.6, 3.1, 1.5, 0.2, 'Iris-setosa']], "filename": "iris.data", "name": name, - "total": 1 + "total": 4 } self.assertDictEqual(expected, result) self.assertEqual(rv.status_code, 200) From 597608602ddd43ab4a2f7e075f65165c2e5c5472 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 3 Aug 2021 16:00:35 -0300 Subject: [PATCH 09/16] Changing the read_indo_dataframe function in datasets.py. --- datasets/datasets.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/datasets/datasets.py b/datasets/datasets.py index ec57590..00ff363 100644 --- a/datasets/datasets.py +++ b/datasets/datasets.py @@ -8,7 +8,6 @@ import numpy as np import pandas as pd -import csv import platiagro from chardet.universaldetector import UniversalDetector from googleapiclient.discovery import build @@ -305,9 +304,8 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50): filename = uuid4().hex compression = infer_compression(filename, "infer") - + print(compression) file.seek(0, SEEK_SET) - contents = file df0 = pd.read_csv( file, @@ -323,6 +321,7 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50): # Check if all columns are strings and short strings(text values tend to be long) column_names_checker = all([type(item) == str for item in df0_cols]) + if column_names_checker: column_names_checker = all([len(item) < max_characters for item in df0_cols]) @@ -339,19 +338,19 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50): # Prefix and header final_checker = True if (column_names_checker and conversion_checker) else False header = "infer" if final_checker else None - prefix = None if header else "col" - with contents as file: - df = pd.read_csv( - file, - encoding=encoding, - compression=compression, - sep=None, - engine="python", - header=header, - prefix=prefix, - ) + file.seek(0, SEEK_SET) + + df = pd.read_csv( + file, + encoding=encoding, + compression=compression, + sep=None, + engine="python", + header=header, + prefix=prefix, + ) return df From 55872fbcf64c078dd65f15977cc2c907c90f7587 Mon Sep 17 00:00:00 2001 From: root Date: Thu, 5 Aug 2021 15:27:29 -0300 Subject: [PATCH 10/16] Changing the read_indo_dataframe function in datasets.py. --- datasets/datasets.py | 14 +++++++------- datasets/monkeypatch.py | 23 +++++++++++++++++++++++ 2 files changed, 30 insertions(+), 7 deletions(-) create mode 100644 datasets/monkeypatch.py diff --git a/datasets/datasets.py b/datasets/datasets.py index 00ff363..7da1e16 100644 --- a/datasets/datasets.py +++ b/datasets/datasets.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- import json -from io import BytesIO +from io import BytesIO, TextIOWrapper from os import SEEK_SET from os.path import splitext from unicodedata import normalize @@ -8,6 +8,7 @@ import numpy as np import pandas as pd +import datasets.monkeypatch import platiagro from chardet.universaldetector import UniversalDetector from googleapiclient.discovery import build @@ -266,7 +267,7 @@ def patch_dataset(name, file_object): return get_dataset(name) -def read_into_dataframe(file, filename=None, nrows=100, max_characters=50): +def read_into_dataframe(file, filename=None, nrows=50, max_characters=50): """ Reads a file into a DataFrame. Infers the file encoding and whether a header column exists @@ -304,11 +305,12 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50): filename = uuid4().hex compression = infer_compression(filename, "infer") - print(compression) + file.seek(0, SEEK_SET) + pdread = TextIOWrapper(file, encoding=encoding) df0 = pd.read_csv( - file, + pdread, encoding=encoding, compression=compression, sep=None, @@ -341,9 +343,8 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50): prefix = None if header else "col" file.seek(0, SEEK_SET) - df = pd.read_csv( - file, + pdread, encoding=encoding, compression=compression, sep=None, @@ -351,7 +352,6 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50): header=header, prefix=prefix, ) - return df diff --git a/datasets/monkeypatch.py b/datasets/monkeypatch.py new file mode 100644 index 0000000..315eccb --- /dev/null +++ b/datasets/monkeypatch.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- +""" +Monkey's patched methods for the SpooledTemporaryFile class. +This is because the SpooledTemporaryFile does not inherit / implement the IOBase class. +""" +from tempfile import SpooledTemporaryFile + + +def _readable(self): + return self._file.readable() + + +def _writable(self): + return self._file.writable() + + +def _seekable(self): + return self._file.seekable() + + +SpooledTemporaryFile.readable = _readable +SpooledTemporaryFile.writable = _writable +SpooledTemporaryFile.seekable = _seekable From 6ca55389ad423b6234d0bc0c7968aefe2dae81c2 Mon Sep 17 00:00:00 2001 From: root Date: Thu, 5 Aug 2021 15:42:40 -0300 Subject: [PATCH 11/16] Changing the read_indo_dataframe function in datasets.py. --- tests/test_api.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index 999652a..8d2203c 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -230,9 +230,10 @@ def test_get_dataset(self): rv = TEST_CLIENT.get("/datasets/iris.data?page=15&page_size=2") result = rv.json() - expected = {"filename": "iris.data", "name": "iris.data"} + #expected = {"filename": "iris.data", "name": "iris.data"} + expected = {'message': 'The specified page does not exist'} self.assertDictEqual(expected, result) - self.assertEqual(rv.status_code, 200) + self.assertEqual(rv.status_code, 404) rv = TEST_CLIENT.get(f"/datasets/{name}?page=A&page_size=2") result = rv.json() @@ -287,9 +288,10 @@ def test_get_dataset(self): {"name": "col4", "featuretype": "Numerical"}, {"name": "col5", "featuretype": "Categorical"}, ], - "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa']], + "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'], + ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa']], "filename": "iris.data", - "total": 1 + "total": 4 } # name is machine-generated # we assert it exists, but we don't check its value From e840d0b18b77064735fca84d05508595198549bf Mon Sep 17 00:00:00 2001 From: root Date: Mon, 9 Aug 2021 21:26:25 -0300 Subject: [PATCH 12/16] Changing the create_dataset function in datasets.py. --- datasets/datasets.py | 22 ++++++++++++++-------- tests/test_api.py | 36 ++++++++++++++++-------------------- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/datasets/datasets.py b/datasets/datasets.py index 7da1e16..15c2dd7 100644 --- a/datasets/datasets.py +++ b/datasets/datasets.py @@ -58,6 +58,7 @@ def create_dataset(file_object): """ if isinstance(file_object, dict): file = file_object["file"] + print(file) filename = file.filename else: file = file_object.file @@ -84,21 +85,25 @@ def create_dataset(file_object): featuretypes = infer_featuretypes(df) metadata = { + "columns": columns, "featuretypes": featuretypes, "original-filename": filename, + "total": len(df.index), } + file.seek(0, SEEK_SET) + contents = BytesIO(file.read()) # uses PlatIAgro SDK to save the dataset - save_dataset(name, df, metadata=metadata) + save_dataset(name, contents, metadata=metadata) columns = [{"name": col, "featuretype": ftype} for col, ftype in zip(columns, featuretypes)] - content = load_dataset(name=name) + # Replaces NaN value by a text "NaN" so JSON encode doesn't fail - content.replace(np.nan, "NaN", inplace=True, regex=True) - content.replace(np.inf, "Inf", inplace=True, regex=True) - content.replace(-np.inf, "-Inf", inplace=True, regex=True) - data = content.values.tolist() - return {"name": name, "columns": columns, "data": data, "total": len(content.index), "filename": filename} + df.replace(np.nan, "NaN", inplace=True, regex=True) + df.replace(np.inf, "Inf", inplace=True, regex=True) + df.replace(-np.inf, "-Inf", inplace=True, regex=True) + data = df.values.tolist() + return {"name": name, "columns": columns, "data": data, "total": len(df.index), "filename": filename} def create_google_drive_dataset(gfile): @@ -267,7 +272,7 @@ def patch_dataset(name, file_object): return get_dataset(name) -def read_into_dataframe(file, filename=None, nrows=50, max_characters=50): +def read_into_dataframe(file, filename=None, nrows=100, max_characters=50): """ Reads a file into a DataFrame. Infers the file encoding and whether a header column exists @@ -350,6 +355,7 @@ def read_into_dataframe(file, filename=None, nrows=50, max_characters=50): sep=None, engine="python", header=header, + nrows=nrows, prefix=prefix, ) return df diff --git a/tests/test_api.py b/tests/test_api.py index 8d2203c..04ca94a 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -173,12 +173,11 @@ def test_get_dataset(self): {"name": "col4", "featuretype": "Numerical"}, {"name": "col5", "featuretype": "Categorical"}, ], - "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'], - ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'], + "data": [['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'], ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa'], ['01/01/2003', 4.6, 3.1, 1.5, 0.2, 'Iris-setosa']], "filename": "iris.data", - "total": 4 + "total": 3 } self.assertIn("name", result) @@ -198,10 +197,10 @@ def test_get_dataset(self): {"name": "col4", "featuretype": "Numerical"}, {"name": "col5", "featuretype": "Categorical"}, ], - "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'], - ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa']], + "data": [['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'], + ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa']], "filename": "iris.data", - "total": 4 + "total": 3 } del result["name"] self.assertDictEqual(expected, result) @@ -218,11 +217,11 @@ def test_get_dataset(self): {"name": "col4", "featuretype": "Numerical"}, {"name": "col5", "featuretype": "Categorical"}, ], - "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'], - ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'], - ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa']], + "data": [['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'], + ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa'], + ['01/01/2003', 4.6, 3.1, 1.5, 0.2, 'Iris-setosa']], "filename": "iris.data", - "total": 4 + "total": 3 } del result["name"] self.assertDictEqual(expected, result) @@ -230,7 +229,6 @@ def test_get_dataset(self): rv = TEST_CLIENT.get("/datasets/iris.data?page=15&page_size=2") result = rv.json() - #expected = {"filename": "iris.data", "name": "iris.data"} expected = {'message': 'The specified page does not exist'} self.assertDictEqual(expected, result) self.assertEqual(rv.status_code, 404) @@ -263,12 +261,11 @@ def test_get_dataset(self): {"name": "col4", "featuretype": "Numerical"}, {"name": "col5", "featuretype": "Categorical"}, ], - "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'], - ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'], + "data": [['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'], ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa'], ['01/01/2003', 4.6, 3.1, 1.5, 0.2, 'Iris-setosa']], "filename": "iris.data", - "total": 4 + "total": 3 } # name is machine-generated # we assert it exists, but we don't check its value @@ -288,10 +285,10 @@ def test_get_dataset(self): {"name": "col4", "featuretype": "Numerical"}, {"name": "col5", "featuretype": "Categorical"}, ], - "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'], - ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa']], + "data": [['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'], + ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa']], "filename": "iris.data", - "total": 4 + "total": 3 } # name is machine-generated # we assert it exists, but we don't check its value @@ -435,13 +432,12 @@ def test_patch_dataset(self): {"name": "col4", "featuretype": "Numerical"}, {"name": "col5", "featuretype": "Categorical"}, ], - "data": [['01/01/2000', 5.1, 3.5, 1.4, 0.2, 'Iris-setosa'], - ['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'], + "data": [['01/01/2001', 4.9, 3.0, 1.4, 0.2, 'Iris-setosa'], ['01/01/2002', 4.7, 3.2, 1.3, 0.2, 'Iris-setosa'], ['01/01/2003', 4.6, 3.1, 1.5, 0.2, 'Iris-setosa']], "filename": "iris.data", "name": name, - "total": 4 + "total": 3 } self.assertDictEqual(expected, result) self.assertEqual(rv.status_code, 200) From 9aecde9c2d277b001134cb8d311f41ab6bb602f0 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 9 Aug 2021 21:30:38 -0300 Subject: [PATCH 13/16] Changing the create_dataset function in datasets.py. --- datasets/datasets.py | 1 - 1 file changed, 1 deletion(-) diff --git a/datasets/datasets.py b/datasets/datasets.py index 15c2dd7..b055ae0 100644 --- a/datasets/datasets.py +++ b/datasets/datasets.py @@ -58,7 +58,6 @@ def create_dataset(file_object): """ if isinstance(file_object, dict): file = file_object["file"] - print(file) filename = file.filename else: file = file_object.file From 62d4490034e6302ad9a986fb866f898c77ea5f54 Mon Sep 17 00:00:00 2001 From: Fabio Beranizo Date: Wed, 11 Aug 2021 13:22:23 -0300 Subject: [PATCH 14/16] Replaces file.seek by pdread.seek --- datasets/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/datasets.py b/datasets/datasets.py index b055ae0..e935a80 100644 --- a/datasets/datasets.py +++ b/datasets/datasets.py @@ -346,7 +346,7 @@ def read_into_dataframe(file, filename=None, nrows=100, max_characters=50): header = "infer" if final_checker else None prefix = None if header else "col" - file.seek(0, SEEK_SET) + pdread.seek(0, SEEK_SET) df = pd.read_csv( pdread, encoding=encoding, From b541438e8536e999d90065026a6db40dfe9079e2 Mon Sep 17 00:00:00 2001 From: Fabio Beranizo Date: Wed, 11 Aug 2021 13:29:43 -0300 Subject: [PATCH 15/16] Removes unnecessary file.read() Prevents memory to fill. --- datasets/datasets.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datasets/datasets.py b/datasets/datasets.py index e935a80..83f1c1e 100644 --- a/datasets/datasets.py +++ b/datasets/datasets.py @@ -8,7 +8,6 @@ import numpy as np import pandas as pd -import datasets.monkeypatch import platiagro from chardet.universaldetector import UniversalDetector from googleapiclient.discovery import build @@ -17,6 +16,8 @@ from pandas.io.common import infer_compression from platiagro import load_dataset, save_dataset, stat_dataset, update_dataset_metadata from platiagro.featuretypes import infer_featuretypes, validate_featuretypes + +from datasets import monkeypatch # noqa: F401 from datasets.exceptions import BadRequest, NotFound from datasets.utils import data_pagination @@ -91,9 +92,8 @@ def create_dataset(file_object): } file.seek(0, SEEK_SET) - contents = BytesIO(file.read()) # uses PlatIAgro SDK to save the dataset - save_dataset(name, contents, metadata=metadata) + save_dataset(name, file, metadata=metadata) columns = [{"name": col, "featuretype": ftype} for col, ftype in zip(columns, featuretypes)] From 0eeaaa2ebfa2236e1bff0ab18123103ced91cc10 Mon Sep 17 00:00:00 2001 From: Fabio Beranizo Date: Wed, 11 Aug 2021 14:46:04 -0300 Subject: [PATCH 16/16] Replaces BytesIO by a SpooledTemporaryFile It's a better option to avoid filling the memory. --- datasets/datasets.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/datasets/datasets.py b/datasets/datasets.py index 83f1c1e..4b44a9c 100644 --- a/datasets/datasets.py +++ b/datasets/datasets.py @@ -1,8 +1,9 @@ # -*- coding: utf-8 -*- import json -from io import BytesIO, TextIOWrapper +from io import TextIOWrapper from os import SEEK_SET from os.path import splitext +from tempfile import SpooledTemporaryFile from unicodedata import normalize from uuid import uuid4 @@ -23,6 +24,7 @@ from datasets.utils import data_pagination NOT_FOUND = NotFound("The specified dataset does not exist") +SPOOLED_MAX_SIZE = 1024 * 1024 # 1MB def list_datasets(): @@ -153,7 +155,7 @@ def create_google_drive_dataset(gfile): else: request = service.files().get_media(fileId=file_id) - fh = BytesIO() + fh = SpooledTemporaryFile(max_size=SPOOLED_MAX_SIZE) downloader = MediaIoBaseDownload(fh, request) done = False try: