Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
58da78f
Implement distributed groupby sum and apply_concat_apply function for…
kayibal Mar 7, 2018
51ae0a2
Merge branch 'master' into distributed-groupby-sum
kayibal Mar 7, 2018
3f3be49
update comments
kayibal Mar 9, 2018
63d4dc8
add test for different index datatypes
kayibal Mar 9, 2018
df110c9
implement sort_index
kayibal Mar 31, 2018
49cb182
implement __len__
kayibal Apr 1, 2018
fd10c00
implement rename, optimize groupby_sum and join
kayibal Apr 1, 2018
b89f7d4
implement distributed set_index
kayibal Apr 1, 2018
bb818ff
Merge branch 'optimization/distributed' into develop
kayibal Apr 1, 2018
f942a91
Merge branch 'master' into refactor/binning
kayibal Oct 8, 2018
d2b68e3
Formatting
kayibal Oct 8, 2018
2d2ab88
fix dask version until we have an update
kayibal Oct 8, 2018
8fc32b8
number of line ouput in __repr__ changed.
kayibal Oct 8, 2018
47782ed
stop checking for number of lines as it adjust to terminal size now...
kayibal Oct 8, 2018
9c352f9
Create folders when writing to local filesystem
kayibal Oct 9, 2018
cd2ce7b
Merge branch 'master' into refactor/binning
kayibal Oct 17, 2018
fcdd167
Fix empty dtype
kayibal Oct 10, 2018
18469ff
Implement distributed drop.
kayibal Oct 10, 2018
4876190
Always add npz extension when writing SparseFrame to npz format
kayibal Oct 10, 2018
4c18873
Fix metadata handling on set_index method
kayibal Oct 18, 2018
80aef8e
Correct dask requirement
kayibal Oct 22, 2018
414c69a
Add method for dask SparseFrame and tuple divisions type
kayibal Oct 29, 2018
f0aa2d8
Support empty divisions
kayibal Oct 30, 2018
3329492
Pass on divisions on sort_index
kayibal Oct 31, 2018
3223850
More restrictive pandas version as .drop method fails with pandas==0.…
kayibal Oct 31, 2018
a603d3d
Fix bug where empty dataframe would create wrongly sized shuffle array
kayibal Nov 1, 2018
92a8f5d
Fix bug where join with in memory sparse frame would return rows from…
kayibal Nov 15, 2018
c6fa6d0
update dask version
kayibal Dec 4, 2018
d60b758
Update dask version in setup.py
kayibal Dec 4, 2018
031d965
Update deprecated set_options call
kayibal Dec 4, 2018
734ff26
some fixes to tests
kayibal Dec 4, 2018
fbf0b17
Fix moto and boto versions
kayibal Dec 4, 2018
b836024
Update test dependencies
kayibal Dec 4, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ jobs:
- image: drtools/dask:latest
steps:
- checkout
- run: pip install pytest pytest-cov moto .
- run: pip install pytest pytest-cov moto dask==0.19.2 .
- run: py.test --cov sparsity --cov-report xml sparsity
- run: bash <(curl -s https://codecov.io/bash)
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
'scipy>=0.18.1',
'numpy>=1.12.0',
's3fs>=0.1.0',
'dask<=0.19.2'
],
test_requires=[
'moto',
Expand Down
12 changes: 11 additions & 1 deletion sparsity/dask/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,23 @@ def _make_meta(inp):
return sp.SparseFrame(meta)
return meta


def _meta_nonempty(x):
idx = _nonempty_index(x.index)
return sp.SparseFrame(sparse.csr_matrix((len(idx), len(x.columns))),
index=idx, columns=x.columns)


def optimize(dsk, keys, **kwargs):
dsk, _ = cull(dsk, keys)
return dsk


def finalize(results):
results = [r for r in results if not r.empty]
return sp.SparseFrame.vstack(results)


class SparseFrame(dask.base.DaskMethodsMixin):

def __init__(self, dsk, name, meta, divisions=None):
Expand Down Expand Up @@ -172,7 +176,7 @@ def join(self, other, on=None, how='left', lsuffix='',
rsuffix='', npartitions=None):
from .multi import join_indexed_sparseframes

if isinstance(other, sp.SparseFrame) and how in ['left', 'inner']:
if isinstance(other, sp.SparseFrame):
meta = sp.SparseFrame.join(self._meta_nonempty,
other,
how=how)
Expand Down Expand Up @@ -261,6 +265,12 @@ def set_index(self, column=None, idx=None, level=None):
res.divisions = [None] * ( self.npartitions + 1)
return res

def rename(self, columns):
#TODO: add test
_meta = self._meta.rename(columns=columns)
return self.map_partitions(sp.SparseFrame.rename, meta=_meta,
columns=columns)

def __repr__(self):
return \
"""
Expand Down
3 changes: 2 additions & 1 deletion sparsity/io.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from io import BytesIO
from pathlib import PurePath
from pathlib import PurePath, Path
from urllib.parse import urlparse

import numpy as np
Expand Down Expand Up @@ -67,6 +67,7 @@ def _write_dict_npz(data, filename, block_size, storage_options):
filename = path2str(filename)
protocol = urlparse(filename).scheme or 'file'
if protocol == 'file':
Path(filename).parent.mkdir(parents=True, exist_ok=True)
with open(filename, 'wb') as fp:
np.savez(fp, **data)
else:
Expand Down
1 change: 0 additions & 1 deletion sparsity/test/test_dask_sparse_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import pytest
import sparsity as sp
import sparsity.dask as dsp
from dask.local import get_sync
from sparsity.dask.reshape import one_hot_encode
import pandas.util.testing as pdt

Expand Down
2 changes: 0 additions & 2 deletions sparsity/test/test_sparse_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -705,12 +705,10 @@ def test_repr():
assert isinstance(res, str)
assert '10x10000' in res
assert '0 stored' in res
assert len(res.splitlines()) == 1 + 5 + 2
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did the lines number change? I can't see any change that you made that should affect it...

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's related to pandas version...

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, I hope our __repr__ still makes sense then...

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should just lines change I think depending on your terminal size? Don't know...


sf = SparseFrame(sparse.csr_matrix((10000, 10000)))
res = sf.__repr__()
assert isinstance(res, str)
assert len(res.splitlines()) == 1 + 5 + 2

sf = SparseFrame(np.array([]), index=[], columns=['A', 'B'])
res = sf.__repr__()
Expand Down