-
Notifications
You must be signed in to change notification settings - Fork 3
Update indexer instantiation. Allow loc from index with duplicates. #46
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
21b6011
a610018
4be9628
6ec1810
9d74e6c
c9329fb
aa4bc8f
ecf3762
74075ca
f1866f1
c854ce6
c21824a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| version: 2 | ||
| jobs: | ||
| build: | ||
| working_directory: ~/sparsity | ||
| docker: | ||
| - image: drtools/dask:latest | ||
| steps: | ||
| - checkout | ||
| - run: pip install pytest pytest-cov moto . | ||
| - run: py.test --cov sparsity --cov-report xml sparsity | ||
| - run: bash <(curl -s https://codecov.io/bash) |
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,7 +10,6 @@ | |
| import sparsity as sp | ||
| import sparsity.dask as dsp | ||
| from dask.local import get_sync | ||
| from sparsity import sparse_one_hot | ||
| from sparsity.dask.reshape import one_hot_encode | ||
| import pandas.util.testing as pdt | ||
|
|
||
|
|
@@ -63,29 +62,23 @@ def test_loc(iindexer, correct_shape): | |
| assert res.shape == correct_shape | ||
|
|
||
| def test_dask_loc(clickstream): | ||
| sf = dd.from_pandas(clickstream, npartitions=10) \ | ||
| .map_partitions( | ||
| sparse_one_hot, | ||
| categories={'page_id': list('ABCDE')}, | ||
| meta=list | ||
| ) | ||
|
|
||
| sf = one_hot_encode(dd.from_pandas(clickstream, npartitions=10), | ||
| categories={'page_id': list('ABCDE'), | ||
| 'other_categorical': list('FGHIJ')}, | ||
| index_col=['index', 'id']) | ||
| res = sf.loc['2016-01-15':'2016-02-15'] | ||
| res = sp.SparseFrame.concat(res.compute(get=get_sync).tolist()) | ||
| assert res.index.date.max() == dt.date(2016, 2, 15) | ||
| assert res.index.date.min() == dt.date(2016, 1, 15) | ||
| res = res.compute() | ||
| assert res.index.levels[0].max().date() == dt.date(2016, 2, 15) | ||
| assert res.index.levels[0].min().date() == dt.date(2016, 1, 15) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No need to use datetime if you already need/have pandas, but that's so lightweight I don't think it justifies changing it. |
||
|
|
||
|
|
||
| def test_dask_multi_index_loc(clickstream): | ||
| sf = dd.from_pandas(clickstream, npartitions=10) \ | ||
| .map_partitions( | ||
| sparse_one_hot, | ||
| index_col=['index', 'id'], | ||
| categories={'page_id': list('ABCDE')}, | ||
| meta=list | ||
| ) | ||
| sf = one_hot_encode(dd.from_pandas(clickstream, npartitions=10), | ||
| categories={'page_id': list('ABCDE'), | ||
| 'other_categorical': list('FGHIJ')}, | ||
| index_col=['index', 'id']) | ||
| res = sf.loc['2016-01-15':'2016-02-15'] | ||
| res = sp.SparseFrame.vstack(res.compute(get=get_sync).tolist()) | ||
| res = res.compute() | ||
| assert res.index.get_level_values(0).date.min() == dt.date(2016, 1, 15) | ||
| assert res.index.get_level_values(0).date.max() == dt.date(2016, 2, 15) | ||
|
|
||
|
|
@@ -234,7 +227,8 @@ def test_assign_column(): | |
| dsf = dsf.assign(new=ds) | ||
| assert dsf._meta.empty | ||
| sf = dsf.compute() | ||
| assert np.all(sf.todense() == f.assign(new=s)) | ||
| assert np.all((sf.todense() == f.assign(new=s)).values) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize('arg_dict', [ | ||
| dict(divisions=[0, 30, 50, 70, 99]), | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,7 +2,6 @@ | |
| import datetime as dt | ||
| import os | ||
|
|
||
| #import dask.dataframe as dd | ||
| from contextlib import contextmanager | ||
|
|
||
| import numpy as np | ||
|
|
@@ -202,8 +201,8 @@ def test_loc_multi_index(sf_midx, sf_midx_int): | |
| assert np.all(sf_midx.loc[dt_slice].data.todense() == | ||
| np.identity(5)[:3]) | ||
|
|
||
| assert np.all(sf_midx_int.loc[1].todense() == sf_midx.data[:4,:]) | ||
| assert np.all(sf_midx_int.loc[0].todense() == sf_midx.data[4, :]) | ||
| assert np.all(sf_midx_int.loc[1].todense().values == sf_midx.data[:4,:]) | ||
| assert np.all(sf_midx_int.loc[0].todense().values == sf_midx.data[4, :]) | ||
|
|
||
|
|
||
| def test_set_index(sf_midx): | ||
|
|
@@ -619,13 +618,18 @@ def test_npz_io_s3(complex_example): | |
|
|
||
|
|
||
| def test_getitem(): | ||
| sf = SparseFrame(np.identity(10), columns=list('abcdefghij')) | ||
| id_ = np.identity(10) | ||
| sf = SparseFrame(id_, columns=list('abcdefghij')) | ||
| assert sf['a'].data.todense()[0] == 1 | ||
| assert sf['j'].data.todense()[9] == 1 | ||
| assert np.all(sf[['a', 'b']].data.todense() == np.asmatrix(id_[:, [0, 1]])) | ||
| tmp = sf[['j', 'a']].data.todense() | ||
| assert tmp[9, 0] == 1 | ||
| assert tmp[0, 1] == 1 | ||
| assert (sf[list('abcdefghij')].data.todense() == np.identity(10)).all() | ||
| assert sf[[]].shape == (10, 0) | ||
| assert len(sf[[]].columns) == 0 | ||
| assert isinstance(sf.columns, type(sf[[]].columns)) | ||
|
|
||
|
|
||
| def test_vstack(): | ||
|
|
@@ -895,4 +899,17 @@ def test_empty_elemwise(): | |
| assert np.all(res == sf.data.todense()) | ||
|
|
||
| with pytest.raises(ValueError): | ||
| res = sf.add(sf_empty, fill_value=None) | ||
| res = sf.add(sf_empty, fill_value=None) | ||
|
|
||
|
|
||
| def test_loc_duplicate_index(): | ||
| sf = SparseFrame(np.identity(5), | ||
| columns=list('UUXYZ'), | ||
| index=list('AAABB')) | ||
| assert len(sf.loc['A'].index) == 3 | ||
| assert len(sf.loc['B'].index) == 2 | ||
| assert np.all(sf.loc['A'].todense().values == np.identity(5)[:3]) | ||
| assert np.all(sf.loc['B'].todense().values == np.identity(5)[3:]) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like you wanted to test columns too, but didn't do it. |
||
|
|
||
| assert len(sf.loc[:, 'U'].columns) == 2 | ||
| assert np.all(sf.loc[:, 'U'].todense().values == np.identity(5)[:, :2]) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@kayibal Previously you took care of the situation when
Noneis passed, now you don't. I'm not sure if it's necessary, just wanted to drop a hint.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It should raise an error
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now it's great.