-
Notifications
You must be signed in to change notification settings - Fork 3
Update indexer instantiation. Allow loc from index with duplicates. #46
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
21b6011
a610018
4be9628
6ec1810
9d74e6c
c9329fb
aa4bc8f
ecf3762
74075ca
f1866f1
c854ce6
c21824a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| version: 2 | ||
| jobs: | ||
| build: | ||
| working_directory: ~/sparsity | ||
| docker: | ||
| - image: drtools/dask:latest | ||
| steps: | ||
| - checkout | ||
| - run: pip install pytest pytest-cov moto . | ||
| - run: py.test --cov sparsity --cov-report xml sparsity | ||
| - run: bash <(curl -s https://codecov.io/bash) |
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,5 @@ | ||
| # coding=utf-8 | ||
| import functools | ||
| import traceback | ||
| import uuid | ||
| import warnings | ||
|
|
@@ -22,7 +23,8 @@ | |
| trail_db = True | ||
| except: | ||
| trail_db = False | ||
| from sparsity.indexing import _CsrILocationIndexer, _CsrLocIndexer | ||
| from sparsity.indexing import _CsrILocationIndexer, _CsrLocIndexer, \ | ||
| get_indexers_list | ||
|
|
||
|
|
||
| def _is_empty(data): | ||
|
|
@@ -45,9 +47,6 @@ class SparseFrame(object): | |
| Simple sparse table based on scipy.sparse.csr_matrix | ||
| """ | ||
|
|
||
| __slots__ = ["_index", "_columns", "_data", "shape", | ||
| 'ndim', 'iloc', 'loc', 'empty'] | ||
|
|
||
| def __init__(self, data, index=None, columns=None, **kwargs): | ||
| if len(data.shape) > 2: | ||
| raise ValueError("Only two dimensional data supported") | ||
|
|
@@ -86,8 +85,17 @@ def __init__(self, data, index=None, columns=None, **kwargs): | |
|
|
||
| # register indexers | ||
| self.ndim = 2 | ||
| self.iloc = _CsrILocationIndexer(self, 'iloc') | ||
| self.loc = _CsrLocIndexer(self, 'loc') | ||
|
|
||
| @classmethod | ||
| def _create_indexer(cls, name, indexer): | ||
| """Create an indexer like _name in the class.""" | ||
| if getattr(cls, name, None) is None: | ||
| _v = int(pd.__version__.split('.')[1]) | ||
| if _v >= 23: | ||
| _indexer = functools.partial(indexer, name) | ||
| else: | ||
| _indexer = functools.partial(indexer, name=name) | ||
| setattr(cls, name, property(_indexer, doc=indexer.__doc__)) | ||
|
|
||
| def _init_values(self, data, kwargs): | ||
| if isinstance(data, pd.DataFrame): | ||
|
|
@@ -219,10 +227,21 @@ def _take(self, *args, **kwargs): | |
| """ | ||
| return self.take(*args, **kwargs) | ||
|
|
||
| def _xs(self, key, *args, **kwargs): | ||
| def _xs(self, key, *args, axis=0, **kwargs): | ||
| """Used for label based indexing.""" | ||
| loc = self.index.get_loc(key) | ||
| return SparseFrame(self.data[loc], index=[key], columns=self.columns) | ||
| if axis == 0: | ||
| loc = self.index.get_loc(key) | ||
| new_data = self.data[loc] | ||
| return SparseFrame(new_data, | ||
| index=[key] * new_data.shape[0], | ||
| columns=self.columns) | ||
| else: | ||
| loc = self.columns.get_loc(key) | ||
| new_data = self.data[:, loc] | ||
| return SparseFrame(new_data, | ||
| columns=[key] * new_data.shape[1], | ||
| index=self.index) | ||
|
|
||
|
|
||
| @property | ||
| def index(self): | ||
|
|
@@ -558,7 +577,7 @@ def drop(self, labels, axis=0): | |
| labels = [labels] | ||
| if axis == 1: | ||
| mask = np.logical_not(self.columns.isin(labels)) | ||
| sf = self[self.columns[mask].tolist()] | ||
| sf = self.loc[:, self.columns[mask].tolist()] | ||
| else: | ||
| raise NotImplementedError | ||
| return sf | ||
|
|
@@ -572,7 +591,10 @@ def drop_duplicate_idx(self, **kwargs): | |
| def __getitem__(self, item): | ||
| if not isinstance(item, (tuple, list)): | ||
| item = [item] | ||
| return self.reindex_axis(item, axis=1) | ||
| if item is not None and len(item) > 0: | ||
| return self.reindex_axis(item, axis=1) | ||
| else: | ||
| return self | ||
|
||
|
|
||
| def dropna(self): | ||
| """Drop nans from index.""" | ||
|
|
@@ -609,7 +631,7 @@ def set_index(self, column=None, idx=None, level=None, inplace=False): | |
| isinstance(self._index, pd.MultiIndex): | ||
| new_idx = self.index.get_level_values(level) | ||
| elif column is not None: | ||
| new_idx = np.asarray(self[column].data.todense()).reshape(-1) | ||
| new_idx = np.asarray(self.loc[:, column].data.todense()).reshape(-1) | ||
|
|
||
| if inplace: | ||
| self._index = _ensure_index(new_idx) | ||
|
|
@@ -647,6 +669,30 @@ def _get_axis_name(self, axis): | |
| raise ValueError('No axis named {} for {}' | ||
| .format(axis, self.__class__)) | ||
|
|
||
| def _reindex_with_indexers(self, reindexers, **kwargs): | ||
| """allow_dups indicates an internal call here """ | ||
|
|
||
| # reindex doing multiple operations on different axes if indicated | ||
| new_data = self.copy() | ||
| for axis in sorted(reindexers.keys()): | ||
| index, indexer = reindexers[axis] | ||
|
|
||
| if index is None: | ||
| continue | ||
|
|
||
| if axis == 0: | ||
| new_mat = new_data.data[indexer, :] | ||
| new_data = SparseFrame(new_mat, index=index, | ||
| columns=self.columns) | ||
| elif axis == 1: | ||
| new_mat = new_data.data[:, indexer] | ||
| new_data = SparseFrame(new_mat, columns=index, | ||
| index=self.index) | ||
| else: | ||
| raise ValueError('Only supported aces are 0 and 1.') | ||
|
||
|
|
||
| return new_data | ||
|
|
||
| def reindex(self, labels=None, index=None, columns=None, axis=None, | ||
| *args, **kwargs): | ||
| """Conform SparseFrame to new index. | ||
|
|
@@ -923,3 +969,6 @@ def _check_categories_order(categories1, categories2, categorical_column_name, | |
| mismatch_type=mismatch_type | ||
| ) | ||
| ) | ||
|
|
||
| for _name, _indexer in get_indexers_list(): | ||
| SparseFrame._create_indexer(_name, _indexer) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,7 +10,6 @@ | |
| import sparsity as sp | ||
| import sparsity.dask as dsp | ||
| from dask.local import get_sync | ||
| from sparsity import sparse_one_hot | ||
| from sparsity.dask.reshape import one_hot_encode | ||
| import pandas.util.testing as pdt | ||
|
|
||
|
|
@@ -63,29 +62,23 @@ def test_loc(iindexer, correct_shape): | |
| assert res.shape == correct_shape | ||
|
|
||
| def test_dask_loc(clickstream): | ||
| sf = dd.from_pandas(clickstream, npartitions=10) \ | ||
| .map_partitions( | ||
| sparse_one_hot, | ||
| categories={'page_id': list('ABCDE')}, | ||
| meta=list | ||
| ) | ||
|
|
||
| sf = one_hot_encode(dd.from_pandas(clickstream, npartitions=10), | ||
| categories={'page_id': list('ABCDE'), | ||
| 'other_categorical': list('FGHIJ')}, | ||
| index_col=['index', 'id']) | ||
| res = sf.loc['2016-01-15':'2016-02-15'] | ||
| res = sp.SparseFrame.concat(res.compute(get=get_sync).tolist()) | ||
| assert res.index.date.max() == dt.date(2016, 2, 15) | ||
| assert res.index.date.min() == dt.date(2016, 1, 15) | ||
| res = res.compute(get=get_sync) | ||
|
||
| assert res.index.levels[0].max().date() == dt.date(2016, 2, 15) | ||
| assert res.index.levels[0].min().date() == dt.date(2016, 1, 15) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No need to use datetime if you already need/have pandas, but that's so lightweight I don't think it justifies changing it. |
||
|
|
||
|
|
||
| def test_dask_multi_index_loc(clickstream): | ||
| sf = dd.from_pandas(clickstream, npartitions=10) \ | ||
| .map_partitions( | ||
| sparse_one_hot, | ||
| index_col=['index', 'id'], | ||
| categories={'page_id': list('ABCDE')}, | ||
| meta=list | ||
| ) | ||
| sf = one_hot_encode(dd.from_pandas(clickstream, npartitions=10), | ||
| categories={'page_id': list('ABCDE'), | ||
| 'other_categorical': list('FGHIJ')}, | ||
| index_col=['index', 'id']) | ||
| res = sf.loc['2016-01-15':'2016-02-15'] | ||
| res = sp.SparseFrame.vstack(res.compute(get=get_sync).tolist()) | ||
| res = res.compute(get=get_sync) | ||
| assert res.index.get_level_values(0).date.min() == dt.date(2016, 1, 15) | ||
| assert res.index.get_level_values(0).date.max() == dt.date(2016, 2, 15) | ||
|
|
||
|
|
@@ -234,7 +227,8 @@ def test_assign_column(): | |
| dsf = dsf.assign(new=ds) | ||
| assert dsf._meta.empty | ||
| sf = dsf.compute() | ||
| assert np.all(sf.todense() == f.assign(new=s)) | ||
| assert np.all((sf.todense() == f.assign(new=s)).values) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize('arg_dict', [ | ||
| dict(divisions=[0, 30, 50, 70, 99]), | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -202,8 +202,8 @@ def test_loc_multi_index(sf_midx, sf_midx_int): | |
| assert np.all(sf_midx.loc[dt_slice].data.todense() == | ||
| np.identity(5)[:3]) | ||
|
|
||
| assert np.all(sf_midx_int.loc[1].todense() == sf_midx.data[:4,:]) | ||
| assert np.all(sf_midx_int.loc[0].todense() == sf_midx.data[4, :]) | ||
| assert np.all(sf_midx_int.loc[1].todense().values == sf_midx.data[:4,:]) | ||
| assert np.all(sf_midx_int.loc[0].todense().values == sf_midx.data[4, :]) | ||
|
|
||
|
|
||
| def test_set_index(sf_midx): | ||
|
|
@@ -619,9 +619,11 @@ def test_npz_io_s3(complex_example): | |
|
|
||
|
|
||
| def test_getitem(): | ||
| sf = SparseFrame(np.identity(10), columns=list('abcdefghij')) | ||
| id_ = np.identity(10) | ||
| sf = SparseFrame(id_, columns=list('abcdefghij')) | ||
| assert sf['a'].data.todense()[0] == 1 | ||
| assert sf['j'].data.todense()[9] == 1 | ||
| assert np.all(sf[['a', 'b']].data.todense() == np.asmatrix(id_[:, [0, 1]])) | ||
| tmp = sf[['j', 'a']].data.todense() | ||
| assert tmp[9, 0] == 1 | ||
| assert tmp[0, 1] == 1 | ||
|
|
@@ -895,4 +897,14 @@ def test_empty_elemwise(): | |
| assert np.all(res == sf.data.todense()) | ||
|
|
||
| with pytest.raises(ValueError): | ||
| res = sf.add(sf_empty, fill_value=None) | ||
| res = sf.add(sf_empty, fill_value=None) | ||
|
|
||
|
|
||
| def test_loc_duplicate_index(): | ||
| sf = SparseFrame(np.identity(5), | ||
| columns=list('UUXYZ'), | ||
| index=list('AAABB')) | ||
| assert len(sf.loc['A'].index) == 3 | ||
| assert len(sf.loc['B'].index) == 2 | ||
| assert np.all(sf.loc['A'].todense().values == np.identity(5)[:3]) | ||
| assert np.all(sf.loc['B'].todense().values == np.identity(5)[3:]) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like you wanted to test columns too, but didn't do it. |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am okay with this for now, but it would be prettier to do it like this: