Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 18 additions & 11 deletions sparsity/sparse_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ def _append_zero_row(csr):
)


def _batch_get_loc(index: pd.Index, keys: list):
return [index.get_loc(key) for key in keys]


class SparseFrame(object):
"""
Simple sparse table based on scipy.sparse.csr_matrix
Expand Down Expand Up @@ -282,14 +286,15 @@ def groupby(self, by=None, level=0):
return self.groupby_sum(by, level)

def groupby_agg(self, by=None, level=None, agg_func=None):
by = self._get_groupby_col(by, level)
by, cols = self._get_groupby_col(by, level)
groups = pd.Index(np.arange(self.shape[0])).groupby(by)
res = sparse.csr_matrix((len(groups), self.shape[1]))
new_idx = []
for i, (name, indizes) in enumerate(groups.items()):
new_idx.append(self.index.values[indizes[0]])
new_idx.append(name)
res[i] = agg_func(self.data[indizes.values,:])
return SparseFrame(res, index=new_idx)
res = SparseFrame(res, index=new_idx, columns=self.columns)
return res[cols]

def groupby_sum(self, by=None, level=0):
"""
Expand All @@ -310,34 +315,36 @@ def groupby_sum(self, by=None, level=0):
df: sparsity.SparseFrame
Grouped by and summed SparseFrame.
"""
by = self._get_groupby_col(by, level)
by, cols = self._get_groupby_col(by, level)
group_idx = by.argsort()
gm = _create_group_matrix(by[group_idx])
grouped_data = self._data[group_idx, :].T.dot(gm).T
return SparseFrame(grouped_data, index=np.unique(by), columns=self._columns)
res = SparseFrame(grouped_data, index=np.unique(by),
columns=self._columns)
return res[cols]


def _get_groupby_col(self, by, level):
if by is None and level is None:
raise ValueError("You have to supply one of 'by' and 'level'")
other_cols = self._columns.tolist()
if by is not None:
try:
if by in self._columns:
other_cols.remove(by)
by = self[by].toarray()
except TypeError:
assert len(by) == self.data.shape[0]
by = np.array(by)
else:
if level and isinstance(self._index, pd.MultiIndex):
by = self.index.get_level_values(level).values
elif level == 0:
by = np.asarray(self._index)
elif level > 0:
raise ValueError(
"Connot use level > 0 in a non MultiIndex Frame")
else:
by = self.index.values
return by
"Cannot use level > 0 in a non-MultiIndex Frame.")
else: # level == 0
by = np.asarray(self._index)
return by, other_cols

def join(self, other, axis=1, how='outer', level=None):
"""
Expand Down
26 changes: 24 additions & 2 deletions sparsity/test/test_sparse_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,8 +745,30 @@ def test_groupby_agg(groupby_frame):
res = groupby_frame.groupby_agg(
level=0,
agg_func=lambda x: x.mean(axis=0)
).data.todense()
assert np.all(res.round() == np.identity(10))
)
assert np.all(res.data.todense().round() == np.identity(10))

assert np.all(res.columns == groupby_frame.columns)
assert np.all(res.index == groupby_frame.index.unique().sort_values())


def test_groupby_agg_multiindex():
df = pd.DataFrame({'X': [1, 1, 1, 0],
'Y': [0, 1, 0, 1],
'gr': ['a', 'a', 'b', 'b'],
'day': [10, 11, 11, 12]})
df = df.set_index(['day', 'gr'])
sf = SparseFrame(df)

correct = df.groupby(level=1).mean()
res = sf.groupby_agg(level=1, agg_func=lambda x: x.mean(axis=0))
assert np.all(res.index == correct.index)
assert np.all(res.columns == correct.columns)

correct = df.groupby(by='Y').mean()
res = sf.groupby_agg(by='Y', agg_func=lambda x: x.mean(axis=0))
assert np.all(res.index == correct.index)
assert np.all(res.columns == correct.columns)


def test_init_with_pandas():
Expand Down