Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 14 additions & 11 deletions sparsity/sparse_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,14 +282,15 @@ def groupby(self, by=None, level=0):
return self.groupby_sum(by, level)

def groupby_agg(self, by=None, level=None, agg_func=None):
by = self._get_groupby_col(by, level)
by, cols = self._get_groupby_col(by, level)
groups = pd.Index(np.arange(self.shape[0])).groupby(by)
res = sparse.csr_matrix((len(groups), self.shape[1]))
new_idx = []
for i, (name, indizes) in enumerate(groups.items()):
new_idx.append(self.index.values[indizes[0]])
new_idx.append(name)
res[i] = agg_func(self.data[indizes.values,:])
return SparseFrame(res, index=new_idx)
res = SparseFrame(res, index=new_idx, columns=self.columns)
return res[cols]

def groupby_sum(self, by=None, level=0):
"""
Expand All @@ -310,34 +311,36 @@ def groupby_sum(self, by=None, level=0):
df: sparsity.SparseFrame
Grouped by and summed SparseFrame.
"""
by = self._get_groupby_col(by, level)
by, cols = self._get_groupby_col(by, level)
group_idx = by.argsort()
gm = _create_group_matrix(by[group_idx])
grouped_data = self._data[group_idx, :].T.dot(gm).T
return SparseFrame(grouped_data, index=np.unique(by), columns=self._columns)
res = SparseFrame(grouped_data, index=np.unique(by),
columns=self._columns)
return res[cols]


def _get_groupby_col(self, by, level):
if by is None and level is None:
raise ValueError("You have to supply one of 'by' and 'level'")
other_cols = self._columns.tolist()
if by is not None:
try:
if by in self._columns:
other_cols.remove(by)
by = self[by].toarray()
except TypeError:
assert len(by) == self.data.shape[0]
by = np.array(by)
else:
if level and isinstance(self._index, pd.MultiIndex):
by = self.index.get_level_values(level).values
elif level == 0:
by = np.asarray(self._index)
elif level > 0:
raise ValueError(
"Connot use level > 0 in a non MultiIndex Frame")
else:
by = self.index.values
return by
"Cannot use level > 0 in a non-MultiIndex Frame.")
else: # level == 0
by = np.asarray(self._index)
return by, other_cols

def join(self, other, axis=1, how='outer', level=None):
"""
Expand Down
26 changes: 24 additions & 2 deletions sparsity/test/test_sparse_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,8 +745,30 @@ def test_groupby_agg(groupby_frame):
res = groupby_frame.groupby_agg(
level=0,
agg_func=lambda x: x.mean(axis=0)
).data.todense()
assert np.all(res.round() == np.identity(10))
)
assert np.all(res.data.todense().round() == np.identity(10))

assert np.all(res.columns == groupby_frame.columns)
assert np.all(res.index == groupby_frame.index.unique().sort_values())


def test_groupby_agg_multiindex():
df = pd.DataFrame({'X': [1, 1, 1, 0],
'Y': [0, 1, 0, 1],
'gr': ['a', 'a', 'b', 'b'],
'day': [10, 11, 11, 12]})
df = df.set_index(['day', 'gr'])
sf = SparseFrame(df)

correct = df.groupby(level=1).mean()
res = sf.groupby_agg(level=1, agg_func=lambda x: x.mean(axis=0))
assert np.all(res.index == correct.index)
assert np.all(res.columns == correct.columns)

correct = df.groupby(by='Y').mean()
res = sf.groupby_agg(by='Y', agg_func=lambda x: x.mean(axis=0))
assert np.all(res.index == correct.index)
assert np.all(res.columns == correct.columns)


def test_init_with_pandas():
Expand Down