diff --git a/sparsity/io.py b/sparsity/io.py index bd4f585..6f5c0d0 100644 --- a/sparsity/io.py +++ b/sparsity/io.py @@ -26,17 +26,21 @@ def traildb_to_coo(db, fieldname): def to_npz(sf, filename): data = _csr_to_dict(sf.data) + data['metadata'] = \ + {'multiindex': True if isinstance(sf.index, pd.MultiIndex) else False} data['frame_index'] = sf.index.values data['frame_columns'] = sf.columns.values np.savez(filename, **data) + def read_npz(filename): loader = np.load(filename) csr_mat = _load_csr(loader) - idx = loader['frame_index'] + idx = _load_idx_from_npz(loader) cols = loader['frame_columns'] return (csr_mat, idx, cols) + def _csr_to_dict(array): return dict(data = array.data ,indices=array.indices, indptr =array.indptr, shape=array.shape) @@ -48,6 +52,17 @@ def _load_csr(loader): shape=loader['shape']) +def _load_idx_from_npz(loader): + idx = loader['frame_index'] + try: + if loader['metadata'][()]['multiindex']: + idx = pd.MultiIndex.from_tuples(idx) + except KeyError: + if all(map(lambda x: isinstance(x, tuple), idx)): + idx = pd.MultiIndex.from_tuples(idx) + return idx + + def _just_read_array(path): if path.endswith('hdf') or path.endswith('hdf5'): return pd.read_hdf(path, '/df').values diff --git a/sparsity/test/test_sparse_frame.py b/sparsity/test/test_sparse_frame.py index 25654ad..f7529e3 100644 --- a/sparsity/test/test_sparse_frame.py +++ b/sparsity/test/test_sparse_frame.py @@ -9,6 +9,7 @@ from scipy import sparse from sparsity import SparseFrame, sparse_one_hot +from sparsity.io import _csr_to_dict from .conftest import tmpdir @@ -217,6 +218,26 @@ def test_set_index(sf_midx): # assert np.all(sf.loc[[4, 5]].data.todense() == np.identity(5)[[3, 4]]) +def test_save_load_multiindex(sf_midx): + with tmpdir() as tmp: + # test new + path = os.path.join(tmp, 'sf.npz') + sf_midx.to_npz(path) + res = SparseFrame.read_npz(path) + assert isinstance(res.index, pd.MultiIndex) + + # test backwards compatibility + def _to_npz_legacy(sf, filename): + data = _csr_to_dict(sf.data) + data['frame_index'] = sf.index.values + data['frame_columns'] = sf.columns.values + np.savez(filename, **data) + + _to_npz_legacy(sf_midx, path) + res = SparseFrame.read_npz(path) + assert isinstance(res.index, pd.MultiIndex) + + def test_new_column_assign_array(): sf = SparseFrame(np.identity(5)) sf[6] = np.ones(5)