Skip to content

Commit e484862

Browse files
bjudeBen Judewilliam-silversmith
authored
fix: handle cases where compressed representation is larger than input array (#21)
The fpzip algorithm can sometimes require a larger buffer than the input array size (pigeonhole principle). Instead of always using the input array size + header size as the compression buffer size, try with this size and incrementally expand the buffer if it's not enough. Co-authored-by: Ben Jude <[email protected]> Co-authored-by: William Silversmith <[email protected]>
1 parent 29ded07 commit e484862

File tree

2 files changed

+33
-5
lines changed

2 files changed

+33
-5
lines changed

automated_test.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,15 @@ def test_basic_conformation():
127127
fpzip.decompress(six_fpz) == fpzip.decompress(compressed)[0,:,:,:]
128128
)
129129

130+
def test_oversize_compression():
131+
# This array compresses to a larger size than the input array
132+
# This test ensures that compressing this array does not lead to buffer
133+
# overflows
134+
arr = np.array([1e-12, 0, 1e-12])
135+
compressed = fpzip.compress(arr)
136+
assert np.all(
137+
fpzip.decompress(compressed).flatten() == arr
138+
)
130139

131140
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
132141
def test_buffer_overflow(dtype):

fpzip.pyx

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,26 @@ def compress(data, precision=0, order='C'):
8888
order is 'C' or 'F' (row major vs column major memory layout) and
8989
should correspond to the underlying orientation of the input array.
9090
"""
91+
MAX_ATTEMPTS = 5
92+
BUFFER_GROWTH_FACTOR = 1.5
93+
# Occasionally the fpzip compression algorithm produces larger 'compressed'
94+
# representation than the size of the input data array (particularly for very
95+
# short input arrays) so we gradually grow the compression buffer if a buffer
96+
# size of `data.nbytes` is insufficient.
97+
buffer_size = data.nbytes
98+
for i in range(MAX_ATTEMPTS):
99+
try:
100+
return _try_compress(data, buffer_size, precision=precision, order=order)
101+
except:
102+
# Only increase the buffer size if the exception was caused by buffer
103+
# overflow and we aren't on our final iteration.
104+
if fpzip_errno == fpzipErrorBufferOverflow and i < (MAX_ATTEMPTS - 1):
105+
buffer_size = int(BUFFER_GROWTH_FACTOR * buffer_size)
106+
continue
107+
else:
108+
raise
109+
110+
def _try_compress(data, buffer_size, precision, order):
91111
if data.dtype not in (np.float32, np.float64):
92112
raise ValueError("Data type {} must be a floating type.".format(data.dtype))
93113

@@ -102,19 +122,18 @@ def compress(data, precision=0, order='C'):
102122
if not data.flags['C_CONTIGUOUS'] and not data.flags['F_CONTIGUOUS']:
103123
data = np.copy(data, order=order)
104124

105-
header_bytes = 28 # read.cpp:fpzip_read_header + 4 for some reason
125+
header_bytes = 24 # read.cpp:fpzip_read_header
106126

107127
cdef char fptype = b'f' if data.dtype == np.float32 else b'd'
108128

109129
# some compressed data can be bigger than the original data
110-
extra_fraction = 1.25
111-
cdef array.array compression_buf = allocate(fptype, int(extra_fraction * data.size) + header_bytes)
130+
cdef array.array compression_buf = allocate(fptype, data.size + header_bytes)
112131

113132
cdef FPZ* fpz_ptr
114133
if fptype == b'f':
115-
fpz_ptr = fpzip_write_to_buffer(compression_buf.data.as_floats, int(extra_fraction * data.nbytes) + header_bytes)
134+
fpz_ptr = fpzip_write_to_buffer(compression_buf.data.as_floats, buffer_size + header_bytes)
116135
else:
117-
fpz_ptr = fpzip_write_to_buffer(compression_buf.data.as_doubles, int(extra_fraction * data.nbytes) + header_bytes)
136+
fpz_ptr = fpzip_write_to_buffer(compression_buf.data.as_doubles, buffer_size + header_bytes)
118137

119138
if data.dtype == np.float32:
120139
fpz_ptr[0].type = 0 # float

0 commit comments

Comments
 (0)