Skip to content

Commit d693bc8

Browse files
authored
inflate: 10-15% faster decompression (#293)
* inflate: 10-15% faster decompression ``` λ benchcmp before.txt after.txt benchmark old ns/op new ns/op delta BenchmarkDecodeDigitsSpeed1e4-32 56514 52065 -7.87% BenchmarkDecodeDigitsSpeed1e5-32 640875 554792 -13.43% BenchmarkDecodeDigitsSpeed1e6-32 6402174 5513890 -13.87% BenchmarkDecodeDigitsDefault1e4-32 61343 54619 -10.96% BenchmarkDecodeDigitsDefault1e5-32 622027 537596 -13.57% BenchmarkDecodeDigitsDefault1e6-32 6236843 5297780 -15.06% BenchmarkDecodeDigitsCompress1e4-32 60749 54412 -10.43% BenchmarkDecodeDigitsCompress1e5-32 676871 596456 -11.88% BenchmarkDecodeDigitsCompress1e6-32 6749999 5935333 -12.07% BenchmarkDecodeTwainSpeed1e4-32 59802 53984 -9.73% BenchmarkDecodeTwainSpeed1e5-32 629727 547914 -12.99% BenchmarkDecodeTwainSpeed1e6-32 6247368 5438356 -12.95% BenchmarkDecodeTwainDefault1e4-32 59156 52760 -10.81% BenchmarkDecodeTwainDefault1e5-32 584153 512330 -12.30% BenchmarkDecodeTwainDefault1e6-32 5725959 4933883 -13.83% BenchmarkDecodeTwainCompress1e4-32 56871 51282 -9.83% BenchmarkDecodeTwainCompress1e5-32 535344 469828 -12.24% BenchmarkDecodeTwainCompress1e6-32 5266667 4636718 -11.96% BenchmarkDecodeRandomSpeed1e4-32 314 311 -0.96% BenchmarkDecodeRandomSpeed1e5-32 1962 1955 -0.36% BenchmarkDecodeRandomSpeed1e6-32 20149 20369 +1.09% benchmark old MB/s new MB/s speedup BenchmarkDecodeDigitsSpeed1e4-32 176.95 192.07 1.09x BenchmarkDecodeDigitsSpeed1e5-32 156.04 180.25 1.16x BenchmarkDecodeDigitsSpeed1e6-32 156.20 181.36 1.16x BenchmarkDecodeDigitsDefault1e4-32 163.02 183.09 1.12x BenchmarkDecodeDigitsDefault1e5-32 160.76 186.01 1.16x BenchmarkDecodeDigitsDefault1e6-32 160.34 188.76 1.18x BenchmarkDecodeDigitsCompress1e4-32 164.61 183.78 1.12x BenchmarkDecodeDigitsCompress1e5-32 147.74 167.66 1.13x BenchmarkDecodeDigitsCompress1e6-32 148.15 168.48 1.14x BenchmarkDecodeTwainSpeed1e4-32 167.22 185.24 1.11x BenchmarkDecodeTwainSpeed1e5-32 158.80 182.51 1.15x BenchmarkDecodeTwainSpeed1e6-32 160.07 183.88 1.15x BenchmarkDecodeTwainDefault1e4-32 169.04 189.54 1.12x BenchmarkDecodeTwainDefault1e5-32 171.19 195.19 1.14x BenchmarkDecodeTwainDefault1e6-32 174.64 202.68 1.16x BenchmarkDecodeTwainCompress1e4-32 175.84 195.00 1.11x BenchmarkDecodeTwainCompress1e5-32 186.80 212.84 1.14x BenchmarkDecodeTwainCompress1e6-32 189.87 215.67 1.14x BenchmarkDecodeRandomSpeed1e4-32 31888.50 32136.37 1.01x BenchmarkDecodeRandomSpeed1e5-32 50976.74 51150.89 1.00x BenchmarkDecodeRandomSpeed1e6-32 49629.09 49094.25 0.99x ```
1 parent 3d6de88 commit d693bc8

File tree

3 files changed

+337
-240
lines changed

3 files changed

+337
-240
lines changed

flate/gen_inflate.go

Lines changed: 66 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -42,16 +42,6 @@ func (f *decompressor) $FUNCNAME$() {
4242
stateDict
4343
)
4444
fr := f.r.($TYPE$)
45-
moreBits := func() error {
46-
c, err := fr.ReadByte()
47-
if err != nil {
48-
return noEOF(err)
49-
}
50-
f.roffset++
51-
f.b |= uint32(c) << f.nb
52-
f.nb += 8
53-
return nil
54-
}
5545
5646
switch f.stepState {
5747
case stateInit:
@@ -112,9 +102,7 @@ readLiteral:
112102
}
113103
}
114104
115-
var n uint // number of bits extra
116105
var length int
117-
var err error
118106
switch {
119107
case v < 256:
120108
f.dict.writeByte(byte(v))
@@ -131,71 +119,97 @@ readLiteral:
131119
// otherwise, reference to older data
132120
case v < 265:
133121
length = v - (257 - 3)
134-
n = 0
135-
case v < 269:
136-
length = v*2 - (265*2 - 11)
137-
n = 1
138-
case v < 273:
139-
length = v*4 - (269*4 - 19)
140-
n = 2
141-
case v < 277:
142-
length = v*8 - (273*8 - 35)
143-
n = 3
144-
case v < 281:
145-
length = v*16 - (277*16 - 67)
146-
n = 4
147-
case v < 285:
148-
length = v*32 - (281*32 - 131)
149-
n = 5
150122
case v < maxNumLit:
151-
length = 258
152-
n = 0
153-
default:
154-
if debugDecode {
155-
fmt.Println(v, ">= maxNumLit")
156-
}
157-
f.err = CorruptInputError(f.roffset)
158-
return
159-
}
160-
if n > 0 {
123+
val := decCodeToLen[(v - 257)]
124+
length = int(val.length) + 3
125+
n := uint(val.extra)
161126
for f.nb < n {
162-
if err = moreBits(); err != nil {
127+
c, err := fr.ReadByte()
128+
if err != nil {
163129
if debugDecode {
164130
fmt.Println("morebits n>0:", err)
165131
}
166132
f.err = err
167133
return
168134
}
135+
f.roffset++
136+
f.b |= uint32(c) << f.nb
137+
f.nb += 8
169138
}
170139
length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
171140
f.b >>= n & regSizeMaskUint32
172141
f.nb -= n
142+
default:
143+
if debugDecode {
144+
fmt.Println(v, ">= maxNumLit")
145+
}
146+
f.err = CorruptInputError(f.roffset)
147+
return
173148
}
174149
175150
var dist uint32
176151
if f.hd == nil {
177152
for f.nb < 5 {
178-
if err = f.moreBits(); err != nil {
153+
c, err := fr.ReadByte()
154+
if err != nil {
179155
if debugDecode {
180156
fmt.Println("morebits f.nb<5:", err)
181157
}
182158
f.err = err
183159
return
184160
}
161+
f.roffset++
162+
f.b |= uint32(c) << f.nb
163+
f.nb += 8
185164
}
186165
dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
187166
f.b >>= 5
188167
f.nb -= 5
189168
} else {
190-
sym, err := f.huffSym(f.hd)
191-
if err != nil {
192-
if debugDecode {
193-
fmt.Println("huffsym:", err)
169+
// Since a huffmanDecoder can be empty or be composed of a degenerate tree
170+
// with single element, huffSym must error on these two edge cases. In both
171+
// cases, the chunks slice will be 0 for the invalid sequence, leading it
172+
// satisfy the n == 0 check below.
173+
n := uint(f.hd.maxRead)
174+
// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
175+
// but is smart enough to keep local variables in registers, so use nb and b,
176+
// inline call to moreBits and reassign b,nb back to f on return.
177+
nb, b := f.nb, f.b
178+
for {
179+
for nb < n {
180+
c, err := fr.ReadByte()
181+
if err != nil {
182+
f.b = b
183+
f.nb = nb
184+
f.err = noEOF(err)
185+
return
186+
}
187+
f.roffset++
188+
b |= uint32(c) << (nb & regSizeMaskUint32)
189+
nb += 8
190+
}
191+
chunk := f.hd.chunks[b&(huffmanNumChunks-1)]
192+
n = uint(chunk & huffmanCountMask)
193+
if n > huffmanChunkBits {
194+
chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask]
195+
n = uint(chunk & huffmanCountMask)
196+
}
197+
if n <= nb {
198+
if n == 0 {
199+
f.b = b
200+
f.nb = nb
201+
if debugDecode {
202+
fmt.Println("huffsym: n==0")
203+
}
204+
f.err = CorruptInputError(f.roffset)
205+
return
206+
}
207+
f.b = b >> (n & regSizeMaskUint32)
208+
f.nb = nb - n
209+
dist = uint32(chunk >> huffmanValueShift)
210+
break
194211
}
195-
f.err = err
196-
return
197212
}
198-
dist = uint32(sym)
199213
}
200214
201215
switch {
@@ -206,13 +220,17 @@ readLiteral:
206220
// have 1 bit in bottom of dist, need nb more.
207221
extra := (dist & 1) << (nb & regSizeMaskUint32)
208222
for f.nb < nb {
209-
if err = f.moreBits(); err != nil {
223+
c, err := fr.ReadByte()
224+
if err != nil {
210225
if debugDecode {
211226
fmt.Println("morebits f.nb<nb:", err)
212227
}
213228
f.err = err
214229
return
215230
}
231+
f.roffset++
232+
f.b |= uint32(c) << f.nb
233+
f.nb += 8
216234
}
217235
extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
218236
f.b >>= nb & regSizeMaskUint32

flate/inflate.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,13 @@ const (
2929
debugDecode = false
3030
)
3131

32+
// Value of length - 3 and extra bits.
33+
type lengthExtra struct {
34+
length, extra uint8
35+
}
36+
37+
var decCodeToLen = [32]lengthExtra{{length: 0x0, extra: 0x0}, {length: 0x1, extra: 0x0}, {length: 0x2, extra: 0x0}, {length: 0x3, extra: 0x0}, {length: 0x4, extra: 0x0}, {length: 0x5, extra: 0x0}, {length: 0x6, extra: 0x0}, {length: 0x7, extra: 0x0}, {length: 0x8, extra: 0x1}, {length: 0xa, extra: 0x1}, {length: 0xc, extra: 0x1}, {length: 0xe, extra: 0x1}, {length: 0x10, extra: 0x2}, {length: 0x14, extra: 0x2}, {length: 0x18, extra: 0x2}, {length: 0x1c, extra: 0x2}, {length: 0x20, extra: 0x3}, {length: 0x28, extra: 0x3}, {length: 0x30, extra: 0x3}, {length: 0x38, extra: 0x3}, {length: 0x40, extra: 0x4}, {length: 0x50, extra: 0x4}, {length: 0x60, extra: 0x4}, {length: 0x70, extra: 0x4}, {length: 0x80, extra: 0x5}, {length: 0xa0, extra: 0x5}, {length: 0xc0, extra: 0x5}, {length: 0xe0, extra: 0x5}, {length: 0xff, extra: 0x0}, {length: 0x0, extra: 0x0}, {length: 0x0, extra: 0x0}, {length: 0x0, extra: 0x0}}
38+
3239
// Initialize the fixedHuffmanDecoder only once upon first use.
3340
var fixedOnce sync.Once
3441
var fixedHuffmanDecoder huffmanDecoder

0 commit comments

Comments
 (0)