Skip to content

Commit 4b3cc06

Browse files
authored
huff0: Pass a single bitReader pointer to asm (#634)
This makes the context object smaller and frees up three registers, which we can use to replace the limitPtr and bufferOrigin stack variables. Benchmark results show a tiny win (Go 1.19beta, Core i7-3770K): name old speed new speed delta Decompress1XTable/digits-8 347MB/s ± 0% 347MB/s ± 0% ~ (p=0.650 n=8+10) Decompress1XTable/gettysburg-8 268MB/s ± 0% 268MB/s ± 0% ~ (p=0.400 n=9+9) Decompress1XTable/twain-8 327MB/s ± 0% 327MB/s ± 1% ~ (p=0.339 n=7+9) Decompress1XTable/low-ent.10k-8 385MB/s ± 0% 385MB/s ± 1% ~ (p=0.510 n=9+10) Decompress1XTable/superlow-ent-10k-8 376MB/s ± 0% 376MB/s ± 0% ~ (p=0.712 n=8+10) Decompress1XTable/crash2-8 17.3MB/s ± 1% 17.3MB/s ± 1% ~ (p=0.926 n=10+10) Decompress1XTable/endzerobits-8 52.9MB/s ± 1% 52.4MB/s ± 0% -0.94% (p=0.000 n=10+10) Decompress1XTable/endnonzero-8 11.4MB/s ± 0% 11.4MB/s ± 1% ~ (p=0.343 n=10+10) Decompress1XTable/case1-8 22.0MB/s ± 0% 22.0MB/s ± 0% ~ (p=0.618 n=9+9) Decompress1XTable/case2-8 18.1MB/s ± 0% 18.1MB/s ± 0% ~ (p=0.348 n=9+9) Decompress1XTable/case3-8 19.1MB/s ± 0% 19.1MB/s ± 0% +0.21% (p=0.048 n=10+10) Decompress1XTable/pngdata.001-8 374MB/s ± 0% 374MB/s ± 0% ~ (p=0.861 n=9+10) Decompress1XTable/normcount2-8 54.3MB/s ± 1% 54.5MB/s ± 1% ~ (p=0.093 n=10+10) Decompress1XNoTable/digits/100-8 279MB/s ± 0% 280MB/s ± 0% +0.30% (p=0.003 n=10+9) Decompress1XNoTable/digits/10000-8 366MB/s ± 0% 365MB/s ± 0% ~ (p=0.113 n=10+9) Decompress1XNoTable/digits/262143-8 347MB/s ± 0% 347MB/s ± 1% ~ (p=0.739 n=10+10) Decompress1XNoTable/gettysburg/100-8 278MB/s ± 1% 277MB/s ± 1% ~ (p=0.676 n=10+9) Decompress1XNoTable/gettysburg/10000-8 363MB/s ± 1% 362MB/s ± 0% -0.50% (p=0.001 n=10+9) Decompress1XNoTable/gettysburg/262143-8 350MB/s ± 0% 347MB/s ± 0% -0.90% (p=0.000 n=10+8) Decompress1XNoTable/twain/100-8 268MB/s ± 0% 267MB/s ± 0% ~ (p=0.384 n=9+8) Decompress1XNoTable/twain/10000-8 363MB/s ± 0% 362MB/s ± 0% -0.32% (p=0.000 n=9+9) Decompress1XNoTable/twain/262143-8 328MB/s ± 0% 329MB/s ± 0% ~ (p=0.063 n=9+10) Decompress1XNoTable/low-ent.10k/100-8 180MB/s ± 0% 181MB/s ± 0% ~ (p=0.225 n=10+10) Decompress1XNoTable/low-ent.10k/10000-8 385MB/s ± 0% 385MB/s ± 0% ~ (p=0.289 n=10+10) Decompress1XNoTable/low-ent.10k/262143-8 389MB/s ± 1% 389MB/s ± 1% ~ (p=0.971 n=10+10) Decompress1XNoTable/superlow-ent-10k/262143-8 389MB/s ± 0% 390MB/s ± 0% +0.27% (p=0.017 n=9+10) Decompress1XNoTable/crash2/100-8 278MB/s ± 0% 279MB/s ± 1% ~ (p=0.163 n=9+10) Decompress1XNoTable/crash2/10000-8 373MB/s ± 1% 373MB/s ± 0% ~ (p=0.370 n=10+8) Decompress1XNoTable/crash2/262143-8 375MB/s ± 0% 375MB/s ± 0% ~ (p=0.604 n=9+10) Decompress1XNoTable/endzerobits/100-8 180MB/s ± 0% 181MB/s ± 0% +0.26% (p=0.005 n=10+9) Decompress1XNoTable/endzerobits/10000-8 384MB/s ± 0% 385MB/s ± 0% ~ (p=0.914 n=8+10) Decompress1XNoTable/endzerobits/262143-8 389MB/s ± 0% 390MB/s ± 0% ~ (p=0.739 n=10+10) Decompress1XNoTable/endnonzero/100-8 180MB/s ± 1% 180MB/s ± 1% ~ (p=0.926 n=10+10) Decompress1XNoTable/endnonzero/10000-8 384MB/s ± 0% 384MB/s ± 0% ~ (p=0.965 n=10+8) Decompress1XNoTable/endnonzero/262143-8 390MB/s ± 0% 390MB/s ± 0% ~ (p=0.633 n=8+10) Decompress1XNoTable/case1/100-8 282MB/s ± 0% 283MB/s ± 0% +0.34% (p=0.005 n=10+10) Decompress1XNoTable/case1/10000-8 372MB/s ± 0% 373MB/s ± 0% ~ (p=0.113 n=9+9) Decompress1XNoTable/case1/262143-8 374MB/s ± 0% 374MB/s ± 0% ~ (p=0.448 n=10+10) Decompress1XNoTable/case2/100-8 274MB/s ± 1% 274MB/s ± 0% ~ (p=0.927 n=10+10) Decompress1XNoTable/case2/10000-8 376MB/s ± 0% 376MB/s ± 0% ~ (p=0.408 n=10+8) Decompress1XNoTable/case2/262143-8 376MB/s ± 1% 377MB/s ± 0% ~ (p=1.000 n=10+10) Decompress1XNoTable/case3/100-8 266MB/s ± 0% 265MB/s ± 0% ~ (p=0.113 n=9+10) Decompress1XNoTable/case3/10000-8 372MB/s ± 0% 372MB/s ± 0% ~ (p=0.075 n=10+9) Decompress1XNoTable/case3/262143-8 374MB/s ± 0% 374MB/s ± 0% ~ (p=0.172 n=10+10) Decompress1XNoTable/pngdata.001/100-8 238MB/s ± 0% 238MB/s ± 0% ~ (p=0.438 n=9+8) Decompress1XNoTable/pngdata.001/10000-8 384MB/s ± 0% 384MB/s ± 0% ~ (p=0.448 n=10+10) Decompress1XNoTable/pngdata.001/262143-8 378MB/s ± 0% 378MB/s ± 0% ~ (p=0.836 n=10+10) Decompress1XNoTable/normcount2/100-8 281MB/s ± 0% 282MB/s ± 1% ~ (p=0.122 n=8+10) Decompress1XNoTable/normcount2/10000-8 369MB/s ± 1% 369MB/s ± 0% ~ (p=0.912 n=10+10) Decompress1XNoTable/normcount2/262143-8 370MB/s ± 0% 370MB/s ± 1% ~ (p=0.342 n=10+10) Decompress4XNoTable/digits/100-8 197MB/s ± 0% 197MB/s ± 1% ~ (p=0.764 n=10+9) Decompress4XNoTable/digits/10000-8 594MB/s ± 0% 602MB/s ± 1% +1.35% (p=0.000 n=10+10) Decompress4XNoTable/digits/262143-8 570MB/s ± 1% 578MB/s ± 0% +1.30% (p=0.000 n=10+8) Decompress4XNoTable/gettysburg/100-8 258MB/s ± 1% 260MB/s ± 0% +0.59% (p=0.001 n=10+10) Decompress4XNoTable/gettysburg/10000-8 638MB/s ± 0% 641MB/s ± 0% +0.44% (p=0.000 n=9+9) Decompress4XNoTable/gettysburg/262143-8 573MB/s ± 1% 574MB/s ± 0% ~ (p=0.353 n=10+10) Decompress4XNoTable/twain/100-8 214MB/s ± 2% 214MB/s ± 2% ~ (p=0.853 n=10+10) Decompress4XNoTable/twain/10000-8 634MB/s ± 1% 638MB/s ± 0% +0.62% (p=0.000 n=10+10) Decompress4XNoTable/twain/262143-8 513MB/s ± 1% 517MB/s ± 0% +0.85% (p=0.000 n=10+10) Decompress4XNoTable/low-ent.10k/100-8 195MB/s ± 0% 194MB/s ± 0% ~ (p=0.130 n=9+9) Decompress4XNoTable/low-ent.10k/10000-8 635MB/s ± 0% 642MB/s ± 0% +1.19% (p=0.000 n=10+10) Decompress4XNoTable/low-ent.10k/262143-8 675MB/s ± 0% 685MB/s ± 0% +1.51% (p=0.000 n=10+10) Decompress4XNoTable/superlow-ent-10k/262143-8 673MB/s ± 1% 684MB/s ± 0% +1.70% (p=0.000 n=10+10) Decompress4XNoTable/case1/100-8 206MB/s ± 1% 206MB/s ± 0% ~ (p=0.189 n=10+9) Decompress4XNoTable/case1/10000-8 593MB/s ± 0% 601MB/s ± 0% +1.47% (p=0.000 n=10+10) Decompress4XNoTable/case1/262143-8 603MB/s ± 0% 613MB/s ± 0% +1.64% (p=0.000 n=10+10) Decompress4XNoTable/case2/100-8 201MB/s ± 0% 202MB/s ± 1% ~ (p=0.053 n=9+10) Decompress4XNoTable/case2/10000-8 610MB/s ± 0% 618MB/s ± 0% +1.30% (p=0.000 n=9+10) Decompress4XNoTable/case2/262143-8 622MB/s ± 1% 634MB/s ± 0% +1.90% (p=0.000 n=9+8) Decompress4XNoTable/case3/100-8 197MB/s ± 1% 198MB/s ± 0% +0.53% (p=0.001 n=9+10) Decompress4XNoTable/case3/10000-8 606MB/s ± 0% 615MB/s ± 0% +1.49% (p=0.000 n=8+10) Decompress4XNoTable/case3/262143-8 613MB/s ± 1% 622MB/s ± 0% +1.48% (p=0.000 n=10+10) Decompress4XNoTable/pngdata.001/100-8 212MB/s ± 1% 211MB/s ± 0% ~ (p=0.136 n=9+9) Decompress4XNoTable/pngdata.001/10000-8 645MB/s ± 1% 649MB/s ± 1% +0.65% (p=0.000 n=9+10) Decompress4XNoTable/pngdata.001/262143-8 640MB/s ± 1% 649MB/s ± 0% +1.44% (p=0.000 n=10+10) Decompress4XNoTable/normcount2/100-8 260MB/s ± 1% 261MB/s ± 1% ~ (p=0.211 n=10+9) Decompress4XNoTable/normcount2/10000-8 584MB/s ± 1% 591MB/s ± 0% +1.33% (p=0.000 n=9+9) Decompress4XNoTable/normcount2/262143-8 588MB/s ± 1% 596MB/s ± 1% +1.39% (p=0.000 n=10+9) Decompress4XNoTableTableLog8/digits-8 583MB/s ± 1% 592MB/s ± 0% +1.48% (p=0.000 n=10+10) Decompress4XTable/digits-8 580MB/s ± 0% 588MB/s ± 0% +1.33% (p=0.000 n=8+10) Decompress4XTable/gettysburg-8 368MB/s ± 1% 370MB/s ± 0% +0.59% (p=0.017 n=10+9) Decompress4XTable/twain-8 510MB/s ± 0% 515MB/s ± 0% +0.99% (p=0.000 n=9+10) Decompress4XTable/low-ent.10k-8 657MB/s ± 0% 665MB/s ± 0% +1.24% (p=0.000 n=10+10) Decompress4XTable/superlow-ent-10k-8 608MB/s ± 0% 617MB/s ± 1% +1.48% (p=0.000 n=8+10) Decompress4XTable/case1-8 21.1MB/s ± 1% 21.0MB/s ± 2% ~ (p=0.223 n=10+10) Decompress4XTable/case2-8 17.6MB/s ± 0% 17.6MB/s ± 0% ~ (p=0.199 n=9+10) Decompress4XTable/case3-8 18.7MB/s ± 0% 18.7MB/s ± 0% ~ (p=0.557 n=10+8) Decompress4XTable/pngdata.001-8 633MB/s ± 1% 645MB/s ± 0% +1.90% (p=0.000 n=9+10) Decompress4XTable/normcount2-8 49.9MB/s ± 1% 49.5MB/s ± 1% -0.64% (p=0.002 n=10+10) [Geo mean] 270MB/s 271MB/s +0.36%
1 parent b16a9af commit 4b3cc06

File tree

3 files changed

+382
-422
lines changed

3 files changed

+382
-422
lines changed

huff0/_generate/gen.go

Lines changed: 46 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -49,49 +49,41 @@ func (d decompress4x) generateProcedure(name string) {
4949
exhausted := GP64()
5050
XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false
5151

52-
limitPtr := AllocLocal(8)
52+
limit := GP64()
5353

5454
bufferOrigin := GP64()
5555
peekBits := GP64()
5656
buffer := GP64()
5757
dstEvery := GP64()
5858
table := GP64()
5959

60-
br0 := GP64()
61-
br1 := GP64()
62-
br2 := GP64()
63-
br3 := GP64()
60+
br := GP64()
6461

6562
Comment("Preload values")
6663
{
6764
ctx := Dereference(Param("ctx"))
6865
Load(ctx.Field("peekBits"), peekBits)
69-
Load(ctx.Field("out"), buffer)
70-
MOVQ(buffer, bufferOrigin)
71-
limit := Load(ctx.Field("limit"), GP64())
72-
MOVQ(limit, limitPtr)
66+
Load(ctx.Field("out"), bufferOrigin)
67+
Load(ctx.Field("limit"), limit)
7368
Load(ctx.Field("dstEvery"), dstEvery)
7469
Load(ctx.Field("tbl"), table)
75-
Load(ctx.Field("pbr0"), br0)
76-
Load(ctx.Field("pbr1"), br1)
77-
Load(ctx.Field("pbr2"), br2)
78-
Load(ctx.Field("pbr3"), br3)
70+
Load(ctx.Field("pbr"), br)
7971
}
8072

8173
Comment("Main loop")
8274
Label("main_loop")
8375

8476
MOVQ(bufferOrigin, buffer)
8577
// Check if we have space
86-
CMPQ(buffer, limitPtr)
78+
CMPQ(buffer, limit)
8779
SETGE(exhausted.As8())
88-
d.decodeTwoValues(0, br0, peekBits, table, buffer, exhausted)
80+
d.decodeTwoValues(0, br, peekBits, table, buffer, exhausted)
8981
ADDQ(dstEvery, buffer)
90-
d.decodeTwoValues(1, br1, peekBits, table, buffer, exhausted)
82+
d.decodeTwoValues(1, br, peekBits, table, buffer, exhausted)
9183
ADDQ(dstEvery, buffer)
92-
d.decodeTwoValues(2, br2, peekBits, table, buffer, exhausted)
84+
d.decodeTwoValues(2, br, peekBits, table, buffer, exhausted)
9385
ADDQ(dstEvery, buffer)
94-
d.decodeTwoValues(3, br3, peekBits, table, buffer, exhausted)
86+
d.decodeTwoValues(3, br, peekBits, table, buffer, exhausted)
9587

9688
ADDQ(U8(2), bufferOrigin) // off += 2
9789

@@ -100,10 +92,9 @@ func (d decompress4x) generateProcedure(name string) {
10092

10193
{
10294
ctx := Dereference(Param("ctx"))
103-
tmp := Load(ctx.Field("out"), GP64())
104-
decoded := GP64()
105-
MOVQ(bufferOrigin, decoded)
106-
SUBQ(tmp, decoded)
95+
ctxout, _ := ctx.Field("out").Resolve()
96+
decoded := bufferOrigin
97+
SUBQ(ctxout.Addr, decoded)
10798
SHLQ(U8(2), decoded) // decoded *= 4
10899

109100
Store(decoded, ctx.Field("decoded"))
@@ -118,6 +109,7 @@ const bitReader_in = 0
118109
const bitReader_off = bitReader_in + 3*8 // {ptr, len, cap}
119110
const bitReader_value = bitReader_off + 8
120111
const bitReader_bitsRead = bitReader_value + 8
112+
const bitReader__size = bitReader_bitsRead + 8
121113

122114
func (d decompress4x) decodeTwoValues(id int, br, peekBits, table, buffer, exhausted reg.GPVirtual) {
123115
brValue, brBitsRead := d.fillFast32(id, 32, br, exhausted)
@@ -157,9 +149,10 @@ func (d decompress4x) decodeTwoValues(id int, br, peekBits, table, buffer, exhau
157149
Comment("out[id * dstEvery + 1] = uint8(v1.entry >> 8)")
158150
MOVW(out.As16(), Mem{Base: buffer})
159151

160-
Comment("update the bitrader reader structure")
161-
MOVQ(brValue, Mem{Base: br, Disp: bitReader_value})
162-
MOVB(brBitsRead.As8(), Mem{Base: br, Disp: bitReader_bitsRead})
152+
Comment("update the bitreader structure")
153+
offset := id * bitReader__size
154+
MOVQ(brValue, Mem{Base: br, Disp: offset + bitReader_value})
155+
MOVB(brBitsRead.As8(), Mem{Base: br, Disp: offset + bitReader_bitsRead})
163156
}
164157

165158
func (d decompress4x) generateProcedure4x8bit(name string) {
@@ -171,49 +164,41 @@ func (d decompress4x) generateProcedure4x8bit(name string) {
171164
exhausted := GP64() // Fixed since we need 8H
172165
XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false
173166

174-
bufferOrigin := AllocLocal(8)
175-
limitPtr := AllocLocal(8)
167+
bufferOrigin := GP64()
168+
limit := GP64()
176169

177170
peekBits := GP64()
178171
buffer := GP64()
179172
dstEvery := GP64()
180173
table := GP64()
181174

182-
br0 := GP64()
183-
br1 := GP64()
184-
br2 := GP64()
185-
br3 := GP64()
175+
br := GP64()
186176

187177
Comment("Preload values")
188178
{
189179
ctx := Dereference(Param("ctx"))
190180
Load(ctx.Field("peekBits"), peekBits)
191-
Load(ctx.Field("out"), buffer)
192-
MOVQ(buffer, bufferOrigin)
193-
limit := Load(ctx.Field("limit"), GP64())
194-
MOVQ(limit, limitPtr)
181+
Load(ctx.Field("out"), bufferOrigin)
182+
Load(ctx.Field("limit"), limit)
195183
Load(ctx.Field("dstEvery"), dstEvery)
196184
Load(ctx.Field("tbl"), table)
197-
Load(ctx.Field("pbr0"), br0)
198-
Load(ctx.Field("pbr1"), br1)
199-
Load(ctx.Field("pbr2"), br2)
200-
Load(ctx.Field("pbr3"), br3)
185+
Load(ctx.Field("pbr"), br)
201186
}
202187

203188
Comment("Main loop")
204189
Label("main_loop")
205190

206191
MOVQ(bufferOrigin, buffer)
207192
// Check if we have space
208-
CMPQ(buffer, limitPtr)
193+
CMPQ(buffer, limit)
209194
SETGE(exhausted.As8())
210-
d.decodeFourValues(0, br0, peekBits, table, buffer, exhausted)
195+
d.decodeFourValues(0, br, peekBits, table, buffer, exhausted)
211196
ADDQ(dstEvery, buffer)
212-
d.decodeFourValues(1, br1, peekBits, table, buffer, exhausted)
197+
d.decodeFourValues(1, br, peekBits, table, buffer, exhausted)
213198
ADDQ(dstEvery, buffer)
214-
d.decodeFourValues(2, br2, peekBits, table, buffer, exhausted)
199+
d.decodeFourValues(2, br, peekBits, table, buffer, exhausted)
215200
ADDQ(dstEvery, buffer)
216-
d.decodeFourValues(3, br3, peekBits, table, buffer, exhausted)
201+
d.decodeFourValues(3, br, peekBits, table, buffer, exhausted)
217202

218203
ADDQ(U8(4), bufferOrigin) // off += 4
219204

@@ -222,10 +207,9 @@ func (d decompress4x) generateProcedure4x8bit(name string) {
222207

223208
{
224209
ctx := Dereference(Param("ctx"))
225-
tmp := Load(ctx.Field("out"), GP64())
226-
decoded := GP64()
227-
MOVQ(bufferOrigin, decoded)
228-
SUBQ(tmp, decoded)
210+
ctxout, _ := ctx.Field("out").Resolve()
211+
decoded := bufferOrigin
212+
SUBQ(ctxout.Addr, decoded)
229213
SHLQ(U8(2), decoded) // decoded *= 4
230214

231215
Store(decoded, ctx.Field("decoded"))
@@ -234,7 +218,7 @@ func (d decompress4x) generateProcedure4x8bit(name string) {
234218
}
235219

236220
func (d decompress4x) decodeFourValues(id int, br, peekBits, table, buffer, exhausted reg.GPVirtual) {
237-
brValue, brBitsRead := d.fillFast32(id+1000, 32, br, exhausted)
221+
brValue, brBitsRead := d.fillFast32(id, 32, br, exhausted)
238222

239223
decompress := func(valID int, outByte reg.Register) {
240224
CX := reg.CL
@@ -269,9 +253,10 @@ func (d decompress4x) decodeFourValues(id int, br, peekBits, table, buffer, exha
269253
Comment("out[id * dstEvery + 4] = uint8(v3.entry >> 8)")
270254
MOVL(out.As32(), Mem{Base: buffer})
271255

272-
Comment("update the bitreader reader structure")
273-
MOVQ(brValue, Mem{Base: br, Disp: bitReader_value})
274-
MOVB(brBitsRead.As8(), Mem{Base: br, Disp: bitReader_bitsRead})
256+
Comment("update the bitreader structure")
257+
offset := id * bitReader__size
258+
MOVQ(brValue, Mem{Base: br, Disp: offset + bitReader_value})
259+
MOVB(brBitsRead.As8(), Mem{Base: br, Disp: offset + bitReader_bitsRead})
275260
}
276261

277262
func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) (brValue, brBitsRead reg.GPVirtual) {
@@ -281,14 +266,15 @@ func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) (
281266
Commentf("br%d.fillFast32()", id)
282267
brValue = GP64()
283268
brBitsRead = GP64()
284-
MOVQ(Mem{Base: br, Disp: bitReader_value}, brValue)
285-
MOVBQZX(Mem{Base: br, Disp: bitReader_bitsRead}, brBitsRead)
269+
offset := bitReader__size * id
270+
MOVQ(Mem{Base: br, Disp: offset + bitReader_value}, brValue)
271+
MOVBQZX(Mem{Base: br, Disp: offset + bitReader_bitsRead}, brBitsRead)
286272

287273
// We must have at least 2 * max tablelog left
288274
CMPQ(brBitsRead, U8(64-atLeast))
289275
JBE(LabelRef("skip_fill" + strconv.Itoa(id)))
290276
brOffset := GP64()
291-
MOVQ(Mem{Base: br, Disp: bitReader_off}, brOffset)
277+
MOVQ(Mem{Base: br, Disp: offset + bitReader_off}, brOffset)
292278

293279
SUBQ(U8(32), brBitsRead) // b.bitsRead -= 32
294280
SUBQ(U8(4), brOffset) // b.off -= 4
@@ -297,7 +283,7 @@ func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) (
297283
// v = v[:4]
298284
// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
299285
tmp := GP64()
300-
MOVQ(Mem{Base: br, Disp: bitReader_in}, tmp)
286+
MOVQ(Mem{Base: br, Disp: offset + bitReader_in}, tmp)
301287

302288
Comment("b.value |= uint64(low) << (b.bitsRead & 63)")
303289
addr := Mem{Base: brOffset, Index: tmp.As64(), Scale: 1}
@@ -306,7 +292,7 @@ func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) (
306292
MOVQ(brBitsRead, CX.As64())
307293
SHLQ(CX, tmp.As64())
308294

309-
MOVQ(brOffset, Mem{Base: br, Disp: bitReader_off})
295+
MOVQ(brOffset, Mem{Base: br, Disp: offset + bitReader_off})
310296
ORQ(tmp.As64(), brValue)
311297
{
312298
Commentf("exhausted = exhausted || (br%d.off < 4)", id)
@@ -474,11 +460,9 @@ func (d decompress1x) generateProcedure(name string) {
474460
{
475461
// calculate decoded as current `out` - initial `out`
476462
ctx := Dereference(Param("ctx"))
477-
decoded := GP64()
478-
tmp := GP64()
479-
MOVQ(buffer, decoded)
480-
Load(ctx.Field("out"), tmp)
481-
SUBQ(tmp, decoded)
463+
ctxout, _ := ctx.Field("out").Resolve()
464+
decoded := buffer
465+
SUBQ(ctxout.Addr, decoded)
482466
Store(decoded, ctx.Field("decoded"))
483467

484468
pbr := Dereference(ctx.Field("pbr"))

huff0/decompress_amd64.go

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,7 @@ func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
2727
const fallback8BitSize = 800
2828

2929
type decompress4xContext struct {
30-
pbr0 *bitReaderShifted
31-
pbr1 *bitReaderShifted
32-
pbr2 *bitReaderShifted
33-
pbr3 *bitReaderShifted
30+
pbr *[4]bitReaderShifted
3431
peekBits uint8
3532
out *byte
3633
dstEvery int
@@ -89,10 +86,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
8986

9087
if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
9188
ctx := decompress4xContext{
92-
pbr0: &br[0],
93-
pbr1: &br[1],
94-
pbr2: &br[2],
95-
pbr3: &br[3],
89+
pbr: &br,
9690
peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
9791
out: &out[0],
9892
dstEvery: dstEvery,

0 commit comments

Comments
 (0)