Skip to content

Commit 9dd9b26

Browse files
committed
igzip/riscv64:Optimize adler32_rvv for VLEN=128
Signed-off-by: WenLei <[email protected]>
1 parent cd58e41 commit 9dd9b26

2 files changed

Lines changed: 18 additions & 11 deletions

File tree

Release_notes.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ v2.32
146146

147147
* Igzip compression improvements:
148148
- Added new RVV adler32 implementations.
149+
- Added optimized RVV adler32 for VLEN=128.
149150

150151
* Igzip:
151152
- Added experimental ISA-L shim library to provide drop-in compatibility with zlib.

igzip/riscv64/igzip_isal_adler32_rvv128.S

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ adler32_rvv128:
4040
srliw t6, a0, 16 // t6: B = adler32 >> 16
4141
li t0, 32
4242
bltu a2, t0, tail_bytes
43-
43+
vsetvli zero, t0, e32, m4, ta, ma
44+
vmv.v.i v24, 0
4445
vsetvli zero, t0, e8, m2, ta, ma
4546
la a7, factors
4647
vle8.v v0, (a7)
@@ -49,6 +50,7 @@ adler32_rvv128:
4950

5051
outer_loop:
5152
beqz t1, tail_bytes
53+
vmv.v.i v24, 0
5254
li t2, 173
5355
bgeu t1, t2, 1f
5456
mv t2, t1
@@ -57,25 +59,29 @@ outer_loop:
5759
add a7, a1, a7
5860

5961
inner_loop:
60-
vle8.v v2, (a1)
61-
addi a1, a1, 32
6262
slli a5, t5, 5
6363
add t6, t6, a5
64+
vsetvli zero, t0, e8, m1, ta, ma
65+
vle8.v v2, (a1)
66+
addi a1, a1, 16
67+
vle8.v v3, (a1)
68+
addi a1, a1, 16
6469
vwredsumu.vs v12, v2, v4
70+
vwredsumu.vs v12, v3, v12
6571
vwmulu.vv v16, v2, v0
72+
vwmulu.vv v18, v3, v1
6673

67-
vsetvli zero, t0, e16, m4, ta, ma
74+
vsetvli zero, t0, e16, m2, ta, ma
75+
vwaddu.wv v24, v24, v16
76+
vwaddu.wv v24, v24, v18
6877
vmv.x.s a6, v12
6978
add t5, t5, a6
70-
vwredsumu.vs v20, v16, v4
71-
72-
vsetvli zero, t0, e32, m4, ta, ma
73-
vmv.x.s a6, v20
74-
add t6, t6, a6 // B += weighted_sum
75-
76-
vsetvli zero, t0, e8, m2, ta, ma
7779
bne a1, a7, inner_loop
7880

81+
vsetvli zero, t0, e32, m4, ta, ma
82+
vredsum.vs v16, v24, v4
83+
vmv.x.s a6, v16
84+
add t6, t6, a6
7985
mul a3, t5, t3
8086
srli a3, a3, 47
8187
mul a4, a3, t4

0 commit comments

Comments
 (0)