@@ -13,75 +13,130 @@ TEXT ·xorBytes(SB), NOSPLIT, $0
1313 MOVD b+16 (FP), R5 // R5 = b
1414 MOVD n+24 (FP), R6 // R6 = n
1515
16- CMPU R6 , $ 32 , CR7 // Check if n ≥ 32 bytes
16+ CMPU R6, $64 , CR7 // Check if n ≥ 64 bytes
1717 MOVD R0, R8 // R8 = index
18- CMPU R6 , $ 8 , CR6 // Check if 8 ≤ n < 32 bytes
19- BLT CR6 , small // Smaller than 8
20- BLT CR7 , xor16 // Case for 16 ≤ n < 32 bytes
18+ CMPU R6, $8 , CR6 // Check if 8 ≤ n < 64 bytes
19+ BLE CR6, small // <= 8
20+ BLT CR7, xor32 // Case for 32 ≤ n < 64 bytes
2121
22- // Case for n ≥ 32 bytes
23- preloop32 :
24- SRD $ 5 , R6 , R7 // Setup loop counter
22+ // Case for n ≥ 64 bytes
23+ preloop64 :
24+ SRD $6 , R6, R7 // Set up loop counter
2525 MOVD R7, CTR
2626 MOVD $16 , R10
27- ANDCC $ 31 , R6 , R9 // Check for tailing bytes for later
28- loop32:
29- LXVD2X (R4)( R8 ) , VS32 // VS32 = a [ i , ... , i + 15 ]
30- LXVD2X (R4)( R10 ) , VS34
31- LXVD2X (R5)( R8 ) , VS33 // VS33 = b [ i , ... , i + 15 ]
32- LXVD2X (R5)( R10 ) , VS35
33- XXLXOR VS32 , VS33 , VS32 // VS34 = a [] ^ b []
34- XXLXOR VS34 , VS35 , VS34
35- STXVD2X VS32 , (R3)( R8 ) // Store to dst
36- STXVD2X VS34 , (R3)( R10 )
37- ADD $ 32 , R8 // Update index
38- ADD $ 32 , R10
39- BC 16 , 0 , loop32 // bdnz loop16
40-
41- BEQ CR0 , done
42-
43- MOVD R9 , R6
44- CMP R6 , $ 8
45- BLT small
27+ MOVD $32 , R14
28+ MOVD $48 , R15
29+ ANDCC $63 , R6, R9 // Check for tailing bytes for later
30+ PCALIGN $16
31+ // Case for >= 64 bytes
32+ // Process 64 bytes per iteration
33+ // Load 4 vectors of a and b
34+ // XOR the corresponding vectors
35+ // from a and b and store the result
36+ loop64:
37+ LXVD2X (R4)(R8), VS32
38+ LXVD2X (R4)(R10), VS34
39+ LXVD2X (R4)(R14), VS36
40+ LXVD2X (R4)(R15), VS38
41+ LXVD2X (R5)(R8), VS33
42+ LXVD2X (R5)(R10), VS35
43+ LXVD2X (R5)(R14), VS37
44+ LXVD2X (R5)(R15), VS39
45+ XXLXOR VS32, VS33, VS32
46+ XXLXOR VS34, VS35, VS34
47+ XXLXOR VS36, VS37, VS36
48+ XXLXOR VS38, VS39, VS38
49+ STXVD2X VS32, (R3)(R8)
50+ STXVD2X VS34, (R3)(R10)
51+ STXVD2X VS36, (R3)(R14)
52+ STXVD2X VS38, (R3)(R15)
53+ ADD $64 , R8
54+ ADD $64 , R10
55+ ADD $64 , R14
56+ ADD $64 , R15
57+ BDNZ loop64
58+ BC 12 ,2 ,LR // BEQLR
59+ MOVD R9, R6
60+ CMP R6, $8
61+ BLE small
62+ // Case for 8 <= n < 64 bytes
63+ // Process 32 bytes if available
64+ xor32:
65+ CMP R6, $32
66+ BLT xor16
67+ ADD $16 , R8, R9
68+ LXVD2X (R4)(R8), VS32
69+ LXVD2X (R4)(R9), VS33
70+ LXVD2X (R5)(R8), VS34
71+ LXVD2X (R5)(R9), VS35
72+ XXLXOR VS32, VS34, VS32
73+ XXLXOR VS33, VS35, VS33
74+ STXVD2X VS32, (R3)(R8)
75+ STXVD2X VS33, (R3)(R9)
76+ ADD $32 , R8
77+ ADD $-32 , R6
78+ CMP R6, $8
79+ BLE small
80+ // Case for 8 <= n < 32 bytes
81+ // Process 16 bytes if available
4682xor16:
47- CMP R6 , $ 16
48- BLT xor8
49- LXVD2X (R4)( R8 ) , VS32
50- LXVD2X (R5)( R8 ) , VS33
51- XXLXOR VS32 , VS33 , VS32
52- STXVD2X VS32 , (R3)( R8 )
53- ADD $ 16 , R8
54- ADD $ - 16 , R6
55- CMP R6 , $ 8
56- BLT small
83+ CMP R6, $16
84+ BLT xor8
85+ LXVD2X (R4)(R8), VS32
86+ LXVD2X (R5)(R8), VS33
87+ XXLXOR VS32, VS33, VS32
88+ STXVD2X VS32, (R3)(R8)
89+ ADD $16 , R8
90+ ADD $-16 , R6
91+ small:
92+ CMP R6, R0
93+ BC 12 ,2 ,LR // BEQLR
5794xor8:
58- // Case for 8 ≤ n < 16 bytes
59- MOVD (R4)( R8 ) , R14 // R14 = a [ i , ... , i + 7 ]
60- MOVD (R5)( R8 ) , R15 // R15 = b [ i , ... , i + 7 ]
61- XOR R14 , R15 , R16 // R16 = a [] ^ b []
62- SUB $ 8 , R6 // n = n - 8
63- MOVD R16 , (R3)( R8 ) // Store to dst
64- ADD $ 8 , R8
65-
66- // Check if we're finished
67- CMP R6 , R0
68- BGT small
95+ #ifdef GOPPC64_power10
96+ SLD $56 ,R6,R17
97+ ADD R4,R8,R18
98+ ADD R5,R8,R19
99+ ADD R3,R8,R20
100+ LXVL R18,R17,V0
101+ LXVL R19,R17,V1
102+ VXOR V0,V1,V1
103+ STXVL V1,R20,R17
69104 RET
70-
71- // Case for n < 8 bytes and tailing bytes from the
72- // previous cases.
73- small:
105+ #else
106+ CMP R6, $8
107+ BLT xor4
108+ // Case for 8 ≤ n < 16 bytes
109+ MOVD (R4)(R8), R14 // R14 = a[i,...,i+7]
110+ MOVD (R5)(R8), R15 // R15 = b[i,...,i+7]
111+ XOR R14, R15, R16 // R16 = a[] ^ b[]
112+ SUB $8 , R6 // n = n - 8
113+ MOVD R16, (R3)(R8) // Store to dst
114+ ADD $8 , R8
115+ xor4:
116+ CMP R6, $4
117+ BLT xor2
118+ MOVWZ (R4)(R8), R14
119+ MOVWZ (R5)(R8), R15
120+ XOR R14, R15, R16
121+ MOVW R16, (R3)(R8)
122+ ADD $4 ,R8
123+ ADD $-4 ,R6
124+ xor2:
125+ CMP R6, $2
126+ BLT xor1
127+ MOVHZ (R4)(R8), R14
128+ MOVHZ (R5)(R8), R15
129+ XOR R14, R15, R16
130+ MOVH R16, (R3)(R8)
131+ ADD $2 ,R8
132+ ADD $-2 ,R6
133+ xor1:
74134 CMP R6, R0
75- BEQ done
76- MOVD R6 , CTR // Setup loop counter
77-
78- loop :
135+ BC 12 ,2 ,LR // BEQLR
79136 MOVBZ (R4)(R8), R14 // R14 = a[i]
80137 MOVBZ (R5)(R8), R15 // R15 = b[i]
81138 XOR R14, R15, R16 // R16 = a[i] ^ b[i]
82139 MOVB R16, (R3)(R8) // Store to dst
83- ADD $ 1 , R8
84- BC 16 , 0 , loop // bdnz loop
85-
140+ #endif
86141done:
87142 RET
0 commit comments