@@ -490,43 +490,38 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
490490;
491491; AVX512F-LABEL: var_funnnel_v32i8:
492492; AVX512F: # %bb.0:
493- ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
494- ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
493+ ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
494+ ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
495495; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3
496- ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
497- ; AVX512F-NEXT: vpsubb %ymm1, %ymm2, %ymm1
498496; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
499497; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
500- ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2
501- ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3
498+ ; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm2
499+ ; AVX512F-NEXT: vpsllw $6, %ymm0, %ymm3
502500; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3
503501; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
504502; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
505- ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2
506- ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
507- ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3
508- ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
503+ ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm2
504+ ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm3
505+ ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3
509506; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
510- ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2 , %ymm0, %ymm0
507+ ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3 , %ymm0, %ymm0
511508; AVX512F-NEXT: retq
512509;
513510; AVX512VL-LABEL: var_funnnel_v32i8:
514511; AVX512VL: # %bb.0:
515- ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
516- ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
512+ ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
513+ ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3
517514; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3
518- ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
519- ; AVX512VL-NEXT: vpsubb %ymm1, %ymm2, %ymm1
520515; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
521516; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
522- ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
523- ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
517+ ; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm2
518+ ; AVX512VL-NEXT: vpsllw $6, %ymm0, %ymm3
524519; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3
525520; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
526521; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
527- ; AVX512VL-NEXT: vpsrlw $7 , %ymm0, %ymm2
528- ; AVX512VL-NEXT: vpaddb %ymm0 , %ymm0, %ymm3
529- ; AVX512VL-NEXT: vpternlogq $248 , {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
522+ ; AVX512VL-NEXT: vpsrlw $1 , %ymm0, %ymm2
523+ ; AVX512VL-NEXT: vpsllw $7 , %ymm0, %ymm3
524+ ; AVX512VL-NEXT: vpternlogq $216 , {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4} , %ymm2, %ymm3
530525; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
531526; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
532527; AVX512VL-NEXT: retq
@@ -975,70 +970,65 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
975970define <32 x i8 > @splatvar_funnnel_v32i8 (<32 x i8 > %x , <32 x i8 > %amt ) nounwind {
976971; AVX1-LABEL: splatvar_funnnel_v32i8:
977972; AVX1: # %bb.0:
978- ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
979- ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
980- ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
981973; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
982974; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
983975; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
984- ; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3
985- ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
976+ ; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
977+ ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
978+ ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
986979; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
987- ; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
988- ; AVX1-NEXT: vpsrlw $8 , %xmm2, %xmm2
980+ ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
981+ ; AVX1-NEXT: vpand %xmm4 , %xmm2, %xmm2
989982; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
990983; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
991- ; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3
992- ; AVX1-NEXT: vpsrlw $8 , %xmm3, %xmm3
984+ ; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
985+ ; AVX1-NEXT: vpand %xmm4 , %xmm3, %xmm3
993986; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
994- ; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
995- ; AVX1-NEXT: vpsrlw $8 , %xmm0, %xmm0
987+ ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
988+ ; AVX1-NEXT: vpand %xmm4 , %xmm0, %xmm0
996989; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
997990; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
998991; AVX1-NEXT: retq
999992;
1000993; AVX2-LABEL: splatvar_funnnel_v32i8:
1001994; AVX2: # %bb.0:
1002- ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1003- ; AVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
995+ ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1004996; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1005997; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1006- ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1007- ; AVX2-NEXT: vpsllw %xmm1, %ymm2, %ymm2
1008- ; AVX2-NEXT: vpsrlw $8 , %ymm2, %ymm2
998+ ; AVX2-NEXT: vpsrlw %xmm1, % ymm2, %ymm2
999+ ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1000+ ; AVX2-NEXT: vpand %ymm3 , %ymm2, %ymm2
10091001; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1010- ; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
1011- ; AVX2-NEXT: vpsrlw $8 , %ymm0, %ymm0
1002+ ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1003+ ; AVX2-NEXT: vpand %ymm3 , %ymm0, %ymm0
10121004; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
10131005; AVX2-NEXT: retq
10141006;
10151007; AVX512F-LABEL: splatvar_funnnel_v32i8:
10161008; AVX512F: # %bb.0:
1017- ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
1018- ; AVX512F-NEXT: vpsubb %xmm1, %xmm2, %xmm1
1009+ ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
10191010; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
10201011; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1021- ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1022- ; AVX512F-NEXT: vpsllw %xmm1, %ymm2, %ymm2
1023- ; AVX512F-NEXT: vpsrlw $8 , %ymm2, %ymm2
1012+ ; AVX512F-NEXT: vpsrlw %xmm1, % ymm2, %ymm2
1013+ ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1014+ ; AVX512F-NEXT: vpand %ymm3 , %ymm2, %ymm2
10241015; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1025- ; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0
1026- ; AVX512F-NEXT: vpsrlw $8 , %ymm0, %ymm0
1016+ ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1017+ ; AVX512F-NEXT: vpand %ymm3 , %ymm0, %ymm0
10271018; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
10281019; AVX512F-NEXT: retq
10291020;
10301021; AVX512VL-LABEL: splatvar_funnnel_v32i8:
10311022; AVX512VL: # %bb.0:
1032- ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
1033- ; AVX512VL-NEXT: vpsubb %xmm1, %xmm2, %xmm1
1023+ ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
10341024; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
10351025; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1036- ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1037- ; AVX512VL-NEXT: vpsllw %xmm1, %ymm2, %ymm2
1038- ; AVX512VL-NEXT: vpsrlw $8 , %ymm2, %ymm2
1026+ ; AVX512VL-NEXT: vpsrlw %xmm1, % ymm2, %ymm2
1027+ ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1028+ ; AVX512VL-NEXT: vpand %ymm3 , %ymm2, %ymm2
10391029; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1040- ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
1041- ; AVX512VL-NEXT: vpsrlw $8 , %ymm0, %ymm0
1030+ ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1031+ ; AVX512VL-NEXT: vpand %ymm3 , %ymm0, %ymm0
10421032; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
10431033; AVX512VL-NEXT: retq
10441034;
0 commit comments