@@ -740,16 +740,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
740
740
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
741
741
; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
742
742
; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
743
+ ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
744
+ ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
743
745
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
746
+ ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
744
747
; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
745
- ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
746
- ; AVX512-NEXT: # ymm3 = mem[0,1,0,1]
747
748
; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2
749
+ ; AVX512-NEXT: vmovdqa %xmm0, 16(%rcx)
750
+ ; AVX512-NEXT: vmovdqa %xmm1, (%rcx)
748
751
; AVX512-NEXT: vmovdqa %xmm2, 32(%rcx)
749
- ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
750
- ; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
751
- ; AVX512-NEXT: vmovdqa %ymm0, (%rcx)
752
- ; AVX512-NEXT: vzeroupper
753
752
; AVX512-NEXT: retq
754
753
;
755
754
; AVX512-FCP-LABEL: store_i8_stride3_vf16:
@@ -763,16 +762,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
763
762
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
764
763
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
765
764
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
765
+ ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
766
+ ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
766
767
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
768
+ ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
767
769
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
768
- ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
769
- ; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1]
770
770
; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
771
+ ; AVX512-FCP-NEXT: vmovdqa %xmm0, 16(%rcx)
772
+ ; AVX512-FCP-NEXT: vmovdqa %xmm1, (%rcx)
771
773
; AVX512-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
772
- ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
773
- ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
774
- ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rcx)
775
- ; AVX512-FCP-NEXT: vzeroupper
776
774
; AVX512-FCP-NEXT: retq
777
775
;
778
776
; AVX512DQ-LABEL: store_i8_stride3_vf16:
@@ -786,16 +784,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
786
784
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
787
785
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
788
786
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
787
+ ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
788
+ ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1
789
789
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
790
+ ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm0
790
791
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
791
- ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
792
- ; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
793
792
; AVX512DQ-NEXT: vpshufb %xmm3, %xmm2, %xmm2
793
+ ; AVX512DQ-NEXT: vmovdqa %xmm0, 16(%rcx)
794
+ ; AVX512DQ-NEXT: vmovdqa %xmm1, (%rcx)
794
795
; AVX512DQ-NEXT: vmovdqa %xmm2, 32(%rcx)
795
- ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
796
- ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
797
- ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
798
- ; AVX512DQ-NEXT: vzeroupper
799
796
; AVX512DQ-NEXT: retq
800
797
;
801
798
; AVX512DQ-FCP-LABEL: store_i8_stride3_vf16:
@@ -809,16 +806,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
809
806
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
810
807
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
811
808
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
809
+ ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
810
+ ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
812
811
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
812
+ ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
813
813
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
814
- ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
815
- ; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1]
816
814
; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
815
+ ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 16(%rcx)
816
+ ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%rcx)
817
817
; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
818
- ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
819
- ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
820
- ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rcx)
821
- ; AVX512DQ-FCP-NEXT: vzeroupper
822
818
; AVX512DQ-FCP-NEXT: retq
823
819
;
824
820
; AVX512BW-LABEL: store_i8_stride3_vf16:
@@ -832,16 +828,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
832
828
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
833
829
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
834
830
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
831
+ ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
832
+ ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
835
833
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
834
+ ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
836
835
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
837
- ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
838
- ; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1]
839
836
; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
837
+ ; AVX512BW-NEXT: vmovdqa %xmm0, 16(%rcx)
838
+ ; AVX512BW-NEXT: vmovdqa %xmm1, (%rcx)
840
839
; AVX512BW-NEXT: vmovdqa %xmm2, 32(%rcx)
841
- ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
842
- ; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
843
- ; AVX512BW-NEXT: vmovdqa %ymm0, (%rcx)
844
- ; AVX512BW-NEXT: vzeroupper
845
840
; AVX512BW-NEXT: retq
846
841
;
847
842
; AVX512BW-FCP-LABEL: store_i8_stride3_vf16:
@@ -855,16 +850,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
855
850
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
856
851
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
857
852
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
853
+ ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
854
+ ; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
858
855
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
856
+ ; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
859
857
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
860
- ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
861
- ; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
862
858
; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
859
+ ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, 16(%rcx)
860
+ ; AVX512BW-FCP-NEXT: vmovdqa %xmm1, (%rcx)
863
861
; AVX512BW-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
864
- ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
865
- ; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
866
- ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rcx)
867
- ; AVX512BW-FCP-NEXT: vzeroupper
868
862
; AVX512BW-FCP-NEXT: retq
869
863
;
870
864
; AVX512DQ-BW-LABEL: store_i8_stride3_vf16:
@@ -878,16 +872,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
878
872
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
879
873
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
880
874
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
875
+ ; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
876
+ ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
881
877
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
878
+ ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
882
879
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
883
- ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
884
- ; AVX512DQ-BW-NEXT: # ymm3 = mem[0,1,0,1]
885
880
; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
881
+ ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, 16(%rcx)
882
+ ; AVX512DQ-BW-NEXT: vmovdqa %xmm1, (%rcx)
886
883
; AVX512DQ-BW-NEXT: vmovdqa %xmm2, 32(%rcx)
887
- ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
888
- ; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
889
- ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rcx)
890
- ; AVX512DQ-BW-NEXT: vzeroupper
891
884
; AVX512DQ-BW-NEXT: retq
892
885
;
893
886
; AVX512DQ-BW-FCP-LABEL: store_i8_stride3_vf16:
@@ -901,16 +894,15 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
901
894
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
902
895
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
903
896
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
897
+ ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
898
+ ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
904
899
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
900
+ ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
905
901
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
906
- ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
907
- ; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
908
902
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
903
+ ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, 16(%rcx)
904
+ ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, (%rcx)
909
905
; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
910
- ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
911
- ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
912
- ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rcx)
913
- ; AVX512DQ-BW-FCP-NEXT: vzeroupper
914
906
; AVX512DQ-BW-FCP-NEXT: retq
915
907
%in.vec0 = load <16 x i8 >, ptr %in.vecptr0 , align 64
916
908
%in.vec1 = load <16 x i8 >, ptr %in.vecptr1 , align 64
0 commit comments