diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 0c7e20fc1ebf3..67fb68a3eee83 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -211,6 +211,10 @@ multiclass VOP2Inst_e64_t16 { let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in { defm NAME : VOP2Inst; + let SubtargetPredicate = isGFX10Only in { + def _vop3_e64 : VOP3InstBase , node, 1>, + Commutable_REV; + } } let SubtargetPredicate = UseRealTrue16Insts in { defm _t16 : VOP2Inst_e64, node, revOp#"_t16">; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 0252c4f1b0929..597202d47591d 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1932,16 +1932,14 @@ defm V_DIV_FIXUP_F16 : defm V_ADD_NC_U16 : VOP3OpSel_Real_gfx10<0x303>; defm V_SUB_NC_U16 : VOP3OpSel_Real_gfx10<0x304>; -// FIXME-GFX10-OPSEL: Need to add "selective" opsel support to some of these -// (they do not support SDWA or DPP). -defm V_MUL_LO_U16 : VOP3_Real_gfx10_with_name<0x305, "V_MUL_LO_U16", "v_mul_lo_u16">; -defm V_LSHRREV_B16 : VOP3_Real_gfx10_with_name<0x307, "V_LSHRREV_B16", "v_lshrrev_b16">; -defm V_ASHRREV_I16 : VOP3_Real_gfx10_with_name<0x308, "V_ASHRREV_I16", "v_ashrrev_i16">; -defm V_MAX_U16 : VOP3_Real_gfx10_with_name<0x309, "V_MAX_U16", "v_max_u16">; -defm V_MAX_I16 : VOP3_Real_gfx10_with_name<0x30a, "V_MAX_I16", "v_max_i16">; -defm V_MIN_U16 : VOP3_Real_gfx10_with_name<0x30b, "V_MIN_U16", "v_min_u16">; -defm V_MIN_I16 : VOP3_Real_gfx10_with_name<0x30c, "V_MIN_I16", "v_min_i16">; -defm V_LSHLREV_B16 : VOP3_Real_gfx10_with_name<0x314, "V_LSHLREV_B16", "v_lshlrev_b16">; +defm V_MUL_LO_U16 : VOP3OpSel_Real_gfx10_with_name<0x305, "V_MUL_LO_U16_vop3", "v_mul_lo_u16">; +defm V_LSHRREV_B16 : VOP3OpSel_Real_gfx10_with_name<0x307, "V_LSHRREV_B16_vop3", "v_lshrrev_b16">; +defm V_ASHRREV_I16 : VOP3OpSel_Real_gfx10_with_name<0x308, "V_ASHRREV_I16_vop3", "v_ashrrev_i16">; +defm V_MAX_U16 : VOP3OpSel_Real_gfx10_with_name<0x309, "V_MAX_U16_vop3", "v_max_u16">; +defm V_MAX_I16 : VOP3OpSel_Real_gfx10_with_name<0x30a, "V_MAX_I16_vop3", "v_max_i16">; +defm V_MIN_U16 : VOP3OpSel_Real_gfx10_with_name<0x30b, "V_MIN_U16_vop3", "v_min_u16">; +defm V_MIN_I16 : VOP3OpSel_Real_gfx10_with_name<0x30c, "V_MIN_I16_vop3", "v_min_i16">; +defm V_LSHLREV_B16 : VOP3OpSel_Real_gfx10_with_name<0x314, "V_LSHLREV_B16_vop3", "v_lshlrev_b16">; defm V_PERMLANE16_B32 : VOP3OpSel_Real_gfx10<0x377>; defm V_PERMLANEX16_B32 : VOP3OpSel_Real_gfx10<0x378>; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index fc81e16d68e98..81153dbefb360 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -864,25 +864,25 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v4 +; GFX10-NEXT: v_xor_b32_e32 v5, -1, v4 ; GFX10-NEXT: v_and_b32_e32 v4, 7, v4 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 ; GFX10-NEXT: v_lshrrev_b16 v3, 1, v3 -; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 -; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX10-NEXT: v_lshlrev_b16 v4, v4, v5 +; GFX10-NEXT: v_and_b32_e32 v5, 7, v5 +; GFX10-NEXT: v_lshlrev_b16 v4, v4, v6 ; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 -; GFX10-NEXT: v_lshrrev_b16 v3, v6, v3 -; GFX10-NEXT: v_lshrrev_b16 v1, v7, v1 -; GFX10-NEXT: v_or_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xff +; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 +; GFX10-NEXT: v_and_b32_e32 v5, 7, v7 +; GFX10-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX10-NEXT: v_lshrrev_b16 v1, v5, v1 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v2 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index 238cc06fc7f7c..c5078c2283203 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -864,25 +864,25 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 +; GFX10-NEXT: v_xor_b32_e32 v5, -1, v3 ; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 ; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 -; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX10-NEXT: v_lshrrev_b16 v3, v3, v5 -; GFX10-NEXT: v_lshlrev_b16 v4, v6, v4 +; GFX10-NEXT: v_lshrrev_b16 v3, v3, v6 +; GFX10-NEXT: v_lshlrev_b16 v4, v5, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 7, v7 ; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v7, v0 -; GFX10-NEXT: v_or_b32_e32 v2, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xff +; GFX10-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX10-NEXT: v_lshlrev_b16 v0, v5, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v2 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll index 1701a9cc7f09b..5874cebe46a37 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -71,17 +71,17 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(ptr addrspace(4) inreg %ptr, i8 ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 1 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xff ; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 0 -; GFX10-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, s4, s0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s4, s0 +; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; @@ -175,16 +175,16 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(ptr addrspace(1) %ptr, i8 inreg ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 1 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 0 -; GFX10-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, s2, s0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, s0 +; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; @@ -277,17 +277,17 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(ptr addrspace(4) inreg %ptr, i8 ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xff ; GFX10-NEXT: global_load_ushort v1, v1, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 0 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2 +; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; @@ -383,17 +383,17 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(ptr addrspace(4) inreg %ptr, i8 ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xff ; GFX10-NEXT: global_load_ushort v1, v1, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e64 v4, v1, s4, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, s4, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2 +; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; @@ -487,16 +487,16 @@ define amdgpu_ps void @insertelement_s_v2i8_v_v(ptr addrspace(4) inreg %ptr, i8 ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xff ; GFX10-NEXT: global_load_ushort v2, v2, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_and_b32_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm @@ -590,13 +590,13 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(ptr addrspace(1) %ptr, i8 inreg ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX10-NEXT: v_and_b32_sdwa v3, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, vcc_lo +; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -689,13 +689,13 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(ptr addrspace(1) %ptr, i8 %val, ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 -; GFX10-NEXT: v_and_b32_sdwa v3, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo +; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -788,13 +788,13 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(ptr addrspace(1) %ptr, i8 %val, ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX10-NEXT: v_and_b32_sdwa v3, v1, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo +; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir index 4c3f4d9b06ed1..461021112cfef 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir @@ -100,7 +100,7 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]] ; GFX11-LABEL: name: ashr_s16_s16_vs ; GFX11: liveins: $sgpr0, $vgpr0 @@ -193,7 +193,7 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]] ; GFX11-LABEL: name: ashr_s16_s16_vv ; GFX11: liveins: $vgpr0, $vgpr1 @@ -238,7 +238,7 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_ASHRREV_I16_e64_]], implicit $exec ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] @@ -292,7 +292,7 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_ASHRREV_I16_e64_]], implicit $exec @@ -442,7 +442,7 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]] ; GFX11-LABEL: name: ashr_s16_s16_sv ; GFX11: liveins: $sgpr0, $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir index 4769b5f77e3b2..c17b32d5c1676 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir @@ -98,7 +98,7 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]] ; GFX11-LABEL: name: lshr_s16_s16_vs ; GFX11: liveins: $sgpr0, $vgpr0 @@ -191,7 +191,7 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]] ; GFX11-LABEL: name: lshr_s16_s16_vv ; GFX11: liveins: $vgpr0, $vgpr1 @@ -236,7 +236,7 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_LSHRREV_B16_e64_]], implicit $exec ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] @@ -290,7 +290,7 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHRREV_B16_e64_]], implicit $exec @@ -440,7 +440,7 @@ body: | ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]] ; GFX11-LABEL: name: lshr_s16_s16_sv ; GFX11: liveins: $sgpr0, $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir index 19143c52b3f43..db5490ac7b90c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX8 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11 %s --- @@ -34,6 +34,15 @@ body: | ; GFX9-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]] ; + ; GFX10-LABEL: name: smed3_s16_vvv + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]] + ; ; GFX11-LABEL: name: smed3_s16_vvv ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -88,6 +97,16 @@ body: | ; GFX9-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MAX_I16_e64_]] ; + ; GFX10-LABEL: name: smed3_s16_vvv_multiuse0 + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MAX_I16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_vop3_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec + ; GFX10-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MAX_I16_vop3_e64_]] + ; ; GFX11-LABEL: name: smed3_s16_vvv_multiuse0 ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -143,6 +162,16 @@ body: | ; GFX9-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MIN_I16_e64_]] ; + ; GFX10-LABEL: name: smed3_s16_vvv_multiuse1 + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MIN_I16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_vop3_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec + ; GFX10-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MIN_I16_vop3_e64_]] + ; ; GFX11-LABEL: name: smed3_s16_vvv_multiuse1 ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -199,6 +228,17 @@ body: | ; GFX9-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MAX_I16_e64_]] ; + ; GFX10-LABEL: name: smed3_s16_vvv_multiuse2 + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MIN_I16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_vop3_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec + ; GFX10-NEXT: [[V_MAX_I16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_vop3_e64 0, [[V_MIN_I16_vop3_e64_]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX10-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MAX_I16_vop3_e64_]] + ; ; GFX11-LABEL: name: smed3_s16_vvv_multiuse2 ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir index b7f48d34b8f96..c3dd6e8e521db 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX8 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11 %s --- @@ -34,6 +34,15 @@ body: | ; GFX9-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]] ; + ; GFX10-LABEL: name: umed3_s16_vvv + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]] + ; ; GFX11-LABEL: name: umed3_s16_vvv ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -88,6 +97,16 @@ body: | ; GFX9-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MAX_U16_e64_]] ; + ; GFX10-LABEL: name: umed3_s16_vvv_multiuse0 + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MAX_U16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_vop3_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec + ; GFX10-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MAX_U16_vop3_e64_]] + ; ; GFX11-LABEL: name: umed3_s16_vvv_multiuse0 ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -143,6 +162,16 @@ body: | ; GFX9-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MIN_U16_e64_]] ; + ; GFX10-LABEL: name: umed3_s16_vvv_multiuse1 + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MIN_U16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_vop3_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec + ; GFX10-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MIN_U16_vop3_e64_]] + ; ; GFX11-LABEL: name: umed3_s16_vvv_multiuse1 ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -199,6 +228,17 @@ body: | ; GFX9-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MAX_U16_e64_]] ; + ; GFX10-LABEL: name: umed3_s16_vvv_multiuse2 + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_MIN_U16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_vop3_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec + ; GFX10-NEXT: [[V_MAX_U16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_vop3_e64 0, [[V_MIN_U16_vop3_e64_]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX10-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MAX_U16_vop3_e64_]] + ; ; GFX11-LABEL: name: umed3_s16_vvv_multiuse2 ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir index 73f164ed10df1..632b68fe80b2c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir @@ -36,6 +36,7 @@ body: | ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) ; GFX8-NEXT: [[SHL:%[0-9]+]]:sgpr(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16) ; GFX8-NEXT: S_ENDPGM 0, implicit [[SHL]](s16) + ; ; GFX9-LABEL: name: shl_s16_s16_ss ; GFX9: liveins: $sgpr0, $sgpr1 ; GFX9-NEXT: {{ $}} @@ -45,6 +46,7 @@ body: | ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) ; GFX9-NEXT: [[SHL:%[0-9]+]]:sgpr(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16) ; GFX9-NEXT: S_ENDPGM 0, implicit [[SHL]](s16) + ; ; GFX10-LABEL: name: shl_s16_s16_ss ; GFX10: liveins: $sgpr0, $sgpr1 ; GFX10-NEXT: {{ $}} @@ -54,6 +56,7 @@ body: | ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) ; GFX10-NEXT: [[SHL:%[0-9]+]]:sgpr(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16) ; GFX10-NEXT: S_ENDPGM 0, implicit [[SHL]](s16) + ; ; GFX11-LABEL: name: shl_s16_s16_ss ; GFX11: liveins: $sgpr0, $sgpr1 ; GFX11-NEXT: {{ $}} @@ -86,6 +89,7 @@ body: | ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GFX8-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]] + ; ; GFX9-LABEL: name: shl_s16_s16_vs ; GFX9: liveins: $sgpr0, $vgpr0 ; GFX9-NEXT: {{ $}} @@ -93,13 +97,15 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GFX9-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]] + ; ; GFX10-LABEL: name: shl_s16_s16_vs ; GFX10: liveins: $sgpr0, $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX10-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]] + ; GFX10-NEXT: [[V_LSHLREV_B16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_vop3_e64_]] + ; ; GFX11-LABEL: name: shl_s16_s16_vs ; GFX11: liveins: $sgpr0, $vgpr0 ; GFX11-NEXT: {{ $}} @@ -132,6 +138,7 @@ body: | ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX8-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) ; GFX8-NEXT: S_ENDPGM 0, implicit [[SHL]](s16) + ; ; GFX9-LABEL: name: shl_s16_s32_vv ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} @@ -140,6 +147,7 @@ body: | ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX9-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) ; GFX9-NEXT: S_ENDPGM 0, implicit [[SHL]](s16) + ; ; GFX10-LABEL: name: shl_s16_s32_vv ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10-NEXT: {{ $}} @@ -148,6 +156,7 @@ body: | ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX10-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) ; GFX10-NEXT: S_ENDPGM 0, implicit [[SHL]](s16) + ; ; GFX11-LABEL: name: shl_s16_s32_vv ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} @@ -179,6 +188,7 @@ body: | ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]] + ; ; GFX9-LABEL: name: shl_s16_s16_vv ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} @@ -186,13 +196,15 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]] + ; ; GFX10-LABEL: name: shl_s16_s16_vv ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]] + ; GFX10-NEXT: [[V_LSHLREV_B16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_vop3_e64_]] + ; ; GFX11-LABEL: name: shl_s16_s16_vv ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} @@ -224,6 +236,7 @@ body: | ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX8-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]] + ; ; GFX9-LABEL: name: shl_s16_s16_vv_zext_to_s32 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} @@ -231,15 +244,17 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]] + ; ; GFX10-LABEL: name: shl_s16_s16_vv_zext_to_s32 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10-NEXT: [[V_LSHLREV_B16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_LSHLREV_B16_e64_]], implicit $exec + ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_LSHLREV_B16_vop3_e64_]], implicit $exec ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] + ; ; GFX11-LABEL: name: shl_s16_s16_vv_zext_to_s32 ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} @@ -276,6 +291,7 @@ body: | ; GFX8-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_LSHLREV_B16_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 ; GFX8-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] + ; ; GFX9-LABEL: name: shl_s16_vv_zext_to_s64 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} @@ -285,18 +301,20 @@ body: | ; GFX9-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_LSHLREV_B16_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 ; GFX9-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] + ; ; GFX10-LABEL: name: shl_s16_vv_zext_to_s64 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10-NEXT: [[V_LSHLREV_B16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHLREV_B16_e64_]], implicit $exec + ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHLREV_B16_vop3_e64_]], implicit $exec ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX10-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] + ; ; GFX11-LABEL: name: shl_s16_vv_zext_to_s64 ; GFX11: liveins: $vgpr0, $vgpr1 ; GFX11-NEXT: {{ $}} @@ -335,6 +353,7 @@ body: | ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX8-NEXT: [[SHL:%[0-9]+]]:sgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) ; GFX8-NEXT: S_ENDPGM 0, implicit [[SHL]](s16) + ; ; GFX9-LABEL: name: shl_s16_s32_ss ; GFX9: liveins: $sgpr0, $sgpr1 ; GFX9-NEXT: {{ $}} @@ -343,6 +362,7 @@ body: | ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX9-NEXT: [[SHL:%[0-9]+]]:sgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) ; GFX9-NEXT: S_ENDPGM 0, implicit [[SHL]](s16) + ; ; GFX10-LABEL: name: shl_s16_s32_ss ; GFX10: liveins: $sgpr0, $sgpr1 ; GFX10-NEXT: {{ $}} @@ -351,6 +371,7 @@ body: | ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX10-NEXT: [[SHL:%[0-9]+]]:sgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) ; GFX10-NEXT: S_ENDPGM 0, implicit [[SHL]](s16) + ; ; GFX11-LABEL: name: shl_s16_s32_ss ; GFX11: liveins: $sgpr0, $sgpr1 ; GFX11-NEXT: {{ $}} @@ -382,6 +403,7 @@ body: | ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX8-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) ; GFX8-NEXT: S_ENDPGM 0, implicit [[SHL]](s16) + ; ; GFX9-LABEL: name: shl_s16_s32_sv ; GFX9: liveins: $sgpr0, $vgpr0 ; GFX9-NEXT: {{ $}} @@ -390,6 +412,7 @@ body: | ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX9-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) ; GFX9-NEXT: S_ENDPGM 0, implicit [[SHL]](s16) + ; ; GFX10-LABEL: name: shl_s16_s32_sv ; GFX10: liveins: $sgpr0, $vgpr0 ; GFX10-NEXT: {{ $}} @@ -398,6 +421,7 @@ body: | ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX10-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) ; GFX10-NEXT: S_ENDPGM 0, implicit [[SHL]](s16) + ; ; GFX11-LABEL: name: shl_s16_s32_sv ; GFX11: liveins: $sgpr0, $vgpr0 ; GFX11-NEXT: {{ $}} @@ -428,6 +452,7 @@ body: | ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]] + ; ; GFX9-LABEL: name: shl_s16_s16_sv ; GFX9: liveins: $sgpr0, $vgpr0 ; GFX9-NEXT: {{ $}} @@ -435,13 +460,15 @@ body: | ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]] + ; ; GFX10-LABEL: name: shl_s16_s16_sv ; GFX10: liveins: $sgpr0, $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]] + ; GFX10-NEXT: [[V_LSHLREV_B16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_vop3_e64_]] + ; ; GFX11-LABEL: name: shl_s16_s16_sv ; GFX11: liveins: $sgpr0, $vgpr0 ; GFX11-NEXT: {{ $}} @@ -473,6 +500,7 @@ body: | ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX8-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) ; GFX8-NEXT: S_ENDPGM 0, implicit [[SHL]](s16) + ; ; GFX9-LABEL: name: shl_s16_s32_vs ; GFX9: liveins: $sgpr0, $vgpr0 ; GFX9-NEXT: {{ $}} @@ -481,6 +509,7 @@ body: | ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX9-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) ; GFX9-NEXT: S_ENDPGM 0, implicit [[SHL]](s16) + ; ; GFX10-LABEL: name: shl_s16_s32_vs ; GFX10: liveins: $sgpr0, $vgpr0 ; GFX10-NEXT: {{ $}} @@ -489,6 +518,7 @@ body: | ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX10-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) ; GFX10-NEXT: S_ENDPGM 0, implicit [[SHL]](s16) + ; ; GFX11-LABEL: name: shl_s16_s32_vs ; GFX11: liveins: $sgpr0, $vgpr0 ; GFX11-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index 832f066adaa84..a9397b3c33b88 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -315,7 +315,8 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp ; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -458,7 +459,8 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 02f8d0bf3c3df..e5ec9e48b9a63 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -2579,26 +2579,27 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v2 +; GFX10-NEXT: s_xor_b32 s1, s11, s2 ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v3 -; GFX10-NEXT: s_xor_b32 s1, s11, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s3, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 ; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 ; GFX10-NEXT: v_xor_b32_e32 v2, s11, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 ; GFX10-NEXT: s_xor_b32 s0, s12, s10 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xff -; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0 -; GFX10-NEXT: v_xor_b32_e32 v3, s12, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s11, v2 +; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1 +; GFX10-NEXT: v_xor_b32_e32 v3, s12, v3 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s12, v3 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2 ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 2673ac4fb5bae..4a58a6ae62657 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -316,7 +316,8 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp ; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -459,7 +460,8 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index d9158e3558395..18938b8afc353 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -315,7 +315,8 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp ; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -439,7 +440,8 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index 1aaf3122cc00d..c5a3800b2d6ca 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -2032,7 +2032,9 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; ; GFX10-LABEL: udivrem_v2i8: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s0, s[8:9], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, s0 ; GFX10-NEXT: s_bfe_u32 s1, s0, 0x80010 @@ -2040,17 +2042,16 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s3, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_sub_i32 s4, 0, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0 +; GFX10-NEXT: s_sub_i32 s3, 0, s1 +; GFX10-NEXT: v_mul_lo_u32 v3, s3, v1 ; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: v_mul_lo_u32 v3, s4, v1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 @@ -2060,34 +2061,34 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v4, 0xff -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 -; GFX10-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v3 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2 ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_short v1, v0, s[4:5] ; GFX10-NEXT: global_store_short v1, v2, s[6:7] ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll index 1fd139b06417f..c3c3eb3165167 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -309,7 +309,8 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp ; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -431,7 +432,8 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll index 2040aedc250e6..98ff6214da3f8 100644 --- a/llvm/test/CodeGen/AMDGPU/freeze.ll +++ b/llvm/test/CodeGen/AMDGPU/freeze.ll @@ -12294,10 +12294,10 @@ define void @freeze_v2i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0xff ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX10-GISEL-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-GISEL-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-GISEL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-GISEL-NEXT: global_store_short v[2:3], v0, off ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -12485,13 +12485,14 @@ define void @freeze_v3i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v[0:1], off -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0xff +; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0xff ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX10-GISEL-NEXT: v_and_b32_sdwa v4, v4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-GISEL-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-GISEL-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX10-GISEL-NEXT: v_and_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-GISEL-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX10-GISEL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v4 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-GISEL-NEXT: global_store_short v[2:3], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll index b4e5fa088b533..7111ef037897e 100644 --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -536,7 +536,8 @@ define hidden void @insertUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b16 v0, 8, v0 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: global_store_dword v[5:6], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -993,17 +994,18 @@ define hidden void @and_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: global_load_dword v9, v[0:1], off -; GFX10-NEXT: v_mov_b32_e32 v0, 2 -; GFX10-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v2, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v2, 2, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_e32 v3, 0x100, v9 -; GFX10-NEXT: v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX10-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2 +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x5070006 ; GFX10-NEXT: global_store_dword v[5:6], v0, off ; GFX10-NEXT: global_store_dword v[7:8], v1, off @@ -1060,18 +1062,18 @@ define hidden void @ashr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: global_load_dword v9, v[0:1], off ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v2, 26 ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_bfe_i32 v1, v9, 0, 8 -; GFX10-NEXT: v_ashrrev_i32_sdwa v2, v2, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 26, v9 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 25, v9 ; GFX10-NEXT: v_lshlrev_b16 v1, 7, v1 -; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ashrrev_i16 v4, 10, v0 ; GFX10-NEXT: v_perm_b32 v0, v9, v0, 0x4010707 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff00, v1 +; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: global_store_dword v[5:6], v1, off @@ -1233,16 +1235,16 @@ define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: global_load_dword v9, v[0:1], off ; GFX10-NEXT: global_load_dword v10, v[2:3], off -; GFX10-NEXT: v_mov_b32_e32 v0, 16 -; GFX10-NEXT: v_mov_b32_e32 v1, 0xff +; GFX10-NEXT: v_mov_b32_e32 v0, 0xff ; GFX10-NEXT: v_lshlrev_b16 v2, 8, v4 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v9 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: v_perm_b32 v1, v10, v9, 0x2000706 ; GFX10-NEXT: global_store_dword v[5:6], v0, off ; GFX10-NEXT: global_store_dword v[7:8], v1, off @@ -1294,24 +1296,24 @@ define hidden void @lhsr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: global_load_dword v9, v[2:3], off -; GFX10-NEXT: v_mov_b32_e32 v0, 26 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[2:3], off +; GFX10-NEXT: global_load_dword v9, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_lshrrev_b16 v1, 1, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 26, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 25, v9 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 26, v4 +; GFX10-NEXT: v_lshrrev_b16 v1, 1, v9 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 25, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 26, v9 +; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0x7f00, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x1030707 +; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x1030707 ; GFX10-NEXT: global_store_dword v[5:6], v0, off ; GFX10-NEXT: global_store_dword v[7:8], v1, off ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1432,6 +1434,7 @@ define hidden void @or_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX10-NEXT: v_bfrev_b32_e32 v10, 4.0 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo @@ -1439,16 +1442,17 @@ define hidden void @or_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: global_load_dword v9, v[0:1], off -; GFX10-NEXT: v_mov_b32_e32 v0, 16 -; GFX10-NEXT: v_bfrev_b32_e32 v2, 4.0 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_lshlrev_b16 v1, 8, v4 -; GFX10-NEXT: v_lshrrev_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b16 v1, 8, v9 +; GFX10-NEXT: v_lshrrev_b16 v2, 8, v4 +; GFX10-NEXT: v_lshlrev_b16 v3, 8, v4 +; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX10-NEXT: v_or_b32_e32 v1, 0x201, v1 -; GFX10-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x2010005 ; GFX10-NEXT: global_store_dword v[5:6], v0, off @@ -1508,59 +1512,61 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: global_load_dword v9, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 -; GFX10-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v15, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v16, v10 -; GFX10-NEXT: v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v12 -; GFX10-NEXT: v_xor_b32_sdwa v0, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v14 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v15, v1 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 ; GFX10-NEXT: v_xor_b32_sdwa v3, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 -; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v12 ; GFX10-NEXT: v_xor_b32_sdwa v13, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3 -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GFX10-NEXT: v_mul_f32_e32 v15, v2, v15 -; GFX10-NEXT: v_mul_f32_e32 v16, v19, v16 +; GFX10-NEXT: v_xor_b32_sdwa v0, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 +; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 30, v3 +; GFX10-NEXT: v_mul_f32_e32 v16, v19, v16 +; GFX10-NEXT: v_mul_f32_e32 v18, v1, v18 +; GFX10-NEXT: v_mul_f32_e32 v15, v2, v15 +; GFX10-NEXT: v_ashrrev_i32_e32 v13, 30, v13 +; GFX10-NEXT: v_or_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_trunc_f32_e32 v16, v16 +; GFX10-NEXT: v_trunc_f32_e32 v18, v18 ; GFX10-NEXT: v_mul_f32_e32 v17, v2, v17 -; GFX10-NEXT: v_or_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_trunc_f32_e32 v15, v15 -; GFX10-NEXT: v_trunc_f32_e32 v16, v16 -; GFX10-NEXT: v_mul_f32_e32 v18, v1, v18 +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GFX10-NEXT: v_mad_f32 v19, -v16, v10, v19 +; GFX10-NEXT: v_mad_f32 v21, -v18, v14, v1 +; GFX10-NEXT: v_or_b32_e32 v13, 1, v13 ; GFX10-NEXT: v_trunc_f32_e32 v17, v17 -; GFX10-NEXT: v_ashrrev_i32_e32 v11, 30, v11 ; GFX10-NEXT: v_mad_f32 v20, -v15, v1, v2 -; GFX10-NEXT: v_mad_f32 v19, -v16, v10, v19 -; GFX10-NEXT: v_or_b32_e32 v3, 1, v3 -; GFX10-NEXT: v_trunc_f32_e32 v18, v18 +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v19|, |v10| +; GFX10-NEXT: v_ashrrev_i32_e32 v11, 30, v11 +; GFX10-NEXT: v_or_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_mad_f32 v2, -v17, v12, v2 -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, |v1| -; GFX10-NEXT: v_ashrrev_i32_e32 v13, 30, v13 +; GFX10-NEXT: v_cvt_i32_f32_e32 v16, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v14| ; GFX10-NEXT: v_or_b32_e32 v11, 1, v11 -; GFX10-NEXT: v_mad_f32 v21, -v18, v14, v1 +; GFX10-NEXT: v_cvt_i32_f32_e32 v18, v18 ; GFX10-NEXT: v_cvt_i32_f32_e32 v15, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v19|, |v10| -; GFX10-NEXT: v_or_b32_e32 v13, 1, v13 -; GFX10-NEXT: v_cvt_i32_f32_e32 v16, v16 ; GFX10-NEXT: v_cvt_i32_f32_e32 v17, v17 -; GFX10-NEXT: v_cvt_i32_f32_e32 v18, v18 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v10, 0, v13, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, |v1| +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, |v12| +; GFX10-NEXT: v_add_nc_u32_e32 v2, v16, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v18, v10 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v15, v0 -; GFX10-NEXT: v_add_nc_u32_sdwa v1, v16, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v11, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v14| -; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_add_nc_u32_e32 v2, v17, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v13, vcc_lo -; GFX10-NEXT: v_add_nc_u32_sdwa v3, v18, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v11, vcc_lo +; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2 +; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v17, v1 +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x60706 ; GFX10-NEXT: global_store_dword v[5:6], v0, off @@ -1861,70 +1867,72 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: global_load_dword v9, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 -; GFX10-NEXT: v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 -; GFX10-NEXT: v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_i32_sdwa v12, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX10-NEXT: v_cvt_f32_i32_sdwa v15, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v2 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v13 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v19, v3 -; GFX10-NEXT: v_xor_b32_sdwa v1, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v20, v15 -; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_i32_sdwa v11, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v12 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v20, v14 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v1 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v21, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 -; GFX10-NEXT: v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 30, v1 +; GFX10-NEXT: v_xor_b32_sdwa v10, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v19, v2 ; GFX10-NEXT: v_xor_b32_sdwa v16, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3 -; GFX10-NEXT: v_mul_f32_e32 v17, v3, v17 -; GFX10-NEXT: v_mul_f32_e32 v18, v12, v18 -; GFX10-NEXT: v_mul_f32_e32 v19, v15, v19 -; GFX10-NEXT: v_ashrrev_i32_e32 v11, 30, v11 -; GFX10-NEXT: v_or_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_trunc_f32_e32 v17, v17 -; GFX10-NEXT: v_trunc_f32_e32 v18, v18 +; GFX10-NEXT: v_xor_b32_sdwa v0, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 +; GFX10-NEXT: v_xor_b32_sdwa v13, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2 +; GFX10-NEXT: v_ashrrev_i32_e32 v10, 30, v10 +; GFX10-NEXT: v_mul_f32_e32 v18, v11, v18 ; GFX10-NEXT: v_mul_f32_e32 v20, v21, v20 -; GFX10-NEXT: v_trunc_f32_e32 v19, v19 -; GFX10-NEXT: v_ashrrev_i32_e32 v14, 30, v14 -; GFX10-NEXT: v_mad_f32 v22, -v17, v2, v3 -; GFX10-NEXT: v_mad_f32 v12, -v18, v13, v12 -; GFX10-NEXT: v_or_b32_e32 v11, 1, v11 -; GFX10-NEXT: v_trunc_f32_e32 v20, v20 -; GFX10-NEXT: v_mad_f32 v23, -v19, v3, v15 -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v22|, |v2| +; GFX10-NEXT: v_mul_f32_e32 v17, v2, v17 ; GFX10-NEXT: v_ashrrev_i32_e32 v16, 30, v16 -; GFX10-NEXT: v_or_b32_e32 v14, 1, v14 -; GFX10-NEXT: v_mad_f32 v21, -v20, v15, v21 -; GFX10-NEXT: v_cvt_i32_f32_e32 v17, v17 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v12|, |v13| +; GFX10-NEXT: v_or_b32_e32 v10, 1, v10 +; GFX10-NEXT: v_trunc_f32_e32 v18, v18 +; GFX10-NEXT: v_trunc_f32_e32 v20, v20 +; GFX10-NEXT: v_mul_f32_e32 v19, v14, v19 +; GFX10-NEXT: v_trunc_f32_e32 v17, v17 +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GFX10-NEXT: v_mad_f32 v11, -v18, v12, v11 +; GFX10-NEXT: v_mad_f32 v21, -v20, v14, v21 ; GFX10-NEXT: v_or_b32_e32 v16, 1, v16 +; GFX10-NEXT: v_trunc_f32_e32 v19, v19 +; GFX10-NEXT: v_mad_f32 v22, -v17, v1, v2 +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v11|, |v12| +; GFX10-NEXT: v_ashrrev_i32_e32 v13, 30, v13 +; GFX10-NEXT: v_or_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_mad_f32 v23, -v19, v2, v14 ; GFX10-NEXT: v_cvt_i32_f32_e32 v18, v18 -; GFX10-NEXT: v_cvt_i32_f32_e32 v19, v19 +; GFX10-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v14| +; GFX10-NEXT: v_or_b32_e32 v13, 1, v13 ; GFX10-NEXT: v_cvt_i32_f32_e32 v20, v20 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v11, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v23|, |v3| -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 8, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v17, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v14, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v15| -; GFX10-NEXT: v_add_nc_u32_e32 v2, v18, v2 -; GFX10-NEXT: v_mul_lo_u32 v1, v1, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v19, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v15, 24, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v16, vcc_lo -; GFX10-NEXT: v_mul_lo_u32 v2, v2, v10 -; GFX10-NEXT: v_mul_lo_u32 v3, v3, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v11, v20, v11 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 -; GFX10-NEXT: v_sub_nc_u32_sdwa v1, v9, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX10-NEXT: v_mul_lo_u32 v10, v11, v12 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, v12, v3 -; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_sub_nc_u32_sdwa v3, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v22|, |v1| +; GFX10-NEXT: v_cvt_i32_f32_e32 v17, v17 +; GFX10-NEXT: v_cvt_i32_f32_e32 v19, v19 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v23|, |v2| +; GFX10-NEXT: v_add_nc_u32_e32 v2, v18, v10 +; GFX10-NEXT: v_add_nc_u32_e32 v10, v20, v11 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v17, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v13, vcc_lo +; GFX10-NEXT: v_mul_lo_u32 v2, v2, v3 +; GFX10-NEXT: v_mul_lo_u32 v3, v10, v15 +; GFX10-NEXT: v_mul_lo_u32 v0, v0, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v19, v1 +; GFX10-NEXT: v_sub_nc_u32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-NEXT: v_mul_lo_u32 v1, v1, v11 +; GFX10-NEXT: v_sub_nc_u32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v11, v0 +; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2 +; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, v15, v1 +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x2070306 ; GFX10-NEXT: global_store_dword v[5:6], v0, off @@ -2149,24 +2157,25 @@ define hidden void @trunc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: global_load_dword v9, v[2:3], off +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[2:3], off +; GFX10-NEXT: global_load_dword v9, v[0:1], off ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v1, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b16 v1, 8, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 -; GFX10-NEXT: v_lshlrev_b16 v2, 2, v0 +; GFX10-NEXT: v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v2, 3, v9 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_lshlrev_b16 v1, 2, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_lshlrev_b16 v1, 3, v4 +; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x50205 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x50205 ; GFX10-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX10-NEXT: global_store_byte v[7:8], v0, off ; GFX10-NEXT: global_store_dword v[5:6], v1, off @@ -2416,51 +2425,53 @@ define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v3, v2 -; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v4, v2 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v9, v2 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v4, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v15, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v10, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v11, v3 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v12, v4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v13, v9 -; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v12, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v16, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v17, 24, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v2 ; GFX10-NEXT: v_mul_f32_e32 v10, v3, v10 ; GFX10-NEXT: v_mul_f32_e32 v11, v3, v11 -; GFX10-NEXT: v_mul_f32_e32 v12, v3, v12 ; GFX10-NEXT: v_mul_f32_e32 v13, v15, v13 +; GFX10-NEXT: v_mul_f32_e32 v12, v3, v12 ; GFX10-NEXT: v_trunc_f32_e32 v10, v10 ; GFX10-NEXT: v_trunc_f32_e32 v11, v11 -; GFX10-NEXT: v_trunc_f32_e32 v12, v12 ; GFX10-NEXT: v_trunc_f32_e32 v13, v13 +; GFX10-NEXT: v_trunc_f32_e32 v12, v12 ; GFX10-NEXT: v_mad_f32 v18, -v10, v1, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v10 ; GFX10-NEXT: v_mad_f32 v19, -v11, v3, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GFX10-NEXT: v_mad_f32 v20, -v12, v4, v3 -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v18|, v1 -; GFX10-NEXT: v_cvt_u32_f32_e32 v12, v12 ; GFX10-NEXT: v_mad_f32 v15, -v13, v9, v15 +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v18|, v1 +; GFX10-NEXT: v_mad_f32 v20, -v12, v4, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v13, v13 +; GFX10-NEXT: v_cvt_u32_f32_e32 v12, v12 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v10, vcc_lo ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v19|, v3 ; GFX10-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, v4 +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v15|, v9 ; GFX10-NEXT: v_mul_lo_u32 v3, v3, v16 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, v16, v1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v13, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, v4 +; GFX10-NEXT: v_mul_lo_u32 v9, v9, v17 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, v16, v3 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v12, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v15|, v9 +; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 ; GFX10-NEXT: v_mul_lo_u32 v4, v4, v14 -; GFX10-NEXT: v_sub_nc_u32_sdwa v3, v16, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v13, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x2050505 ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_mul_lo_u32 v9, v9, v17 +; GFX10-NEXT: v_lshlrev_b16 v9, 8, v9 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, v16, v4 -; GFX10-NEXT: v_sub_nc_u32_sdwa v9, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x2050505 ; GFX10-NEXT: v_or_b32_sdwa v3, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: global_store_dword v[5:6], v1, off diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll index 1c2d07c2f7af5..2d1c0af14ca37 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll @@ -25,11 +25,11 @@ define void @quux(i32 %arg, i1 %arg1, i1 %arg2) { ; CHECK-NEXT: v_mov_b32_e32 v2, 0xffff ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_and_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; CHECK-NEXT: v_mov_b32_e32 v1, 24 ; CHECK-NEXT: v_mov_b32_e32 v2, 0xff ; CHECK-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo -; CHECK-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; CHECK-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; CHECK-NEXT: v_lshlrev_b16 v1, 8, v1 ; CHECK-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; CHECK-NEXT: .LBB0_2: ; %bb9 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll index 801324eec454e..03226d8df2d94 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll @@ -63,8 +63,10 @@ define i8 @test_vector_reduce_and_v2i8(<2 x i8> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_and_v2i8: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_lshrrev_b16 v2, 8, v1 ; GFX10-SDAG-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX10-SDAG-NEXT: v_and_b32_sdwa v1, v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-SDAG-NEXT: v_and_b32_e32 v2, v1, v2 +; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 8, v2 ; GFX10-SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll index bdb1c22ce7267..0830f6957f03f 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll @@ -64,8 +64,10 @@ define i8 @test_vector_reduce_or_v2i8(<2 x i8> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_or_v2i8: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_lshrrev_b16 v2, 8, v1 ; GFX10-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX10-SDAG-NEXT: v_or_b32_sdwa v1, v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-SDAG-NEXT: v_or_b32_e32 v2, v1, v2 +; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 8, v2 ; GFX10-SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll index cf344ea9b92d4..a25138b53aa74 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll @@ -63,8 +63,10 @@ define i8 @test_vector_reduce_xor_v2i8(<2 x i8> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_xor_v2i8: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_lshrrev_b16 v2, 8, v1 ; GFX10-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX10-SDAG-NEXT: v_xor_b32_sdwa v1, v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-SDAG-NEXT: v_xor_b32_e32 v2, v1, v2 +; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 8, v2 ; GFX10-SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s index c151bf99b76c5..6bb0f4b1dff2d 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s @@ -8974,6 +8974,9 @@ v_mul_lo_u16 v5, v1, 0.5 v_mul_lo_u16 v5, v1, -4.0 // GFX10: encoding: [0x05,0x00,0x05,0xd7,0x01,0xef,0x01,0x00] +v_mul_lo_u16 v5, v1, v2 op_sel:[1,1,1] +// GFX10: encoding: [0x05,0x58,0x05,0xd7,0x01,0x05,0x02,0x00] + v_lshrrev_b16 v5, v1, v2 // GFX10: encoding: [0x05,0x00,0x07,0xd7,0x01,0x05,0x02,0x00] @@ -9052,6 +9055,9 @@ v_lshrrev_b16 v5, v1, 0.5 v_lshrrev_b16 v5, v1, -4.0 // GFX10: encoding: [0x05,0x00,0x07,0xd7,0x01,0xef,0x01,0x00] +v_lshrrev_b16 v5, v1, v2 op_sel:[1,1,1] +// GFX10: encoding: [0x05,0x58,0x07,0xd7,0x01,0x05,0x02,0x00] + v_ashrrev_i16 v5, v1, v2 // GFX10: encoding: [0x05,0x00,0x08,0xd7,0x01,0x05,0x02,0x00] @@ -9130,6 +9136,9 @@ v_ashrrev_i16 v5, v1, 0.5 v_ashrrev_i16 v5, v1, -4.0 // GFX10: encoding: [0x05,0x00,0x08,0xd7,0x01,0xef,0x01,0x00] +v_ashrrev_i16 v5, v1, v2 op_sel:[1,1,1] +// GFX10: encoding: [0x05,0x58,0x08,0xd7,0x01,0x05,0x02,0x00] + v_max_u16 v5, v1, v2 // GFX10: encoding: [0x05,0x00,0x09,0xd7,0x01,0x05,0x02,0x00] @@ -9208,6 +9217,9 @@ v_max_u16 v5, v1, 0.5 v_max_u16 v5, v1, -4.0 // GFX10: encoding: [0x05,0x00,0x09,0xd7,0x01,0xef,0x01,0x00] +v_max_u16 v5, v1, v2 op_sel:[1,1,1] +// GFX10: encoding: [0x05,0x58,0x09,0xd7,0x01,0x05,0x02,0x00] + v_max_i16 v5, v1, v2 // GFX10: encoding: [0x05,0x00,0x0a,0xd7,0x01,0x05,0x02,0x00] @@ -9286,6 +9298,9 @@ v_max_i16 v5, v1, 0.5 v_max_i16 v5, v1, -4.0 // GFX10: encoding: [0x05,0x00,0x0a,0xd7,0x01,0xef,0x01,0x00] +v_max_i16 v5, v1, v2 op_sel:[1,1,1] +// GFX10: encoding: [0x05,0x58,0x0a,0xd7,0x01,0x05,0x02,0x00] + v_min_u16 v5, v1, v2 // GFX10: encoding: [0x05,0x00,0x0b,0xd7,0x01,0x05,0x02,0x00] @@ -9364,6 +9379,9 @@ v_min_u16 v5, v1, 0.5 v_min_u16 v5, v1, -4.0 // GFX10: encoding: [0x05,0x00,0x0b,0xd7,0x01,0xef,0x01,0x00] +v_min_u16 v5, v1, v2 op_sel:[1,1,1] +// GFX10: encoding: [0x05,0x58,0x0b,0xd7,0x01,0x05,0x02,0x00] + v_min_i16 v5, v1, v2 // GFX10: encoding: [0x05,0x00,0x0c,0xd7,0x01,0x05,0x02,0x00] @@ -9442,6 +9460,9 @@ v_min_i16 v5, v1, 0.5 v_min_i16 v5, v1, -4.0 // GFX10: encoding: [0x05,0x00,0x0c,0xd7,0x01,0xef,0x01,0x00] +v_min_i16 v5, v1, v2 op_sel:[1,1,1] +// GFX10: encoding: [0x05,0x58,0x0c,0xd7,0x01,0x05,0x02,0x00] + v_add_nc_i16 v5, v1, v2 // GFX10: encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00] @@ -10009,6 +10030,9 @@ v_lshlrev_b16 v5, v1, 0.5 v_lshlrev_b16 v5, v1, -4.0 // GFX10: encoding: [0x05,0x00,0x14,0xd7,0x01,0xef,0x01,0x00] +v_lshlrev_b16 v5, v1, v2 op_sel:[1,1,1] +// GFX10: encoding: [0x05,0x58,0x14,0xd7,0x01,0x05,0x02,0x00] + v_mad_u16 v5, 0, v2, v3 // GFX10: encoding: [0x05,0x00,0x40,0xd7,0x80,0x04,0x0e,0x04] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt index 6da1423fe8278..721babdd64245 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt @@ -1503,6 +1503,9 @@ # GFX10: v_ashrrev_i16 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0x08,0xd7,0x6a,0x04,0x02,0x00] 0x05,0x00,0x08,0xd7,0x6a,0x04,0x02,0x00 +# GFX10: v_ashrrev_i16 v5, v1, v2 op_sel:[1,1,1] ; encoding: [0x05,0x58,0x08,0xd7,0x01,0x05,0x02,0x00] +0x05,0x58,0x08,0xd7,0x01,0x05,0x02,0x00 + # GFX10: v_ashrrev_i32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x18,0xd5,0x01,0x05,0x02,0x00] 0xff,0x00,0x18,0xd5,0x01,0x05,0x02,0x00 @@ -8309,6 +8312,9 @@ # GFX10: v_lshlrev_b16 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0x14,0xd7,0x6a,0x04,0x02,0x00] 0x05,0x00,0x14,0xd7,0x6a,0x04,0x02,0x00 +# GFX10: v_lshlrev_b16 v5, v1, v2 op_sel:[1,1,1] ; encoding: [0x05,0x58,0x14,0xd7,0x01,0x05,0x02,0x00] +0x05,0x58,0x14,0xd7,0x01,0x05,0x02,0x00 + # GFX10: v_lshlrev_b32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x1a,0xd5,0x01,0x05,0x02,0x00] 0xff,0x00,0x1a,0xd5,0x01,0x05,0x02,0x00 @@ -8537,6 +8543,9 @@ # GFX10: v_lshrrev_b16 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0x07,0xd7,0x6a,0x04,0x02,0x00] 0x05,0x00,0x07,0xd7,0x6a,0x04,0x02,0x00 +# GFX10: v_lshrrev_b16 v5, v1, v2 op_sel:[1,1,1] ; encoding: [0x05,0x58,0x07,0xd7,0x01,0x05,0x02,0x00] +0x05,0x58,0x07,0xd7,0x01,0x05,0x02,0x00 + # GFX10: v_lshrrev_b32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x16,0xd5,0x01,0x05,0x02,0x00] 0xff,0x00,0x16,0xd5,0x01,0x05,0x02,0x00 @@ -11292,6 +11301,9 @@ # GFX10: v_max_i16 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0x0a,0xd7,0x6a,0x04,0x02,0x00] 0x05,0x00,0x0a,0xd7,0x6a,0x04,0x02,0x00 +# GFX10: v_max_i16 v5, v1, v2 op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0a,0xd7,0x01,0x05,0x02,0x00] +0x05,0x58,0x0a,0xd7,0x01,0x05,0x02,0x00 + # GFX10: v_max_i32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x12,0xd5,0x01,0x05,0x02,0x00] 0xff,0x00,0x12,0xd5,0x01,0x05,0x02,0x00 @@ -11448,6 +11460,9 @@ # GFX10: v_max_u16 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0x09,0xd7,0x6a,0x04,0x02,0x00] 0x05,0x00,0x09,0xd7,0x6a,0x04,0x02,0x00 +# GFX10: v_max_u16 v5, v1, v2 op_sel:[1,1,1] ; encoding: [0x05,0x58,0x09,0xd7,0x01,0x05,0x02,0x00] +0x05,0x58,0x09,0xd7,0x01,0x05,0x02,0x00 + # GFX10: v_max_u32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x14,0xd5,0x01,0x05,0x02,0x00] 0xff,0x00,0x14,0xd5,0x01,0x05,0x02,0x00 @@ -13728,6 +13743,9 @@ # GFX10: v_min_i16 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0x0c,0xd7,0x6a,0x04,0x02,0x00] 0x05,0x00,0x0c,0xd7,0x6a,0x04,0x02,0x00 +# GFX10: v_min_i16 v5, v1, v2 op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0c,0xd7,0x01,0x05,0x02,0x00] +0x05,0x58,0x0c,0xd7,0x01,0x05,0x02,0x00 + # GFX10: v_min_i32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x11,0xd5,0x01,0x05,0x02,0x00] 0xff,0x00,0x11,0xd5,0x01,0x05,0x02,0x00 @@ -13884,6 +13902,9 @@ # GFX10: v_min_u16 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0x0b,0xd7,0x6a,0x04,0x02,0x00] 0x05,0x00,0x0b,0xd7,0x6a,0x04,0x02,0x00 +# GFX10: v_min_u16 v5, v1, v2 op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0b,0xd7,0x01,0x05,0x02,0x00] +0x05,0x58,0x0b,0xd7,0x01,0x05,0x02,0x00 + # GFX10: v_min_u32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x13,0xd5,0x01,0x05,0x02,0x00] 0xff,0x00,0x13,0xd5,0x01,0x05,0x02,0x00 @@ -15228,6 +15249,9 @@ # GFX10: v_mul_lo_u16 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0x05,0xd7,0x6a,0x04,0x02,0x00] 0x05,0x00,0x05,0xd7,0x6a,0x04,0x02,0x00 +# GFX10: v_mul_lo_u16 v5, v1, v2 op_sel:[1,1,1] ; encoding: [0x05,0x58,0x05,0xd7,0x01,0x05,0x02,0x00] +0x05,0x58,0x05,0xd7,0x01,0x05,0x02,0x00 + # GFX10: v_mul_lo_u32 v255, v1, v2 ; encoding: [0xff,0x00,0x69,0xd5,0x01,0x05,0x02,0x00] 0xff,0x00,0x69,0xd5,0x01,0x05,0x02,0x00