Skip to content

Commit fd3cc20

Browse files
authored
[SelectionDAG] Fold undemanded operand to UNDEF for VECTOR_SHUFFLE (#145524)
Always let SimplifyDemandedVectorElts fold either side of a VECTOR_SHUFFLE to UNDEF if no elements are demanded from that side. For a single use this could be done by SimplifyDemandedVectorElts already, but in case the operand had multiple uses we did not eliminate the use.
1 parent 41457bc commit fd3cc20

File tree

6 files changed

+50
-49
lines changed

6 files changed

+50
-49
lines changed

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3587,6 +3587,19 @@ bool TargetLowering::SimplifyDemandedVectorElts(
35873587
DemandedRHS.setBit(M - NumElts);
35883588
}
35893589

3590+
// If either side isn't demanded, replace it by UNDEF. We handle this
3591+
// explicitly here to also simplify in case of multiple uses (on the
3592+
// contrary to the SimplifyDemandedVectorElts calls below).
3593+
bool FoldLHS = !DemandedLHS && !LHS.isUndef();
3594+
bool FoldRHS = !DemandedRHS && !RHS.isUndef();
3595+
if (FoldLHS || FoldRHS) {
3596+
LHS = FoldLHS ? TLO.DAG.getUNDEF(LHS.getValueType()) : LHS;
3597+
RHS = FoldRHS ? TLO.DAG.getUNDEF(RHS.getValueType()) : RHS;
3598+
SDValue NewOp =
3599+
TLO.DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, ShuffleMask);
3600+
return TLO.CombineTo(Op, NewOp);
3601+
}
3602+
35903603
// See if we can simplify either shuffle operand.
35913604
APInt UndefLHS, ZeroLHS;
35923605
APInt UndefRHS, ZeroRHS;

llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1228,51 +1228,49 @@ define void @v_shuffle_v3bf16_v2bf16__3_u_1(ptr addrspace(1) inreg %ptr) {
12281228
; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_u_1:
12291229
; GFX900: ; %bb.0:
12301230
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1231+
; GFX900-NEXT: v_mov_b32_e32 v0, 0
12311232
; GFX900-NEXT: ;;#ASMSTART
12321233
; GFX900-NEXT: ; def v1
12331234
; GFX900-NEXT: ;;#ASMEND
12341235
; GFX900-NEXT: ;;#ASMSTART
12351236
; GFX900-NEXT: ; def v2
12361237
; GFX900-NEXT: ;;#ASMEND
1237-
; GFX900-NEXT: v_mov_b32_e32 v0, 0
12381238
; GFX900-NEXT: v_alignbit_b32 v2, s4, v2, 16
1239-
; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1239+
; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
12401240
; GFX900-NEXT: global_store_dword v0, v2, s[16:17]
1241-
; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4
12421241
; GFX900-NEXT: s_waitcnt vmcnt(0)
12431242
; GFX900-NEXT: s_setpc_b64 s[30:31]
12441243
;
12451244
; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_u_1:
12461245
; GFX90A: ; %bb.0:
12471246
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1247+
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
12481248
; GFX90A-NEXT: ;;#ASMSTART
12491249
; GFX90A-NEXT: ; def v1
12501250
; GFX90A-NEXT: ;;#ASMEND
12511251
; GFX90A-NEXT: ;;#ASMSTART
12521252
; GFX90A-NEXT: ; def v2
12531253
; GFX90A-NEXT: ;;#ASMEND
1254-
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
12551254
; GFX90A-NEXT: v_alignbit_b32 v2, s4, v2, 16
1256-
; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1255+
; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
12571256
; GFX90A-NEXT: global_store_dword v0, v2, s[16:17]
1258-
; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4
12591257
; GFX90A-NEXT: s_waitcnt vmcnt(0)
12601258
; GFX90A-NEXT: s_setpc_b64 s[30:31]
12611259
;
12621260
; GFX942-LABEL: v_shuffle_v3bf16_v2bf16__3_u_1:
12631261
; GFX942: ; %bb.0:
12641262
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1263+
; GFX942-NEXT: v_mov_b32_e32 v0, 0
12651264
; GFX942-NEXT: ;;#ASMSTART
12661265
; GFX942-NEXT: ; def v1
12671266
; GFX942-NEXT: ;;#ASMEND
12681267
; GFX942-NEXT: ;;#ASMSTART
12691268
; GFX942-NEXT: ; def v2
12701269
; GFX942-NEXT: ;;#ASMEND
1271-
; GFX942-NEXT: v_mov_b32_e32 v0, 0
1270+
; GFX942-NEXT: s_nop 0
12721271
; GFX942-NEXT: v_alignbit_b32 v2, s0, v2, 16
1273-
; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1272+
; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4
12741273
; GFX942-NEXT: global_store_dword v0, v2, s[0:1]
1275-
; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4
12761274
; GFX942-NEXT: s_waitcnt vmcnt(0)
12771275
; GFX942-NEXT: s_setpc_b64 s[30:31]
12781276
%vec0 = call <2 x bfloat> asm "; def $0", "=v"()

llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1928,48 +1928,45 @@ define void @v_shuffle_v3bf16_v3bf16__5_u_1(ptr addrspace(1) inreg %ptr) {
19281928
; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_u_1:
19291929
; GFX900: ; %bb.0:
19301930
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1931+
; GFX900-NEXT: v_mov_b32_e32 v3, 0
19311932
; GFX900-NEXT: ;;#ASMSTART
19321933
; GFX900-NEXT: ; def v[0:1]
19331934
; GFX900-NEXT: ;;#ASMEND
1934-
; GFX900-NEXT: v_mov_b32_e32 v3, 0
1935-
; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
19361935
; GFX900-NEXT: ;;#ASMSTART
19371936
; GFX900-NEXT: ; def v[1:2]
19381937
; GFX900-NEXT: ;;#ASMEND
1938+
; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4
19391939
; GFX900-NEXT: global_store_dword v3, v2, s[16:17]
1940-
; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4
19411940
; GFX900-NEXT: s_waitcnt vmcnt(0)
19421941
; GFX900-NEXT: s_setpc_b64 s[30:31]
19431942
;
19441943
; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_u_1:
19451944
; GFX90A: ; %bb.0:
19461945
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1946+
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
19471947
; GFX90A-NEXT: ;;#ASMSTART
19481948
; GFX90A-NEXT: ; def v[0:1]
19491949
; GFX90A-NEXT: ;;#ASMEND
1950-
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
1951-
; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
19521950
; GFX90A-NEXT: ;;#ASMSTART
19531951
; GFX90A-NEXT: ; def v[2:3]
19541952
; GFX90A-NEXT: ;;#ASMEND
1953+
; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4
19551954
; GFX90A-NEXT: global_store_dword v4, v3, s[16:17]
1956-
; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4
19571955
; GFX90A-NEXT: s_waitcnt vmcnt(0)
19581956
; GFX90A-NEXT: s_setpc_b64 s[30:31]
19591957
;
19601958
; GFX942-LABEL: v_shuffle_v3bf16_v3bf16__5_u_1:
19611959
; GFX942: ; %bb.0:
19621960
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1961+
; GFX942-NEXT: v_mov_b32_e32 v4, 0
19631962
; GFX942-NEXT: ;;#ASMSTART
19641963
; GFX942-NEXT: ; def v[0:1]
19651964
; GFX942-NEXT: ;;#ASMEND
1966-
; GFX942-NEXT: v_mov_b32_e32 v4, 0
1967-
; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
19681965
; GFX942-NEXT: ;;#ASMSTART
19691966
; GFX942-NEXT: ; def v[2:3]
19701967
; GFX942-NEXT: ;;#ASMEND
1968+
; GFX942-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4
19711969
; GFX942-NEXT: global_store_dword v4, v3, s[0:1]
1972-
; GFX942-NEXT: global_store_short v4, v0, s[0:1] offset:4
19731970
; GFX942-NEXT: s_waitcnt vmcnt(0)
19741971
; GFX942-NEXT: s_setpc_b64 s[30:31]
19751972
%vec0 = call <4 x bfloat> asm "; def $0", "=v"()

llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1228,51 +1228,49 @@ define void @v_shuffle_v3f16_v2f16__3_u_1(ptr addrspace(1) inreg %ptr) {
12281228
; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_u_1:
12291229
; GFX900: ; %bb.0:
12301230
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1231+
; GFX900-NEXT: v_mov_b32_e32 v0, 0
12311232
; GFX900-NEXT: ;;#ASMSTART
12321233
; GFX900-NEXT: ; def v1
12331234
; GFX900-NEXT: ;;#ASMEND
12341235
; GFX900-NEXT: ;;#ASMSTART
12351236
; GFX900-NEXT: ; def v2
12361237
; GFX900-NEXT: ;;#ASMEND
1237-
; GFX900-NEXT: v_mov_b32_e32 v0, 0
12381238
; GFX900-NEXT: v_alignbit_b32 v2, s4, v2, 16
1239-
; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1239+
; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
12401240
; GFX900-NEXT: global_store_dword v0, v2, s[16:17]
1241-
; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4
12421241
; GFX900-NEXT: s_waitcnt vmcnt(0)
12431242
; GFX900-NEXT: s_setpc_b64 s[30:31]
12441243
;
12451244
; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_u_1:
12461245
; GFX90A: ; %bb.0:
12471246
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1247+
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
12481248
; GFX90A-NEXT: ;;#ASMSTART
12491249
; GFX90A-NEXT: ; def v1
12501250
; GFX90A-NEXT: ;;#ASMEND
12511251
; GFX90A-NEXT: ;;#ASMSTART
12521252
; GFX90A-NEXT: ; def v2
12531253
; GFX90A-NEXT: ;;#ASMEND
1254-
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
12551254
; GFX90A-NEXT: v_alignbit_b32 v2, s4, v2, 16
1256-
; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1255+
; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
12571256
; GFX90A-NEXT: global_store_dword v0, v2, s[16:17]
1258-
; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4
12591257
; GFX90A-NEXT: s_waitcnt vmcnt(0)
12601258
; GFX90A-NEXT: s_setpc_b64 s[30:31]
12611259
;
12621260
; GFX942-LABEL: v_shuffle_v3f16_v2f16__3_u_1:
12631261
; GFX942: ; %bb.0:
12641262
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1263+
; GFX942-NEXT: v_mov_b32_e32 v0, 0
12651264
; GFX942-NEXT: ;;#ASMSTART
12661265
; GFX942-NEXT: ; def v1
12671266
; GFX942-NEXT: ;;#ASMEND
12681267
; GFX942-NEXT: ;;#ASMSTART
12691268
; GFX942-NEXT: ; def v2
12701269
; GFX942-NEXT: ;;#ASMEND
1271-
; GFX942-NEXT: v_mov_b32_e32 v0, 0
1270+
; GFX942-NEXT: s_nop 0
12721271
; GFX942-NEXT: v_alignbit_b32 v2, s0, v2, 16
1273-
; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1272+
; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4
12741273
; GFX942-NEXT: global_store_dword v0, v2, s[0:1]
1275-
; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4
12761274
; GFX942-NEXT: s_waitcnt vmcnt(0)
12771275
; GFX942-NEXT: s_setpc_b64 s[30:31]
12781276
%vec0 = call <2 x half> asm "; def $0", "=v"()

llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1928,48 +1928,45 @@ define void @v_shuffle_v3f16_v3f16__5_u_1(ptr addrspace(1) inreg %ptr) {
19281928
; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_u_1:
19291929
; GFX900: ; %bb.0:
19301930
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1931+
; GFX900-NEXT: v_mov_b32_e32 v3, 0
19311932
; GFX900-NEXT: ;;#ASMSTART
19321933
; GFX900-NEXT: ; def v[0:1]
19331934
; GFX900-NEXT: ;;#ASMEND
1934-
; GFX900-NEXT: v_mov_b32_e32 v3, 0
1935-
; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
19361935
; GFX900-NEXT: ;;#ASMSTART
19371936
; GFX900-NEXT: ; def v[1:2]
19381937
; GFX900-NEXT: ;;#ASMEND
1938+
; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4
19391939
; GFX900-NEXT: global_store_dword v3, v2, s[16:17]
1940-
; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4
19411940
; GFX900-NEXT: s_waitcnt vmcnt(0)
19421941
; GFX900-NEXT: s_setpc_b64 s[30:31]
19431942
;
19441943
; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_u_1:
19451944
; GFX90A: ; %bb.0:
19461945
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1946+
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
19471947
; GFX90A-NEXT: ;;#ASMSTART
19481948
; GFX90A-NEXT: ; def v[0:1]
19491949
; GFX90A-NEXT: ;;#ASMEND
1950-
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
1951-
; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
19521950
; GFX90A-NEXT: ;;#ASMSTART
19531951
; GFX90A-NEXT: ; def v[2:3]
19541952
; GFX90A-NEXT: ;;#ASMEND
1953+
; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4
19551954
; GFX90A-NEXT: global_store_dword v4, v3, s[16:17]
1956-
; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4
19571955
; GFX90A-NEXT: s_waitcnt vmcnt(0)
19581956
; GFX90A-NEXT: s_setpc_b64 s[30:31]
19591957
;
19601958
; GFX942-LABEL: v_shuffle_v3f16_v3f16__5_u_1:
19611959
; GFX942: ; %bb.0:
19621960
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1961+
; GFX942-NEXT: v_mov_b32_e32 v4, 0
19631962
; GFX942-NEXT: ;;#ASMSTART
19641963
; GFX942-NEXT: ; def v[0:1]
19651964
; GFX942-NEXT: ;;#ASMEND
1966-
; GFX942-NEXT: v_mov_b32_e32 v4, 0
1967-
; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
19681965
; GFX942-NEXT: ;;#ASMSTART
19691966
; GFX942-NEXT: ; def v[2:3]
19701967
; GFX942-NEXT: ;;#ASMEND
1968+
; GFX942-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4
19711969
; GFX942-NEXT: global_store_dword v4, v3, s[0:1]
1972-
; GFX942-NEXT: global_store_short v4, v0, s[0:1] offset:4
19731970
; GFX942-NEXT: s_waitcnt vmcnt(0)
19741971
; GFX942-NEXT: s_setpc_b64 s[30:31]
19751972
%vec0 = call <4 x half> asm "; def $0", "=v"()

llvm/test/CodeGen/X86/vec_int_to_fp.ll

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2099,21 +2099,19 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
20992099
; SSE41-NEXT: movdqa %xmm0, %xmm2
21002100
; SSE41-NEXT: psrlq $1, %xmm2
21012101
; SSE41-NEXT: por %xmm1, %xmm2
2102-
; SSE41-NEXT: movdqa %xmm0, %xmm1
2103-
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
2104-
; SSE41-NEXT: pextrq $1, %xmm1, %rax
2102+
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
2103+
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm0
2104+
; SSE41-NEXT: pextrq $1, %xmm0, %rax
2105+
; SSE41-NEXT: cvtsi2ss %rax, %xmm3
2106+
; SSE41-NEXT: movq %xmm0, %rax
21052107
; SSE41-NEXT: xorps %xmm2, %xmm2
21062108
; SSE41-NEXT: cvtsi2ss %rax, %xmm2
2107-
; SSE41-NEXT: movq %xmm1, %rax
2108-
; SSE41-NEXT: xorps %xmm1, %xmm1
2109-
; SSE41-NEXT: cvtsi2ss %rax, %xmm1
2110-
; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
2111-
; SSE41-NEXT: movaps %xmm1, %xmm2
2112-
; SSE41-NEXT: addps %xmm1, %xmm2
2113-
; SSE41-NEXT: xorps %xmm3, %xmm3
2114-
; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[2,3]
2115-
; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
2116-
; SSE41-NEXT: movaps %xmm1, %xmm0
2109+
; SSE41-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],zero,zero
2110+
; SSE41-NEXT: movaps %xmm2, %xmm3
2111+
; SSE41-NEXT: addps %xmm2, %xmm3
2112+
; SSE41-NEXT: movdqa %xmm1, %xmm0
2113+
; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2
2114+
; SSE41-NEXT: movaps %xmm2, %xmm0
21172115
; SSE41-NEXT: retq
21182116
;
21192117
; AVX1-LABEL: uitofp_4i64_to_4f32_undef:

0 commit comments

Comments
 (0)