-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[DAG] Refactor X86 combineVSelectWithAllOnesOrZeros fold into a generic DAG Combine #145298
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[DAG] Refactor X86 combineVSelectWithAllOnesOrZeros fold into a generic DAG Combine #145298
Conversation
@llvm/pr-subscribers-backend-powerpc @llvm/pr-subscribers-backend-systemz Author: woruyu (woruyu) ChangesThis PR resolves #144513 The modification include five pattern : 1-4 have been migrated to DAGCombine. 5 still in x86 code. The reason is that you cannot use the andn instruction directly in DAGCombine, you can only use and+xor, which will introduce optimization order issues. For example, in the x86 backend, select Cond, 0, x → (~Cond) & x, the backend will first check whether the cond node of (~Cond) is a setcc node. If so, it will modify the comparison operator of the condition.So the x86 backend cannot complete the optimization of andn.In short, I think it is a better choice to keep the pattern of vselect Cond, 000..., X instead of and+xor in combineDAG. For commit, the first is code changes and x86 test(note 1), the second is tests in other backend(node 2). Patch is 74.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/145298.diff 20 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 56a5643e13442..0dce13035f33a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12945,6 +12945,85 @@ SDValue DAGCombiner::visitVP_SELECT(SDNode *N) {
return SDValue();
}
+static SDValue combineVSelectWithAllOnesOrZeros(SDValue Cond, SDValue TVal,
+ SDValue FVal,
+ const TargetLowering &TLI,
+ SelectionDAG &DAG,
+ const SDLoc &DL) {
+ if (!TLI.isTypeLegal(TVal.getValueType()))
+ return SDValue();
+
+ EVT VT = TVal.getValueType();
+ EVT CondVT = Cond.getValueType();
+
+ assert(CondVT.isVector() && "Vector select expects a vector selector!");
+
+ // Classify TVal/FVal content
+ bool IsTAllZero = ISD::isBuildVectorAllZeros(TVal.getNode());
+ bool IsTAllOne = ISD::isBuildVectorAllOnes(TVal.getNode());
+ bool IsFAllZero = ISD::isBuildVectorAllZeros(FVal.getNode());
+ bool IsFAllOne = ISD::isBuildVectorAllOnes(FVal.getNode());
+
+ // no vselect(cond, 0/-1, X) or vselect(cond, X, 0/-1), return
+ if (!(IsTAllZero || IsTAllOne || IsFAllZero || IsFAllOne))
+ return SDValue();
+
+ // select Cond, 0, 0 → 0
+ if (IsTAllZero && IsFAllZero) {
+ return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, DL, VT)
+ : DAG.getConstant(0, DL, VT);
+ }
+
+ // To use the condition operand as a bitwise mask, it must have elements that
+ // are the same size as the select elements. Ie, the condition operand must
+ // have already been promoted from the IR select condition type <N x i1>.
+ // Don't check if the types themselves are equal because that excludes
+ // vector floating-point selects.
+ if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
+ return SDValue();
+
+ // Try inverting Cond and swapping T/F if it gives all-ones/all-zeros form
+ if (!IsTAllOne && !IsFAllZero && Cond.hasOneUse() &&
+ Cond.getOpcode() == ISD::SETCC &&
+ TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
+ CondVT) {
+ if (IsTAllZero || IsFAllOne) {
+ SDValue CC = Cond.getOperand(2);
+ ISD::CondCode InverseCC = ISD::getSetCCInverse(
+ cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
+ Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
+ InverseCC);
+ std::swap(TVal, FVal);
+ std::swap(IsTAllOne, IsFAllOne);
+ std::swap(IsTAllZero, IsFAllZero);
+ }
+ }
+
+ // Cond value must be 'sign splat' to be converted to a logical op.
+ if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
+ return SDValue();
+
+ // select Cond, -1, 0 → bitcast Cond
+ if (IsTAllOne && IsFAllZero)
+ return DAG.getBitcast(VT, Cond);
+
+ // select Cond, -1, x → or Cond, x
+ if (IsTAllOne) {
+ SDValue X = DAG.getBitcast(CondVT, FVal);
+ SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, X);
+ return DAG.getBitcast(VT, Or);
+ }
+
+ // select Cond, x, 0 → and Cond, x
+ if (IsFAllZero) {
+ SDValue X = DAG.getBitcast(CondVT, TVal);
+ SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, X);
+ return DAG.getBitcast(VT, And);
+ }
+
+ return SDValue();
+}
+
SDValue DAGCombiner::visitVSELECT(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
@@ -13213,6 +13292,9 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
if (SimplifyDemandedVectorElts(SDValue(N, 0)))
return SDValue(N, 0);
+ if (SDValue V = combineVSelectWithAllOnesOrZeros(N0, N1, N2, TLI, DAG, DL))
+ return V;
+
return SDValue();
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2541182de1208..ed462d9692358 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -47264,13 +47264,14 @@ static SDValue combineToExtendBoolVectorInReg(
DAG.getConstant(EltSizeInBits - 1, DL, VT));
}
-/// If a vector select has an operand that is -1 or 0, try to simplify the
+/// If a vector select has an left operand that is 0, try to simplify the
/// select to a bitwise logic operation.
-/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
-static SDValue
-combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
+/// TODO: Move to DAGCombiner.combineVSelectWithAllOnesOrZeros, possibly using
+/// TargetLowering::hasAndNot()?
+static SDValue combineVSelectWithLastZeros(SDNode *N, SelectionDAG &DAG,
+ const SDLoc &DL,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
SDValue Cond = N->getOperand(0);
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
@@ -47283,20 +47284,6 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
assert(CondVT.isVector() && "Vector select expects a vector selector!");
- // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
- // TODO: Can we assert that both operands are not zeros (because that should
- // get simplified at node creation time)?
- bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
- bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
-
- // If both inputs are 0/undef, create a complete zero vector.
- // FIXME: As noted above this should be handled by DAGCombiner/getNode.
- if (TValIsAllZeros && FValIsAllZeros) {
- if (VT.isFloatingPoint())
- return DAG.getConstantFP(0.0, DL, VT);
- return DAG.getConstant(0, DL, VT);
- }
-
// To use the condition operand as a bitwise mask, it must have elements that
// are the same size as the select elements. Ie, the condition operand must
// have already been promoted from the IR select condition type <N x i1>.
@@ -47305,56 +47292,15 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
return SDValue();
- // Try to invert the condition if true value is not all 1s and false value is
- // not all 0s. Only do this if the condition has one use.
- bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
- if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
- // Check if the selector will be produced by CMPP*/PCMP*.
- Cond.getOpcode() == ISD::SETCC &&
- // Check if SETCC has already been promoted.
- TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
- CondVT) {
- bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
-
- if (TValIsAllZeros || FValIsAllOnes) {
- SDValue CC = Cond.getOperand(2);
- ISD::CondCode NewCC = ISD::getSetCCInverse(
- cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
- Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
- NewCC);
- std::swap(LHS, RHS);
- TValIsAllOnes = FValIsAllOnes;
- FValIsAllZeros = TValIsAllZeros;
- }
- }
-
// Cond value must be 'sign splat' to be converted to a logical op.
if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
return SDValue();
- // vselect Cond, 111..., 000... -> Cond
- if (TValIsAllOnes && FValIsAllZeros)
- return DAG.getBitcast(VT, Cond);
-
if (!TLI.isTypeLegal(CondVT))
return SDValue();
- // vselect Cond, 111..., X -> or Cond, X
- if (TValIsAllOnes) {
- SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
- SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
- return DAG.getBitcast(VT, Or);
- }
-
- // vselect Cond, X, 000... -> and Cond, X
- if (FValIsAllZeros) {
- SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
- SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
- return DAG.getBitcast(VT, And);
- }
-
// vselect Cond, 000..., X -> andn Cond, X
- if (TValIsAllZeros) {
+ if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
SDValue AndN;
// The canonical form differs for i1 vectors - x86andnp is not used
@@ -48117,7 +48063,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
return SDValue();
- if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DL, DCI, Subtarget))
+ if (SDValue V = combineVSelectWithLastZeros(N, DAG, DL, DCI, Subtarget))
return V;
if (SDValue V = combineVSelectToBLENDV(N, DAG, DL, DCI, Subtarget))
diff --git a/llvm/test/CodeGen/AArch64/arm64-zip.ll b/llvm/test/CodeGen/AArch64/arm64-zip.ll
index b24e54a68fb42..20d0c7f1b7085 100644
--- a/llvm/test/CodeGen/AArch64/arm64-zip.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-zip.ll
@@ -413,7 +413,7 @@ define <4 x float> @shuffle_zip1(<4 x float> %arg) {
; CHECK-NEXT: fmov.4s v1, #1.00000000
; CHECK-NEXT: zip1.4h v0, v0, v0
; CHECK-NEXT: sshll.4s v0, v0, #0
-; CHECK-NEXT: and.16b v0, v1, v0
+; CHECK-NEXT: and.16b v0, v0, v1
; CHECK-NEXT: ret
bb:
%inst = fcmp olt <4 x float> zeroinitializer, %arg
diff --git a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
index b4f179e992a0d..6bbbcf88167d8 100644
--- a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
+++ b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
@@ -114,9 +114,10 @@ define i64 @not_sign_i64_4(i64 %a) {
define <7 x i8> @sign_7xi8(<7 x i8> %a) {
; CHECK-LABEL: sign_7xi8:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.8b, #1
-; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
-; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff
+; CHECK-NEXT: movi v2.8b, #1
+; CHECK-NEXT: cmge v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: orr v0.8b, v0.8b, v2.8b
; CHECK-NEXT: ret
%c = icmp sgt <7 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
%res = select <7 x i1> %c, <7 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <7 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -150,7 +151,8 @@ define <16 x i8> @sign_16xi8(<16 x i8> %a) {
define <3 x i32> @sign_3xi32(<3 x i32> %a) {
; CHECK-LABEL: sign_3xi32:
; CHECK: // %bb.0:
-; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff
+; CHECK-NEXT: cmge v0.4s, v1.4s, v0.4s
; CHECK-NEXT: orr v0.4s, #1
; CHECK-NEXT: ret
%c = icmp sgt <3 x i32> %a, <i32 -1, i32 -1, i32 -1>
@@ -197,11 +199,9 @@ define <4 x i32> @not_sign_4xi32(<4 x i32> %a) {
; CHECK-LABEL: not_sign_4xi32:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI16_0
-; CHECK-NEXT: movi v2.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0]
-; CHECK-NEXT: cmgt v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: and v1.16b, v0.16b, v2.16b
-; CHECK-NEXT: orn v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: cmge v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: orr v0.4s, #1
; CHECK-NEXT: ret
%c = icmp sgt <4 x i32> %a, <i32 1, i32 -1, i32 -1, i32 -1>
%res = select <4 x i1> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
diff --git a/llvm/test/CodeGen/AArch64/concatbinop.ll b/llvm/test/CodeGen/AArch64/concatbinop.ll
index 828182d18b38c..062a5a8c35b2c 100644
--- a/llvm/test/CodeGen/AArch64/concatbinop.ll
+++ b/llvm/test/CodeGen/AArch64/concatbinop.ll
@@ -179,7 +179,7 @@ define <16 x i8> @signOf_neon(ptr nocapture noundef readonly %a, ptr nocapture n
; CHECK-NEXT: uzp1 v3.16b, v5.16b, v6.16b
; CHECK-NEXT: uzp1 v1.16b, v1.16b, v2.16b
; CHECK-NEXT: and v0.16b, v3.16b, v0.16b
-; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
; CHECK-NEXT: ret
entry:
%0 = load <8 x i16>, ptr %a, align 2
diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll
index 2deb19be24821..ecd48d6b7c65b 100644
--- a/llvm/test/CodeGen/AArch64/sat-add.ll
+++ b/llvm/test/CodeGen/AArch64/sat-add.ll
@@ -530,7 +530,7 @@ define <16 x i8> @unsigned_sat_variable_v16i8_using_cmp_notval(<16 x i8> %x, <16
; CHECK-NEXT: mvn v2.16b, v1.16b
; CHECK-NEXT: add v1.16b, v0.16b, v1.16b
; CHECK-NEXT: cmhi v0.16b, v0.16b, v2.16b
-; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%noty = xor <16 x i8> %y, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
%a = add <16 x i8> %x, %y
@@ -570,7 +570,7 @@ define <8 x i16> @unsigned_sat_variable_v8i16_using_cmp_notval(<8 x i16> %x, <8
; CHECK-NEXT: mvn v2.16b, v1.16b
; CHECK-NEXT: add v1.8h, v0.8h, v1.8h
; CHECK-NEXT: cmhi v0.8h, v0.8h, v2.8h
-; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%noty = xor <8 x i16> %y, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
%a = add <8 x i16> %x, %y
@@ -610,7 +610,7 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_notval(<4 x i32> %x, <4
; CHECK-NEXT: mvn v2.16b, v1.16b
; CHECK-NEXT: add v1.4s, v0.4s, v1.4s
; CHECK-NEXT: cmhi v0.4s, v0.4s, v2.4s
-; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%noty = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
%a = add <4 x i32> %x, %y
@@ -651,7 +651,7 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_notval(<2 x i64> %x, <2
; CHECK-NEXT: mvn v2.16b, v1.16b
; CHECK-NEXT: add v1.2d, v0.2d, v1.2d
; CHECK-NEXT: cmhi v0.2d, v0.2d, v2.2d
-; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%noty = xor <2 x i64> %y, <i64 -1, i64 -1>
%a = add <2 x i64> %x, %y
diff --git a/llvm/test/CodeGen/AArch64/select_cc.ll b/llvm/test/CodeGen/AArch64/select_cc.ll
index 73e4d4c7f0aeb..483f6c26af8c1 100644
--- a/llvm/test/CodeGen/AArch64/select_cc.ll
+++ b/llvm/test/CodeGen/AArch64/select_cc.ll
@@ -88,7 +88,7 @@ define <2 x double> @select_olt_load_cmp(<2 x double> %a, ptr %src) {
; CHECK-SD-NEXT: ldr d1, [x0]
; CHECK-SD-NEXT: fcmgt v1.2s, v1.2s, #0.0
; CHECK-SD-NEXT: sshll v1.2d, v1.2s, #0
-; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: select_olt_load_cmp:
diff --git a/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll b/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
index 32fc9c1377704..0d4a636446164 100644
--- a/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
+++ b/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
@@ -249,9 +249,6 @@ define <16 x i8> @sel_shift_bool_v16i8(<16 x i1> %t) {
; CHECK-SD-LABEL: sel_shift_bool_v16i8:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: shl v0.16b, v0.16b, #7
-; CHECK-SD-NEXT: movi v1.16b, #128
-; CHECK-SD-NEXT: cmlt v0.16b, v0.16b, #0
-; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sel_shift_bool_v16i8:
diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll
index b5d64112db727..aa0a163b96ac8 100644
--- a/llvm/test/CodeGen/AArch64/tbl-loops.ll
+++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll
@@ -31,12 +31,12 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: add x13, x13, #32
; CHECK-NEXT: fcmgt v3.4s, v1.4s, v0.4s
; CHECK-NEXT: fcmgt v4.4s, v2.4s, v0.4s
-; CHECK-NEXT: fcmlt v5.4s, v1.4s, #0.0
-; CHECK-NEXT: fcmlt v6.4s, v2.4s, #0.0
-; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b
-; CHECK-NEXT: bit v2.16b, v0.16b, v4.16b
-; CHECK-NEXT: bic v1.16b, v1.16b, v5.16b
-; CHECK-NEXT: bic v2.16b, v2.16b, v6.16b
+; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b
+; CHECK-NEXT: bsl v4.16b, v0.16b, v2.16b
+; CHECK-NEXT: fcmlt v1.4s, v1.4s, #0.0
+; CHECK-NEXT: fcmlt v2.4s, v2.4s, #0.0
+; CHECK-NEXT: bic v1.16b, v3.16b, v1.16b
+; CHECK-NEXT: bic v2.16b, v4.16b, v2.16b
; CHECK-NEXT: fcvtzs v1.4s, v1.4s
; CHECK-NEXT: fcvtzs v2.4s, v2.4s
; CHECK-NEXT: xtn v1.4h, v1.4s
diff --git a/llvm/test/CodeGen/AArch64/vselect-constants.ll b/llvm/test/CodeGen/AArch64/vselect-constants.ll
index a7cf5ece5d270..fe125c9626ea3 100644
--- a/llvm/test/CodeGen/AArch64/vselect-constants.ll
+++ b/llvm/test/CodeGen/AArch64/vselect-constants.ll
@@ -146,10 +146,8 @@ define <4 x i32> @cmp_sel_0_or_minus1_vec(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @sel_1_or_0_vec(<4 x i1> %cond) {
; CHECK-LABEL: sel_1_or_0_vec:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: shl v0.4s, v0.4s, #31
-; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%add = select <4 x i1> %cond, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
diff --git a/llvm/test/CodeGen/AArch64/vselect-ext.ll b/llvm/test/CodeGen/AArch64/vselect-ext.ll
index 76b7f3d9dfc0e..4f2b9c5a62669 100644
--- a/llvm/test/CodeGen/AArch64/vselect-ext.ll
+++ b/llvm/test/CodeGen/AArch64/vselect-ext.ll
@@ -12,10 +12,10 @@ define <16 x i32> @no_existing_zext(<16 x i8> %a, <16 x i32> %op) {
; CHECK-NEXT: sshll.4s v6, v5, #0
; CHECK-NEXT: sshll.4s v7, v0, #0
; CHECK-NEXT: sshll2.4s v5, v5, #0
-; CHECK-NEXT: and.16b v4, v4, v16
-; CHECK-NEXT: and.16b v0, v1, v6
-; CHECK-NEXT: and.16b v1, v2, v5
-; CHECK-NEXT: and.16b v2, v3, v7
+; CHECK-NEXT: and.16b v4, v16, v4
+; CHECK-NEXT: and.16b v0, v6, v1
+; CHECK-NEXT: and.16b v1, v5, v2
+; CHECK-NEXT: and.16b v2, v7, v3
; CHECK-NEXT: mov.16b v3, v4
; CHECK-NEXT: ret
entry:
@@ -40,10 +40,10 @@ define <16 x i32> @second_compare_operand_not_splat(<16 x i8> %a, <16 x i8> %b)
; CHECK-NEXT: sshll.4s v7, v1, #0
; CHECK-NEXT: sshll2.4s v16, v3, #0
; CHECK-NEXT: sshll2.4s v1, v1, #0
-; CHECK-NEXT: and.16b v0, v4, v0
-; CHECK-NEXT: and.16b v3, v6, v1
-; CHECK-NEXT: and.16b v1, v2, v16
-; CHECK-NEXT: and.16b v2, v5, v7
+; CHECK-NEXT: and.16b v0, v0, v4
+; CHECK-NEXT: and.16b v3, v1, v6
+; CHECK-NEXT: and.16b v1, v16, v2
+; CHECK-NEXT: and.16b v2, v7, v5
; CHECK-NEXT: ret
entry:
%ext = zext <16 x i8> %a to <16 x i32>
@@ -69,10 +69,10 @@ define <16 x i32> @same_zext_used_in_cmp_signed_pred_and_select(<16 x i8> %a) {
; CHECK-NEXT: sshll.4s v7, v1, #0
; CHECK-NEXT: sshll2.4s v16, v3, #0
; CHECK-NEXT: sshll2.4s v1, v1, #0
-; CHECK-NEXT: and.16b v0, v4, v0
-; CHECK-NEXT: and.16b v3, v6, v1
-; CHECK-NEXT: and.16b v1, v2, v16
-; CHECK-NEXT: and.16b v2, v5, v7
+; CHECK-NEXT: and.16b v0, v0, v4
+; CHECK-NEXT: and.16b v3, v1, v6
+; CHECK-NEXT: and.16b v1, v16, v2
+; CHECK-NEXT: and.16b v2, v7, v5
; CHECK-NEXT: ret
entry:
%ext = zext <16 x i8> %a to <16 x i32>
@@ -97,10 +97,10 @@ define <8 x i64> @same_zext_used_in_cmp_unsigned_pred_and_select_v8i64(<8 x i8>
; CHECK-NEXT: cmhi.2d v7, v1, v2
; CHECK-NEXT: cmhi.2d v6, v5, v2
; CHECK-NEXT: cmhi.2d v2, v4, v2
-; CHECK-NEXT: and.16b v0, v3, v0
-; CHECK-NEXT: and.16b v1, v1, v7
-; CHECK-NEXT: and.16b v3, v4, v2
-; CHECK-NEXT: and.16b v2, v5, v6
+; CHECK-NEXT: and.16b v0, v0, v3
+; CHECK-NEXT: and.16b v1, v7, v1
+; CHECK-NEXT: and.16b v3, v2, v4
+; CHECK-NEXT: and.16b v2, v6, v5
; CHECK-NEXT: ret
%ext = zext <8 x i8> %a to <8 x i64>
%cmp = icmp ugt <8 x i8> %a, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
@@ -123,10 +123,10 @@ define <16 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v16i32(<16 x i
; CHECK-NEXT: cmhi.4s v7, v2, v1
; CHECK-NEXT: cmhi.4s v6, v5, v1
; CHECK-NEXT: cmhi.4s v1, v4, v1
-; CHECK-NEXT: and.16b v0, v3, v0
-; CHECK-NEXT: and.16b v3, v4, v1
-; CHECK-NEXT: and.16b v1, v2, v7
-; CHECK-NEXT: and.16b v2, v5, v6
+; CHECK-NEXT: and.16b v0, v0, v3
+; CHECK-N...
[truncated]
|
@llvm/pr-subscribers-backend-loongarch Author: woruyu (woruyu) ChangesThis PR resolves #144513 The modification include five pattern : 1-4 have been migrated to DAGCombine. 5 still in x86 code. The reason is that you cannot use the andn instruction directly in DAGCombine, you can only use and+xor, which will introduce optimization order issues. For example, in the x86 backend, select Cond, 0, x → (~Cond) & x, the backend will first check whether the cond node of (~Cond) is a setcc node. If so, it will modify the comparison operator of the condition.So the x86 backend cannot complete the optimization of andn.In short, I think it is a better choice to keep the pattern of vselect Cond, 000..., X instead of and+xor in combineDAG. For commit, the first is code changes and x86 test(note 1), the second is tests in other backend(node 2). Patch is 74.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/145298.diff 20 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 56a5643e13442..0dce13035f33a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12945,6 +12945,85 @@ SDValue DAGCombiner::visitVP_SELECT(SDNode *N) {
return SDValue();
}
+static SDValue combineVSelectWithAllOnesOrZeros(SDValue Cond, SDValue TVal,
+ SDValue FVal,
+ const TargetLowering &TLI,
+ SelectionDAG &DAG,
+ const SDLoc &DL) {
+ if (!TLI.isTypeLegal(TVal.getValueType()))
+ return SDValue();
+
+ EVT VT = TVal.getValueType();
+ EVT CondVT = Cond.getValueType();
+
+ assert(CondVT.isVector() && "Vector select expects a vector selector!");
+
+ // Classify TVal/FVal content
+ bool IsTAllZero = ISD::isBuildVectorAllZeros(TVal.getNode());
+ bool IsTAllOne = ISD::isBuildVectorAllOnes(TVal.getNode());
+ bool IsFAllZero = ISD::isBuildVectorAllZeros(FVal.getNode());
+ bool IsFAllOne = ISD::isBuildVectorAllOnes(FVal.getNode());
+
+ // no vselect(cond, 0/-1, X) or vselect(cond, X, 0/-1), return
+ if (!(IsTAllZero || IsTAllOne || IsFAllZero || IsFAllOne))
+ return SDValue();
+
+ // select Cond, 0, 0 → 0
+ if (IsTAllZero && IsFAllZero) {
+ return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, DL, VT)
+ : DAG.getConstant(0, DL, VT);
+ }
+
+ // To use the condition operand as a bitwise mask, it must have elements that
+ // are the same size as the select elements. Ie, the condition operand must
+ // have already been promoted from the IR select condition type <N x i1>.
+ // Don't check if the types themselves are equal because that excludes
+ // vector floating-point selects.
+ if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
+ return SDValue();
+
+ // Try inverting Cond and swapping T/F if it gives all-ones/all-zeros form
+ if (!IsTAllOne && !IsFAllZero && Cond.hasOneUse() &&
+ Cond.getOpcode() == ISD::SETCC &&
+ TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
+ CondVT) {
+ if (IsTAllZero || IsFAllOne) {
+ SDValue CC = Cond.getOperand(2);
+ ISD::CondCode InverseCC = ISD::getSetCCInverse(
+ cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
+ Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
+ InverseCC);
+ std::swap(TVal, FVal);
+ std::swap(IsTAllOne, IsFAllOne);
+ std::swap(IsTAllZero, IsFAllZero);
+ }
+ }
+
+ // Cond value must be 'sign splat' to be converted to a logical op.
+ if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
+ return SDValue();
+
+ // select Cond, -1, 0 → bitcast Cond
+ if (IsTAllOne && IsFAllZero)
+ return DAG.getBitcast(VT, Cond);
+
+ // select Cond, -1, x → or Cond, x
+ if (IsTAllOne) {
+ SDValue X = DAG.getBitcast(CondVT, FVal);
+ SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, X);
+ return DAG.getBitcast(VT, Or);
+ }
+
+ // select Cond, x, 0 → and Cond, x
+ if (IsFAllZero) {
+ SDValue X = DAG.getBitcast(CondVT, TVal);
+ SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, X);
+ return DAG.getBitcast(VT, And);
+ }
+
+ return SDValue();
+}
+
SDValue DAGCombiner::visitVSELECT(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
@@ -13213,6 +13292,9 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
if (SimplifyDemandedVectorElts(SDValue(N, 0)))
return SDValue(N, 0);
+ if (SDValue V = combineVSelectWithAllOnesOrZeros(N0, N1, N2, TLI, DAG, DL))
+ return V;
+
return SDValue();
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2541182de1208..ed462d9692358 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -47264,13 +47264,14 @@ static SDValue combineToExtendBoolVectorInReg(
DAG.getConstant(EltSizeInBits - 1, DL, VT));
}
-/// If a vector select has an operand that is -1 or 0, try to simplify the
+/// If a vector select has an left operand that is 0, try to simplify the
/// select to a bitwise logic operation.
-/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
-static SDValue
-combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
+/// TODO: Move to DAGCombiner.combineVSelectWithAllOnesOrZeros, possibly using
+/// TargetLowering::hasAndNot()?
+static SDValue combineVSelectWithLastZeros(SDNode *N, SelectionDAG &DAG,
+ const SDLoc &DL,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
SDValue Cond = N->getOperand(0);
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
@@ -47283,20 +47284,6 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
assert(CondVT.isVector() && "Vector select expects a vector selector!");
- // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
- // TODO: Can we assert that both operands are not zeros (because that should
- // get simplified at node creation time)?
- bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
- bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
-
- // If both inputs are 0/undef, create a complete zero vector.
- // FIXME: As noted above this should be handled by DAGCombiner/getNode.
- if (TValIsAllZeros && FValIsAllZeros) {
- if (VT.isFloatingPoint())
- return DAG.getConstantFP(0.0, DL, VT);
- return DAG.getConstant(0, DL, VT);
- }
-
// To use the condition operand as a bitwise mask, it must have elements that
// are the same size as the select elements. Ie, the condition operand must
// have already been promoted from the IR select condition type <N x i1>.
@@ -47305,56 +47292,15 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
return SDValue();
- // Try to invert the condition if true value is not all 1s and false value is
- // not all 0s. Only do this if the condition has one use.
- bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
- if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
- // Check if the selector will be produced by CMPP*/PCMP*.
- Cond.getOpcode() == ISD::SETCC &&
- // Check if SETCC has already been promoted.
- TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
- CondVT) {
- bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
-
- if (TValIsAllZeros || FValIsAllOnes) {
- SDValue CC = Cond.getOperand(2);
- ISD::CondCode NewCC = ISD::getSetCCInverse(
- cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
- Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
- NewCC);
- std::swap(LHS, RHS);
- TValIsAllOnes = FValIsAllOnes;
- FValIsAllZeros = TValIsAllZeros;
- }
- }
-
// Cond value must be 'sign splat' to be converted to a logical op.
if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
return SDValue();
- // vselect Cond, 111..., 000... -> Cond
- if (TValIsAllOnes && FValIsAllZeros)
- return DAG.getBitcast(VT, Cond);
-
if (!TLI.isTypeLegal(CondVT))
return SDValue();
- // vselect Cond, 111..., X -> or Cond, X
- if (TValIsAllOnes) {
- SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
- SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
- return DAG.getBitcast(VT, Or);
- }
-
- // vselect Cond, X, 000... -> and Cond, X
- if (FValIsAllZeros) {
- SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
- SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
- return DAG.getBitcast(VT, And);
- }
-
// vselect Cond, 000..., X -> andn Cond, X
- if (TValIsAllZeros) {
+ if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
SDValue AndN;
// The canonical form differs for i1 vectors - x86andnp is not used
@@ -48117,7 +48063,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
return SDValue();
- if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DL, DCI, Subtarget))
+ if (SDValue V = combineVSelectWithLastZeros(N, DAG, DL, DCI, Subtarget))
return V;
if (SDValue V = combineVSelectToBLENDV(N, DAG, DL, DCI, Subtarget))
diff --git a/llvm/test/CodeGen/AArch64/arm64-zip.ll b/llvm/test/CodeGen/AArch64/arm64-zip.ll
index b24e54a68fb42..20d0c7f1b7085 100644
--- a/llvm/test/CodeGen/AArch64/arm64-zip.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-zip.ll
@@ -413,7 +413,7 @@ define <4 x float> @shuffle_zip1(<4 x float> %arg) {
; CHECK-NEXT: fmov.4s v1, #1.00000000
; CHECK-NEXT: zip1.4h v0, v0, v0
; CHECK-NEXT: sshll.4s v0, v0, #0
-; CHECK-NEXT: and.16b v0, v1, v0
+; CHECK-NEXT: and.16b v0, v0, v1
; CHECK-NEXT: ret
bb:
%inst = fcmp olt <4 x float> zeroinitializer, %arg
diff --git a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
index b4f179e992a0d..6bbbcf88167d8 100644
--- a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
+++ b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
@@ -114,9 +114,10 @@ define i64 @not_sign_i64_4(i64 %a) {
define <7 x i8> @sign_7xi8(<7 x i8> %a) {
; CHECK-LABEL: sign_7xi8:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.8b, #1
-; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
-; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff
+; CHECK-NEXT: movi v2.8b, #1
+; CHECK-NEXT: cmge v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: orr v0.8b, v0.8b, v2.8b
; CHECK-NEXT: ret
%c = icmp sgt <7 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
%res = select <7 x i1> %c, <7 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <7 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -150,7 +151,8 @@ define <16 x i8> @sign_16xi8(<16 x i8> %a) {
define <3 x i32> @sign_3xi32(<3 x i32> %a) {
; CHECK-LABEL: sign_3xi32:
; CHECK: // %bb.0:
-; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff
+; CHECK-NEXT: cmge v0.4s, v1.4s, v0.4s
; CHECK-NEXT: orr v0.4s, #1
; CHECK-NEXT: ret
%c = icmp sgt <3 x i32> %a, <i32 -1, i32 -1, i32 -1>
@@ -197,11 +199,9 @@ define <4 x i32> @not_sign_4xi32(<4 x i32> %a) {
; CHECK-LABEL: not_sign_4xi32:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI16_0
-; CHECK-NEXT: movi v2.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0]
-; CHECK-NEXT: cmgt v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: and v1.16b, v0.16b, v2.16b
-; CHECK-NEXT: orn v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: cmge v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: orr v0.4s, #1
; CHECK-NEXT: ret
%c = icmp sgt <4 x i32> %a, <i32 1, i32 -1, i32 -1, i32 -1>
%res = select <4 x i1> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
diff --git a/llvm/test/CodeGen/AArch64/concatbinop.ll b/llvm/test/CodeGen/AArch64/concatbinop.ll
index 828182d18b38c..062a5a8c35b2c 100644
--- a/llvm/test/CodeGen/AArch64/concatbinop.ll
+++ b/llvm/test/CodeGen/AArch64/concatbinop.ll
@@ -179,7 +179,7 @@ define <16 x i8> @signOf_neon(ptr nocapture noundef readonly %a, ptr nocapture n
; CHECK-NEXT: uzp1 v3.16b, v5.16b, v6.16b
; CHECK-NEXT: uzp1 v1.16b, v1.16b, v2.16b
; CHECK-NEXT: and v0.16b, v3.16b, v0.16b
-; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
; CHECK-NEXT: ret
entry:
%0 = load <8 x i16>, ptr %a, align 2
diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll
index 2deb19be24821..ecd48d6b7c65b 100644
--- a/llvm/test/CodeGen/AArch64/sat-add.ll
+++ b/llvm/test/CodeGen/AArch64/sat-add.ll
@@ -530,7 +530,7 @@ define <16 x i8> @unsigned_sat_variable_v16i8_using_cmp_notval(<16 x i8> %x, <16
; CHECK-NEXT: mvn v2.16b, v1.16b
; CHECK-NEXT: add v1.16b, v0.16b, v1.16b
; CHECK-NEXT: cmhi v0.16b, v0.16b, v2.16b
-; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%noty = xor <16 x i8> %y, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
%a = add <16 x i8> %x, %y
@@ -570,7 +570,7 @@ define <8 x i16> @unsigned_sat_variable_v8i16_using_cmp_notval(<8 x i16> %x, <8
; CHECK-NEXT: mvn v2.16b, v1.16b
; CHECK-NEXT: add v1.8h, v0.8h, v1.8h
; CHECK-NEXT: cmhi v0.8h, v0.8h, v2.8h
-; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%noty = xor <8 x i16> %y, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
%a = add <8 x i16> %x, %y
@@ -610,7 +610,7 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_notval(<4 x i32> %x, <4
; CHECK-NEXT: mvn v2.16b, v1.16b
; CHECK-NEXT: add v1.4s, v0.4s, v1.4s
; CHECK-NEXT: cmhi v0.4s, v0.4s, v2.4s
-; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%noty = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
%a = add <4 x i32> %x, %y
@@ -651,7 +651,7 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_notval(<2 x i64> %x, <2
; CHECK-NEXT: mvn v2.16b, v1.16b
; CHECK-NEXT: add v1.2d, v0.2d, v1.2d
; CHECK-NEXT: cmhi v0.2d, v0.2d, v2.2d
-; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%noty = xor <2 x i64> %y, <i64 -1, i64 -1>
%a = add <2 x i64> %x, %y
diff --git a/llvm/test/CodeGen/AArch64/select_cc.ll b/llvm/test/CodeGen/AArch64/select_cc.ll
index 73e4d4c7f0aeb..483f6c26af8c1 100644
--- a/llvm/test/CodeGen/AArch64/select_cc.ll
+++ b/llvm/test/CodeGen/AArch64/select_cc.ll
@@ -88,7 +88,7 @@ define <2 x double> @select_olt_load_cmp(<2 x double> %a, ptr %src) {
; CHECK-SD-NEXT: ldr d1, [x0]
; CHECK-SD-NEXT: fcmgt v1.2s, v1.2s, #0.0
; CHECK-SD-NEXT: sshll v1.2d, v1.2s, #0
-; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: select_olt_load_cmp:
diff --git a/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll b/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
index 32fc9c1377704..0d4a636446164 100644
--- a/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
+++ b/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
@@ -249,9 +249,6 @@ define <16 x i8> @sel_shift_bool_v16i8(<16 x i1> %t) {
; CHECK-SD-LABEL: sel_shift_bool_v16i8:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: shl v0.16b, v0.16b, #7
-; CHECK-SD-NEXT: movi v1.16b, #128
-; CHECK-SD-NEXT: cmlt v0.16b, v0.16b, #0
-; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sel_shift_bool_v16i8:
diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll
index b5d64112db727..aa0a163b96ac8 100644
--- a/llvm/test/CodeGen/AArch64/tbl-loops.ll
+++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll
@@ -31,12 +31,12 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: add x13, x13, #32
; CHECK-NEXT: fcmgt v3.4s, v1.4s, v0.4s
; CHECK-NEXT: fcmgt v4.4s, v2.4s, v0.4s
-; CHECK-NEXT: fcmlt v5.4s, v1.4s, #0.0
-; CHECK-NEXT: fcmlt v6.4s, v2.4s, #0.0
-; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b
-; CHECK-NEXT: bit v2.16b, v0.16b, v4.16b
-; CHECK-NEXT: bic v1.16b, v1.16b, v5.16b
-; CHECK-NEXT: bic v2.16b, v2.16b, v6.16b
+; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b
+; CHECK-NEXT: bsl v4.16b, v0.16b, v2.16b
+; CHECK-NEXT: fcmlt v1.4s, v1.4s, #0.0
+; CHECK-NEXT: fcmlt v2.4s, v2.4s, #0.0
+; CHECK-NEXT: bic v1.16b, v3.16b, v1.16b
+; CHECK-NEXT: bic v2.16b, v4.16b, v2.16b
; CHECK-NEXT: fcvtzs v1.4s, v1.4s
; CHECK-NEXT: fcvtzs v2.4s, v2.4s
; CHECK-NEXT: xtn v1.4h, v1.4s
diff --git a/llvm/test/CodeGen/AArch64/vselect-constants.ll b/llvm/test/CodeGen/AArch64/vselect-constants.ll
index a7cf5ece5d270..fe125c9626ea3 100644
--- a/llvm/test/CodeGen/AArch64/vselect-constants.ll
+++ b/llvm/test/CodeGen/AArch64/vselect-constants.ll
@@ -146,10 +146,8 @@ define <4 x i32> @cmp_sel_0_or_minus1_vec(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @sel_1_or_0_vec(<4 x i1> %cond) {
; CHECK-LABEL: sel_1_or_0_vec:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: shl v0.4s, v0.4s, #31
-; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%add = select <4 x i1> %cond, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
diff --git a/llvm/test/CodeGen/AArch64/vselect-ext.ll b/llvm/test/CodeGen/AArch64/vselect-ext.ll
index 76b7f3d9dfc0e..4f2b9c5a62669 100644
--- a/llvm/test/CodeGen/AArch64/vselect-ext.ll
+++ b/llvm/test/CodeGen/AArch64/vselect-ext.ll
@@ -12,10 +12,10 @@ define <16 x i32> @no_existing_zext(<16 x i8> %a, <16 x i32> %op) {
; CHECK-NEXT: sshll.4s v6, v5, #0
; CHECK-NEXT: sshll.4s v7, v0, #0
; CHECK-NEXT: sshll2.4s v5, v5, #0
-; CHECK-NEXT: and.16b v4, v4, v16
-; CHECK-NEXT: and.16b v0, v1, v6
-; CHECK-NEXT: and.16b v1, v2, v5
-; CHECK-NEXT: and.16b v2, v3, v7
+; CHECK-NEXT: and.16b v4, v16, v4
+; CHECK-NEXT: and.16b v0, v6, v1
+; CHECK-NEXT: and.16b v1, v5, v2
+; CHECK-NEXT: and.16b v2, v7, v3
; CHECK-NEXT: mov.16b v3, v4
; CHECK-NEXT: ret
entry:
@@ -40,10 +40,10 @@ define <16 x i32> @second_compare_operand_not_splat(<16 x i8> %a, <16 x i8> %b)
; CHECK-NEXT: sshll.4s v7, v1, #0
; CHECK-NEXT: sshll2.4s v16, v3, #0
; CHECK-NEXT: sshll2.4s v1, v1, #0
-; CHECK-NEXT: and.16b v0, v4, v0
-; CHECK-NEXT: and.16b v3, v6, v1
-; CHECK-NEXT: and.16b v1, v2, v16
-; CHECK-NEXT: and.16b v2, v5, v7
+; CHECK-NEXT: and.16b v0, v0, v4
+; CHECK-NEXT: and.16b v3, v1, v6
+; CHECK-NEXT: and.16b v1, v16, v2
+; CHECK-NEXT: and.16b v2, v7, v5
; CHECK-NEXT: ret
entry:
%ext = zext <16 x i8> %a to <16 x i32>
@@ -69,10 +69,10 @@ define <16 x i32> @same_zext_used_in_cmp_signed_pred_and_select(<16 x i8> %a) {
; CHECK-NEXT: sshll.4s v7, v1, #0
; CHECK-NEXT: sshll2.4s v16, v3, #0
; CHECK-NEXT: sshll2.4s v1, v1, #0
-; CHECK-NEXT: and.16b v0, v4, v0
-; CHECK-NEXT: and.16b v3, v6, v1
-; CHECK-NEXT: and.16b v1, v2, v16
-; CHECK-NEXT: and.16b v2, v5, v7
+; CHECK-NEXT: and.16b v0, v0, v4
+; CHECK-NEXT: and.16b v3, v1, v6
+; CHECK-NEXT: and.16b v1, v16, v2
+; CHECK-NEXT: and.16b v2, v7, v5
; CHECK-NEXT: ret
entry:
%ext = zext <16 x i8> %a to <16 x i32>
@@ -97,10 +97,10 @@ define <8 x i64> @same_zext_used_in_cmp_unsigned_pred_and_select_v8i64(<8 x i8>
; CHECK-NEXT: cmhi.2d v7, v1, v2
; CHECK-NEXT: cmhi.2d v6, v5, v2
; CHECK-NEXT: cmhi.2d v2, v4, v2
-; CHECK-NEXT: and.16b v0, v3, v0
-; CHECK-NEXT: and.16b v1, v1, v7
-; CHECK-NEXT: and.16b v3, v4, v2
-; CHECK-NEXT: and.16b v2, v5, v6
+; CHECK-NEXT: and.16b v0, v0, v3
+; CHECK-NEXT: and.16b v1, v7, v1
+; CHECK-NEXT: and.16b v3, v2, v4
+; CHECK-NEXT: and.16b v2, v6, v5
; CHECK-NEXT: ret
%ext = zext <8 x i8> %a to <8 x i64>
%cmp = icmp ugt <8 x i8> %a, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
@@ -123,10 +123,10 @@ define <16 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v16i32(<16 x i
; CHECK-NEXT: cmhi.4s v7, v2, v1
; CHECK-NEXT: cmhi.4s v6, v5, v1
; CHECK-NEXT: cmhi.4s v1, v4, v1
-; CHECK-NEXT: and.16b v0, v3, v0
-; CHECK-NEXT: and.16b v3, v4, v1
-; CHECK-NEXT: and.16b v1, v2, v7
-; CHECK-NEXT: and.16b v2, v5, v6
+; CHECK-NEXT: and.16b v0, v0, v3
+; CHECK-N...
[truncated]
|
@llvm/pr-subscribers-backend-arm Author: woruyu (woruyu) ChangesThis PR resolves #144513 The modification include five pattern : 1-4 have been migrated to DAGCombine. 5 still in x86 code. The reason is that you cannot use the andn instruction directly in DAGCombine, you can only use and+xor, which will introduce optimization order issues. For example, in the x86 backend, select Cond, 0, x → (~Cond) & x, the backend will first check whether the cond node of (~Cond) is a setcc node. If so, it will modify the comparison operator of the condition.So the x86 backend cannot complete the optimization of andn.In short, I think it is a better choice to keep the pattern of vselect Cond, 000..., X instead of and+xor in combineDAG. For commit, the first is code changes and x86 test(note 1), the second is tests in other backend(node 2). Patch is 74.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/145298.diff 20 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 56a5643e13442..0dce13035f33a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12945,6 +12945,85 @@ SDValue DAGCombiner::visitVP_SELECT(SDNode *N) {
return SDValue();
}
+static SDValue combineVSelectWithAllOnesOrZeros(SDValue Cond, SDValue TVal,
+ SDValue FVal,
+ const TargetLowering &TLI,
+ SelectionDAG &DAG,
+ const SDLoc &DL) {
+ if (!TLI.isTypeLegal(TVal.getValueType()))
+ return SDValue();
+
+ EVT VT = TVal.getValueType();
+ EVT CondVT = Cond.getValueType();
+
+ assert(CondVT.isVector() && "Vector select expects a vector selector!");
+
+ // Classify TVal/FVal content
+ bool IsTAllZero = ISD::isBuildVectorAllZeros(TVal.getNode());
+ bool IsTAllOne = ISD::isBuildVectorAllOnes(TVal.getNode());
+ bool IsFAllZero = ISD::isBuildVectorAllZeros(FVal.getNode());
+ bool IsFAllOne = ISD::isBuildVectorAllOnes(FVal.getNode());
+
+ // no vselect(cond, 0/-1, X) or vselect(cond, X, 0/-1), return
+ if (!(IsTAllZero || IsTAllOne || IsFAllZero || IsFAllOne))
+ return SDValue();
+
+ // select Cond, 0, 0 → 0
+ if (IsTAllZero && IsFAllZero) {
+ return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, DL, VT)
+ : DAG.getConstant(0, DL, VT);
+ }
+
+ // To use the condition operand as a bitwise mask, it must have elements that
+ // are the same size as the select elements. Ie, the condition operand must
+ // have already been promoted from the IR select condition type <N x i1>.
+ // Don't check if the types themselves are equal because that excludes
+ // vector floating-point selects.
+ if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
+ return SDValue();
+
+ // Try inverting Cond and swapping T/F if it gives all-ones/all-zeros form
+ if (!IsTAllOne && !IsFAllZero && Cond.hasOneUse() &&
+ Cond.getOpcode() == ISD::SETCC &&
+ TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
+ CondVT) {
+ if (IsTAllZero || IsFAllOne) {
+ SDValue CC = Cond.getOperand(2);
+ ISD::CondCode InverseCC = ISD::getSetCCInverse(
+ cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
+ Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
+ InverseCC);
+ std::swap(TVal, FVal);
+ std::swap(IsTAllOne, IsFAllOne);
+ std::swap(IsTAllZero, IsFAllZero);
+ }
+ }
+
+ // Cond value must be 'sign splat' to be converted to a logical op.
+ if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
+ return SDValue();
+
+ // select Cond, -1, 0 → bitcast Cond
+ if (IsTAllOne && IsFAllZero)
+ return DAG.getBitcast(VT, Cond);
+
+ // select Cond, -1, x → or Cond, x
+ if (IsTAllOne) {
+ SDValue X = DAG.getBitcast(CondVT, FVal);
+ SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, X);
+ return DAG.getBitcast(VT, Or);
+ }
+
+ // select Cond, x, 0 → and Cond, x
+ if (IsFAllZero) {
+ SDValue X = DAG.getBitcast(CondVT, TVal);
+ SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, X);
+ return DAG.getBitcast(VT, And);
+ }
+
+ return SDValue();
+}
+
SDValue DAGCombiner::visitVSELECT(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
@@ -13213,6 +13292,9 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
if (SimplifyDemandedVectorElts(SDValue(N, 0)))
return SDValue(N, 0);
+ if (SDValue V = combineVSelectWithAllOnesOrZeros(N0, N1, N2, TLI, DAG, DL))
+ return V;
+
return SDValue();
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2541182de1208..ed462d9692358 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -47264,13 +47264,14 @@ static SDValue combineToExtendBoolVectorInReg(
DAG.getConstant(EltSizeInBits - 1, DL, VT));
}
-/// If a vector select has an operand that is -1 or 0, try to simplify the
+/// If a vector select has an left operand that is 0, try to simplify the
/// select to a bitwise logic operation.
-/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
-static SDValue
-combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
+/// TODO: Move to DAGCombiner.combineVSelectWithAllOnesOrZeros, possibly using
+/// TargetLowering::hasAndNot()?
+static SDValue combineVSelectWithLastZeros(SDNode *N, SelectionDAG &DAG,
+ const SDLoc &DL,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
SDValue Cond = N->getOperand(0);
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
@@ -47283,20 +47284,6 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
assert(CondVT.isVector() && "Vector select expects a vector selector!");
- // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
- // TODO: Can we assert that both operands are not zeros (because that should
- // get simplified at node creation time)?
- bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
- bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
-
- // If both inputs are 0/undef, create a complete zero vector.
- // FIXME: As noted above this should be handled by DAGCombiner/getNode.
- if (TValIsAllZeros && FValIsAllZeros) {
- if (VT.isFloatingPoint())
- return DAG.getConstantFP(0.0, DL, VT);
- return DAG.getConstant(0, DL, VT);
- }
-
// To use the condition operand as a bitwise mask, it must have elements that
// are the same size as the select elements. Ie, the condition operand must
// have already been promoted from the IR select condition type <N x i1>.
@@ -47305,56 +47292,15 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
return SDValue();
- // Try to invert the condition if true value is not all 1s and false value is
- // not all 0s. Only do this if the condition has one use.
- bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
- if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
- // Check if the selector will be produced by CMPP*/PCMP*.
- Cond.getOpcode() == ISD::SETCC &&
- // Check if SETCC has already been promoted.
- TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
- CondVT) {
- bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
-
- if (TValIsAllZeros || FValIsAllOnes) {
- SDValue CC = Cond.getOperand(2);
- ISD::CondCode NewCC = ISD::getSetCCInverse(
- cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
- Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
- NewCC);
- std::swap(LHS, RHS);
- TValIsAllOnes = FValIsAllOnes;
- FValIsAllZeros = TValIsAllZeros;
- }
- }
-
// Cond value must be 'sign splat' to be converted to a logical op.
if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
return SDValue();
- // vselect Cond, 111..., 000... -> Cond
- if (TValIsAllOnes && FValIsAllZeros)
- return DAG.getBitcast(VT, Cond);
-
if (!TLI.isTypeLegal(CondVT))
return SDValue();
- // vselect Cond, 111..., X -> or Cond, X
- if (TValIsAllOnes) {
- SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
- SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
- return DAG.getBitcast(VT, Or);
- }
-
- // vselect Cond, X, 000... -> and Cond, X
- if (FValIsAllZeros) {
- SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
- SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
- return DAG.getBitcast(VT, And);
- }
-
// vselect Cond, 000..., X -> andn Cond, X
- if (TValIsAllZeros) {
+ if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
SDValue AndN;
// The canonical form differs for i1 vectors - x86andnp is not used
@@ -48117,7 +48063,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
return SDValue();
- if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DL, DCI, Subtarget))
+ if (SDValue V = combineVSelectWithLastZeros(N, DAG, DL, DCI, Subtarget))
return V;
if (SDValue V = combineVSelectToBLENDV(N, DAG, DL, DCI, Subtarget))
diff --git a/llvm/test/CodeGen/AArch64/arm64-zip.ll b/llvm/test/CodeGen/AArch64/arm64-zip.ll
index b24e54a68fb42..20d0c7f1b7085 100644
--- a/llvm/test/CodeGen/AArch64/arm64-zip.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-zip.ll
@@ -413,7 +413,7 @@ define <4 x float> @shuffle_zip1(<4 x float> %arg) {
; CHECK-NEXT: fmov.4s v1, #1.00000000
; CHECK-NEXT: zip1.4h v0, v0, v0
; CHECK-NEXT: sshll.4s v0, v0, #0
-; CHECK-NEXT: and.16b v0, v1, v0
+; CHECK-NEXT: and.16b v0, v0, v1
; CHECK-NEXT: ret
bb:
%inst = fcmp olt <4 x float> zeroinitializer, %arg
diff --git a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
index b4f179e992a0d..6bbbcf88167d8 100644
--- a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
+++ b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
@@ -114,9 +114,10 @@ define i64 @not_sign_i64_4(i64 %a) {
define <7 x i8> @sign_7xi8(<7 x i8> %a) {
; CHECK-LABEL: sign_7xi8:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.8b, #1
-; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
-; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff
+; CHECK-NEXT: movi v2.8b, #1
+; CHECK-NEXT: cmge v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: orr v0.8b, v0.8b, v2.8b
; CHECK-NEXT: ret
%c = icmp sgt <7 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
%res = select <7 x i1> %c, <7 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <7 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -150,7 +151,8 @@ define <16 x i8> @sign_16xi8(<16 x i8> %a) {
define <3 x i32> @sign_3xi32(<3 x i32> %a) {
; CHECK-LABEL: sign_3xi32:
; CHECK: // %bb.0:
-; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff
+; CHECK-NEXT: cmge v0.4s, v1.4s, v0.4s
; CHECK-NEXT: orr v0.4s, #1
; CHECK-NEXT: ret
%c = icmp sgt <3 x i32> %a, <i32 -1, i32 -1, i32 -1>
@@ -197,11 +199,9 @@ define <4 x i32> @not_sign_4xi32(<4 x i32> %a) {
; CHECK-LABEL: not_sign_4xi32:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI16_0
-; CHECK-NEXT: movi v2.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0]
-; CHECK-NEXT: cmgt v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: and v1.16b, v0.16b, v2.16b
-; CHECK-NEXT: orn v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: cmge v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: orr v0.4s, #1
; CHECK-NEXT: ret
%c = icmp sgt <4 x i32> %a, <i32 1, i32 -1, i32 -1, i32 -1>
%res = select <4 x i1> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
diff --git a/llvm/test/CodeGen/AArch64/concatbinop.ll b/llvm/test/CodeGen/AArch64/concatbinop.ll
index 828182d18b38c..062a5a8c35b2c 100644
--- a/llvm/test/CodeGen/AArch64/concatbinop.ll
+++ b/llvm/test/CodeGen/AArch64/concatbinop.ll
@@ -179,7 +179,7 @@ define <16 x i8> @signOf_neon(ptr nocapture noundef readonly %a, ptr nocapture n
; CHECK-NEXT: uzp1 v3.16b, v5.16b, v6.16b
; CHECK-NEXT: uzp1 v1.16b, v1.16b, v2.16b
; CHECK-NEXT: and v0.16b, v3.16b, v0.16b
-; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
; CHECK-NEXT: ret
entry:
%0 = load <8 x i16>, ptr %a, align 2
diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll
index 2deb19be24821..ecd48d6b7c65b 100644
--- a/llvm/test/CodeGen/AArch64/sat-add.ll
+++ b/llvm/test/CodeGen/AArch64/sat-add.ll
@@ -530,7 +530,7 @@ define <16 x i8> @unsigned_sat_variable_v16i8_using_cmp_notval(<16 x i8> %x, <16
; CHECK-NEXT: mvn v2.16b, v1.16b
; CHECK-NEXT: add v1.16b, v0.16b, v1.16b
; CHECK-NEXT: cmhi v0.16b, v0.16b, v2.16b
-; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%noty = xor <16 x i8> %y, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
%a = add <16 x i8> %x, %y
@@ -570,7 +570,7 @@ define <8 x i16> @unsigned_sat_variable_v8i16_using_cmp_notval(<8 x i16> %x, <8
; CHECK-NEXT: mvn v2.16b, v1.16b
; CHECK-NEXT: add v1.8h, v0.8h, v1.8h
; CHECK-NEXT: cmhi v0.8h, v0.8h, v2.8h
-; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%noty = xor <8 x i16> %y, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
%a = add <8 x i16> %x, %y
@@ -610,7 +610,7 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_notval(<4 x i32> %x, <4
; CHECK-NEXT: mvn v2.16b, v1.16b
; CHECK-NEXT: add v1.4s, v0.4s, v1.4s
; CHECK-NEXT: cmhi v0.4s, v0.4s, v2.4s
-; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%noty = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
%a = add <4 x i32> %x, %y
@@ -651,7 +651,7 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_notval(<2 x i64> %x, <2
; CHECK-NEXT: mvn v2.16b, v1.16b
; CHECK-NEXT: add v1.2d, v0.2d, v1.2d
; CHECK-NEXT: cmhi v0.2d, v0.2d, v2.2d
-; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%noty = xor <2 x i64> %y, <i64 -1, i64 -1>
%a = add <2 x i64> %x, %y
diff --git a/llvm/test/CodeGen/AArch64/select_cc.ll b/llvm/test/CodeGen/AArch64/select_cc.ll
index 73e4d4c7f0aeb..483f6c26af8c1 100644
--- a/llvm/test/CodeGen/AArch64/select_cc.ll
+++ b/llvm/test/CodeGen/AArch64/select_cc.ll
@@ -88,7 +88,7 @@ define <2 x double> @select_olt_load_cmp(<2 x double> %a, ptr %src) {
; CHECK-SD-NEXT: ldr d1, [x0]
; CHECK-SD-NEXT: fcmgt v1.2s, v1.2s, #0.0
; CHECK-SD-NEXT: sshll v1.2d, v1.2s, #0
-; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: select_olt_load_cmp:
diff --git a/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll b/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
index 32fc9c1377704..0d4a636446164 100644
--- a/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
+++ b/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
@@ -249,9 +249,6 @@ define <16 x i8> @sel_shift_bool_v16i8(<16 x i1> %t) {
; CHECK-SD-LABEL: sel_shift_bool_v16i8:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: shl v0.16b, v0.16b, #7
-; CHECK-SD-NEXT: movi v1.16b, #128
-; CHECK-SD-NEXT: cmlt v0.16b, v0.16b, #0
-; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: sel_shift_bool_v16i8:
diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll
index b5d64112db727..aa0a163b96ac8 100644
--- a/llvm/test/CodeGen/AArch64/tbl-loops.ll
+++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll
@@ -31,12 +31,12 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: add x13, x13, #32
; CHECK-NEXT: fcmgt v3.4s, v1.4s, v0.4s
; CHECK-NEXT: fcmgt v4.4s, v2.4s, v0.4s
-; CHECK-NEXT: fcmlt v5.4s, v1.4s, #0.0
-; CHECK-NEXT: fcmlt v6.4s, v2.4s, #0.0
-; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b
-; CHECK-NEXT: bit v2.16b, v0.16b, v4.16b
-; CHECK-NEXT: bic v1.16b, v1.16b, v5.16b
-; CHECK-NEXT: bic v2.16b, v2.16b, v6.16b
+; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b
+; CHECK-NEXT: bsl v4.16b, v0.16b, v2.16b
+; CHECK-NEXT: fcmlt v1.4s, v1.4s, #0.0
+; CHECK-NEXT: fcmlt v2.4s, v2.4s, #0.0
+; CHECK-NEXT: bic v1.16b, v3.16b, v1.16b
+; CHECK-NEXT: bic v2.16b, v4.16b, v2.16b
; CHECK-NEXT: fcvtzs v1.4s, v1.4s
; CHECK-NEXT: fcvtzs v2.4s, v2.4s
; CHECK-NEXT: xtn v1.4h, v1.4s
diff --git a/llvm/test/CodeGen/AArch64/vselect-constants.ll b/llvm/test/CodeGen/AArch64/vselect-constants.ll
index a7cf5ece5d270..fe125c9626ea3 100644
--- a/llvm/test/CodeGen/AArch64/vselect-constants.ll
+++ b/llvm/test/CodeGen/AArch64/vselect-constants.ll
@@ -146,10 +146,8 @@ define <4 x i32> @cmp_sel_0_or_minus1_vec(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @sel_1_or_0_vec(<4 x i1> %cond) {
; CHECK-LABEL: sel_1_or_0_vec:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: shl v0.4s, v0.4s, #31
-; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%add = select <4 x i1> %cond, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
diff --git a/llvm/test/CodeGen/AArch64/vselect-ext.ll b/llvm/test/CodeGen/AArch64/vselect-ext.ll
index 76b7f3d9dfc0e..4f2b9c5a62669 100644
--- a/llvm/test/CodeGen/AArch64/vselect-ext.ll
+++ b/llvm/test/CodeGen/AArch64/vselect-ext.ll
@@ -12,10 +12,10 @@ define <16 x i32> @no_existing_zext(<16 x i8> %a, <16 x i32> %op) {
; CHECK-NEXT: sshll.4s v6, v5, #0
; CHECK-NEXT: sshll.4s v7, v0, #0
; CHECK-NEXT: sshll2.4s v5, v5, #0
-; CHECK-NEXT: and.16b v4, v4, v16
-; CHECK-NEXT: and.16b v0, v1, v6
-; CHECK-NEXT: and.16b v1, v2, v5
-; CHECK-NEXT: and.16b v2, v3, v7
+; CHECK-NEXT: and.16b v4, v16, v4
+; CHECK-NEXT: and.16b v0, v6, v1
+; CHECK-NEXT: and.16b v1, v5, v2
+; CHECK-NEXT: and.16b v2, v7, v3
; CHECK-NEXT: mov.16b v3, v4
; CHECK-NEXT: ret
entry:
@@ -40,10 +40,10 @@ define <16 x i32> @second_compare_operand_not_splat(<16 x i8> %a, <16 x i8> %b)
; CHECK-NEXT: sshll.4s v7, v1, #0
; CHECK-NEXT: sshll2.4s v16, v3, #0
; CHECK-NEXT: sshll2.4s v1, v1, #0
-; CHECK-NEXT: and.16b v0, v4, v0
-; CHECK-NEXT: and.16b v3, v6, v1
-; CHECK-NEXT: and.16b v1, v2, v16
-; CHECK-NEXT: and.16b v2, v5, v7
+; CHECK-NEXT: and.16b v0, v0, v4
+; CHECK-NEXT: and.16b v3, v1, v6
+; CHECK-NEXT: and.16b v1, v16, v2
+; CHECK-NEXT: and.16b v2, v7, v5
; CHECK-NEXT: ret
entry:
%ext = zext <16 x i8> %a to <16 x i32>
@@ -69,10 +69,10 @@ define <16 x i32> @same_zext_used_in_cmp_signed_pred_and_select(<16 x i8> %a) {
; CHECK-NEXT: sshll.4s v7, v1, #0
; CHECK-NEXT: sshll2.4s v16, v3, #0
; CHECK-NEXT: sshll2.4s v1, v1, #0
-; CHECK-NEXT: and.16b v0, v4, v0
-; CHECK-NEXT: and.16b v3, v6, v1
-; CHECK-NEXT: and.16b v1, v2, v16
-; CHECK-NEXT: and.16b v2, v5, v7
+; CHECK-NEXT: and.16b v0, v0, v4
+; CHECK-NEXT: and.16b v3, v1, v6
+; CHECK-NEXT: and.16b v1, v16, v2
+; CHECK-NEXT: and.16b v2, v7, v5
; CHECK-NEXT: ret
entry:
%ext = zext <16 x i8> %a to <16 x i32>
@@ -97,10 +97,10 @@ define <8 x i64> @same_zext_used_in_cmp_unsigned_pred_and_select_v8i64(<8 x i8>
; CHECK-NEXT: cmhi.2d v7, v1, v2
; CHECK-NEXT: cmhi.2d v6, v5, v2
; CHECK-NEXT: cmhi.2d v2, v4, v2
-; CHECK-NEXT: and.16b v0, v3, v0
-; CHECK-NEXT: and.16b v1, v1, v7
-; CHECK-NEXT: and.16b v3, v4, v2
-; CHECK-NEXT: and.16b v2, v5, v6
+; CHECK-NEXT: and.16b v0, v0, v3
+; CHECK-NEXT: and.16b v1, v7, v1
+; CHECK-NEXT: and.16b v3, v2, v4
+; CHECK-NEXT: and.16b v2, v6, v5
; CHECK-NEXT: ret
%ext = zext <8 x i8> %a to <8 x i64>
%cmp = icmp ugt <8 x i8> %a, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
@@ -123,10 +123,10 @@ define <16 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v16i32(<16 x i
; CHECK-NEXT: cmhi.4s v7, v2, v1
; CHECK-NEXT: cmhi.4s v6, v5, v1
; CHECK-NEXT: cmhi.4s v1, v4, v1
-; CHECK-NEXT: and.16b v0, v3, v0
-; CHECK-NEXT: and.16b v3, v4, v1
-; CHECK-NEXT: and.16b v1, v2, v7
-; CHECK-NEXT: and.16b v2, v5, v6
+; CHECK-NEXT: and.16b v0, v0, v3
+; CHECK-N...
[truncated]
|
Note 1:
The reason is the execution order of the code. After moving to DAGCombine, the combineVSelectWithAllOnesOrZeros is executed first. It should be a problem of optimization order, but I think this case should be faster (fewer instructions)
|
Note 2: |
@RKSimon , hello, any suggestions or idea for modifications? |
This PR resolves #144513
The modification include five pattern :
1.vselect Cond, 0, 0 → 0
2.vselect Cond, -1, 0 → bitcast Cond
3.vselect Cond, -1, x → or Cond, x
4.vselect Cond, x, 0 → and Cond, x
5.vselect Cond, 000..., X -> andn Cond, X
1-4 have been migrated to DAGCombine. 5 still in x86 code.
The reason is that you cannot use the andn instruction directly in DAGCombine, you can only use and+xor, which will introduce optimization order issues. For example, in the x86 backend, select Cond, 0, x → (~Cond) & x, the backend will first check whether the cond node of (~Cond) is a setcc node. If so, it will modify the comparison operator of the condition.So the x86 backend cannot complete the optimization of andn.In short, I think it is a better choice to keep the pattern of vselect Cond, 000..., X instead of and+xor in combineDAG.
For commit, the first is code changes and x86 test(note 1), the second is tests in other backend(node 2).