@@ -852,13 +852,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
852
852
if (STI.allowFP16Math () || STI.hasBF16Math ())
853
853
setTargetDAGCombine (ISD::SETCC);
854
854
855
- // Combine reduction operations on packed types (e.g. fadd.f16x2) with vector
856
- // shuffles when one of their lanes is a no-op.
857
- if (STI.allowFP16Math () || STI.hasBF16Math ())
858
- // already added above: FADD, ADD, AND
859
- setTargetDAGCombine ({ISD::FMUL, ISD::FMINIMUM, ISD::FMAXIMUM, ISD::UMIN,
860
- ISD::UMAX, ISD::SMIN, ISD::SMAX, ISD::OR, ISD::XOR});
861
-
862
855
// Promote fp16 arithmetic if fp16 hardware isn't available or the
863
856
// user passed --nvptx-no-fp16-math. The flag is useful because,
864
857
// although sm_53+ GPUs have some sort of FP16 support in
@@ -5076,102 +5069,20 @@ static SDValue PerformStoreRetvalCombine(SDNode *N) {
5076
5069
return PerformStoreCombineHelper (N, 2 , 0 );
5077
5070
}
5078
5071
5079
- // / For vector reductions, the final result needs to be a scalar. The default
5080
- // / expansion will use packed ops (ex. fadd.f16x2) even for the final operation.
5081
- // / This requires a packed operation where one of the lanes is undef.
5082
- // /
5083
- // / ex: lowering of vecreduce_fadd(V) where V = v4f16<a b c d>
5084
- // /
5085
- // / v1: v2f16 = fadd reassoc v2f16<a b>, v2f16<c d> (== <a+c b+d>)
5086
- // / v2: v2f16 = vector_shuffle<1,u> v1, undef:v2f16 (== <b+d undef>)
5087
- // / v3: v2f16 = fadd reassoc v2, v1 (== <b+d+a+c undef>)
5088
- // / vR: f16 = extractelt v3, 1
5089
- // /
5090
- // / We wish to replace vR, v3, and v2 with:
5091
- // / vR: f16 = fadd reassoc (extractelt v1, 1) (extractelt v1, 0)
5092
- // /
5093
- // / ...so that we get:
5094
- // / v1: v2f16 = fadd reassoc v2f16<a b>, v2f16<c d> (== <a+c b+d>)
5095
- // / s1: f16 = extractelt v1, 1
5096
- // / s2: f16 = extractelt v1, 0
5097
- // / vR: f16 = fadd reassoc s1, s2 (== a+c+b+d)
5098
- // /
5099
- // / So for this example, this rule will replace v3 and v2, returning a vector
5100
- // / with the result in lane 0 and an undef in lane 1, which we expect will be
5101
- // / folded into the extractelt in vR.
5102
- static SDValue PerformPackedOpCombine (SDNode *N,
5103
- TargetLowering::DAGCombinerInfo &DCI) {
5104
- // Convert:
5105
- // (fop.x2 (vector_shuffle<i,u> A), B) -> ((fop A:i, B:0), undef)
5106
- // ...or...
5107
- // (fop.x2 (vector_shuffle<u,i> A), B) -> (undef, (fop A:i, B:1))
5108
- // ...where i is a valid index and u is poison.
5109
- const EVT VectorVT = N->getValueType (0 );
5110
- if (!Isv2x16VT (VectorVT))
5111
- return SDValue ();
5112
-
5113
- SDLoc DL (N);
5114
-
5115
- SDValue ShufOp = N->getOperand (0 );
5116
- SDValue VectOp = N->getOperand (1 );
5117
- bool Swapped = false ;
5118
-
5119
- // canonicalize shuffle to op0
5120
- if (VectOp.getOpcode () == ISD::VECTOR_SHUFFLE) {
5121
- std::swap (ShufOp, VectOp);
5122
- Swapped = true ;
5123
- }
5124
-
5125
- if (ShufOp.getOpcode () != ISD::VECTOR_SHUFFLE)
5126
- return SDValue ();
5127
-
5128
- auto *ShuffleOp = cast<ShuffleVectorSDNode>(ShufOp);
5129
- int LiveLane; // exclusively live lane
5130
- for (LiveLane = 0 ; LiveLane < 2 ; ++LiveLane) {
5131
- // check if the current lane is live and the other lane is dead
5132
- if (ShuffleOp->getMaskElt (LiveLane) != PoisonMaskElem &&
5133
- ShuffleOp->getMaskElt (!LiveLane) == PoisonMaskElem)
5134
- break ;
5135
- }
5136
- if (LiveLane == 2 )
5137
- return SDValue ();
5138
-
5139
- int ElementIdx = ShuffleOp->getMaskElt (LiveLane);
5140
- const EVT ScalarVT = VectorVT.getScalarType ();
5141
- SDValue Lanes[2 ] = {};
5142
- for (auto [LaneID, LaneVal] : enumerate(Lanes)) {
5143
- if (LaneID == (unsigned )LiveLane) {
5144
- SDValue Operands[2 ] = {
5145
- DCI.DAG .getExtractVectorElt (DL, ScalarVT, ShufOp.getOperand (0 ),
5146
- ElementIdx),
5147
- DCI.DAG .getExtractVectorElt (DL, ScalarVT, VectOp, LiveLane)};
5148
- // preserve the order of operands
5149
- if (Swapped)
5150
- std::swap (Operands[0 ], Operands[1 ]);
5151
- LaneVal = DCI.DAG .getNode (N->getOpcode (), DL, ScalarVT, Operands);
5152
- } else {
5153
- LaneVal = DCI.DAG .getUNDEF (ScalarVT);
5154
- }
5155
- }
5156
- return DCI.DAG .getBuildVector (VectorVT, DL, Lanes);
5157
- }
5158
-
5159
5072
// / PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
5160
5073
// /
5161
5074
static SDValue PerformADDCombine (SDNode *N,
5162
5075
TargetLowering::DAGCombinerInfo &DCI,
5163
5076
CodeGenOptLevel OptLevel) {
5077
+ if (OptLevel == CodeGenOptLevel::None)
5078
+ return SDValue ();
5079
+
5164
5080
SDValue N0 = N->getOperand (0 );
5165
5081
SDValue N1 = N->getOperand (1 );
5166
5082
5167
5083
// Skip non-integer, non-scalar case
5168
5084
EVT VT = N0.getValueType ();
5169
- if (VT.isVector ())
5170
- return PerformPackedOpCombine (N, DCI);
5171
- if (VT != MVT::i32 )
5172
- return SDValue ();
5173
-
5174
- if (OptLevel == CodeGenOptLevel::None)
5085
+ if (VT.isVector () || VT != MVT::i32 )
5175
5086
return SDValue ();
5176
5087
5177
5088
// First try with the default operand order.
@@ -5191,10 +5102,7 @@ static SDValue PerformFADDCombine(SDNode *N,
5191
5102
SDValue N1 = N->getOperand (1 );
5192
5103
5193
5104
EVT VT = N0.getValueType ();
5194
- if (VT.isVector ())
5195
- return PerformPackedOpCombine (N, DCI);
5196
-
5197
- if (!(VT == MVT::f32 || VT == MVT::f64 ))
5105
+ if (VT.isVector () || !(VT == MVT::f32 || VT == MVT::f64 ))
5198
5106
return SDValue ();
5199
5107
5200
5108
// First try with the default operand order.
@@ -5297,7 +5205,7 @@ static SDValue PerformANDCombine(SDNode *N,
5297
5205
DCI.CombineTo (N, Val, AddTo);
5298
5206
}
5299
5207
5300
- return PerformPackedOpCombine (N, DCI );
5208
+ return SDValue ( );
5301
5209
}
5302
5210
5303
5211
static SDValue PerformREMCombine (SDNode *N,
@@ -5778,16 +5686,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
5778
5686
return PerformADDCombine (N, DCI, OptLevel);
5779
5687
case ISD::FADD:
5780
5688
return PerformFADDCombine (N, DCI, OptLevel);
5781
- case ISD::FMUL:
5782
- case ISD::FMINNUM:
5783
- case ISD::FMAXIMUM:
5784
- case ISD::UMIN:
5785
- case ISD::UMAX:
5786
- case ISD::SMIN:
5787
- case ISD::SMAX:
5788
- case ISD::OR:
5789
- case ISD::XOR:
5790
- return PerformPackedOpCombine (N, DCI);
5791
5689
case ISD::MUL:
5792
5690
return PerformMULCombine (N, DCI, OptLevel);
5793
5691
case ISD::SHL:
0 commit comments