Skip to content

Commit d90add0

Browse files
committed
[NVPTX] update how loads are optimized and disable on O0
1 parent 440750f commit d90add0

File tree

4 files changed

+113
-28
lines changed

4 files changed

+113
-28
lines changed

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1184,11 +1184,25 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
11841184
return false;
11851185
}
11861186

1187+
LLVM_DEBUG({
1188+
dbgs() << "tryLoadVector on " << TLI->getTargetNodeName(N->getOpcode())
1189+
<< ":\n";
1190+
dbgs() << " load type: " << MemVT << "\n";
1191+
dbgs() << " total load width: " << TotalWidth << " bits\n";
1192+
dbgs() << " from type width: " << FromTypeWidth << " bits\n";
1193+
dbgs() << " element type: " << EltVT << "\n";
1194+
});
1195+
11871196
if (isSubVectorPackedInInteger(EltVT)) {
11881197
assert(ExtensionType == ISD::NON_EXTLOAD);
11891198
FromTypeWidth = EltVT.getSizeInBits();
11901199
EltVT = MVT::getIntegerVT(FromTypeWidth);
11911200
FromType = NVPTX::PTXLdStInstCode::Untyped;
1201+
LLVM_DEBUG({
1202+
dbgs() << " packed integers detected:\n";
1203+
dbgs() << " from type width: " << FromTypeWidth << " (new)\n";
1204+
dbgs() << " element type: " << EltVT << " (new)\n";
1205+
});
11921206
}
11931207

11941208
assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
@@ -1501,9 +1515,23 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
15011515
return false;
15021516
}
15031517

1518+
LLVM_DEBUG({
1519+
dbgs() << "tryStoreVector on " << TLI->getTargetNodeName(N->getOpcode())
1520+
<< ":\n";
1521+
dbgs() << " store type: " << StoreVT << "\n";
1522+
dbgs() << " total store width: " << TotalWidth << " bits\n";
1523+
dbgs() << " to type width: " << ToTypeWidth << " bits\n";
1524+
dbgs() << " element type: " << EltVT << "\n";
1525+
});
1526+
15041527
if (isSubVectorPackedInInteger(EltVT)) {
15051528
ToTypeWidth = EltVT.getSizeInBits();
15061529
EltVT = MVT::getIntegerVT(ToTypeWidth);
1530+
LLVM_DEBUG({
1531+
dbgs() << " packed integers detected:\n";
1532+
dbgs() << " to type width: " << ToTypeWidth << " (new)\n";
1533+
dbgs() << " element type: " << EltVT << " (new)\n";
1534+
});
15071535
}
15081536

15091537
assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5196,8 +5196,19 @@ convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector,
51965196
return {{NewLD, LoadChain}};
51975197
}
51985198

5199+
static MachineMemOperand *
5200+
getMachineMemOperandForType(const SelectionDAG &DAG,
5201+
const MachineMemOperand *MMO,
5202+
const MachinePointerInfo &PointerInfo, MVT VT) {
5203+
return DAG.getMachineFunction().getMachineMemOperand(MMO, PointerInfo,
5204+
LLT(VT));
5205+
}
5206+
51995207
static SDValue PerformLoadCombine(SDNode *N,
52005208
TargetLowering::DAGCombinerInfo &DCI) {
5209+
if (DCI.DAG.getOptLevel() == CodeGenOptLevel::None)
5210+
return {};
5211+
52015212
auto *MemN = cast<MemSDNode>(N);
52025213
// only operate on vectors of f32s / i64s
52035214
if (EVT MemVT = MemN->getMemoryVT();
@@ -5278,9 +5289,13 @@ static SDValue PerformLoadCombine(SDNode *N,
52785289
// Do we have to tweak the opcode for an NVPTXISD::Load* or do we have to
52795290
// rewrite an ISD::LOAD?
52805291
std::optional<NVPTXISD::NodeType> NewOpcode;
5292+
5293+
// LoadV's are handled slightly different in ISelDAGToDAG.
5294+
bool IsLoadV = false;
52815295
switch (N->getOpcode()) {
52825296
case NVPTXISD::LoadV2:
52835297
NewOpcode = NVPTXISD::LoadV4;
5298+
IsLoadV = true;
52845299
break;
52855300
case NVPTXISD::LoadParam:
52865301
NewOpcode = NVPTXISD::LoadParamV2;
@@ -5321,9 +5336,22 @@ static SDValue PerformLoadCombine(SDNode *N,
53215336
}
53225337
}
53235338

5339+
MVT LoadVT = MVT::f32;
5340+
MachineMemOperand *MMO = MemN->getMemOperand();
5341+
5342+
if (IsLoadV) {
5343+
// Some loads must have an operand type that matches the number of results
5344+
// and the type of each result. Because we changed a vNi64 to v(N*2)f32 we
5345+
// have to update it here. Note that LoadParam is not handled the same way
5346+
// in NVPXISelDAGToDAG so we only do this for LoadV*.
5347+
LoadVT = MVT::getVectorVT(MVT::f32, NumElts);
5348+
MMO = getMachineMemOperandForType(DCI.DAG, MMO, MemN->getPointerInfo(),
5349+
LoadVT);
5350+
}
5351+
53245352
NewLoad = DCI.DAG.getMemIntrinsicNode(
53255353
*NewOpcode, SDLoc(N), DCI.DAG.getVTList(VTs),
5326-
SmallVector<SDValue>(N->ops()), MVT::f32, MemN->getMemOperand());
5354+
SmallVector<SDValue>(N->ops()), LoadVT, MMO);
53275355
NewChain = NewLoad.getValue(*NewChainIdx);
53285356
if (NewGlueIdx)
53295357
NewGlue = NewLoad.getValue(*NewGlueIdx);
@@ -5422,6 +5450,9 @@ static SDValue PerformStoreCombineHelper(SDNode *N,
54225450
// as the previous value will become unused and eliminated later.
54235451
return N->getOperand(0);
54245452

5453+
if (DCI.DAG.getOptLevel() == CodeGenOptLevel::None)
5454+
return {};
5455+
54255456
auto *MemN = cast<MemSDNode>(N);
54265457
if (MemN->getMemoryVT() == MVT::v2f32) {
54275458
// try to fold, and expand:
@@ -5453,13 +5484,15 @@ static SDValue PerformStoreCombineHelper(SDNode *N,
54535484
if (NewOpcode) {
54545485
// copy chain, offset from existing store
54555486
SmallVector<SDValue> NewOps = {N->getOperand(0), N->getOperand(1)};
5487+
unsigned NumElts = 0;
54565488
// gather all operands to expand
54575489
for (unsigned I = 2, E = N->getNumOperands(); I < E; ++I) {
54585490
SDValue CurrentOp = N->getOperand(I);
54595491
if (CurrentOp->getOpcode() == ISD::BUILD_VECTOR) {
54605492
assert(CurrentOp.getValueType() == MVT::v2f32);
54615493
NewOps.push_back(CurrentOp.getOperand(0));
54625494
NewOps.push_back(CurrentOp.getOperand(1));
5495+
NumElts += 2;
54635496
} else {
54645497
NewOps.clear();
54655498
break;

llvm/test/CodeGen/NVPTX/f16x2-instructions.ll

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -624,7 +624,7 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
624624
; CHECK-F16-NEXT: .reg .pred %p<3>;
625625
; CHECK-F16-NEXT: .reg .b32 %r<3>;
626626
; CHECK-F16-NEXT: .reg .b32 %f<7>;
627-
; CHECK-F16-NEXT: .reg .b64 %rd<3>;
627+
; CHECK-F16-NEXT: .reg .b64 %rd<4>;
628628
; CHECK-F16-EMPTY:
629629
; CHECK-F16-NEXT: // %bb.0:
630630
; CHECK-F16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3];
@@ -636,7 +636,8 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
636636
; CHECK-F16-NEXT: mov.b64 {%f3, %f4}, %rd1;
637637
; CHECK-F16-NEXT: selp.f32 %f5, %f4, %f2, %p2;
638638
; CHECK-F16-NEXT: selp.f32 %f6, %f3, %f1, %p1;
639-
; CHECK-F16-NEXT: st.param.v2.b32 [func_retval0], {%f6, %f5};
639+
; CHECK-F16-NEXT: mov.b64 %rd3, {%f6, %f5};
640+
; CHECK-F16-NEXT: st.param.b64 [func_retval0], %rd3;
640641
; CHECK-F16-NEXT: ret;
641642
;
642643
; CHECK-NOF16-LABEL: test_select_cc_f32_f16(
@@ -645,7 +646,7 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
645646
; CHECK-NOF16-NEXT: .reg .b16 %rs<5>;
646647
; CHECK-NOF16-NEXT: .reg .b32 %r<3>;
647648
; CHECK-NOF16-NEXT: .reg .b32 %f<11>;
648-
; CHECK-NOF16-NEXT: .reg .b64 %rd<3>;
649+
; CHECK-NOF16-NEXT: .reg .b64 %rd<4>;
649650
; CHECK-NOF16-EMPTY:
650651
; CHECK-NOF16-NEXT: // %bb.0:
651652
; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3];
@@ -664,7 +665,8 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
664665
; CHECK-NOF16-NEXT: mov.b64 {%f7, %f8}, %rd1;
665666
; CHECK-NOF16-NEXT: selp.f32 %f9, %f8, %f6, %p2;
666667
; CHECK-NOF16-NEXT: selp.f32 %f10, %f7, %f5, %p1;
667-
; CHECK-NOF16-NEXT: st.param.v2.b32 [func_retval0], {%f10, %f9};
668+
; CHECK-NOF16-NEXT: mov.b64 %rd3, {%f10, %f9};
669+
; CHECK-NOF16-NEXT: st.param.b64 [func_retval0], %rd3;
668670
; CHECK-NOF16-NEXT: ret;
669671
<2 x half> %c, <2 x half> %d) #0 {
670672
%cc = fcmp une <2 x half> %c, %d
@@ -1593,13 +1595,15 @@ define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 {
15931595
; CHECK-NEXT: .reg .b16 %rs<3>;
15941596
; CHECK-NEXT: .reg .b32 %r<2>;
15951597
; CHECK-NEXT: .reg .b32 %f<3>;
1598+
; CHECK-NEXT: .reg .b64 %rd<2>;
15961599
; CHECK-EMPTY:
15971600
; CHECK-NEXT: // %bb.0:
15981601
; CHECK-NEXT: ld.param.b32 %r1, [test_fpext_2xfloat_param_0];
15991602
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
16001603
; CHECK-NEXT: cvt.f32.f16 %f1, %rs2;
16011604
; CHECK-NEXT: cvt.f32.f16 %f2, %rs1;
1602-
; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%f2, %f1};
1605+
; CHECK-NEXT: mov.b64 %rd1, {%f2, %f1};
1606+
; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
16031607
; CHECK-NEXT: ret;
16041608
%r = fpext <2 x half> %a to <2 x float>
16051609
ret <2 x float> %r
@@ -2097,6 +2101,7 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
20972101
; CHECK-F16-NEXT: .reg .b16 %rs<3>;
20982102
; CHECK-F16-NEXT: .reg .b32 %r<6>;
20992103
; CHECK-F16-NEXT: .reg .b32 %f<3>;
2104+
; CHECK-F16-NEXT: .reg .b64 %rd<2>;
21002105
; CHECK-F16-EMPTY:
21012106
; CHECK-F16-NEXT: // %bb.0:
21022107
; CHECK-F16-NEXT: ld.param.b32 %r2, [test_copysign_extended_param_1];
@@ -2107,14 +2112,16 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
21072112
; CHECK-F16-NEXT: mov.b32 {%rs1, %rs2}, %r5;
21082113
; CHECK-F16-NEXT: cvt.f32.f16 %f1, %rs2;
21092114
; CHECK-F16-NEXT: cvt.f32.f16 %f2, %rs1;
2110-
; CHECK-F16-NEXT: st.param.v2.b32 [func_retval0], {%f2, %f1};
2115+
; CHECK-F16-NEXT: mov.b64 %rd1, {%f2, %f1};
2116+
; CHECK-F16-NEXT: st.param.b64 [func_retval0], %rd1;
21112117
; CHECK-F16-NEXT: ret;
21122118
;
21132119
; CHECK-NOF16-LABEL: test_copysign_extended(
21142120
; CHECK-NOF16: {
21152121
; CHECK-NOF16-NEXT: .reg .b16 %rs<11>;
21162122
; CHECK-NOF16-NEXT: .reg .b32 %r<3>;
21172123
; CHECK-NOF16-NEXT: .reg .b32 %f<3>;
2124+
; CHECK-NOF16-NEXT: .reg .b64 %rd<2>;
21182125
; CHECK-NOF16-EMPTY:
21192126
; CHECK-NOF16-NEXT: // %bb.0:
21202127
; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_copysign_extended_param_1];
@@ -2129,7 +2136,8 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
21292136
; CHECK-NOF16-NEXT: or.b16 %rs10, %rs9, %rs8;
21302137
; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs10;
21312138
; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs7;
2132-
; CHECK-NOF16-NEXT: st.param.v2.b32 [func_retval0], {%f2, %f1};
2139+
; CHECK-NOF16-NEXT: mov.b64 %rd1, {%f2, %f1};
2140+
; CHECK-NOF16-NEXT: st.param.b64 [func_retval0], %rd1;
21332141
; CHECK-NOF16-NEXT: ret;
21342142
%r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b)
21352143
%xr = fpext <2 x half> %r to <2 x float>

0 commit comments

Comments
 (0)