Skip to content

Commit 20a2f0b

Browse files
committed
[NVPTX] fold v2f32 = bitcast (i64,i64,... = NVPTXISD::Load*)
Fold i64->v2f32 bitcasts on the results of a NVPTXISD::Load* op.
1 parent d3c8dc8 commit 20a2f0b

File tree

4 files changed

+121
-62
lines changed

4 files changed

+121
-62
lines changed

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1179,7 +1179,9 @@ static bool isVectorElementTypeUpsized(EVT EltVT) {
11791179
// In order to load/store such vectors efficiently, in Type Legalization
11801180
// we split the vector into word-sized chunks (v2x16/v4i8). Now, we will
11811181
// lower to PTX as vectors of b32.
1182-
return Isv2x16VT(EltVT) || EltVT == MVT::v4i8;
1182+
// We also consider v2f32 as an upsized type, which may be used in packed
1183+
// (f32x2) instructions.
1184+
return Isv2x16VT(EltVT) || EltVT == MVT::v4i8 || EltVT == MVT::v2f32;
11831185
}
11841186

11851187
bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
@@ -1235,9 +1237,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
12351237
EVT EltVT = N->getValueType(0);
12361238

12371239
if (isVectorElementTypeUpsized(EltVT)) {
1238-
EltVT = MVT::i32;
1240+
FromTypeWidth = EltVT.getSizeInBits();
1241+
EltVT = MVT::getIntegerVT(FromTypeWidth);
12391242
FromType = NVPTX::PTXLdStInstCode::Untyped;
1240-
FromTypeWidth = 32;
12411243
}
12421244

12431245
SDValue Offset, Base;
@@ -1562,9 +1564,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
15621564
}
15631565

15641566
if (isVectorElementTypeUpsized(EltVT)) {
1565-
EltVT = MVT::i32;
1567+
ToTypeWidth = EltVT.getSizeInBits();
1568+
EltVT = MVT::getIntegerVT(ToTypeWidth);
15661569
ToType = NVPTX::PTXLdStInstCode::Untyped;
1567-
ToTypeWidth = 32;
15681570
}
15691571

15701572
SDValue Offset, Base;

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -833,7 +833,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
833833
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
834834
ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, ISD::VSELECT,
835835
ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::FP_ROUND,
836-
ISD::TRUNCATE, ISD::LOAD});
836+
ISD::TRUNCATE, ISD::LOAD, ISD::BITCAST});
837837

838838
// setcc for f16x2 and bf16x2 needs special handling to prevent
839839
// legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -6144,6 +6144,61 @@ static SDValue PerformTRUNCATECombine(SDNode *N,
61446144
return SDValue();
61456145
}
61466146

6147+
static SDValue PerformBITCASTCombine(SDNode *N,
6148+
TargetLowering::DAGCombinerInfo &DCI) {
6149+
if (N->getValueType(0) != MVT::v2f32)
6150+
return SDValue();
6151+
6152+
SDValue Operand = N->getOperand(0);
6153+
if (Operand.getValueType() != MVT::i64)
6154+
return SDValue();
6155+
6156+
// DAGCombiner handles bitcast(ISD::LOAD) already. For these, we'll do the
6157+
// same thing, by changing their output values from i64 to v2f32. Then the
6158+
// rule for combining loads (see PerformLoadCombine) may split these loads
6159+
// further.
6160+
if (Operand.getOpcode() == NVPTXISD::LoadV2 ||
6161+
Operand.getOpcode() == NVPTXISD::LoadParam ||
6162+
Operand.getOpcode() == NVPTXISD::LoadParamV2) {
6163+
// check for all bitcasts
6164+
SmallVector<std::pair<SDNode *, unsigned /* resno */>> OldUses;
6165+
for (SDUse &U : Operand->uses()) {
6166+
SDNode *User = U.getUser();
6167+
if (!(User->getOpcode() == ISD::BITCAST &&
6168+
User->getValueType(0) == MVT::v2f32 &&
6169+
U.getValueType() == MVT::i64))
6170+
return SDValue(); // unhandled pattern
6171+
OldUses.push_back({User, U.getResNo()});
6172+
}
6173+
6174+
auto *MemN = cast<MemSDNode>(Operand);
6175+
SmallVector<EVT> VTs;
6176+
for (const auto &VT : Operand->values()) {
6177+
if (VT == MVT::i64)
6178+
VTs.push_back(MVT::v2f32);
6179+
else
6180+
VTs.push_back(VT);
6181+
}
6182+
6183+
SDValue NewLoad = DCI.DAG.getMemIntrinsicNode(
6184+
Operand.getOpcode(), SDLoc(Operand), DCI.DAG.getVTList(VTs),
6185+
SmallVector<SDValue>(Operand->ops()), MemN->getMemoryVT(),
6186+
MemN->getMemOperand());
6187+
6188+
// replace all chain/glue uses of the old load
6189+
for (unsigned I = 0, E = Operand->getNumValues(); I != E; ++I)
6190+
if (Operand->getValueType(I) != MVT::i64)
6191+
DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(MemN, I),
6192+
NewLoad.getValue(I));
6193+
6194+
// replace all bitcasts with values from the new load
6195+
for (auto &[BC, ResultNum] : OldUses)
6196+
DCI.CombineTo(BC, NewLoad.getValue(ResultNum), false);
6197+
}
6198+
6199+
return SDValue();
6200+
}
6201+
61476202
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
61486203
DAGCombinerInfo &DCI) const {
61496204
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -6189,6 +6244,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
61896244
return PerformFP_ROUNDCombine(N, DCI);
61906245
case ISD::TRUNCATE:
61916246
return PerformTRUNCATECombine(N, DCI);
6247+
case ISD::BITCAST:
6248+
return PerformBITCASTCombine(N, DCI);
61926249
}
61936250
return SDValue();
61946251
}

llvm/test/CodeGen/NVPTX/f32x2-instructions.ll

Lines changed: 40 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -117,14 +117,14 @@ define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 {
117117
define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 {
118118
; CHECK-LABEL: test_fadd_v4(
119119
; CHECK: {
120-
; CHECK-NEXT: .reg .b64 %rd<11>;
120+
; CHECK-NEXT: .reg .b64 %rd<7>;
121121
; CHECK-EMPTY:
122122
; CHECK-NEXT: // %bb.0:
123-
; CHECK-NEXT: ld.param.v2.u64 {%rd5, %rd6}, [test_fadd_v4_param_1];
124-
; CHECK-NEXT: ld.param.v2.u64 {%rd7, %rd8}, [test_fadd_v4_param_0];
125-
; CHECK-NEXT: add.rn.f32x2 %rd9, %rd8, %rd6;
126-
; CHECK-NEXT: add.rn.f32x2 %rd10, %rd7, %rd5;
127-
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd10, %rd9};
123+
; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_param_1];
124+
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_param_0];
125+
; CHECK-NEXT: add.rn.f32x2 %rd5, %rd2, %rd4;
126+
; CHECK-NEXT: add.rn.f32x2 %rd6, %rd1, %rd3;
127+
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5};
128128
; CHECK-NEXT: ret;
129129
%r = fadd <4 x float> %a, %b
130130
ret <4 x float> %r
@@ -134,19 +134,19 @@ define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 {
134134
; CHECK-LABEL: test_fadd_imm_0_v4(
135135
; CHECK: {
136136
; CHECK-NEXT: .reg .f32 %f<5>;
137-
; CHECK-NEXT: .reg .b64 %rd<9>;
137+
; CHECK-NEXT: .reg .b64 %rd<7>;
138138
; CHECK-EMPTY:
139139
; CHECK-NEXT: // %bb.0:
140-
; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_fadd_imm_0_v4_param_0];
140+
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_param_0];
141141
; CHECK-NEXT: mov.f32 %f1, 0f40800000;
142142
; CHECK-NEXT: mov.f32 %f2, 0f40400000;
143-
; CHECK-NEXT: mov.b64 %rd5, {%f2, %f1};
144-
; CHECK-NEXT: add.rn.f32x2 %rd6, %rd4, %rd5;
143+
; CHECK-NEXT: mov.b64 %rd3, {%f2, %f1};
144+
; CHECK-NEXT: add.rn.f32x2 %rd4, %rd2, %rd3;
145145
; CHECK-NEXT: mov.f32 %f3, 0f40000000;
146146
; CHECK-NEXT: mov.f32 %f4, 0f3F800000;
147-
; CHECK-NEXT: mov.b64 %rd7, {%f4, %f3};
148-
; CHECK-NEXT: add.rn.f32x2 %rd8, %rd3, %rd7;
149-
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
147+
; CHECK-NEXT: mov.b64 %rd5, {%f4, %f3};
148+
; CHECK-NEXT: add.rn.f32x2 %rd6, %rd1, %rd5;
149+
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4};
150150
; CHECK-NEXT: ret;
151151
%r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
152152
ret <4 x float> %r
@@ -156,19 +156,19 @@ define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 {
156156
; CHECK-LABEL: test_fadd_imm_1_v4(
157157
; CHECK: {
158158
; CHECK-NEXT: .reg .f32 %f<5>;
159-
; CHECK-NEXT: .reg .b64 %rd<9>;
159+
; CHECK-NEXT: .reg .b64 %rd<7>;
160160
; CHECK-EMPTY:
161161
; CHECK-NEXT: // %bb.0:
162-
; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_fadd_imm_1_v4_param_0];
162+
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_param_0];
163163
; CHECK-NEXT: mov.f32 %f1, 0f40800000;
164164
; CHECK-NEXT: mov.f32 %f2, 0f40400000;
165-
; CHECK-NEXT: mov.b64 %rd5, {%f2, %f1};
166-
; CHECK-NEXT: add.rn.f32x2 %rd6, %rd4, %rd5;
165+
; CHECK-NEXT: mov.b64 %rd3, {%f2, %f1};
166+
; CHECK-NEXT: add.rn.f32x2 %rd4, %rd2, %rd3;
167167
; CHECK-NEXT: mov.f32 %f3, 0f40000000;
168168
; CHECK-NEXT: mov.f32 %f4, 0f3F800000;
169-
; CHECK-NEXT: mov.b64 %rd7, {%f4, %f3};
170-
; CHECK-NEXT: add.rn.f32x2 %rd8, %rd3, %rd7;
171-
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
169+
; CHECK-NEXT: mov.b64 %rd5, {%f4, %f3};
170+
; CHECK-NEXT: add.rn.f32x2 %rd6, %rd1, %rd5;
171+
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4};
172172
; CHECK-NEXT: ret;
173173
%r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
174174
ret <4 x float> %r
@@ -340,14 +340,14 @@ define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 {
340340
define <4 x float> @test_fadd_v4_ftz(<4 x float> %a, <4 x float> %b) #2 {
341341
; CHECK-LABEL: test_fadd_v4_ftz(
342342
; CHECK: {
343-
; CHECK-NEXT: .reg .b64 %rd<11>;
343+
; CHECK-NEXT: .reg .b64 %rd<7>;
344344
; CHECK-EMPTY:
345345
; CHECK-NEXT: // %bb.0:
346-
; CHECK-NEXT: ld.param.v2.u64 {%rd5, %rd6}, [test_fadd_v4_ftz_param_1];
347-
; CHECK-NEXT: ld.param.v2.u64 {%rd7, %rd8}, [test_fadd_v4_ftz_param_0];
348-
; CHECK-NEXT: add.rn.ftz.f32x2 %rd9, %rd8, %rd6;
349-
; CHECK-NEXT: add.rn.ftz.f32x2 %rd10, %rd7, %rd5;
350-
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd10, %rd9};
346+
; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_ftz_param_1];
347+
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_ftz_param_0];
348+
; CHECK-NEXT: add.rn.ftz.f32x2 %rd5, %rd2, %rd4;
349+
; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd3;
350+
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5};
351351
; CHECK-NEXT: ret;
352352
%r = fadd <4 x float> %a, %b
353353
ret <4 x float> %r
@@ -357,19 +357,19 @@ define <4 x float> @test_fadd_imm_0_v4_ftz(<4 x float> %a) #2 {
357357
; CHECK-LABEL: test_fadd_imm_0_v4_ftz(
358358
; CHECK: {
359359
; CHECK-NEXT: .reg .f32 %f<5>;
360-
; CHECK-NEXT: .reg .b64 %rd<9>;
360+
; CHECK-NEXT: .reg .b64 %rd<7>;
361361
; CHECK-EMPTY:
362362
; CHECK-NEXT: // %bb.0:
363-
; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_fadd_imm_0_v4_ftz_param_0];
363+
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_ftz_param_0];
364364
; CHECK-NEXT: mov.f32 %f1, 0f40800000;
365365
; CHECK-NEXT: mov.f32 %f2, 0f40400000;
366-
; CHECK-NEXT: mov.b64 %rd5, {%f2, %f1};
367-
; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd4, %rd5;
366+
; CHECK-NEXT: mov.b64 %rd3, {%f2, %f1};
367+
; CHECK-NEXT: add.rn.ftz.f32x2 %rd4, %rd2, %rd3;
368368
; CHECK-NEXT: mov.f32 %f3, 0f40000000;
369369
; CHECK-NEXT: mov.f32 %f4, 0f3F800000;
370-
; CHECK-NEXT: mov.b64 %rd7, {%f4, %f3};
371-
; CHECK-NEXT: add.rn.ftz.f32x2 %rd8, %rd3, %rd7;
372-
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
370+
; CHECK-NEXT: mov.b64 %rd5, {%f4, %f3};
371+
; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd5;
372+
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4};
373373
; CHECK-NEXT: ret;
374374
%r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
375375
ret <4 x float> %r
@@ -379,19 +379,19 @@ define <4 x float> @test_fadd_imm_1_v4_ftz(<4 x float> %a) #2 {
379379
; CHECK-LABEL: test_fadd_imm_1_v4_ftz(
380380
; CHECK: {
381381
; CHECK-NEXT: .reg .f32 %f<5>;
382-
; CHECK-NEXT: .reg .b64 %rd<9>;
382+
; CHECK-NEXT: .reg .b64 %rd<7>;
383383
; CHECK-EMPTY:
384384
; CHECK-NEXT: // %bb.0:
385-
; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_fadd_imm_1_v4_ftz_param_0];
385+
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_ftz_param_0];
386386
; CHECK-NEXT: mov.f32 %f1, 0f40800000;
387387
; CHECK-NEXT: mov.f32 %f2, 0f40400000;
388-
; CHECK-NEXT: mov.b64 %rd5, {%f2, %f1};
389-
; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd4, %rd5;
388+
; CHECK-NEXT: mov.b64 %rd3, {%f2, %f1};
389+
; CHECK-NEXT: add.rn.ftz.f32x2 %rd4, %rd2, %rd3;
390390
; CHECK-NEXT: mov.f32 %f3, 0f40000000;
391391
; CHECK-NEXT: mov.f32 %f4, 0f3F800000;
392-
; CHECK-NEXT: mov.b64 %rd7, {%f4, %f3};
393-
; CHECK-NEXT: add.rn.ftz.f32x2 %rd8, %rd3, %rd7;
394-
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
392+
; CHECK-NEXT: mov.b64 %rd5, {%f4, %f3};
393+
; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd5;
394+
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4};
395395
; CHECK-NEXT: ret;
396396
%r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
397397
ret <4 x float> %r

llvm/test/CodeGen/NVPTX/vec-param-load.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,40 +5,40 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
55

66
define <16 x float> @test_v16f32(<16 x float> %a) {
77
; CHECK-LABEL: test_v16f32(
8-
; CHECK-DAG: ld.param.v4.f32 {[[V_12_15:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+48];
9-
; CHECK-DAG: ld.param.v4.f32 {[[V_8_11:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+32];
10-
; CHECK-DAG: ld.param.v4.f32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+16];
11-
; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0];
12-
; CHECK-DAG: st.param.v4.f32 [func_retval0], {[[V_0_3]]}
13-
; CHECK-DAG: st.param.v4.f32 [func_retval0+16], {[[V_4_7]]}
14-
; CHECK-DAG: st.param.v4.f32 [func_retval0+32], {[[V_8_11]]}
15-
; CHECK-DAG: st.param.v4.f32 [func_retval0+48], {[[V_12_15]]}
8+
; CHECK-DAG: ld.param.v2.b64 {[[V_12_15:(%rd[0-9]+[, ]*){2}]]}, [test_v16f32_param_0+48];
9+
; CHECK-DAG: ld.param.v2.b64 {[[V_8_11:(%rd[0-9]+[, ]*){2}]]}, [test_v16f32_param_0+32];
10+
; CHECK-DAG: ld.param.v2.b64 {[[V_4_7:(%rd[0-9]+[, ]*){2}]]}, [test_v16f32_param_0+16];
11+
; CHECK-DAG: ld.param.v2.b64 {[[V_0_3:(%rd[0-9]+[, ]*){2}]]}, [test_v16f32_param_0];
12+
; CHECK-DAG: st.param.v2.b64 [func_retval0], {[[V_0_3]]}
13+
; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[V_4_7]]}
14+
; CHECK-DAG: st.param.v2.b64 [func_retval0+32], {[[V_8_11]]}
15+
; CHECK-DAG: st.param.v2.b64 [func_retval0+48], {[[V_12_15]]}
1616
; CHECK: ret;
1717
ret <16 x float> %a
1818
}
1919

2020
define <8 x float> @test_v8f32(<8 x float> %a) {
2121
; CHECK-LABEL: test_v8f32(
22-
; CHECK-DAG: ld.param.v4.f32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0+16];
23-
; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0];
24-
; CHECK-DAG: st.param.v4.f32 [func_retval0], {[[V_0_3]]}
25-
; CHECK-DAG: st.param.v4.f32 [func_retval0+16], {[[V_4_7]]}
22+
; CHECK-DAG: ld.param.v2.b64 {[[V_4_7:(%rd[0-9]+[, ]*){2}]]}, [test_v8f32_param_0+16];
23+
; CHECK-DAG: ld.param.v2.b64 {[[V_0_3:(%rd[0-9]+[, ]*){2}]]}, [test_v8f32_param_0];
24+
; CHECK-DAG: st.param.v2.b64 [func_retval0], {[[V_0_3]]}
25+
; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[V_4_7]]}
2626
; CHECK: ret;
2727
ret <8 x float> %a
2828
}
2929

3030
define <4 x float> @test_v4f32(<4 x float> %a) {
3131
; CHECK-LABEL: test_v4f32(
32-
; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v4f32_param_0];
33-
; CHECK-DAG: st.param.v4.f32 [func_retval0], {[[V_0_3]]}
32+
; CHECK-DAG: ld.param.v2.b64 {[[V_0_3:(%rd[0-9]+[, ]*){2}]]}, [test_v4f32_param_0];
33+
; CHECK-DAG: st.param.v2.b64 [func_retval0], {[[V_0_3]]}
3434
; CHECK: ret;
3535
ret <4 x float> %a
3636
}
3737

3838
define <2 x float> @test_v2f32(<2 x float> %a) {
3939
; CHECK-LABEL: test_v2f32(
40-
; CHECK-DAG: ld.param.v2.f32 {[[V_0_3:(%f[0-9]+[, ]*){2}]]}, [test_v2f32_param_0];
41-
; CHECK-DAG: st.param.v2.f32 [func_retval0], {[[V_0_3]]}
40+
; CHECK-DAG: ld.param.b64 [[V_0_3:%rd[0-9]+]], [test_v2f32_param_0];
41+
; CHECK-DAG: st.param.b64 [func_retval0], [[V_0_3]]
4242
; CHECK: ret;
4343
ret <2 x float> %a
4444
}

0 commit comments

Comments
 (0)