diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index ff10eea371049..533c54f4285cf 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -471,8 +471,17 @@ bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) { // We only care about 16x2 as it's the only real vector type we // need to deal with. MVT VT = Vector.getSimpleValueType(); - if (!Isv2x16VT(VT)) + if (!isPackedVectorTy(VT) || VT.getVectorNumElements() != 2) return false; + + unsigned Opcode; + if (VT.is32BitVector()) + Opcode = NVPTX::I32toV2I16; + else if (VT.is64BitVector()) + Opcode = NVPTX::I64toV2I32; + else + llvm_unreachable("Unhandled packed type"); + // Find and record all uses of this vector that extract element 0 or 1. SmallVector E0, E1; for (auto *U : Vector.getNode()->users()) { @@ -496,11 +505,11 @@ bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) { if (E0.empty() || E1.empty()) return false; - // Merge (f16 extractelt(V, 0), f16 extractelt(V,1)) - // into f16,f16 SplitF16x2(V) + // Merge (EltTy extractelt(V, 0), EltTy extractelt(V,1)) + // into EltTy,EltTy Split[EltTy]x2(V) MVT EltVT = VT.getVectorElementType(); SDNode *ScatterOp = - CurDAG->getMachineNode(NVPTX::I32toV2I16, SDLoc(N), EltVT, EltVT, Vector); + CurDAG->getMachineNode(Opcode, SDLoc(N), EltVT, EltVT, Vector); for (auto *Node : E0) ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0)); for (auto *Node : E1) @@ -1035,6 +1044,7 @@ pickOpcodeForVT(MVT::SimpleValueType VT, std::optional Opcode_i8, case MVT::i32: case MVT::f32: return Opcode_i32; + case MVT::v2f32: case MVT::i64: case MVT::f64: return Opcode_i64; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 492f4ab76fdbb..22c568b297bb7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -195,11 +195,6 @@ static bool IsPTXVectorType(MVT VT) { } } -static bool Is16bitsType(MVT VT) { - return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 || - VT.SimpleTy == MVT::i16); -} - // When legalizing vector loads/stores, this function is called, which does two // things: // 1. Determines Whether the vector is something we want to custom lower, @@ -330,11 +325,8 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, } ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset); - for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { - EVT VT = TempVTs[i]; - uint64_t Off = TempOffsets[i]; - // Split vectors into individual elements, except for v2f16, which - // we will pass as a single scalar. + for (auto [VT, Off] : zip(TempVTs, TempOffsets)) { + // Split vectors into individual elements, except for packed types if (VT.isVector()) { unsigned NumElts = VT.getVectorNumElements(); EVT EltVT = VT.getVectorElementType(); @@ -342,10 +334,21 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, // TargetLoweringBase::getVectorTypeBreakdown() which is invoked in // ComputePTXValueVTs() cannot currently break down non-power-of-2 sized // vectors. - if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0 && - isPowerOf2_32(NumElts)) { - // Vectors with an even number of f16 elements will be passed to - // us as an array of v2f16/v2bf16 elements. We must match this so we + + // Special case handling for packed i8s. + if (EltVT.getSimpleVT() == MVT::i8 && + ((NumElts % 4 == 0 && isPowerOf2_32(NumElts)) || NumElts == 3)) { + // v*i8 are formally lowered as v4i8 + EltVT = MVT::v4i8; + NumElts = (NumElts + 3) / 4; + } else if (EltVT.getSimpleVT() == MVT::i8 && NumElts == 2) { + // v2i8 is promoted to v2i16 + NumElts = 1; + EltVT = MVT::v2i16; + } else if (isPackedElementTy(EltVT) && NumElts % 2 == 0 && + isPowerOf2_32(NumElts)) { + // Vectors with an even number of elements will be passed to + // us as an array of pairs of 2 elements. We must match this so we // stay in sync with Ins/Outs. switch (EltVT.getSimpleVT().SimpleTy) { case MVT::f16: @@ -357,20 +360,13 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, case MVT::i16: EltVT = MVT::v2i16; break; + case MVT::f32: + EltVT = MVT::v2f32; + break; default: llvm_unreachable("Unexpected type"); } NumElts /= 2; - } else if (EltVT.getSimpleVT() == MVT::i8 && - ((NumElts % 4 == 0 && isPowerOf2_32(NumElts)) || - NumElts == 3)) { - // v*i8 are formally lowered as v4i8 - EltVT = MVT::v4i8; - NumElts = (NumElts + 3) / 4; - } else if (EltVT.getSimpleVT() == MVT::i8 && NumElts == 2) { - // v2i8 is promoted to v2i16 - NumElts = 1; - EltVT = MVT::v2i16; } for (unsigned j = 0; j != NumElts; ++j) { ValueVTs.push_back(EltVT); @@ -601,6 +597,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass); addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass); addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass); + addRegisterClass(MVT::v2f32, &NVPTX::Int64RegsRegClass); // Conversion to/from FP16/FP16x2 is always legal. setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); @@ -637,6 +634,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom); + // No support for these operations with v2f32. + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Expand); + // Custom conversions to/from v2i8. setOperationAction(ISD::BITCAST, MVT::v2i8, Custom); @@ -662,12 +663,16 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // Operations not directly supported by NVPTX. for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32, - MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8, - MVT::i32, MVT::i64}) { + MVT::v2f32, MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, + MVT::v4i8, MVT::i32, MVT::i64}) { setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::BR_CC, VT, Expand); } + // Not directly supported. TLI would attempt to expand operations like + // FMINIMUM(v2f32) using invalid SETCC and VSELECT nodes. + setOperationAction(ISD::VSELECT, MVT::v2f32, Expand); + // Some SIGN_EXTEND_INREG can be done using cvt instruction. // For others we will expand to a SHL/SRA pair. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal); @@ -866,6 +871,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setBF16OperationAction(Op, MVT::bf16, Legal, Promote); if (getOperationAction(Op, MVT::bf16) == Promote) AddPromotedToType(Op, MVT::bf16, MVT::f32); + setOperationAction(Op, MVT::v2f32, + STI.hasF32x2Instructions() ? Legal : Expand); } // On SM80, we select add/mul/sub as fma to avoid promotion to float @@ -887,6 +894,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand); setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand); + setOperationAction(ISD::FNEG, MVT::v2f32, Expand); // (would be) Library functions. // These map to conversion instructions for scalar FP types. @@ -897,6 +905,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(Op, MVT::f64, Legal); setOperationAction(Op, MVT::v2f16, Expand); setOperationAction(Op, MVT::v2bf16, Expand); + setOperationAction(Op, MVT::v2f32, Expand); setBF16OperationAction(Op, MVT::bf16, Legal, Promote); if (getOperationAction(Op, MVT::bf16) == Promote) AddPromotedToType(Op, MVT::bf16, MVT::f32); @@ -912,6 +921,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, } } + // Expand v2f32 = fp_extend + setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); + // Expand v2[b]f16 = fp_round v2f32 + setOperationAction(ISD::FP_ROUND, {MVT::v2bf16, MVT::v2f16}, Expand); + // sm_80 only has conversions between f32 and bf16. Custom lower all other // bf16 conversions. if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) { @@ -949,14 +963,14 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(Op, MVT::f16, Promote); setOperationAction(Op, MVT::f32, Legal); setOperationAction(Op, MVT::f64, Legal); - setOperationAction(Op, MVT::v2f16, Expand); - setOperationAction(Op, MVT::v2bf16, Expand); + setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand); setOperationAction(Op, MVT::bf16, Promote); AddPromotedToType(Op, MVT::bf16, MVT::f32); } setOperationAction(ISD::FREM, {MVT::f32, MVT::f64}, Custom); setOperationAction(ISD::FABS, {MVT::f32, MVT::f64}, Legal); + setOperationAction(ISD::FABS, MVT::v2f32, Expand); if (STI.getPTXVersion() >= 65) { setFP16OperationAction(ISD::FABS, MVT::f16, Legal, Promote); setFP16OperationAction(ISD::FABS, MVT::v2f16, Legal, Expand); @@ -978,6 +992,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setBF16OperationAction(Op, MVT::bf16, Legal, Promote); if (getOperationAction(Op, MVT::bf16) == Promote) AddPromotedToType(Op, MVT::bf16, MVT::f32); + setOperationAction(Op, MVT::v2f32, Expand); } bool SupportsF32MinMaxNaN = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70; @@ -987,6 +1002,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setFP16OperationAction(Op, MVT::v2f16, Legal, Expand); setBF16OperationAction(Op, MVT::bf16, Legal, Expand); setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); + setOperationAction(Op, MVT::v2f32, Expand); } // Custom lowering for inline asm with 128-bit operands @@ -999,6 +1015,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // - bf16/bf16x2 (sm_90+, PTX 7.8+) // When f16/bf16 types aren't supported, they are promoted/expanded to f32. setOperationAction(ISD::FEXP2, MVT::f32, Legal); + setOperationAction(ISD::FEXP2, MVT::v2f32, Expand); setFP16OperationAction(ISD::FEXP2, MVT::f16, Legal, Promote); setFP16OperationAction(ISD::FEXP2, MVT::v2f16, Legal, Expand); setBF16OperationAction(ISD::FEXP2, MVT::bf16, Legal, Promote); @@ -1010,7 +1027,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::FLOG2, MVT::f32, Legal); setOperationPromotedToType(ISD::FLOG2, MVT::f16, MVT::f32); setOperationPromotedToType(ISD::FLOG2, MVT::bf16, MVT::f32); - setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16}, Expand); + setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, + Expand); } setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom); @@ -2139,7 +2157,7 @@ SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op->getValueType(0); - if (!(Isv2x16VT(VT) || VT == MVT::v4i8)) + if (!isPackedVectorTy(VT) || !VT.is32BitVector()) return Op; SDLoc DL(Op); @@ -2189,15 +2207,10 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, Value = Value.trunc(8); return Value.zext(32); }; - APInt Value; - if (Isv2x16VT(VT)) { - Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16); - } else if (VT == MVT::v4i8) { - Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) | - GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24); - } else { - llvm_unreachable("Unsupported type"); - } + + APInt Value(32, 0); + for (unsigned I = 0, E = VT.getVectorNumElements(), S = 32 / E; I != E; ++I) + Value |= GetOperand(Op, I).shl(I * S); SDValue Const = DAG.getConstant(Value, DL, MVT::i32); return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), Const); } @@ -2225,7 +2238,8 @@ SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, return Op; // Extract individual elements and select one of them. - assert(Isv2x16VT(VectorVT) && "Unexpected vector type."); + assert(isPackedVectorTy(VectorVT) && VectorVT.is32BitVector() && + "Unexpected vector type."); EVT EltVT = VectorVT.getVectorElementType(); SDLoc dl(Op.getNode()); @@ -3136,22 +3150,38 @@ SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachinePointerInfo(SV)); } +static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, + SmallVectorImpl &Results, + const NVPTXSubtarget &STI); + SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType() == MVT::i1) return LowerLOADi1(Op, DAG); - // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle - // unaligned loads and have to handle it here. EVT VT = Op.getValueType(); - if (Isv2x16VT(VT) || VT == MVT::v4i8) { - LoadSDNode *Load = cast(Op); - EVT MemVT = Load->getMemoryVT(); - if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), - MemVT, *Load->getMemOperand())) { - SDValue Ops[2]; - std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); - return DAG.getMergeValues(Ops, SDLoc(Op)); - } + if (!isPackedVectorTy(VT)) + return SDValue(); + + // v2f32/v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to + // handle unaligned loads and have to handle it here. + LoadSDNode *Load = cast(Op); + EVT MemVT = Load->getMemoryVT(); + if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + MemVT, *Load->getMemOperand())) { + SDValue Ops[2]; + std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); + return DAG.getMergeValues(Ops, SDLoc(Op)); + } + + if (VT == MVT::v2f32) { + // If we have instructions accessing f32 elements, it's better to not pack + // them. Lower the load now as `f32,f32,ch = LoadV2` (ld.v2.f32), rather + // than waiting until ISel when it'll be lowered as ld.b64. + SmallVector Results; + ReplaceLoadVector(Op.getNode(), DAG, Results, STI); + if (!Results.empty()) + // if we succeeded, return it + return DAG.getMergeValues(Results, SDLoc(Op)); } return SDValue(); @@ -3187,17 +3217,19 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { if (VT == MVT::i1) return LowerSTOREi1(Op, DAG); - // v2f16 is legal, so we can't rely on legalizer to handle unaligned - // stores and have to handle it here. - if ((Isv2x16VT(VT) || VT == MVT::v4i8) && + // v2f32/v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to + // handle unaligned stores and have to handle it here. + if (isPackedVectorTy(VT) && !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), VT, *Store->getMemOperand())) return expandUnalignedStore(Store, DAG); - // v2f16, v2bf16 and v2i16 don't need special handling. - if (Isv2x16VT(VT) || VT == MVT::v4i8) + // v2f16/v2bf16/v2i16 don't need special handling. + if (isPackedVectorTy(VT) && VT.is32BitVector()) return SDValue(); + // Lower store of any other vector type, including v2f32 as we want to break + // it apart since this is not a widely-supported type. return LowerSTOREVector(Op, DAG); } @@ -3476,7 +3508,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( return EltVT; }(); - const EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts); + EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts); SDValue VecAddr = DAG.getObjectPtrOffset( dl, ArgSymbol, TypeSize::getFixed(Offsets[I])); @@ -3512,7 +3544,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( } else if (ExpactedVT.bitsLT(Elt.getValueType())) { Elt = DAG.getNode(ISD::TRUNCATE, dl, ExpactedVT, Elt); } else { - // v2f16 was loaded as an i32. Now we must bitcast it back. + // v2f16 was laoded as an i32. Now we must bitcast it back. Elt = DAG.getBitcast(EltVT, Elt); } InVals.push_back(Elt); @@ -5529,10 +5561,10 @@ static SDValue PerformEXTRACTCombine(SDNode *N, IsPTXVectorType(VectorVT.getSimpleVT())) return SDValue(); // Native vector loads already combine nicely w/ // extract_vector_elt. - // Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already - // handle them OK. - if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) || - VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8) + // Don't mess with singletons or packed types (v2f32, v2*16, v4i8 and v8i8), + // we already handle them OK. + if (VectorVT.getVectorNumElements() == 1 || isPackedVectorTy(VectorVT) || + VectorVT == MVT::v8i8) return SDValue(); // Don't mess with undef values as sra may be simplified to 0, not undef. @@ -5605,7 +5637,7 @@ static SDValue PerformVSELECTCombine(SDNode *N, static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { auto VT = N->getValueType(0); - if (!DCI.isAfterLegalizeDAG() || !Isv2x16VT(VT)) + if (!DCI.isAfterLegalizeDAG() || !isPackedVectorTy(VT) || !VT.is32BitVector()) return SDValue(); auto Op0 = N->getOperand(0); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 5dbdce52f0553..405fed29b66d3 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -151,6 +151,7 @@ def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">; def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">; +def hasF32x2Instructions : Predicate<"Subtarget->hasF32x2Instructions()">; def True : Predicate<"true">; @@ -185,6 +186,7 @@ class ValueToRegClass { !eq(name, "bf16"): Int16Regs, !eq(name, "v2bf16"): Int32Regs, !eq(name, "f32"): Float32Regs, + !eq(name, "v2f32"): Int64Regs, !eq(name, "f64"): Float64Regs, !eq(name, "ai32"): Int32ArgRegs, !eq(name, "ai64"): Int64ArgRegs, @@ -231,6 +233,7 @@ def BF16RT : RegTyInfo; def F16X2RT : RegTyInfo; def BF16X2RT : RegTyInfo; +def F32X2RT : RegTyInfo; // This class provides a basic wrapper around an NVPTXInst that abstracts the @@ -462,6 +465,18 @@ multiclass F3 { [(set f16:$dst, (op_pat f16:$a, f16:$b))]>, Requires<[useFP16Math]>; + def f32x2rr_ftz : + BasicNVPTXInst<(outs Int64Regs:$dst), + (ins Int64Regs:$a, Int64Regs:$b), + op_str # ".ftz.f32x2", + [(set v2f32:$dst, (op_pat v2f32:$a, v2f32:$b))]>, + Requires<[hasF32x2Instructions, doF32FTZ]>; + def f32x2rr : + BasicNVPTXInst<(outs Int64Regs:$dst), + (ins Int64Regs:$a, Int64Regs:$b), + op_str # ".f32x2", + [(set v2f32:$dst, (op_pat v2f32:$a, v2f32:$b))]>, + Requires<[hasF32x2Instructions]>; def f16x2rr_ftz : BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), @@ -840,6 +855,9 @@ def : Pat<(vt (select i1:$p, vt:$a, vt:$b)), (SELP_b32rr $a, $b, $p)>; } +def : Pat<(v2f32 (select i1:$p, v2f32:$a, v2f32:$b)), + (SELP_b64rr $a, $b, $p)>; + //----------------------------------- // Test Instructions //----------------------------------- @@ -1368,6 +1386,8 @@ defm BFMA16 : FMA<"fma.rn.bf16", BF16RT, [hasBF16Math]>; defm BFMA16x2 : FMA<"fma.rn.bf16x2", BF16X2RT, [hasBF16Math]>; defm FMA32_ftz : FMA<"fma.rn.ftz.f32", F32RT, [doF32FTZ]>; defm FMA32 : FMA<"fma.rn.f32", F32RT>; +defm FMA32x2_ftz : FMA<"fma.rn.ftz.f32x2", F32X2RT, [hasF32x2Instructions, doF32FTZ]>; +defm FMA32x2 : FMA<"fma.rn.f32x2", F32X2RT, [hasF32x2Instructions]>; defm FMA64 : FMA<"fma.rn.f64", F64RT>; // sin/cos @@ -2714,6 +2734,7 @@ def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))), (I64toI32H $s)>; def: Pat<(i32 (sext (extractelt v2i16:$src, 0))), (CVT_INREG_s32_s16 $src)>; +// Handle extracting one element from the pair (32-bit types) foreach vt = [v2f16, v2bf16, v2i16] in { def : Pat<(extractelt vt:$src, 0), (I32toI16L_Sink $src)>, Requires<[hasPTX<71>]>; def : Pat<(extractelt vt:$src, 1), (I32toI16H_Sink $src)>, Requires<[hasPTX<71>]>; @@ -2725,10 +2746,21 @@ foreach vt = [v2f16, v2bf16, v2i16] in { (V2I16toI32 $a, $b)>; } +// Same thing for the 64-bit type v2f32. +foreach vt = [v2f32] in { + def : Pat<(extractelt vt:$src, 0), (I64toI32L_Sink $src)>, Requires<[hasPTX<71>]>; + def : Pat<(extractelt vt:$src, 1), (I64toI32H_Sink $src)>, Requires<[hasPTX<71>]>; + + def : Pat<(extractelt vt:$src, 0), (I64toI32L $src)>; + def : Pat<(extractelt vt:$src, 1), (I64toI32H $src)>; + + def : Pat<(vt (build_vector vt.ElementType:$a, vt.ElementType:$b)), + (V2I32toI64 $a, $b)>; +} + def: Pat<(v2i16 (scalar_to_vector i16:$a)), (CVT_u32_u16 $a, CvtNONE)>; - def nvptx_build_vector : SDNode<"NVPTXISD::BUILD_VECTOR", SDTypeProfile<1, 2, []>, []>; def : Pat<(i64 (nvptx_build_vector i32:$a, i32:$b)), diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td index 2eea9e9721cdf..0d364be29b9ec 100644 --- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td @@ -60,7 +60,9 @@ def Int16Regs : NVPTXRegClass<[i16, f16, bf16], 16, (add (sequence "RS%u", 0, 4) def Int32Regs : NVPTXRegClass<[i32, v2f16, v2bf16, v2i16, v4i8, f32], 32, (add (sequence "R%u", 0, 4), VRFrame32, VRFrameLocal32)>; -def Int64Regs : NVPTXRegClass<[i64, f64], 64, (add (sequence "RL%u", 0, 4), VRFrame64, VRFrameLocal64)>; +def Int64Regs : NVPTXRegClass<[i64, v2f32, f64], 64, + (add (sequence "RL%u", 0, 4), + VRFrame64, VRFrameLocal64)>; // 128-bit regs are not defined as general regs in NVPTX. They are used for inlineASM only. def Int128Regs : NVPTXRegClass<[i128], 128, (add (sequence "RQ%u", 0, 4))>; diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index d2eae48826829..004a566d0de9a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -116,6 +116,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { return HasTcgen05 && PTXVersion >= 86; } + // f32x2 instructions in Blackwell family + bool hasF32x2Instructions() const { + return SmVersion >= 100 && PTXVersion >= 86; + } // TMA G2S copy with cta_group::1/2 support bool hasCpAsyncBulkTensorCTAGroupSupport() const { diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index aa7850acbd64a..b901138a87170 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -129,8 +129,9 @@ class NVPTXTTIImpl final : public BasicTTIImplBase { Insert = false; } } - if (Insert && Isv2x16VT(VT)) { - // Can be built in a single mov + if (Insert && isPackedVectorTy(VT) && VT.is32BitVector()) { + // Can be built in a single 32-bit mov (64-bit regs are emulated in SASS + // with 2x 32-bit regs) Cost += 1; Insert = false; } diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index e792e441e49e6..103e67061b806 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -85,8 +85,14 @@ inline unsigned promoteScalarArgumentSize(unsigned size) { bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM); -inline bool Isv2x16VT(EVT VT) { - return (VT == MVT::v2f16 || VT == MVT::v2bf16 || VT == MVT::v2i16); +inline bool isPackedVectorTy(EVT VT) { + return (VT == MVT::v4i8 || VT == MVT::v2f16 || VT == MVT::v2bf16 || + VT == MVT::v2i16 || VT == MVT::v2f32); +} + +inline bool isPackedElementTy(EVT VT) { + return (VT == MVT::i8 || VT == MVT::f16 || VT == MVT::bf16 || + VT == MVT::i16 || VT == MVT::f32); } inline bool shouldPassAsArray(Type *Ty) { diff --git a/llvm/test/CodeGen/NVPTX/aggregate-return.ll b/llvm/test/CodeGen/NVPTX/aggregate-return.ll index 1c8f019922e37..f17cb5bf49693 100644 --- a/llvm/test/CodeGen/NVPTX/aggregate-return.ll +++ b/llvm/test/CodeGen/NVPTX/aggregate-return.ll @@ -10,7 +10,8 @@ define void @test_v2f32(<2 x float> %input, ptr %output) { ; CHECK-LABEL: @test_v2f32 %call = tail call <2 x float> @barv(<2 x float> %input) ; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [retval0]; +; CHECK: ld.param.b64 [[E0_1:%rd[0-9]+]], [retval0]; +; CHECK: mov.b64 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [[E0_1]] store <2 x float> %call, ptr %output, align 8 ; CHECK: st.v2.b32 [{{%rd[0-9]+}}], {[[E0]], [[E1]]} ret void @@ -27,9 +28,7 @@ define void @test_v3f32(<3 x float> %input, ptr %output) { ; CHECK-NOT: ld.param.b32 [[E3:%r[0-9]+]], [retval0+12]; store <3 x float> %call, ptr %output, align 8 ; CHECK-DAG: st.b32 [{{%rd[0-9]}}+8], -; -- This is suboptimal. We should do st.v2.f32 instead -; of combining 2xf32 info i64. -; CHECK-DAG: st.b64 [{{%rd[0-9]}}], +; CHECK-DAG: st.v2.b32 [{{%rd[0-9]}}], ; CHECK: ret; ret void } diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll index 32225ed04e2d9..78fc727c43767 100644 --- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll @@ -707,108 +707,124 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM70: { ; SM70-NEXT: .reg .b16 %rs<9>; ; SM70-NEXT: .reg .b32 %r<21>; -; SM70-NEXT: .reg .b64 %rd<2>; +; SM70-NEXT: .reg .b64 %rd<6>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0]; ; SM70-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r1; -; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; -; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r3; -; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r4; -; SM70-NEXT: cvt.u32.u16 %r5, %rs8; +; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; SM70-NEXT: cvt.u32.u16 %r5, %rs2; ; SM70-NEXT: shl.b32 %r6, %r5, 16; -; SM70-NEXT: cvt.u32.u16 %r7, %rs7; +; SM70-NEXT: cvt.u32.u16 %r7, %rs1; ; SM70-NEXT: shl.b32 %r8, %r7, 16; -; SM70-NEXT: cvt.u32.u16 %r9, %rs6; +; SM70-NEXT: mov.b64 %rd2, {%r8, %r6}; +; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r3; +; SM70-NEXT: cvt.u32.u16 %r9, %rs4; ; SM70-NEXT: shl.b32 %r10, %r9, 16; -; SM70-NEXT: cvt.u32.u16 %r11, %rs5; +; SM70-NEXT: cvt.u32.u16 %r11, %rs3; ; SM70-NEXT: shl.b32 %r12, %r11, 16; -; SM70-NEXT: cvt.u32.u16 %r13, %rs4; +; SM70-NEXT: mov.b64 %rd3, {%r12, %r10}; +; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r2; +; SM70-NEXT: cvt.u32.u16 %r13, %rs6; ; SM70-NEXT: shl.b32 %r14, %r13, 16; -; SM70-NEXT: cvt.u32.u16 %r15, %rs3; +; SM70-NEXT: cvt.u32.u16 %r15, %rs5; ; SM70-NEXT: shl.b32 %r16, %r15, 16; -; SM70-NEXT: cvt.u32.u16 %r17, %rs2; +; SM70-NEXT: mov.b64 %rd4, {%r16, %r14}; +; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r1; +; SM70-NEXT: cvt.u32.u16 %r17, %rs8; ; SM70-NEXT: shl.b32 %r18, %r17, 16; -; SM70-NEXT: cvt.u32.u16 %r19, %rs1; +; SM70-NEXT: cvt.u32.u16 %r19, %rs7; ; SM70-NEXT: shl.b32 %r20, %r19, 16; -; SM70-NEXT: st.param.v4.b32 [func_retval0], {%r20, %r18, %r16, %r14}; -; SM70-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r10, %r8, %r6}; +; SM70-NEXT: mov.b64 %rd5, {%r20, %r18}; +; SM70-NEXT: st.param.v2.b64 [func_retval0], {%rd5, %rd4}; +; SM70-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd2}; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_extload_bf16x8( ; SM80: { ; SM80-NEXT: .reg .b16 %rs<9>; ; SM80-NEXT: .reg .b32 %r<13>; -; SM80-NEXT: .reg .b64 %rd<2>; +; SM80-NEXT: .reg .b64 %rd<6>; ; SM80-EMPTY: ; SM80-NEXT: // %bb.0: ; SM80-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0]; ; SM80-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r1; -; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2; -; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r3; -; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r4; -; SM80-NEXT: cvt.f32.bf16 %r5, %rs8; -; SM80-NEXT: cvt.f32.bf16 %r6, %rs7; -; SM80-NEXT: cvt.f32.bf16 %r7, %rs6; -; SM80-NEXT: cvt.f32.bf16 %r8, %rs5; -; SM80-NEXT: cvt.f32.bf16 %r9, %rs4; -; SM80-NEXT: cvt.f32.bf16 %r10, %rs3; -; SM80-NEXT: cvt.f32.bf16 %r11, %rs2; -; SM80-NEXT: cvt.f32.bf16 %r12, %rs1; -; SM80-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9}; -; SM80-NEXT: st.param.v4.b32 [func_retval0+16], {%r8, %r7, %r6, %r5}; +; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; SM80-NEXT: cvt.f32.bf16 %r5, %rs2; +; SM80-NEXT: cvt.f32.bf16 %r6, %rs1; +; SM80-NEXT: mov.b64 %rd2, {%r6, %r5}; +; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r3; +; SM80-NEXT: cvt.f32.bf16 %r7, %rs4; +; SM80-NEXT: cvt.f32.bf16 %r8, %rs3; +; SM80-NEXT: mov.b64 %rd3, {%r8, %r7}; +; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r2; +; SM80-NEXT: cvt.f32.bf16 %r9, %rs6; +; SM80-NEXT: cvt.f32.bf16 %r10, %rs5; +; SM80-NEXT: mov.b64 %rd4, {%r10, %r9}; +; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r1; +; SM80-NEXT: cvt.f32.bf16 %r11, %rs8; +; SM80-NEXT: cvt.f32.bf16 %r12, %rs7; +; SM80-NEXT: mov.b64 %rd5, {%r12, %r11}; +; SM80-NEXT: st.param.v2.b64 [func_retval0], {%rd5, %rd4}; +; SM80-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd2}; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_extload_bf16x8( ; SM80-FTZ: { ; SM80-FTZ-NEXT: .reg .b16 %rs<9>; ; SM80-FTZ-NEXT: .reg .b32 %r<13>; -; SM80-FTZ-NEXT: .reg .b64 %rd<2>; +; SM80-FTZ-NEXT: .reg .b64 %rd<6>; ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0]; ; SM80-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r1; -; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r2; -; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r3; -; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs8; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs7; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs6; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs5; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs3; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1; -; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9}; -; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r8, %r7, %r6, %r5}; +; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs1; +; SM80-FTZ-NEXT: mov.b64 %rd2, {%r6, %r5}; +; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r3; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs4; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs3; +; SM80-FTZ-NEXT: mov.b64 %rd3, {%r8, %r7}; +; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r2; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs6; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs5; +; SM80-FTZ-NEXT: mov.b64 %rd4, {%r10, %r9}; +; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r1; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs8; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs7; +; SM80-FTZ-NEXT: mov.b64 %rd5, {%r12, %r11}; +; SM80-FTZ-NEXT: st.param.v2.b64 [func_retval0], {%rd5, %rd4}; +; SM80-FTZ-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd2}; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_extload_bf16x8( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<9>; ; SM90-NEXT: .reg .b32 %r<13>; -; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %rd<6>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0]; ; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r1; -; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r2; -; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r3; -; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r4; -; SM90-NEXT: cvt.f32.bf16 %r5, %rs8; -; SM90-NEXT: cvt.f32.bf16 %r6, %rs7; -; SM90-NEXT: cvt.f32.bf16 %r7, %rs6; -; SM90-NEXT: cvt.f32.bf16 %r8, %rs5; -; SM90-NEXT: cvt.f32.bf16 %r9, %rs4; -; SM90-NEXT: cvt.f32.bf16 %r10, %rs3; -; SM90-NEXT: cvt.f32.bf16 %r11, %rs2; -; SM90-NEXT: cvt.f32.bf16 %r12, %rs1; -; SM90-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9}; -; SM90-NEXT: st.param.v4.b32 [func_retval0+16], {%r8, %r7, %r6, %r5}; +; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; SM90-NEXT: cvt.f32.bf16 %r5, %rs2; +; SM90-NEXT: cvt.f32.bf16 %r6, %rs1; +; SM90-NEXT: mov.b64 %rd2, {%r6, %r5}; +; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r3; +; SM90-NEXT: cvt.f32.bf16 %r7, %rs4; +; SM90-NEXT: cvt.f32.bf16 %r8, %rs3; +; SM90-NEXT: mov.b64 %rd3, {%r8, %r7}; +; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r2; +; SM90-NEXT: cvt.f32.bf16 %r9, %rs6; +; SM90-NEXT: cvt.f32.bf16 %r10, %rs5; +; SM90-NEXT: mov.b64 %rd4, {%r10, %r9}; +; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r1; +; SM90-NEXT: cvt.f32.bf16 %r11, %rs8; +; SM90-NEXT: cvt.f32.bf16 %r12, %rs7; +; SM90-NEXT: mov.b64 %rd5, {%r12, %r11}; +; SM90-NEXT: st.param.v2.b64 [func_retval0], {%rd5, %rd4}; +; SM90-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd2}; ; SM90-NEXT: ret; %load = load <8 x bfloat>, ptr addrspace(3) %arg, align 16 %res = fpext <8 x bfloat> %load to <8 x float> diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll index ec993aa15a85a..9f8ba7ea3939d 100644 --- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll @@ -310,39 +310,43 @@ define <2 x float> @test_select_cc_f32_bf16(<2 x float> %a, <2 x float> %b, ; SM80-NEXT: .reg .pred %p<3>; ; SM80-NEXT: .reg .b16 %rs<5>; ; SM80-NEXT: .reg .b32 %r<13>; +; SM80-NEXT: .reg .b64 %rd<2>; ; SM80-EMPTY: ; SM80-NEXT: // %bb.0: -; SM80-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_bf16_param_0]; -; SM80-NEXT: ld.param.b32 %r3, [test_select_cc_f32_bf16_param_2]; -; SM80-NEXT: ld.param.b32 %r4, [test_select_cc_f32_bf16_param_3]; -; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4; -; SM80-NEXT: cvt.f32.bf16 %r5, %rs1; -; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r3; -; SM80-NEXT: cvt.f32.bf16 %r6, %rs3; -; SM80-NEXT: setp.neu.f32 %p1, %r6, %r5; -; SM80-NEXT: cvt.f32.bf16 %r7, %rs2; -; SM80-NEXT: cvt.f32.bf16 %r8, %rs4; -; SM80-NEXT: setp.neu.f32 %p2, %r8, %r7; -; SM80-NEXT: ld.param.v2.b32 {%r9, %r10}, [test_select_cc_f32_bf16_param_1]; -; SM80-NEXT: selp.f32 %r11, %r2, %r10, %p2; -; SM80-NEXT: selp.f32 %r12, %r1, %r9, %p1; -; SM80-NEXT: st.param.v2.b32 [func_retval0], {%r12, %r11}; +; SM80-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_bf16_param_1]; +; SM80-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_bf16_param_0]; +; SM80-NEXT: ld.param.b32 %r5, [test_select_cc_f32_bf16_param_2]; +; SM80-NEXT: ld.param.b32 %r6, [test_select_cc_f32_bf16_param_3]; +; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r6; +; SM80-NEXT: cvt.f32.bf16 %r7, %rs1; +; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r5; +; SM80-NEXT: cvt.f32.bf16 %r8, %rs3; +; SM80-NEXT: setp.neu.f32 %p1, %r8, %r7; +; SM80-NEXT: cvt.f32.bf16 %r9, %rs2; +; SM80-NEXT: cvt.f32.bf16 %r10, %rs4; +; SM80-NEXT: setp.neu.f32 %p2, %r10, %r9; +; SM80-NEXT: selp.f32 %r11, %r4, %r2, %p2; +; SM80-NEXT: selp.f32 %r12, %r3, %r1, %p1; +; SM80-NEXT: mov.b64 %rd1, {%r12, %r11}; +; SM80-NEXT: st.param.b64 [func_retval0], %rd1; ; SM80-NEXT: ret; ; ; SM90-LABEL: test_select_cc_f32_bf16( ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_bf16_param_0]; -; SM90-NEXT: ld.param.b32 %r3, [test_select_cc_f32_bf16_param_3]; -; SM90-NEXT: ld.param.b32 %r4, [test_select_cc_f32_bf16_param_2]; -; SM90-NEXT: setp.neu.bf16x2 %p1|%p2, %r4, %r3; -; SM90-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f32_bf16_param_1]; -; SM90-NEXT: selp.f32 %r7, %r2, %r6, %p2; -; SM90-NEXT: selp.f32 %r8, %r1, %r5, %p1; -; SM90-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; +; SM90-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_bf16_param_1]; +; SM90-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_bf16_param_0]; +; SM90-NEXT: ld.param.b32 %r5, [test_select_cc_f32_bf16_param_3]; +; SM90-NEXT: ld.param.b32 %r6, [test_select_cc_f32_bf16_param_2]; +; SM90-NEXT: setp.neu.bf16x2 %p1|%p2, %r6, %r5; +; SM90-NEXT: selp.f32 %r7, %r4, %r2, %p2; +; SM90-NEXT: selp.f32 %r8, %r3, %r1, %p1; +; SM90-NEXT: mov.b64 %rd1, {%r8, %r7}; +; SM90-NEXT: st.param.b64 [func_retval0], %rd1; ; SM90-NEXT: ret; <2 x bfloat> %c, <2 x bfloat> %d) #0 { %cc = fcmp une <2 x bfloat> %c, %d @@ -360,10 +364,10 @@ define <2 x bfloat> @test_select_cc_bf16_f32(<2 x bfloat> %a, <2 x bfloat> %b, ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_select_cc_bf16_f32_param_0]; ; CHECK-NEXT: ld.param.b32 %r2, [test_select_cc_bf16_f32_param_1]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_bf16_f32_param_2]; -; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_bf16_f32_param_3]; -; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r5; -; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r6; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_bf16_f32_param_3]; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_bf16_f32_param_2]; +; CHECK-NEXT: setp.neu.f32 %p1, %r5, %r3; +; CHECK-NEXT: setp.neu.f32 %p2, %r6, %r4; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; ; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; ; CHECK-NEXT: selp.b16 %rs5, %rs4, %rs2, %p2; @@ -396,13 +400,15 @@ define <2 x float> @test_fpext_2xfloat(<2 x bfloat> %a) #0 { ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_fpext_2xfloat_param_0]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.f32.bf16 %r2, %rs2; ; CHECK-NEXT: cvt.f32.bf16 %r3, %rs1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r3, %r2}; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r2}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; ; CHECK-NEXT: ret; %r = fpext <2 x bfloat> %a to <2 x float> ret <2 x float> %r diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll index 8c89f82dbf9c1..28652ce9d3c98 100644 --- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -614,16 +614,20 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b, ; CHECK-F16: { ; CHECK-F16-NEXT: .reg .pred %p<3>; ; CHECK-F16-NEXT: .reg .b32 %r<9>; +; CHECK-F16-NEXT: .reg .b64 %rd<4>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: ; CHECK-F16-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_1]; -; CHECK-F16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f16_param_0]; -; CHECK-F16-NEXT: ld.param.b32 %r6, [test_select_cc_f32_f16_param_3]; -; CHECK-F16-NEXT: ld.param.b32 %r5, [test_select_cc_f32_f16_param_2]; -; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r5, %r6; -; CHECK-F16-NEXT: selp.f32 %r7, %r2, %r4, %p2; -; CHECK-F16-NEXT: selp.f32 %r8, %r1, %r3, %p1; -; CHECK-F16-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; +; CHECK-F16-NEXT: mov.b64 %rd2, {%r3, %r4}; +; CHECK-F16-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f32_f16_param_0]; +; CHECK-F16-NEXT: mov.b64 %rd1, {%r5, %r6}; +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_select_cc_f32_f16_param_2]; +; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r1, %r2; +; CHECK-F16-NEXT: selp.f32 %r7, %r6, %r4, %p2; +; CHECK-F16-NEXT: selp.f32 %r8, %r5, %r3, %p1; +; CHECK-F16-NEXT: mov.b64 %rd3, {%r8, %r7}; +; CHECK-F16-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-F16-NEXT: ret; ; ; CHECK-NOF16-LABEL: test_select_cc_f32_f16( @@ -631,23 +635,27 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b, ; CHECK-NOF16-NEXT: .reg .pred %p<3>; ; CHECK-NOF16-NEXT: .reg .b16 %rs<5>; ; CHECK-NOF16-NEXT: .reg .b32 %r<13>; +; CHECK-NOF16-NEXT: .reg .b64 %rd<4>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_1]; -; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f16_param_0]; -; CHECK-NOF16-NEXT: ld.param.b32 %r6, [test_select_cc_f32_f16_param_3]; -; CHECK-NOF16-NEXT: ld.param.b32 %r5, [test_select_cc_f32_f16_param_2]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r6; +; CHECK-NOF16-NEXT: mov.b64 %rd2, {%r3, %r4}; +; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f32_f16_param_0]; +; CHECK-NOF16-NEXT: mov.b64 %rd1, {%r5, %r6}; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_select_cc_f32_f16_param_2]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs1; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r5; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; ; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r8, %r7; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs2; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs4; ; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %r10, %r9; -; CHECK-NOF16-NEXT: selp.f32 %r11, %r2, %r4, %p2; -; CHECK-NOF16-NEXT: selp.f32 %r12, %r1, %r3, %p1; -; CHECK-NOF16-NEXT: st.param.v2.b32 [func_retval0], {%r12, %r11}; +; CHECK-NOF16-NEXT: selp.f32 %r11, %r6, %r4, %p2; +; CHECK-NOF16-NEXT: selp.f32 %r12, %r5, %r3, %p1; +; CHECK-NOF16-NEXT: mov.b64 %rd3, {%r12, %r11}; +; CHECK-NOF16-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-NOF16-NEXT: ret; <2 x half> %c, <2 x half> %d) #0 { %cc = fcmp une <2 x half> %c, %d @@ -661,14 +669,17 @@ define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b, ; CHECK-NEXT: .reg .pred %p<3>; ; CHECK-NEXT: .reg .b16 %rs<7>; ; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f16_f32_param_3]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f16_f32_param_2]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f16_f32_param_3]; +; CHECK-NEXT: mov.b64 %rd2, {%r3, %r4}; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f16_f32_param_2]; +; CHECK-NEXT: mov.b64 %rd1, {%r5, %r6}; ; CHECK-NEXT: ld.param.b32 %r2, [test_select_cc_f16_f32_param_1]; ; CHECK-NEXT: ld.param.b32 %r1, [test_select_cc_f16_f32_param_0]; -; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r5; -; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r6; +; CHECK-NEXT: setp.neu.f32 %p1, %r5, %r3; +; CHECK-NEXT: setp.neu.f32 %p2, %r6, %r4; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; ; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; ; CHECK-NEXT: selp.b16 %rs5, %rs4, %rs2, %p2; @@ -1517,9 +1528,11 @@ define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 { ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptrunc_2xfloat_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; ; CHECK-NEXT: cvt.rn.f16.f32 %rs1, %r2; ; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %r1; ; CHECK-NEXT: mov.b32 %r3, {%rs2, %rs1}; @@ -1552,13 +1565,15 @@ define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 { ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_fpext_2xfloat_param_0]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NEXT: cvt.f32.f16 %r3, %rs1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r3, %r2}; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r2}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; ; CHECK-NEXT: ret; %r = fpext <2 x half> %a to <2 x float> ret <2 x float> %r @@ -1943,9 +1958,11 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { ; CHECK-F16: { ; CHECK-F16-NEXT: .reg .b16 %rs<3>; ; CHECK-F16-NEXT: .reg .b32 %r<8>; +; CHECK-F16-NEXT: .reg .b64 %rd<2>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: ; CHECK-F16-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1]; +; CHECK-F16-NEXT: mov.b64 %rd1, {%r2, %r3}; ; CHECK-F16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0]; ; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs1, %r3; ; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs2, %r2; @@ -1960,9 +1977,11 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { ; CHECK-NOF16: { ; CHECK-NOF16-NEXT: .reg .b16 %rs<9>; ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; +; CHECK-NOF16-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1]; +; CHECK-NOF16-NEXT: mov.b64 %rd1, {%r2, %r3}; ; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0]; ; CHECK-NOF16-NEXT: and.b32 %r4, %r3, -2147483648; ; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r4; } @@ -2033,6 +2052,7 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-F16: { ; CHECK-F16-NEXT: .reg .b16 %rs<3>; ; CHECK-F16-NEXT: .reg .b32 %r<8>; +; CHECK-F16-NEXT: .reg .b64 %rd<2>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: ; CHECK-F16-NEXT: ld.param.b32 %r2, [test_copysign_extended_param_1]; @@ -2043,13 +2063,15 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-F16-NEXT: mov.b32 {%rs1, %rs2}, %r5; ; CHECK-F16-NEXT: cvt.f32.f16 %r6, %rs2; ; CHECK-F16-NEXT: cvt.f32.f16 %r7, %rs1; -; CHECK-F16-NEXT: st.param.v2.b32 [func_retval0], {%r7, %r6}; +; CHECK-F16-NEXT: mov.b64 %rd1, {%r7, %r6}; +; CHECK-F16-NEXT: st.param.b64 [func_retval0], %rd1; ; CHECK-F16-NEXT: ret; ; ; CHECK-NOF16-LABEL: test_copysign_extended( ; CHECK-NOF16: { ; CHECK-NOF16-NEXT: .reg .b16 %rs<11>; ; CHECK-NOF16-NEXT: .reg .b32 %r<5>; +; CHECK-NOF16-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_copysign_extended_param_1]; @@ -2064,7 +2086,8 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: or.b16 %rs10, %rs9, %rs8; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs10; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs7; -; CHECK-NOF16-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF16-NEXT: mov.b64 %rd1, {%r4, %r3}; +; CHECK-NOF16-NEXT: st.param.b64 [func_retval0], %rd1; ; CHECK-NOF16-NEXT: ret; %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) %xr = fpext <2 x half> %r to <2 x float> diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll new file mode 100644 index 0000000000000..3309467928d7b --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll @@ -0,0 +1,1876 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; ## Full FP32x2 support enabled by default. +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100 \ +; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ +; RUN: | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas %{ \ +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100 \ +; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ +; RUN: | %ptxas-verify -arch=sm_100 \ +; RUN: %} + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "nvptx64-nvidia-cuda" + +define <2 x float> @test_ret_const() #0 { +; CHECK-LABEL: test_ret_const( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.b32 %r1, 0f40000000; +; CHECK-NEXT: mov.b32 %r2, 0f3F800000; +; CHECK-NEXT: mov.b64 %rd1, {%r2, %r1}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-NEXT: ret; + ret <2 x float> +} + +define float @test_extract_0(<2 x float> %a) #0 { +; CHECK-LABEL: test_extract_0( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_extract_0_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %e = extractelement <2 x float> %a, i32 0 + ret float %e +} + +define float @test_extract_1(<2 x float> %a) #0 { +; CHECK-LABEL: test_extract_1( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_extract_1_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %e = extractelement <2 x float> %a, i32 1 + ret float %e +} + +; NOTE: disabled as -O3 miscompiles this into pointer arithmetic on +; test_extract_i_param_0 where the symbol's address is not taken first (that +; is, moved to a temporary) +; define float @test_extract_i(<2 x float> %a, i64 %idx) #0 { +; %e = extractelement <2 x float> %a, i64 %idx +; ret float %e +; } + +define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fadd( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fadd_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = fadd <2 x float> %a, %b + ret <2 x float> %r +} + +define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 { +; CHECK-LABEL: test_fadd_imm_0( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_0_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: mov.b32 %r3, 0f40000000; +; CHECK-NEXT: mov.b32 %r4, 0f3F800000; +; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3}; +; CHECK-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = fadd <2 x float> , %a + ret <2 x float> %r +} + +define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 { +; CHECK-LABEL: test_fadd_imm_1( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_1_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: mov.b32 %r3, 0f40000000; +; CHECK-NEXT: mov.b32 %r4, 0f3F800000; +; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3}; +; CHECK-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = fadd <2 x float> %a, + ret <2 x float> %r +} + +define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 { +; CHECK-LABEL: test_fadd_v4( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_param_0]; +; CHECK-NEXT: add.rn.f32x2 %rd5, %rd2, %rd4; +; CHECK-NEXT: add.rn.f32x2 %rd6, %rd1, %rd3; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5}; +; CHECK-NEXT: ret; + %r = fadd <4 x float> %a, %b + ret <4 x float> %r +} + +define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 { +; CHECK-LABEL: test_fadd_imm_0_v4( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_param_0]; +; CHECK-NEXT: mov.b32 %r1, 0f40800000; +; CHECK-NEXT: mov.b32 %r2, 0f40400000; +; CHECK-NEXT: mov.b64 %rd3, {%r2, %r1}; +; CHECK-NEXT: add.rn.f32x2 %rd4, %rd2, %rd3; +; CHECK-NEXT: mov.b32 %r3, 0f40000000; +; CHECK-NEXT: mov.b32 %r4, 0f3F800000; +; CHECK-NEXT: mov.b64 %rd5, {%r4, %r3}; +; CHECK-NEXT: add.rn.f32x2 %rd6, %rd1, %rd5; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4}; +; CHECK-NEXT: ret; + %r = fadd <4 x float> , %a + ret <4 x float> %r +} + +define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 { +; CHECK-LABEL: test_fadd_imm_1_v4( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_param_0]; +; CHECK-NEXT: mov.b32 %r1, 0f40800000; +; CHECK-NEXT: mov.b32 %r2, 0f40400000; +; CHECK-NEXT: mov.b64 %rd3, {%r2, %r1}; +; CHECK-NEXT: add.rn.f32x2 %rd4, %rd2, %rd3; +; CHECK-NEXT: mov.b32 %r3, 0f40000000; +; CHECK-NEXT: mov.b32 %r4, 0f3F800000; +; CHECK-NEXT: mov.b64 %rd5, {%r4, %r3}; +; CHECK-NEXT: add.rn.f32x2 %rd6, %rd1, %rd5; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4}; +; CHECK-NEXT: ret; + %r = fadd <4 x float> %a, + ret <4 x float> %r +} + +define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fsub( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fsub_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fsub_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: sub.rn.f32x2 %rd3, %rd1, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = fsub <2 x float> %a, %b + ret <2 x float> %r +} + +define <2 x float> @test_fneg(<2 x float> %a) #0 { +; CHECK-LABEL: test_fneg( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fneg_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: neg.f32 %r3, %r2; +; CHECK-NEXT: neg.f32 %r4, %r1; +; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; + %r = fneg <2 x float> %a + ret <2 x float> %r +} + +define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fmul( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fmul_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fmul_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: mul.rn.f32x2 %rd3, %rd1, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = fmul <2 x float> %a, %b + ret <2 x float> %r +} + +define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fdiv( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fdiv_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fdiv_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: div.rn.f32 %r5, %r4, %r2; +; CHECK-NEXT: div.rn.f32 %r6, %r3, %r1; +; CHECK-NEXT: mov.b64 %rd3, {%r6, %r5}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = fdiv <2 x float> %a, %b + ret <2 x float> %r +} + +define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_frem( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b32 %r<15>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_frem_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_frem_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: div.rn.f32 %r5, %r4, %r2; +; CHECK-NEXT: cvt.rzi.f32.f32 %r6, %r5; +; CHECK-NEXT: neg.f32 %r7, %r6; +; CHECK-NEXT: fma.rn.f32 %r8, %r7, %r2, %r4; +; CHECK-NEXT: testp.infinite.f32 %p1, %r2; +; CHECK-NEXT: selp.f32 %r9, %r4, %r8, %p1; +; CHECK-NEXT: div.rn.f32 %r10, %r3, %r1; +; CHECK-NEXT: cvt.rzi.f32.f32 %r11, %r10; +; CHECK-NEXT: neg.f32 %r12, %r11; +; CHECK-NEXT: fma.rn.f32 %r13, %r12, %r1, %r3; +; CHECK-NEXT: testp.infinite.f32 %p2, %r1; +; CHECK-NEXT: selp.f32 %r14, %r3, %r13, %p2; +; CHECK-NEXT: mov.b64 %rd3, {%r14, %r9}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = frem <2 x float> %a, %b + ret <2 x float> %r +} + +define <2 x float> @test_fadd_ftz(<2 x float> %a, <2 x float> %b) #2 { +; CHECK-LABEL: test_fadd_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_ftz_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fadd_ftz_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = fadd <2 x float> %a, %b + ret <2 x float> %r +} + +define <2 x float> @test_fadd_imm_0_ftz(<2 x float> %a) #2 { +; CHECK-LABEL: test_fadd_imm_0_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_0_ftz_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: mov.b32 %r3, 0f40000000; +; CHECK-NEXT: mov.b32 %r4, 0f3F800000; +; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3}; +; CHECK-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = fadd <2 x float> , %a + ret <2 x float> %r +} + +define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 { +; CHECK-LABEL: test_fadd_imm_1_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_1_ftz_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: mov.b32 %r3, 0f40000000; +; CHECK-NEXT: mov.b32 %r4, 0f3F800000; +; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3}; +; CHECK-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = fadd <2 x float> %a, + ret <2 x float> %r +} + +define <4 x float> @test_fadd_v4_ftz(<4 x float> %a, <4 x float> %b) #2 { +; CHECK-LABEL: test_fadd_v4_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_ftz_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_ftz_param_0]; +; CHECK-NEXT: add.rn.ftz.f32x2 %rd5, %rd2, %rd4; +; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd3; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5}; +; CHECK-NEXT: ret; + %r = fadd <4 x float> %a, %b + ret <4 x float> %r +} + +define <4 x float> @test_fadd_imm_0_v4_ftz(<4 x float> %a) #2 { +; CHECK-LABEL: test_fadd_imm_0_v4_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_ftz_param_0]; +; CHECK-NEXT: mov.b32 %r1, 0f40800000; +; CHECK-NEXT: mov.b32 %r2, 0f40400000; +; CHECK-NEXT: mov.b64 %rd3, {%r2, %r1}; +; CHECK-NEXT: add.rn.ftz.f32x2 %rd4, %rd2, %rd3; +; CHECK-NEXT: mov.b32 %r3, 0f40000000; +; CHECK-NEXT: mov.b32 %r4, 0f3F800000; +; CHECK-NEXT: mov.b64 %rd5, {%r4, %r3}; +; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd5; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4}; +; CHECK-NEXT: ret; + %r = fadd <4 x float> , %a + ret <4 x float> %r +} + +define <4 x float> @test_fadd_imm_1_v4_ftz(<4 x float> %a) #2 { +; CHECK-LABEL: test_fadd_imm_1_v4_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_ftz_param_0]; +; CHECK-NEXT: mov.b32 %r1, 0f40800000; +; CHECK-NEXT: mov.b32 %r2, 0f40400000; +; CHECK-NEXT: mov.b64 %rd3, {%r2, %r1}; +; CHECK-NEXT: add.rn.ftz.f32x2 %rd4, %rd2, %rd3; +; CHECK-NEXT: mov.b32 %r3, 0f40000000; +; CHECK-NEXT: mov.b32 %r4, 0f3F800000; +; CHECK-NEXT: mov.b64 %rd5, {%r4, %r3}; +; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd5; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4}; +; CHECK-NEXT: ret; + %r = fadd <4 x float> %a, + ret <4 x float> %r +} + +define <2 x float> @test_fsub_ftz(<2 x float> %a, <2 x float> %b) #2 { +; CHECK-LABEL: test_fsub_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fsub_ftz_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fsub_ftz_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: sub.rn.ftz.f32x2 %rd3, %rd1, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = fsub <2 x float> %a, %b + ret <2 x float> %r +} + +define <2 x float> @test_fneg_ftz(<2 x float> %a) #2 { +; CHECK-LABEL: test_fneg_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fneg_ftz_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: neg.ftz.f32 %r3, %r2; +; CHECK-NEXT: neg.ftz.f32 %r4, %r1; +; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; + %r = fneg <2 x float> %a + ret <2 x float> %r +} + +define <2 x float> @test_fmul_ftz(<2 x float> %a, <2 x float> %b) #2 { +; CHECK-LABEL: test_fmul_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fmul_ftz_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fmul_ftz_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: mul.rn.ftz.f32x2 %rd3, %rd1, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = fmul <2 x float> %a, %b + ret <2 x float> %r +} + +define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c) #2 { +; CHECK-LABEL: test_fma_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fma_ftz_param_2]; +; CHECK-NEXT: mov.b64 %rd3, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fma_ftz_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r3, %r4}; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_fma_ftz_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r5, %r6}; +; CHECK-NEXT: fma.rn.ftz.f32x2 %rd4, %rd1, %rd2, %rd3; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) + ret <2 x float> %r +} + +define <2 x float> @test_fdiv_ftz(<2 x float> %a, <2 x float> %b) #2 { +; CHECK-LABEL: test_fdiv_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fdiv_ftz_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fdiv_ftz_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: div.rn.ftz.f32 %r5, %r4, %r2; +; CHECK-NEXT: div.rn.ftz.f32 %r6, %r3, %r1; +; CHECK-NEXT: mov.b64 %rd3, {%r6, %r5}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = fdiv <2 x float> %a, %b + ret <2 x float> %r +} + +define <2 x float> @test_frem_ftz(<2 x float> %a, <2 x float> %b) #2 { +; CHECK-LABEL: test_frem_ftz( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b32 %r<15>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_frem_ftz_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_frem_ftz_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: div.rn.ftz.f32 %r5, %r4, %r2; +; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %r6, %r5; +; CHECK-NEXT: neg.ftz.f32 %r7, %r6; +; CHECK-NEXT: fma.rn.ftz.f32 %r8, %r7, %r2, %r4; +; CHECK-NEXT: testp.infinite.f32 %p1, %r2; +; CHECK-NEXT: selp.f32 %r9, %r4, %r8, %p1; +; CHECK-NEXT: div.rn.ftz.f32 %r10, %r3, %r1; +; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %r11, %r10; +; CHECK-NEXT: neg.ftz.f32 %r12, %r11; +; CHECK-NEXT: fma.rn.ftz.f32 %r13, %r12, %r1, %r3; +; CHECK-NEXT: testp.infinite.f32 %p2, %r1; +; CHECK-NEXT: selp.f32 %r14, %r3, %r13, %p2; +; CHECK-NEXT: mov.b64 %rd3, {%r14, %r9}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = frem <2 x float> %a, %b + ret <2 x float> %r +} + +define void @test_ldst_v2f32(ptr %a, ptr %b) #0 { +; CHECK-LABEL: test_ldst_v2f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v2f32_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v2f32_param_0]; +; CHECK-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: st.v2.b32 [%rd2], {%r1, %r2}; +; CHECK-NEXT: ret; + %t1 = load <2 x float>, ptr %a + store <2 x float> %t1, ptr %b, align 32 + ret void +} + +define void @test_ldst_v3f32(ptr %a, ptr %b) #0 { +; CHECK-LABEL: test_ldst_v3f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v3f32_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v3f32_param_0]; +; CHECK-NEXT: ld.b64 %rd3, [%rd1]; +; CHECK-NEXT: ld.b32 %r1, [%rd1+8]; +; CHECK-NEXT: st.b32 [%rd2+8], %r1; +; CHECK-NEXT: st.b64 [%rd2], %rd3; +; CHECK-NEXT: ret; + %t1 = load <3 x float>, ptr %a + store <3 x float> %t1, ptr %b, align 32 + ret void +} + +define void @test_ldst_v4f32(ptr %a, ptr %b) #0 { +; CHECK-LABEL: test_ldst_v4f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v4f32_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v4f32_param_0]; +; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %t1 = load <4 x float>, ptr %a + store <4 x float> %t1, ptr %b, align 32 + ret void +} + +define void @test_ldst_v8f32(ptr %a, ptr %b) #0 { +; CHECK-LABEL: test_ldst_v8f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v8f32_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v8f32_param_0]; +; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %t1 = load <8 x float>, ptr %a + store <8 x float> %t1, ptr %b, align 32 + ret void +} + +declare <2 x float> @test_callee(<2 x float> %a, <2 x float> %b) #0 + +define <2 x float> @test_call(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_call( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_call_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_call_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: { // callseq 0, 0 +; CHECK-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-NEXT: st.param.b64 [param0], %rd1; +; CHECK-NEXT: .param .align 8 .b8 param1[8]; +; CHECK-NEXT: st.param.b64 [param1], %rd2; +; CHECK-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK-NEXT: ( +; CHECK-NEXT: param0, +; CHECK-NEXT: param1 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b64 %rd3, [retval0]; +; CHECK-NEXT: } // callseq 0 +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = call <2 x float> @test_callee(<2 x float> %a, <2 x float> %b) + ret <2 x float> %r +} + +define <2 x float> @test_call_flipped(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_call_flipped( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_call_flipped_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_call_flipped_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: { // callseq 1, 0 +; CHECK-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-NEXT: st.param.b64 [param0], %rd2; +; CHECK-NEXT: .param .align 8 .b8 param1[8]; +; CHECK-NEXT: st.param.b64 [param1], %rd1; +; CHECK-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK-NEXT: ( +; CHECK-NEXT: param0, +; CHECK-NEXT: param1 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b64 %rd3, [retval0]; +; CHECK-NEXT: } // callseq 1 +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a) + ret <2 x float> %r +} + +define <2 x float> @test_tailcall_flipped(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_tailcall_flipped( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_tailcall_flipped_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_tailcall_flipped_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: { // callseq 2, 0 +; CHECK-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-NEXT: st.param.b64 [param0], %rd2; +; CHECK-NEXT: .param .align 8 .b8 param1[8]; +; CHECK-NEXT: st.param.b64 [param1], %rd1; +; CHECK-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK-NEXT: ( +; CHECK-NEXT: param0, +; CHECK-NEXT: param1 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b64 %rd3, [retval0]; +; CHECK-NEXT: } // callseq 2 +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = tail call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a) + ret <2 x float> %r +} + +define <2 x float> @test_select(<2 x float> %a, <2 x float> %b, i1 zeroext %c) #0 { +; CHECK-LABEL: test_select( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: ld.param.b8 %rs1, [test_select_param_2]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: selp.b64 %rd3, %rd1, %rd2, %p1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = select i1 %c, <2 x float> %a, <2 x float> %b + ret <2 x float> %r +} + +define <2 x float> @test_select_cc(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) #0 { +; CHECK-LABEL: test_select_cc( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b32 %r<11>; +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_param_3]; +; CHECK-NEXT: mov.b64 %rd4, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_param_2]; +; CHECK-NEXT: mov.b64 %rd3, {%r3, %r4}; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r5, %r6}; +; CHECK-NEXT: ld.param.v2.b32 {%r7, %r8}, [test_select_cc_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r7, %r8}; +; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r1; +; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r2; +; CHECK-NEXT: selp.f32 %r9, %r8, %r6, %p2; +; CHECK-NEXT: selp.f32 %r10, %r7, %r5, %p1; +; CHECK-NEXT: mov.b64 %rd5, {%r10, %r9}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd5; +; CHECK-NEXT: ret; + %cc = fcmp une <2 x float> %c, %d + %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b + ret <2 x float> %r +} + +define <2 x double> @test_select_cc_f64_f32(<2 x double> %a, <2 x double> %b, <2 x float> %c, <2 x float> %d) #0 { +; CHECK-LABEL: test_select_cc_f64_f32( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f64_f32_param_3]; +; CHECK-NEXT: mov.b64 %rd6, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f64_f32_param_2]; +; CHECK-NEXT: mov.b64 %rd5, {%r3, %r4}; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f64_f32_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_select_cc_f64_f32_param_0]; +; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r1; +; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r2; +; CHECK-NEXT: selp.f64 %rd7, %rd2, %rd4, %p2; +; CHECK-NEXT: selp.f64 %rd8, %rd1, %rd3, %p1; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd7}; +; CHECK-NEXT: ret; + %cc = fcmp une <2 x float> %c, %d + %r = select <2 x i1> %cc, <2 x double> %a, <2 x double> %b + ret <2 x double> %r +} + +define <2 x float> @test_select_cc_f32_f64(<2 x float> %a, <2 x float> %b, <2 x double> %c, <2 x double> %d) #0 { +; CHECK-LABEL: test_select_cc_f32_f64( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f64_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f64_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [test_select_cc_f32_f64_param_3]; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f32_f64_param_2]; +; CHECK-NEXT: setp.neu.f64 %p1, %rd3, %rd5; +; CHECK-NEXT: setp.neu.f64 %p2, %rd4, %rd6; +; CHECK-NEXT: selp.f32 %r5, %r4, %r2, %p2; +; CHECK-NEXT: selp.f32 %r6, %r3, %r1, %p1; +; CHECK-NEXT: mov.b64 %rd7, {%r6, %r5}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd7; +; CHECK-NEXT: ret; + %cc = fcmp une <2 x double> %c, %d + %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b + ret <2 x float> %r +} + +define <2 x i1> @test_fcmp_une(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_une( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_une_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_une_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: setp.neu.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.neu.f32 %p2, %r3, %r1; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp une <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_ueq(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_ueq( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ueq_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ueq_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: setp.equ.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.equ.f32 %p2, %r3, %r1; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp ueq <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_ugt(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_ugt( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ugt_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ugt_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: setp.gtu.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.gtu.f32 %p2, %r3, %r1; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp ugt <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_uge(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_uge( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_uge_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_uge_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: setp.geu.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.geu.f32 %p2, %r3, %r1; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp uge <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_ult(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_ult( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ult_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ult_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: setp.ltu.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.ltu.f32 %p2, %r3, %r1; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp ult <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_ule(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_ule( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ule_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ule_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: setp.leu.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.leu.f32 %p2, %r3, %r1; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp ule <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_uno(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_uno( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_uno_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_uno_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: setp.nan.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.nan.f32 %p2, %r3, %r1; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp uno <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_one(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_one( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_one_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_one_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: setp.ne.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.ne.f32 %p2, %r3, %r1; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp one <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_oeq(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_oeq( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_oeq_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_oeq_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: setp.eq.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.eq.f32 %p2, %r3, %r1; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp oeq <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_ogt(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_ogt( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ogt_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ogt_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: setp.gt.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.gt.f32 %p2, %r3, %r1; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp ogt <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_oge(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_oge( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_oge_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_oge_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: setp.ge.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.ge.f32 %p2, %r3, %r1; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp oge <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_olt(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_olt( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_olt_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_olt_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: setp.lt.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.lt.f32 %p2, %r3, %r1; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp olt <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_ole(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_ole( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ole_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ole_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: setp.le.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.le.f32 %p2, %r3, %r1; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp ole <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_ord(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_ord( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ord_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ord_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: setp.num.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.num.f32 %p2, %r3, %r1; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp ord <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i32> @test_fptosi_i32(<2 x float> %a) #0 { +; CHECK-LABEL: test_fptosi_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptosi_i32_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: cvt.rzi.s32.f32 %r3, %r2; +; CHECK-NEXT: cvt.rzi.s32.f32 %r4, %r1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: ret; + %r = fptosi <2 x float> %a to <2 x i32> + ret <2 x i32> %r +} + +define <2 x i64> @test_fptosi_i64(<2 x float> %a) #0 { +; CHECK-LABEL: test_fptosi_i64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptosi_i64_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: cvt.rzi.s64.f32 %rd2, %r2; +; CHECK-NEXT: cvt.rzi.s64.f32 %rd3, %r1; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; +; CHECK-NEXT: ret; + %r = fptosi <2 x float> %a to <2 x i64> + ret <2 x i64> %r +} + +define <2 x i32> @test_fptoui_2xi32(<2 x float> %a) #0 { +; CHECK-LABEL: test_fptoui_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptoui_2xi32_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: cvt.rzi.u32.f32 %r3, %r2; +; CHECK-NEXT: cvt.rzi.u32.f32 %r4, %r1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: ret; + %r = fptoui <2 x float> %a to <2 x i32> + ret <2 x i32> %r +} + +define <2 x i64> @test_fptoui_2xi64(<2 x float> %a) #0 { +; CHECK-LABEL: test_fptoui_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptoui_2xi64_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: cvt.rzi.u64.f32 %rd2, %r2; +; CHECK-NEXT: cvt.rzi.u64.f32 %rd3, %r1; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; +; CHECK-NEXT: ret; + %r = fptoui <2 x float> %a to <2 x i64> + ret <2 x i64> %r +} + +define <2 x float> @test_uitofp_2xi32(<2 x i32> %a) #0 { +; CHECK-LABEL: test_uitofp_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_param_0]; +; CHECK-NEXT: cvt.rn.f32.u32 %r3, %r2; +; CHECK-NEXT: cvt.rn.f32.u32 %r4, %r1; +; CHECK-NEXT: mov.b64 %rd1, {%r4, %r3}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-NEXT: ret; + %r = uitofp <2 x i32> %a to <2 x float> + ret <2 x float> %r +} + +define <2 x float> @test_uitofp_2xi64(<2 x i64> %a) #0 { +; CHECK-LABEL: test_uitofp_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_uitofp_2xi64_param_0]; +; CHECK-NEXT: cvt.rn.f32.u64 %r1, %rd2; +; CHECK-NEXT: cvt.rn.f32.u64 %r2, %rd1; +; CHECK-NEXT: mov.b64 %rd3, {%r2, %r1}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = uitofp <2 x i64> %a to <2 x float> + ret <2 x float> %r +} + +define <2 x float> @test_sitofp_2xi32(<2 x i32> %a) #0 { +; CHECK-LABEL: test_sitofp_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_param_0]; +; CHECK-NEXT: cvt.rn.f32.s32 %r3, %r2; +; CHECK-NEXT: cvt.rn.f32.s32 %r4, %r1; +; CHECK-NEXT: mov.b64 %rd1, {%r4, %r3}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-NEXT: ret; + %r = sitofp <2 x i32> %a to <2 x float> + ret <2 x float> %r +} + +define <2 x float> @test_sitofp_2xi64(<2 x i64> %a) #0 { +; CHECK-LABEL: test_sitofp_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_sitofp_2xi64_param_0]; +; CHECK-NEXT: cvt.rn.f32.s64 %r1, %rd2; +; CHECK-NEXT: cvt.rn.f32.s64 %r2, %rd1; +; CHECK-NEXT: mov.b64 %rd3, {%r2, %r1}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = sitofp <2 x i64> %a to <2 x float> + ret <2 x float> %r +} + +define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_uitofp_2xi32_fadd( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_uitofp_2xi32_fadd_param_1]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0]; +; CHECK-NEXT: cvt.rn.f32.u32 %r5, %r2; +; CHECK-NEXT: cvt.rn.f32.u32 %r6, %r1; +; CHECK-NEXT: mov.b64 %rd2, {%r6, %r5}; +; CHECK-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %c = uitofp <2 x i32> %a to <2 x float> + %r = fadd <2 x float> %b, %c + ret <2 x float> %r +} + +define <2 x float> @test_fptrunc_2xdouble(<2 x double> %a) #0 { +; CHECK-LABEL: test_fptrunc_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fptrunc_2xdouble_param_0]; +; CHECK-NEXT: cvt.rn.f32.f64 %r1, %rd2; +; CHECK-NEXT: cvt.rn.f32.f64 %r2, %rd1; +; CHECK-NEXT: mov.b64 %rd3, {%r2, %r1}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = fptrunc <2 x double> %a to <2 x float> + ret <2 x float> %r +} + +define <2 x double> @test_fpext_2xdouble(<2 x float> %a) #0 { +; CHECK-LABEL: test_fpext_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fpext_2xdouble_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: cvt.f64.f32 %rd2, %r2; +; CHECK-NEXT: cvt.f64.f32 %rd3, %r1; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; +; CHECK-NEXT: ret; + %r = fpext <2 x float> %a to <2 x double> + ret <2 x double> %r +} + +define <2 x i32> @test_bitcast_2xfloat_to_2xi32(<2 x float> %a) #0 { +; CHECK-LABEL: test_bitcast_2xfloat_to_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_bitcast_2xfloat_to_2xi32_param_0]; +; CHECK-NEXT: mov.b64 {_, %r1}, %rd1; +; CHECK-NEXT: cvt.u32.u64 %r2, %rd1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; +; CHECK-NEXT: ret; + %r = bitcast <2 x float> %a to <2 x i32> + ret <2 x i32> %r +} + +define <2 x float> @test_bitcast_2xi32_to_2xfloat(<2 x i32> %a) #0 { +; CHECK-LABEL: test_bitcast_2xi32_to_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_bitcast_2xi32_to_2xfloat_param_0]; +; CHECK-NEXT: cvt.u64.u32 %rd1, %r1; +; CHECK-NEXT: cvt.u64.u32 %rd2, %r2; +; CHECK-NEXT: shl.b64 %rd3, %rd2, 32; +; CHECK-NEXT: or.b64 %rd4, %rd1, %rd3; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; +; CHECK-NEXT: ret; + %r = bitcast <2 x i32> %a to <2 x float> + ret <2 x float> %r +} + +define <2 x float> @test_bitcast_double_to_2xfloat(double %a) #0 { +; CHECK-LABEL: test_bitcast_double_to_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_bitcast_double_to_2xfloat_param_0]; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-NEXT: ret; + %r = bitcast double %a to <2 x float> + ret <2 x float> %r +} + +define double @test_bitcast_2xfloat_to_double(<2 x float> %a) #0 { +; CHECK-LABEL: test_bitcast_2xfloat_to_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_bitcast_2xfloat_to_double_param_0]; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-NEXT: ret; + %r = bitcast <2 x float> %a to double + ret double %r +} + +define <2 x float> @test_sqrt(<2 x float> %a) #0 { +; CHECK-LABEL: test_sqrt( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sqrt_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: sqrt.rn.f32 %r3, %r2; +; CHECK-NEXT: sqrt.rn.f32 %r4, %r1; +; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.sqrt(<2 x float> %a) + ret <2 x float> %r +} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_powi( +;define <2 x float> @test_powi(<2 x float> %a, <2 x i32> %b) #0 { +; %r = call <2 x float> @llvm.powi.i32(<2 x float> %a, <2 x i32> %b) +; ret <2 x float> %r +;} + +define <2 x float> @test_sin(<2 x float> %a) #0 #1 { +; CHECK-LABEL: test_sin( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sin_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: sin.approx.f32 %r3, %r2; +; CHECK-NEXT: sin.approx.f32 %r4, %r1; +; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.sin(<2 x float> %a) + ret <2 x float> %r +} + +define <2 x float> @test_cos(<2 x float> %a) #0 #1 { +; CHECK-LABEL: test_cos( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_cos_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: cos.approx.f32 %r3, %r2; +; CHECK-NEXT: cos.approx.f32 %r4, %r1; +; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.cos(<2 x float> %a) + ret <2 x float> %r +} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_pow( +;define <2 x float> @test_pow(<2 x float> %a, <2 x float> %b) #0 { +; %r = call <2 x float> @llvm.pow(<2 x float> %a, <2 x float> %b) +; ret <2 x float> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_exp( +;define <2 x float> @test_exp(<2 x float> %a) #0 { +; %r = call <2 x float> @llvm.exp(<2 x float> %a) +; ret <2 x float> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_exp2( +;define <2 x float> @test_exp2(<2 x float> %a) #0 { +; %r = call <2 x float> @llvm.exp2(<2 x float> %a) +; ret <2 x float> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_log( +;define <2 x float> @test_log(<2 x float> %a) #0 { +; %r = call <2 x float> @llvm.log(<2 x float> %a) +; ret <2 x float> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_log10( +;define <2 x float> @test_log10(<2 x float> %a) #0 { +; %r = call <2 x float> @llvm.log10(<2 x float> %a) +; ret <2 x float> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_log2( +;define <2 x float> @test_log2(<2 x float> %a) #0 { +; %r = call <2 x float> @llvm.log2(<2 x float> %a) +; ret <2 x float> %r +;} + + +define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 { +; CHECK-LABEL: test_fma( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fma_param_2]; +; CHECK-NEXT: mov.b64 %rd3, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fma_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r3, %r4}; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_fma_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r5, %r6}; +; CHECK-NEXT: fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) + ret <2 x float> %r +} + +define <2 x float> @test_fabs(<2 x float> %a) #0 { +; CHECK-LABEL: test_fabs( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fabs_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: abs.f32 %r3, %r2; +; CHECK-NEXT: abs.f32 %r4, %r1; +; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.fabs(<2 x float> %a) + ret <2 x float> %r +} + +define <2 x float> @test_minnum(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_minnum( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_minnum_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_minnum_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: min.f32 %r5, %r4, %r2; +; CHECK-NEXT: min.f32 %r6, %r3, %r1; +; CHECK-NEXT: mov.b64 %rd3, {%r6, %r5}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.minnum(<2 x float> %a, <2 x float> %b) + ret <2 x float> %r +} + +define <2 x float> @test_maxnum(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_maxnum( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_maxnum_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_maxnum_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: max.f32 %r5, %r4, %r2; +; CHECK-NEXT: max.f32 %r6, %r3, %r1; +; CHECK-NEXT: mov.b64 %rd3, {%r6, %r5}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.maxnum(<2 x float> %a, <2 x float> %b) + ret <2 x float> %r +} + +define <2 x float> @test_copysign(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_copysign( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_copysign_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: copysign.f32 %r5, %r2, %r4; +; CHECK-NEXT: copysign.f32 %r6, %r1, %r3; +; CHECK-NEXT: mov.b64 %rd3, {%r6, %r5}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.copysign(<2 x float> %a, <2 x float> %b) + ret <2 x float> %r +} + +define <2 x float> @test_copysign_f64(<2 x float> %a, <2 x double> %b) #0 { +; CHECK-LABEL: test_copysign_f64( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_f64_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_copysign_f64_param_1]; +; CHECK-NEXT: abs.f32 %r3, %r2; +; CHECK-NEXT: neg.f32 %r4, %r3; +; CHECK-NEXT: shr.u64 %rd4, %rd3, 63; +; CHECK-NEXT: and.b64 %rd5, %rd4, 1; +; CHECK-NEXT: setp.ne.b64 %p1, %rd5, 0; +; CHECK-NEXT: selp.f32 %r5, %r4, %r3, %p1; +; CHECK-NEXT: abs.f32 %r6, %r1; +; CHECK-NEXT: neg.f32 %r7, %r6; +; CHECK-NEXT: shr.u64 %rd6, %rd2, 63; +; CHECK-NEXT: and.b64 %rd7, %rd6, 1; +; CHECK-NEXT: setp.ne.b64 %p2, %rd7, 0; +; CHECK-NEXT: selp.f32 %r8, %r7, %r6, %p2; +; CHECK-NEXT: mov.b64 %rd8, {%r8, %r5}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd8; +; CHECK-NEXT: ret; + %tb = fptrunc <2 x double> %b to <2 x float> + %r = call <2 x float> @llvm.copysign(<2 x float> %a, <2 x float> %tb) + ret <2 x float> %r +} + +define <2 x double> @test_copysign_extended(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_copysign_extended( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_extended_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_copysign_extended_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r3, %r4}; +; CHECK-NEXT: copysign.f32 %r5, %r1, %r3; +; CHECK-NEXT: copysign.f32 %r6, %r2, %r4; +; CHECK-NEXT: cvt.f64.f32 %rd3, %r6; +; CHECK-NEXT: cvt.f64.f32 %rd4, %r5; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd3}; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.copysign(<2 x float> %a, <2 x float> %b) + %xr = fpext <2 x float> %r to <2 x double> + ret <2 x double> %xr +} + +define <2 x float> @test_floor(<2 x float> %a) #0 { +; CHECK-LABEL: test_floor( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_floor_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: cvt.rmi.f32.f32 %r3, %r2; +; CHECK-NEXT: cvt.rmi.f32.f32 %r4, %r1; +; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.floor(<2 x float> %a) + ret <2 x float> %r +} + +define <2 x float> @test_ceil(<2 x float> %a) #0 { +; CHECK-LABEL: test_ceil( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_ceil_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: cvt.rpi.f32.f32 %r3, %r2; +; CHECK-NEXT: cvt.rpi.f32.f32 %r4, %r1; +; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.ceil(<2 x float> %a) + ret <2 x float> %r +} + +define <2 x float> @test_trunc(<2 x float> %a) #0 { +; CHECK-LABEL: test_trunc( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: cvt.rzi.f32.f32 %r3, %r2; +; CHECK-NEXT: cvt.rzi.f32.f32 %r4, %r1; +; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.trunc(<2 x float> %a) + ret <2 x float> %r +} + +define <2 x float> @test_rint(<2 x float> %a) #0 { +; CHECK-LABEL: test_rint( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_rint_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: cvt.rni.f32.f32 %r3, %r2; +; CHECK-NEXT: cvt.rni.f32.f32 %r4, %r1; +; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.rint(<2 x float> %a) + ret <2 x float> %r +} + +define <2 x float> @test_nearbyint(<2 x float> %a) #0 { +; CHECK-LABEL: test_nearbyint( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_nearbyint_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: cvt.rni.f32.f32 %r3, %r2; +; CHECK-NEXT: cvt.rni.f32.f32 %r4, %r1; +; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.nearbyint(<2 x float> %a) + ret <2 x float> %r +} + +define <2 x float> @test_roundeven(<2 x float> %a) #0 { +; CHECK-LABEL: test_roundeven( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_roundeven_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: cvt.rni.f32.f32 %r3, %r2; +; CHECK-NEXT: cvt.rni.f32.f32 %r4, %r1; +; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.roundeven(<2 x float> %a) + ret <2 x float> %r +} + +; check the use of sign mask and 0.5 to implement round +define <2 x float> @test_round(<2 x float> %a) #0 { +; CHECK-LABEL: test_round( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<5>; +; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_round_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: and.b32 %r3, %r2, -2147483648; +; CHECK-NEXT: or.b32 %r4, %r3, 1056964608; +; CHECK-NEXT: add.rn.f32 %r5, %r2, %r4; +; CHECK-NEXT: cvt.rzi.f32.f32 %r6, %r5; +; CHECK-NEXT: abs.f32 %r7, %r2; +; CHECK-NEXT: setp.gt.f32 %p1, %r7, 0f4B000000; +; CHECK-NEXT: selp.f32 %r8, %r2, %r6, %p1; +; CHECK-NEXT: cvt.rzi.f32.f32 %r9, %r2; +; CHECK-NEXT: setp.lt.f32 %p2, %r7, 0f3F000000; +; CHECK-NEXT: selp.f32 %r10, %r9, %r8, %p2; +; CHECK-NEXT: and.b32 %r11, %r1, -2147483648; +; CHECK-NEXT: or.b32 %r12, %r11, 1056964608; +; CHECK-NEXT: add.rn.f32 %r13, %r1, %r12; +; CHECK-NEXT: cvt.rzi.f32.f32 %r14, %r13; +; CHECK-NEXT: abs.f32 %r15, %r1; +; CHECK-NEXT: setp.gt.f32 %p3, %r15, 0f4B000000; +; CHECK-NEXT: selp.f32 %r16, %r1, %r14, %p3; +; CHECK-NEXT: cvt.rzi.f32.f32 %r17, %r1; +; CHECK-NEXT: setp.lt.f32 %p4, %r15, 0f3F000000; +; CHECK-NEXT: selp.f32 %r18, %r17, %r16, %p4; +; CHECK-NEXT: mov.b64 %rd2, {%r18, %r10}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.round(<2 x float> %a) + ret <2 x float> %r +} + +define <2 x float> @test_fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 { +; CHECK-LABEL: test_fmuladd( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fmuladd_param_2]; +; CHECK-NEXT: mov.b64 %rd3, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fmuladd_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r3, %r4}; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_fmuladd_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r5, %r6}; +; CHECK-NEXT: fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c) + ret <2 x float> %r +} + +define <2 x float> @test_shufflevector(<2 x float> %a) #0 { +; CHECK-LABEL: test_shufflevector( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_shufflevector_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: mov.b64 %rd2, {%r2, %r1}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; + %s = shufflevector <2 x float> %a, <2 x float> poison, <2 x i32> + ret <2 x float> %s +} + +define <2 x float> @test_insertelement(<2 x float> %a, float %x) #0 { +; CHECK-LABEL: test_insertelement( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_insertelement_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r2, %r3}; +; CHECK-NEXT: ld.param.b32 %r1, [test_insertelement_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r2, %r1}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; + %i = insertelement <2 x float> %a, float %x, i64 1 + ret <2 x float> %i +} + +define <2 x float> @test_sitofp_2xi32_to_2xfloat(<2 x i32> %a) #0 { +; CHECK-LABEL: test_sitofp_2xi32_to_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_to_2xfloat_param_0]; +; CHECK-NEXT: cvt.rn.f32.s32 %r3, %r2; +; CHECK-NEXT: cvt.rn.f32.s32 %r4, %r1; +; CHECK-NEXT: mov.b64 %rd1, {%r4, %r3}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-NEXT: ret; + %r = sitofp <2 x i32> %a to <2 x float> + ret <2 x float> %r +} + +define <2 x float> @test_uitofp_2xi32_to_2xfloat(<2 x i32> %a) #0 { +; CHECK-LABEL: test_uitofp_2xi32_to_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_to_2xfloat_param_0]; +; CHECK-NEXT: cvt.rn.f32.u32 %r3, %r2; +; CHECK-NEXT: cvt.rn.f32.u32 %r4, %r1; +; CHECK-NEXT: mov.b64 %rd1, {%r4, %r3}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-NEXT: ret; + %r = uitofp <2 x i32> %a to <2 x float> + ret <2 x float> %r +} + +attributes #0 = { nounwind } +attributes #1 = { "unsafe-fp-math" = "true" } +attributes #2 = { "denormal-fp-math"="preserve-sign" } diff --git a/llvm/test/CodeGen/NVPTX/fexp2.ll b/llvm/test/CodeGen/NVPTX/fexp2.ll index ef2a788bb8267..0a99d6a36961c 100644 --- a/llvm/test/CodeGen/NVPTX/fexp2.ll +++ b/llvm/test/CodeGen/NVPTX/fexp2.ll @@ -86,34 +86,40 @@ define <2 x float> @exp2_test_v(<2 x float> %in) { ; CHECK-LABEL: exp2_test_v( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [exp2_test_v_param_0]; ; CHECK-NEXT: ex2.approx.f32 %r3, %r2; ; CHECK-NEXT: ex2.approx.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: mov.b64 %rd1, {%r4, %r3}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; ; CHECK-NEXT: ret; ; ; CHECK-FP16-LABEL: exp2_test_v( ; CHECK-FP16: { ; CHECK-FP16-NEXT: .reg .b32 %r<5>; +; CHECK-FP16-NEXT: .reg .b64 %rd<2>; ; CHECK-FP16-EMPTY: ; CHECK-FP16-NEXT: // %bb.0: // %entry ; CHECK-FP16-NEXT: ld.param.v2.b32 {%r1, %r2}, [exp2_test_v_param_0]; ; CHECK-FP16-NEXT: ex2.approx.f32 %r3, %r2; ; CHECK-FP16-NEXT: ex2.approx.f32 %r4, %r1; -; CHECK-FP16-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-FP16-NEXT: mov.b64 %rd1, {%r4, %r3}; +; CHECK-FP16-NEXT: st.param.b64 [func_retval0], %rd1; ; CHECK-FP16-NEXT: ret; ; ; CHECK-BF16-LABEL: exp2_test_v( ; CHECK-BF16: { ; CHECK-BF16-NEXT: .reg .b32 %r<5>; +; CHECK-BF16-NEXT: .reg .b64 %rd<2>; ; CHECK-BF16-EMPTY: ; CHECK-BF16-NEXT: // %bb.0: // %entry ; CHECK-BF16-NEXT: ld.param.v2.b32 {%r1, %r2}, [exp2_test_v_param_0]; ; CHECK-BF16-NEXT: ex2.approx.f32 %r3, %r2; ; CHECK-BF16-NEXT: ex2.approx.f32 %r4, %r1; -; CHECK-BF16-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-BF16-NEXT: mov.b64 %rd1, {%r4, %r3}; +; CHECK-BF16-NEXT: st.param.b64 [func_retval0], %rd1; ; CHECK-BF16-NEXT: ret; entry: %exp2 = call <2 x float> @llvm.exp2.v2f32(<2 x float> %in) diff --git a/llvm/test/CodeGen/NVPTX/flog2.ll b/llvm/test/CodeGen/NVPTX/flog2.ll index 7a5b1bb0ddef6..8fe9adcda7001 100644 --- a/llvm/test/CodeGen/NVPTX/flog2.ll +++ b/llvm/test/CodeGen/NVPTX/flog2.ll @@ -40,12 +40,14 @@ define <2 x float> @log2_test_v(<2 x float> %in) { ; CHECK-LABEL: log2_test_v( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [log2_test_v_param_0]; ; CHECK-NEXT: lg2.approx.f32 %r3, %r2; ; CHECK-NEXT: lg2.approx.f32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: mov.b64 %rd1, {%r4, %r3}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; ; CHECK-NEXT: ret; entry: %log2 = call <2 x float> @llvm.log2.v2f32(<2 x float> %in) diff --git a/llvm/test/CodeGen/NVPTX/fp-contract-f32x2.ll b/llvm/test/CodeGen/NVPTX/fp-contract-f32x2.ll new file mode 100644 index 0000000000000..9c0632add6fd4 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fp-contract-f32x2.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,FAST +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 | FileCheck %s --check-prefixes=CHECK,DEFAULT +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | %ptxas-verify -arch sm_100 %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 | %ptxas-verify -arch sm_100 %} + +target triple = "nvptx64-unknown-cuda" + +;; FAST-LABEL: @t0 +;; DEFAULT-LABEL: @t0 +define <2 x float> @t0(<2 x float> %a, <2 x float> %b, <2 x float> %c) { +; FAST-LABEL: t0( +; FAST: { +; FAST-NEXT: .reg .b32 %r<7>; +; FAST-NEXT: .reg .b64 %rd<5>; +; FAST-EMPTY: +; FAST-NEXT: // %bb.0: +; FAST-NEXT: ld.param.v2.b32 {%r1, %r2}, [t0_param_2]; +; FAST-NEXT: mov.b64 %rd1, {%r1, %r2}; +; FAST-NEXT: ld.param.v2.b32 {%r3, %r4}, [t0_param_1]; +; FAST-NEXT: mov.b64 %rd2, {%r3, %r4}; +; FAST-NEXT: ld.param.v2.b32 {%r5, %r6}, [t0_param_0]; +; FAST-NEXT: mov.b64 %rd3, {%r5, %r6}; +; FAST-NEXT: fma.rn.f32x2 %rd4, %rd3, %rd2, %rd1; +; FAST-NEXT: st.param.b64 [func_retval0], %rd4; +; FAST-NEXT: ret; +; +; DEFAULT-LABEL: t0( +; DEFAULT: { +; DEFAULT-NEXT: .reg .b32 %r<7>; +; DEFAULT-NEXT: .reg .b64 %rd<6>; +; DEFAULT-EMPTY: +; DEFAULT-NEXT: // %bb.0: +; DEFAULT-NEXT: ld.param.v2.b32 {%r1, %r2}, [t0_param_2]; +; DEFAULT-NEXT: mov.b64 %rd1, {%r1, %r2}; +; DEFAULT-NEXT: ld.param.v2.b32 {%r3, %r4}, [t0_param_1]; +; DEFAULT-NEXT: mov.b64 %rd2, {%r3, %r4}; +; DEFAULT-NEXT: ld.param.v2.b32 {%r5, %r6}, [t0_param_0]; +; DEFAULT-NEXT: mov.b64 %rd3, {%r5, %r6}; +; DEFAULT-NEXT: mul.rn.f32x2 %rd4, %rd3, %rd2; +; DEFAULT-NEXT: add.rn.f32x2 %rd5, %rd4, %rd1; +; DEFAULT-NEXT: st.param.b64 [func_retval0], %rd5; +; DEFAULT-NEXT: ret; + %v0 = fmul <2 x float> %a, %b + %v1 = fadd <2 x float> %v0, %c + ret <2 x float> %v1 +} + +;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32x2 +;; to prevent ptxas from fusing this with anything else. +define <2 x float> @t1(<2 x float> %a, <2 x float> %b) { +; FAST-LABEL: t1( +; FAST: { +; FAST-NEXT: .reg .b32 %r<5>; +; FAST-NEXT: .reg .b64 %rd<6>; +; FAST-EMPTY: +; FAST-NEXT: // %bb.0: +; FAST-NEXT: ld.param.v2.b32 {%r1, %r2}, [t1_param_1]; +; FAST-NEXT: mov.b64 %rd1, {%r1, %r2}; +; FAST-NEXT: ld.param.v2.b32 {%r3, %r4}, [t1_param_0]; +; FAST-NEXT: mov.b64 %rd2, {%r3, %r4}; +; FAST-NEXT: add.f32x2 %rd3, %rd2, %rd1; +; FAST-NEXT: sub.f32x2 %rd4, %rd2, %rd1; +; FAST-NEXT: mul.f32x2 %rd5, %rd3, %rd4; +; FAST-NEXT: st.param.b64 [func_retval0], %rd5; +; FAST-NEXT: ret; +; +; DEFAULT-LABEL: t1( +; DEFAULT: { +; DEFAULT-NEXT: .reg .b32 %r<5>; +; DEFAULT-NEXT: .reg .b64 %rd<6>; +; DEFAULT-EMPTY: +; DEFAULT-NEXT: // %bb.0: +; DEFAULT-NEXT: ld.param.v2.b32 {%r1, %r2}, [t1_param_1]; +; DEFAULT-NEXT: mov.b64 %rd1, {%r1, %r2}; +; DEFAULT-NEXT: ld.param.v2.b32 {%r3, %r4}, [t1_param_0]; +; DEFAULT-NEXT: mov.b64 %rd2, {%r3, %r4}; +; DEFAULT-NEXT: add.rn.f32x2 %rd3, %rd2, %rd1; +; DEFAULT-NEXT: sub.rn.f32x2 %rd4, %rd2, %rd1; +; DEFAULT-NEXT: mul.rn.f32x2 %rd5, %rd3, %rd4; +; DEFAULT-NEXT: st.param.b64 [func_retval0], %rd5; +; DEFAULT-NEXT: ret; + %v1 = fadd <2 x float> %a, %b + %v2 = fsub <2 x float> %a, %b + %v3 = fmul <2 x float> %v1, %v2 + ret <2 x float> %v3 +} + +;; Make sure we generate the non ".rn" version when the "contract" flag is +;; present on the instructions +define <2 x float> @t2(<2 x float> %a, <2 x float> %b) { +; CHECK-LABEL: t2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [t2_param_1]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [t2_param_0]; +; CHECK-NEXT: mov.b64 %rd2, {%r3, %r4}; +; CHECK-NEXT: add.f32x2 %rd3, %rd2, %rd1; +; CHECK-NEXT: sub.f32x2 %rd4, %rd2, %rd1; +; CHECK-NEXT: mul.f32x2 %rd5, %rd3, %rd4; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd5; +; CHECK-NEXT: ret; + %v1 = fadd contract <2 x float> %a, %b + %v2 = fsub contract <2 x float> %a, %b + %v3 = fmul contract <2 x float> %v1, %v2 + ret <2 x float> %v3 +} + +;; Make sure we always fold to fma when the "contract" flag is present +define <2 x float> @t3(<2 x float> %a, <2 x float> %b, <2 x float> %c) { +; CHECK-LABEL: t3( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [t3_param_2]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [t3_param_1]; +; CHECK-NEXT: mov.b64 %rd2, {%r3, %r4}; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [t3_param_0]; +; CHECK-NEXT: mov.b64 %rd3, {%r5, %r6}; +; CHECK-NEXT: fma.rn.f32x2 %rd4, %rd3, %rd2, %rd1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; +; CHECK-NEXT: ret; + %v0 = fmul contract <2 x float> %a, %b + %v1 = fadd contract <2 x float> %v0, %c + ret <2 x float> %v1 +} diff --git a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll index 419c780f7d82a..282da30af8c94 100644 --- a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll +++ b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll @@ -7,8 +7,8 @@ declare <4 x float> @bar() define void @foo(ptr %ptr) { ; CHECK-LABEL: foo( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [foo_param_0]; @@ -18,9 +18,11 @@ define void @foo(ptr %ptr) { ; CHECK-NEXT: bar, ; CHECK-NEXT: ( ; CHECK-NEXT: ); -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [retval0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [retval0]; ; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: st.v4.b32 [%rd1], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd3; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: st.v4.b32 [%rd1], {%r3, %r4, %r1, %r2}; ; CHECK-NEXT: ret; %val = tail call <4 x float> @bar() store <4 x float> %val, ptr %ptr diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll index ce6707c4564bf..85f7a85aa60b0 100644 --- a/llvm/test/CodeGen/NVPTX/param-load-store.ll +++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; Verifies correctness of load/store of parameters and return values. ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | %ptxas-verify %} @@ -21,1241 +22,1899 @@ ; All scalar parameters must be at least 32 bits in size. ; i1 is loaded/stored as i8. -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i1( -; CHECK-NEXT: .param .b32 test_i1_param_0 -; CHECK: ld.param.b8 [[A8:%rs[0-9]+]], [test_i1_param_0]; -; CHECK: and.b16 [[A:%rs[0-9]+]], [[A8]], 1; -; CHECK: setp.ne.b16 %p1, [[A]], 0 -; CHECK: cvt.u32.u16 [[B:%r[0-9]+]], [[A8]] -; CHECK: and.b32 [[C:%r[0-9]+]], [[B]], 1; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], [[C]] -; CHECK: .param .b32 retval0; -; CHECK: call.uni -; CHECK-NEXT: test_i1, -; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0]; -; CHECK: and.b32 [[R:%r[0-9]+]], [[R8]], 1; -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK: ret; define i1 @test_i1(i1 %a) { +; CHECK-LABEL: test_i1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [test_i1_param_0]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: and.b32 %r2, %r1, 1; +; CHECK-NEXT: { // callseq 0, 0 +; CHECK-NEXT: .param .b32 param0; +; CHECK-NEXT: st.param.b32 [param0], %r2; +; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_i1, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r3, [retval0]; +; CHECK-NEXT: } // callseq 0 +; CHECK-NEXT: and.b32 %r5, %r3, 1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: ret; %r = tail call i1 @test_i1(i1 %a); ret i1 %r; } ; Signed i1 is a somewhat special case. We only care about one bit and ; then us neg.s32 to convert it to 32-bit -1 if it's set. -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i1s( -; CHECK-NEXT: .param .b32 test_i1s_param_0 -; CHECK: ld.param.b8 [[A8:%rs[0-9]+]], [test_i1s_param_0]; -; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]]; -; CHECK: and.b32 [[A1:%r[0-9]+]], [[A32]], 1; -; CHECK: neg.s32 [[A:%r[0-9]+]], [[A1]]; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], [[A]]; -; CHECK: .param .b32 retval0; -; CHECK: call.uni -; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0]; -; CHECK: and.b32 [[R1:%r[0-9]+]], [[R8]], 1; -; CHECK: neg.s32 [[R:%r[0-9]+]], [[R1]]; -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define signext i1 @test_i1s(i1 signext %a) { +; CHECK-LABEL: test_i1s( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [test_i1s_param_0]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: and.b32 %r2, %r1, 1; +; CHECK-NEXT: neg.s32 %r3, %r2; +; CHECK-NEXT: { // callseq 1, 0 +; CHECK-NEXT: .param .b32 param0; +; CHECK-NEXT: st.param.b32 [param0], %r3; +; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_i1s, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r4, [retval0]; +; CHECK-NEXT: } // callseq 1 +; CHECK-NEXT: and.b32 %r6, %r4, 1; +; CHECK-NEXT: neg.s32 %r7, %r6; +; CHECK-NEXT: st.param.b32 [func_retval0], %r7; +; CHECK-NEXT: ret; %r = tail call signext i1 @test_i1s(i1 signext %a); ret i1 %r; } ; Make sure that i1 loads are vectorized as i8 loads, respecting each element alignment. -; CHECK: .func (.param .align 1 .b8 func_retval0[1]) -; CHECK-LABEL: test_v3i1( -; CHECK-NEXT: .param .align 1 .b8 test_v3i1_param_0[1] -; CHECK-DAG: ld.param.b8 [[E2:%rs[0-9]+]], [test_v3i1_param_0+2]; -; CHECK-DAG: ld.param.b8 [[E0:%rs[0-9]+]], [test_v3i1_param_0] -; CHECK: .param .align 1 .b8 param0[1]; -; CHECK-DAG: st.param.b8 [param0], [[E0]]; -; CHECK-DAG: st.param.b8 [param0+2], [[E2]]; -; CHECK: .param .align 1 .b8 retval0[1]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v3i1, -; CHECK-DAG: ld.param.b8 [[RE0:%rs[0-9]+]], [retval0]; -; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2]; -; CHECK-DAG: st.param.b8 [func_retval0], [[RE0]] -; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]]; -; CHECK-NEXT: ret; define <3 x i1> @test_v3i1(<3 x i1> %a) { +; CHECK-LABEL: test_v3i1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<4>; +; CHECK-NEXT: .reg .b16 %rs<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [test_v3i1_param_0+2]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p3, %rs2, 0; +; CHECK-NEXT: ld.param.b8 %rs3, [test_v3i1_param_0+1]; +; CHECK-NEXT: and.b16 %rs4, %rs3, 1; +; CHECK-NEXT: setp.ne.b16 %p2, %rs4, 0; +; CHECK-NEXT: ld.param.b8 %rs5, [test_v3i1_param_0]; +; CHECK-NEXT: and.b16 %rs6, %rs5, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs6, 0; +; CHECK-NEXT: { // callseq 2, 0 +; CHECK-NEXT: .param .align 1 .b8 param0[1]; +; CHECK-NEXT: st.param.b8 [param0], %rs5; +; CHECK-NEXT: st.param.b8 [param0+1], %rs3; +; CHECK-NEXT: st.param.b8 [param0+2], %rs1; +; CHECK-NEXT: .param .align 1 .b8 retval0[1]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_v3i1, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b8 %rs7, [retval0]; +; CHECK-NEXT: ld.param.b8 %rs8, [retval0+1]; +; CHECK-NEXT: ld.param.b8 %rs9, [retval0+2]; +; CHECK-NEXT: } // callseq 2 +; CHECK-NEXT: st.param.b8 [func_retval0], %rs7; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs8; +; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs9; +; CHECK-NEXT: ret; %r = tail call <3 x i1> @test_v3i1(<3 x i1> %a); ret <3 x i1> %r; } -; CHECK: .func (.param .align 1 .b8 func_retval0[1]) -; CHECK-LABEL: test_v4i1( -; CHECK-NEXT: .param .align 1 .b8 test_v4i1_param_0[1] -; CHECK: ld.param.b8 [[E0:%rs[0-9]+]], [test_v4i1_param_0] -; CHECK: .param .align 1 .b8 param0[1]; -; CHECK: st.param.b8 [param0], [[E0]]; -; CHECK: .param .align 1 .b8 retval0[1]; -; CHECK: call.uni (retval0), -; CHECK: test_v4i1, -; CHECK: ld.param.b8 [[RE0:%rs[0-9]+]], [retval0]; -; CHECK: ld.param.b8 [[RE1:%rs[0-9]+]], [retval0+1]; -; CHECK: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2]; -; CHECK: ld.param.b8 [[RE3:%rs[0-9]+]], [retval0+3]; -; CHECK: st.param.b8 [func_retval0], [[RE0]]; -; CHECK: st.param.b8 [func_retval0+1], [[RE1]]; -; CHECK: st.param.b8 [func_retval0+2], [[RE2]]; -; CHECK: st.param.b8 [func_retval0+3], [[RE3]]; -; CHECK-NEXT: ret; define <4 x i1> @test_v4i1(<4 x i1> %a) { +; CHECK-LABEL: test_v4i1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<5>; +; CHECK-NEXT: .reg .b16 %rs<17>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [test_v4i1_param_0+3]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p4, %rs2, 0; +; CHECK-NEXT: ld.param.b8 %rs3, [test_v4i1_param_0+2]; +; CHECK-NEXT: and.b16 %rs4, %rs3, 1; +; CHECK-NEXT: setp.ne.b16 %p3, %rs4, 0; +; CHECK-NEXT: ld.param.b8 %rs5, [test_v4i1_param_0+1]; +; CHECK-NEXT: and.b16 %rs6, %rs5, 1; +; CHECK-NEXT: setp.ne.b16 %p2, %rs6, 0; +; CHECK-NEXT: ld.param.b8 %rs7, [test_v4i1_param_0]; +; CHECK-NEXT: and.b16 %rs8, %rs7, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs8, 0; +; CHECK-NEXT: { // callseq 3, 0 +; CHECK-NEXT: .param .align 1 .b8 param0[1]; +; CHECK-NEXT: st.param.b8 [param0], %rs7; +; CHECK-NEXT: st.param.b8 [param0+1], %rs5; +; CHECK-NEXT: st.param.b8 [param0+2], %rs3; +; CHECK-NEXT: st.param.b8 [param0+3], %rs1; +; CHECK-NEXT: .param .align 1 .b8 retval0[1]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_v4i1, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b8 %rs9, [retval0]; +; CHECK-NEXT: ld.param.b8 %rs10, [retval0+1]; +; CHECK-NEXT: ld.param.b8 %rs11, [retval0+2]; +; CHECK-NEXT: ld.param.b8 %rs12, [retval0+3]; +; CHECK-NEXT: } // callseq 3 +; CHECK-NEXT: st.param.b8 [func_retval0], %rs9; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs10; +; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs11; +; CHECK-NEXT: st.param.b8 [func_retval0+3], %rs12; +; CHECK-NEXT: ret; %r = tail call <4 x i1> @test_v4i1(<4 x i1> %a); ret <4 x i1> %r; } -; CHECK: .func (.param .align 1 .b8 func_retval0[1]) -; CHECK-LABEL: test_v5i1( -; CHECK-NEXT: .param .align 1 .b8 test_v5i1_param_0[1] -; CHECK-DAG: ld.param.b8 [[E4:%rs[0-9]+]], [test_v5i1_param_0+4]; -; CHECK-DAG: ld.param.b8 [[E0:%rs[0-9]+]], [test_v5i1_param_0] -; CHECK: .param .align 1 .b8 param0[1]; -; CHECK-DAG: st.param.b8 [param0], [[E0]]; -; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; -; CHECK: .param .align 1 .b8 retval0[1]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v5i1, -; CHECK-DAG: ld.param.b8 [[RE0:%rs[0-9]+]], [retval0]; -; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4]; -; CHECK-DAG: st.param.b8 [func_retval0], [[RE0]] -; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]]; -; CHECK-NEXT: ret; define <5 x i1> @test_v5i1(<5 x i1> %a) { +; CHECK-LABEL: test_v5i1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<6>; +; CHECK-NEXT: .reg .b16 %rs<21>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [test_v5i1_param_0+4]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p5, %rs2, 0; +; CHECK-NEXT: ld.param.b8 %rs3, [test_v5i1_param_0+3]; +; CHECK-NEXT: and.b16 %rs4, %rs3, 1; +; CHECK-NEXT: setp.ne.b16 %p4, %rs4, 0; +; CHECK-NEXT: ld.param.b8 %rs5, [test_v5i1_param_0+2]; +; CHECK-NEXT: and.b16 %rs6, %rs5, 1; +; CHECK-NEXT: setp.ne.b16 %p3, %rs6, 0; +; CHECK-NEXT: ld.param.b8 %rs7, [test_v5i1_param_0+1]; +; CHECK-NEXT: and.b16 %rs8, %rs7, 1; +; CHECK-NEXT: setp.ne.b16 %p2, %rs8, 0; +; CHECK-NEXT: ld.param.b8 %rs9, [test_v5i1_param_0]; +; CHECK-NEXT: and.b16 %rs10, %rs9, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs10, 0; +; CHECK-NEXT: { // callseq 4, 0 +; CHECK-NEXT: .param .align 1 .b8 param0[1]; +; CHECK-NEXT: st.param.b8 [param0], %rs9; +; CHECK-NEXT: st.param.b8 [param0+1], %rs7; +; CHECK-NEXT: st.param.b8 [param0+2], %rs5; +; CHECK-NEXT: st.param.b8 [param0+3], %rs3; +; CHECK-NEXT: st.param.b8 [param0+4], %rs1; +; CHECK-NEXT: .param .align 1 .b8 retval0[1]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_v5i1, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b8 %rs11, [retval0]; +; CHECK-NEXT: ld.param.b8 %rs12, [retval0+1]; +; CHECK-NEXT: ld.param.b8 %rs13, [retval0+2]; +; CHECK-NEXT: ld.param.b8 %rs14, [retval0+3]; +; CHECK-NEXT: ld.param.b8 %rs15, [retval0+4]; +; CHECK-NEXT: } // callseq 4 +; CHECK-NEXT: st.param.b8 [func_retval0], %rs11; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs12; +; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs13; +; CHECK-NEXT: st.param.b8 [func_retval0+3], %rs14; +; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs15; +; CHECK-NEXT: ret; %r = tail call <5 x i1> @test_v5i1(<5 x i1> %a); ret <5 x i1> %r; } -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i2( -; CHECK-NEXT: .param .b32 test_i2_param_0 -; CHECK: ld.param.b8 {{%rs[0-9]+}}, [test_i2_param_0]; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK: test_i2, -; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0]; -; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}}; -; CHECK-NEXT: ret; define i2 @test_i2(i2 %a) { +; CHECK-LABEL: test_i2( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [test_i2_param_0]; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: and.b32 %r2, %r1, 3; +; CHECK-NEXT: { // callseq 5, 0 +; CHECK-NEXT: .param .b32 param0; +; CHECK-NEXT: st.param.b32 [param0], %r2; +; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_i2, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r3, [retval0]; +; CHECK-NEXT: } // callseq 5 +; CHECK-NEXT: and.b32 %r5, %r3, 3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: ret; %r = tail call i2 @test_i2(i2 %a); ret i2 %r; } -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i3( -; CHECK-NEXT: .param .b32 test_i3_param_0 -; CHECK: ld.param.b8 {{%rs[0-9]+}}, [test_i3_param_0]; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK: test_i3, -; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0]; -; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}}; -; CHECK-NEXT: ret; define i3 @test_i3(i3 %a) { +; CHECK-LABEL: test_i3( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [test_i3_param_0]; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: and.b32 %r2, %r1, 7; +; CHECK-NEXT: { // callseq 6, 0 +; CHECK-NEXT: .param .b32 param0; +; CHECK-NEXT: st.param.b32 [param0], %r2; +; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_i3, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r3, [retval0]; +; CHECK-NEXT: } // callseq 6 +; CHECK-NEXT: and.b32 %r5, %r3, 7; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: ret; %r = tail call i3 @test_i3(i3 %a); ret i3 %r; } ; Unsigned i8 is loaded directly into 32-bit register. -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i8( -; CHECK-NEXT: .param .b32 test_i8_param_0 -; CHECK: ld.param.b8 [[A8:%rs[0-9]+]], [test_i8_param_0]; -; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]]; -; CHECK: and.b32 [[A:%r[0-9]+]], [[A32]], 255; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], [[A]]; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK: test_i8, -; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0]; -; CHECK: and.b32 [[R:%r[0-9]+]], [[R32]], 255; -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define i8 @test_i8(i8 %a) { +; CHECK-LABEL: test_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [test_i8_param_0]; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: and.b32 %r2, %r1, 255; +; CHECK-NEXT: { // callseq 7, 0 +; CHECK-NEXT: .param .b32 param0; +; CHECK-NEXT: st.param.b32 [param0], %r2; +; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_i8, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r3, [retval0]; +; CHECK-NEXT: } // callseq 7 +; CHECK-NEXT: and.b32 %r5, %r3, 255; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: ret; %r = tail call i8 @test_i8(i8 %a); ret i8 %r; } ; signed i8 is loaded into 16-bit register which is then sign-extended to i32. -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i8s( -; CHECK-NEXT: .param .b32 test_i8s_param_0 -; CHECK: ld.param.s8 [[A8:%rs[0-9]+]], [test_i8s_param_0]; -; CHECK: cvt.s32.s16 [[A:%r[0-9]+]], [[A8]]; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], [[A]]; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK: test_i8s, -; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0]; ; -- This is suspicious (though correct) -- why not cvt.u8.u32, cvt.s8.s32 ? -; CHECK: cvt.u16.u32 [[R16:%rs[0-9]+]], [[R32]]; -; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[R16]]; -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define signext i8 @test_i8s(i8 signext %a) { +; CHECK-LABEL: test_i8s( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.s8 %rs1, [test_i8s_param_0]; +; CHECK-NEXT: cvt.s32.s16 %r1, %rs1; +; CHECK-NEXT: { // callseq 8, 0 +; CHECK-NEXT: .param .b32 param0; +; CHECK-NEXT: st.param.b32 [param0], %r1; +; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_i8s, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r2, [retval0]; +; CHECK-NEXT: } // callseq 8 +; CHECK-NEXT: cvt.u16.u32 %rs2, %r2; +; CHECK-NEXT: cvt.s32.s16 %r4, %rs2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; %r = tail call signext i8 @test_i8s(i8 signext %a); ret i8 %r; } -; CHECK: .func (.param .align 4 .b8 func_retval0[4]) -; CHECK-LABEL: test_v3i8( -; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4] -; CHECK: ld.param.b32 [[R:%r[0-9]+]], [test_v3i8_param_0]; -; CHECK: .param .align 4 .b8 param0[4]; -; CHECK: st.param.b32 [param0], [[R]] -; CHECK: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v3i8, -; CHECK: ld.param.b32 [[RE:%r[0-9]+]], [retval0]; ; v4i8/i32->{v3i8 elements}->v4i8/i32 conversion is messy and not very ; interesting here, so it's skipped. -; CHECK: st.param.b32 [func_retval0], -; CHECK-NEXT: ret; define <3 x i8> @test_v3i8(<3 x i8> %a) { +; CHECK-LABEL: test_v3i8( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_v3i8_param_0]; +; CHECK-NEXT: { // callseq 9, 0 +; CHECK-NEXT: .param .align 4 .b8 param0[4]; +; CHECK-NEXT: st.param.b32 [param0], %r1; +; CHECK-NEXT: .param .align 4 .b8 retval0[4]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_v3i8, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r2, [retval0]; +; CHECK-NEXT: } // callseq 9 +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = tail call <3 x i8> @test_v3i8(<3 x i8> %a); ret <3 x i8> %r; } -; CHECK: .func (.param .align 4 .b8 func_retval0[4]) -; CHECK-LABEL: test_v4i8( -; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4] -; CHECK: ld.param.b32 [[R:%r[0-9]+]], [test_v4i8_param_0] -; CHECK: .param .align 4 .b8 param0[4]; -; CHECK: st.param.b32 [param0], [[R]]; -; CHECK: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v4i8, -; CHECK: ld.param.b32 [[RET:%r[0-9]+]], [retval0]; -; CHECK: st.param.b32 [func_retval0], [[RET]]; -; CHECK-NEXT: ret; define <4 x i8> @test_v4i8(<4 x i8> %a) { +; CHECK-LABEL: test_v4i8( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_v4i8_param_0]; +; CHECK-NEXT: { // callseq 10, 0 +; CHECK-NEXT: .param .align 4 .b8 param0[4]; +; CHECK-NEXT: st.param.b32 [param0], %r1; +; CHECK-NEXT: .param .align 4 .b8 retval0[4]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_v4i8, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r2, [retval0]; +; CHECK-NEXT: } // callseq 10 +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = tail call <4 x i8> @test_v4i8(<4 x i8> %a); ret <4 x i8> %r; } -; CHECK: .func (.param .align 8 .b8 func_retval0[8]) -; CHECK-LABEL: test_v5i8( -; CHECK-NEXT: .param .align 8 .b8 test_v5i8_param_0[8] -; CHECK-DAG: ld.param.b32 [[E0:%r[0-9]+]], [test_v5i8_param_0] -; CHECK-DAG: ld.param.b8 [[E4:%rs[0-9]+]], [test_v5i8_param_0+4]; -; CHECK: .param .align 8 .b8 param0[8]; -; CHECK-DAG: st.param.v4.b8 [param0], -; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; -; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v5i8, -; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0]; -; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4]; -; CHECK-DAG: st.param.v4.b8 [func_retval0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} -; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]]; -; CHECK-NEXT: ret; define <5 x i8> @test_v5i8(<5 x i8> %a) { +; CHECK-LABEL: test_v5i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<16>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_v5i8_param_0]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs4, %r2; +; CHECK-NEXT: bfe.u32 %r3, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r3; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; +; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; +; CHECK-NEXT: ld.param.b8 %rs5, [test_v5i8_param_0+4]; +; CHECK-NEXT: { // callseq 11, 0 +; CHECK-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, %rs2, %rs3, %rs4}; +; CHECK-NEXT: st.param.b8 [param0+4], %rs5; +; CHECK-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_v5i8, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.v4.b8 {%rs6, %rs7, %rs8, %rs9}, [retval0]; +; CHECK-NEXT: ld.param.b8 %rs10, [retval0+4]; +; CHECK-NEXT: } // callseq 11 +; CHECK-NEXT: st.param.v4.b8 [func_retval0], {%rs6, %rs7, %rs8, %rs9}; +; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs10; +; CHECK-NEXT: ret; %r = tail call <5 x i8> @test_v5i8(<5 x i8> %a); ret <5 x i8> %r; } -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i11( -; CHECK-NEXT: .param .b32 test_i11_param_0 -; CHECK: ld.param.b16 {{%rs[0-9]+}}, [test_i11_param_0]; -; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_i11, -; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0]; -; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}}; -; CHECK-NEXT: ret; define i11 @test_i11(i11 %a) { +; CHECK-LABEL: test_i11( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [test_i11_param_0]; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: { // callseq 12, 0 +; CHECK-NEXT: .param .b32 param0; +; CHECK-NEXT: st.param.b32 [param0], %r1; +; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_i11, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r2, [retval0]; +; CHECK-NEXT: } // callseq 12 +; CHECK-NEXT: and.b32 %r4, %r2, 2047; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; %r = tail call i11 @test_i11(i11 %a); ret i11 %r; } -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i16( -; CHECK-NEXT: .param .b32 test_i16_param_0 -; CHECK: ld.param.b16 [[E16:%rs[0-9]+]], [test_i16_param_0]; -; CHECK: cvt.u32.u16 [[E32:%r[0-9]+]], [[E16]]; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], [[E32]]; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_i16, -; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0]; -; CHECK: and.b32 [[R:%r[0-9]+]], [[RE32]], 65535; -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define i16 @test_i16(i16 %a) { +; CHECK-LABEL: test_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [test_i16_param_0]; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: { // callseq 13, 0 +; CHECK-NEXT: .param .b32 param0; +; CHECK-NEXT: st.param.b32 [param0], %r1; +; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_i16, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r2, [retval0]; +; CHECK-NEXT: } // callseq 13 +; CHECK-NEXT: and.b32 %r4, %r2, 65535; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; %r = tail call i16 @test_i16(i16 %a); ret i16 %r; } -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i16s( -; CHECK-NEXT: .param .b32 test_i16s_param_0 -; CHECK: ld.param.b16 [[E16:%rs[0-9]+]], [test_i16s_param_0]; -; CHECK: cvt.s32.s16 [[E32:%r[0-9]+]], [[E16]]; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], [[E32]]; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_i16s, -; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0]; -; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[RE32]]; -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define signext i16 @test_i16s(i16 signext %a) { +; CHECK-LABEL: test_i16s( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [test_i16s_param_0]; +; CHECK-NEXT: cvt.s32.s16 %r1, %rs1; +; CHECK-NEXT: { // callseq 14, 0 +; CHECK-NEXT: .param .b32 param0; +; CHECK-NEXT: st.param.b32 [param0], %r1; +; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_i16s, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r2, [retval0]; +; CHECK-NEXT: } // callseq 14 +; CHECK-NEXT: cvt.s32.s16 %r4, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; %r = tail call signext i16 @test_i16s(i16 signext %a); ret i16 %r; } -; CHECK: .func (.param .align 8 .b8 func_retval0[8]) -; CHECK-LABEL: test_v3i16( -; CHECK-NEXT: .param .align 8 .b8 test_v3i16_param_0[8] -; CHECK-DAG: ld.param.b16 [[E2:%rs[0-9]+]], [test_v3i16_param_0+4]; -; CHECK-DAG: ld.param.b32 [[R:%r[0-9]+]], [test_v3i16_param_0]; -; CHECK-DAG: mov.b32 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [[R]]; -; CHECK: .param .align 8 .b8 param0[8]; -; CHECK: st.param.v2.b16 [param0], {[[E0]], [[E1]]}; -; CHECK: st.param.b16 [param0+4], [[E2]]; -; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v3i16, -; CHECK: ld.param.v2.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0]; -; CHECK: ld.param.b16 [[RE2:%rs[0-9]+]], [retval0+4]; -; CHECK-DAG: st.param.v2.b16 [func_retval0], {[[RE0]], [[RE1]]}; -; CHECK-DAG: st.param.b16 [func_retval0+4], [[RE2]]; -; CHECK-NEXT: ret; define <3 x i16> @test_v3i16(<3 x i16> %a) { +; CHECK-LABEL: test_v3i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<10>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_v3i16_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: ld.param.b16 %rs3, [test_v3i16_param_0+4]; +; CHECK-NEXT: { // callseq 15, 0 +; CHECK-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-NEXT: st.param.v2.b16 [param0], {%rs1, %rs2}; +; CHECK-NEXT: st.param.b16 [param0+4], %rs3; +; CHECK-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_v3i16, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.v2.b16 {%rs4, %rs5}, [retval0]; +; CHECK-NEXT: ld.param.b16 %rs6, [retval0+4]; +; CHECK-NEXT: } // callseq 15 +; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs5}; +; CHECK-NEXT: st.param.b16 [func_retval0+4], %rs6; +; CHECK-NEXT: ret; %r = tail call <3 x i16> @test_v3i16(<3 x i16> %a); ret <3 x i16> %r; } -; CHECK: .func (.param .align 8 .b8 func_retval0[8]) -; CHECK-LABEL: test_v4i16( -; CHECK-NEXT: .param .align 8 .b8 test_v4i16_param_0[8] -; CHECK: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v4i16_param_0] -; CHECK: .param .align 8 .b8 param0[8]; -; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; -; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v4i16, -; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0]; -; CHECK: st.param.v2.b32 [func_retval0], {[[RE0]], [[RE1]]} -; CHECK-NEXT: ret; define <4 x i16> @test_v4i16(<4 x i16> %a) { +; CHECK-LABEL: test_v4i16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v4i16_param_0]; +; CHECK-NEXT: { // callseq 16, 0 +; CHECK-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, %r2}; +; CHECK-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_v4i16, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [retval0]; +; CHECK-NEXT: } // callseq 16 +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r3, %r4}; +; CHECK-NEXT: ret; %r = tail call <4 x i16> @test_v4i16(<4 x i16> %a); ret <4 x i16> %r; } -; CHECK: .func (.param .align 16 .b8 func_retval0[16]) -; CHECK-LABEL: test_v5i16( -; CHECK-NEXT: .param .align 16 .b8 test_v5i16_param_0[16] -; CHECK-DAG: ld.param.b16 [[E4:%rs[0-9]+]], [test_v5i16_param_0+8]; -; CHECK-DAG: ld.param.v4.b16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0] -; CHECK: .param .align 16 .b8 param0[16]; -; CHECK-DAG: st.param.v4.b16 [param0], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK-DAG: st.param.b16 [param0+8], [[E4]]; -; CHECK: .param .align 16 .b8 retval0[16]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v5i16, -; CHECK-DAG: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0]; -; CHECK-DAG: ld.param.b16 [[RE4:%rs[0-9]+]], [retval0+8]; -; CHECK-DAG: st.param.v4.b16 [func_retval0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} -; CHECK-DAG: st.param.b16 [func_retval0+8], [[RE4]]; -; CHECK-NEXT: ret; define <5 x i16> @test_v5i16(<5 x i16> %a) { +; CHECK-LABEL: test_v5i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<16>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs5, [test_v5i16_param_0+8]; +; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_v5i16_param_0]; +; CHECK-NEXT: { // callseq 17, 0 +; CHECK-NEXT: .param .align 16 .b8 param0[16]; +; CHECK-NEXT: st.param.v4.b16 [param0], {%rs1, %rs2, %rs3, %rs4}; +; CHECK-NEXT: st.param.b16 [param0+8], %rs5; +; CHECK-NEXT: .param .align 16 .b8 retval0[16]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_v5i16, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.v4.b16 {%rs6, %rs7, %rs8, %rs9}, [retval0]; +; CHECK-NEXT: ld.param.b16 %rs10, [retval0+8]; +; CHECK-NEXT: } // callseq 17 +; CHECK-NEXT: st.param.v4.b16 [func_retval0], {%rs6, %rs7, %rs8, %rs9}; +; CHECK-NEXT: st.param.b16 [func_retval0+8], %rs10; +; CHECK-NEXT: ret; %r = tail call <5 x i16> @test_v5i16(<5 x i16> %a); ret <5 x i16> %r; } -; CHECK: .func (.param .align 2 .b8 func_retval0[2]) -; CHECK-LABEL: test_f16( -; CHECK-NEXT: .param .align 2 .b8 test_f16_param_0[2] -; CHECK: ld.param.b16 [[E:%rs[0-9]+]], [test_f16_param_0]; -; CHECK: .param .align 2 .b8 param0[2]; -; CHECK: st.param.b16 [param0], [[E]]; -; CHECK: .param .align 2 .b8 retval0[2]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_f16, -; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0]; -; CHECK: st.param.b16 [func_retval0], [[R]] -; CHECK-NEXT: ret; define half @test_f16(half %a) { +; CHECK-LABEL: test_f16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [test_f16_param_0]; +; CHECK-NEXT: { // callseq 18, 0 +; CHECK-NEXT: .param .align 2 .b8 param0[2]; +; CHECK-NEXT: st.param.b16 [param0], %rs1; +; CHECK-NEXT: .param .align 2 .b8 retval0[2]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_f16, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b16 %rs2, [retval0]; +; CHECK-NEXT: } // callseq 18 +; CHECK-NEXT: st.param.b16 [func_retval0], %rs2; +; CHECK-NEXT: ret; %r = tail call half @test_f16(half %a); ret half %r; } -; CHECK: .func (.param .align 4 .b8 func_retval0[4]) -; CHECK-LABEL: test_v2f16( -; CHECK-NEXT: .param .align 4 .b8 test_v2f16_param_0[4] -; CHECK: ld.param.b32 [[E:%r[0-9]+]], [test_v2f16_param_0]; -; CHECK: .param .align 4 .b8 param0[4]; -; CHECK: st.param.b32 [param0], [[E]]; -; CHECK: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v2f16, -; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0]; -; CHECK: st.param.b32 [func_retval0], [[R]] -; CHECK-NEXT: ret; define <2 x half> @test_v2f16(<2 x half> %a) { +; CHECK-LABEL: test_v2f16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_v2f16_param_0]; +; CHECK-NEXT: { // callseq 19, 0 +; CHECK-NEXT: .param .align 4 .b8 param0[4]; +; CHECK-NEXT: st.param.b32 [param0], %r1; +; CHECK-NEXT: .param .align 4 .b8 retval0[4]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_v2f16, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r2, [retval0]; +; CHECK-NEXT: } // callseq 19 +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = tail call <2 x half> @test_v2f16(<2 x half> %a); ret <2 x half> %r; } -; CHECK: .func (.param .align 2 .b8 func_retval0[2]) -; CHECK-LABEL: test_bf16( -; CHECK-NEXT: .param .align 2 .b8 test_bf16_param_0[2] -; CHECK: ld.param.b16 [[E:%rs[0-9]+]], [test_bf16_param_0]; -; CHECK: .param .align 2 .b8 param0[2]; -; CHECK: st.param.b16 [param0], [[E]]; -; CHECK: .param .align 2 .b8 retval0[2]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_bf16, -; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0]; -; CHECK: st.param.b16 [func_retval0], [[R]] -; CHECK-NEXT: ret; define bfloat @test_bf16(bfloat %a) { +; CHECK-LABEL: test_bf16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [test_bf16_param_0]; +; CHECK-NEXT: { // callseq 20, 0 +; CHECK-NEXT: .param .align 2 .b8 param0[2]; +; CHECK-NEXT: st.param.b16 [param0], %rs1; +; CHECK-NEXT: .param .align 2 .b8 retval0[2]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_bf16, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b16 %rs2, [retval0]; +; CHECK-NEXT: } // callseq 20 +; CHECK-NEXT: st.param.b16 [func_retval0], %rs2; +; CHECK-NEXT: ret; %r = tail call bfloat @test_bf16(bfloat %a); ret bfloat %r; } -; CHECK: .func (.param .align 4 .b8 func_retval0[4]) -; CHECK-LABEL: test_v2bf16( -; CHECK-NEXT: .param .align 4 .b8 test_v2bf16_param_0[4] -; CHECK: ld.param.b32 [[E:%r[0-9]+]], [test_v2bf16_param_0]; -; CHECK: .param .align 4 .b8 param0[4]; -; CHECK: st.param.b32 [param0], [[E]]; -; CHECK: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v2bf16, -; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0]; -; CHECK: st.param.b32 [func_retval0], [[R]] -; CHECK-NEXT: ret; define <2 x bfloat> @test_v2bf16(<2 x bfloat> %a) { +; CHECK-LABEL: test_v2bf16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_v2bf16_param_0]; +; CHECK-NEXT: { // callseq 21, 0 +; CHECK-NEXT: .param .align 4 .b8 param0[4]; +; CHECK-NEXT: st.param.b32 [param0], %r1; +; CHECK-NEXT: .param .align 4 .b8 retval0[4]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_v2bf16, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r2, [retval0]; +; CHECK-NEXT: } // callseq 21 +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = tail call <2 x bfloat> @test_v2bf16(<2 x bfloat> %a); ret <2 x bfloat> %r; } -; CHECK:.func (.param .align 8 .b8 func_retval0[8]) -; CHECK-LABEL: test_v3f16( -; CHECK: .param .align 8 .b8 test_v3f16_param_0[8] -; CHECK-DAG: ld.param.b32 [[HH01:%r[0-9]+]], [test_v3f16_param_0]; -; CHECK-DAG: mov.b32 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [[HH01]]; -; CHECK-DAG: ld.param.b16 [[E2:%rs[0-9]+]], [test_v3f16_param_0+4]; -; CHECK: .param .align 8 .b8 param0[8]; -; CHECK-DAG: st.param.v2.b16 [param0], {[[E0]], [[E1]]}; -; CHECK-DAG: st.param.b16 [param0+4], [[E2]]; -; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: call.uni (retval0), -; CHECK: test_v3f16, -; CHECK-DAG: ld.param.v2.b16 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]]}, [retval0]; -; CHECK-DAG: ld.param.b16 [[R2:%rs[0-9]+]], [retval0+4]; -; CHECK-DAG: st.param.v2.b16 [func_retval0], {[[R0]], [[R1]]}; -; CHECK-DAG: st.param.b16 [func_retval0+4], [[R2]]; -; CHECK: ret; define <3 x half> @test_v3f16(<3 x half> %a) { +; CHECK-LABEL: test_v3f16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<10>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_v3f16_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: ld.param.b16 %rs3, [test_v3f16_param_0+4]; +; CHECK-NEXT: { // callseq 22, 0 +; CHECK-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-NEXT: st.param.v2.b16 [param0], {%rs1, %rs2}; +; CHECK-NEXT: st.param.b16 [param0+4], %rs3; +; CHECK-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_v3f16, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.v2.b16 {%rs4, %rs5}, [retval0]; +; CHECK-NEXT: ld.param.b16 %rs6, [retval0+4]; +; CHECK-NEXT: } // callseq 22 +; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs5}; +; CHECK-NEXT: st.param.b16 [func_retval0+4], %rs6; +; CHECK-NEXT: ret; %r = tail call <3 x half> @test_v3f16(<3 x half> %a); ret <3 x half> %r; } -; CHECK:.func (.param .align 8 .b8 func_retval0[8]) -; CHECK-LABEL: test_v4f16( -; CHECK: .param .align 8 .b8 test_v4f16_param_0[8] -; CHECK: ld.param.v2.b32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]]}, [test_v4f16_param_0]; -; CHECK: .param .align 8 .b8 param0[8]; -; CHECK: st.param.v2.b32 [param0], {[[R01]], [[R23]]}; -; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: call.uni (retval0), -; CHECK: test_v4f16, -; CHECK: ld.param.v2.b32 {[[RH01:%r[0-9]+]], [[RH23:%r[0-9]+]]}, [retval0]; -; CHECK: st.param.v2.b32 [func_retval0], {[[RH01]], [[RH23]]}; -; CHECK: ret; define <4 x half> @test_v4f16(<4 x half> %a) { +; CHECK-LABEL: test_v4f16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v4f16_param_0]; +; CHECK-NEXT: { // callseq 23, 0 +; CHECK-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, %r2}; +; CHECK-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_v4f16, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [retval0]; +; CHECK-NEXT: } // callseq 23 +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r3, %r4}; +; CHECK-NEXT: ret; %r = tail call <4 x half> @test_v4f16(<4 x half> %a); ret <4 x half> %r; } -; CHECK:.func (.param .align 16 .b8 func_retval0[16]) -; CHECK-LABEL: test_v5f16( -; CHECK: .param .align 16 .b8 test_v5f16_param_0[16] -; CHECK-DAG: ld.param.v4.b16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5f16_param_0]; -; CHECK-DAG: ld.param.b16 [[E4:%rs[0-9]+]], [test_v5f16_param_0+8]; -; CHECK: .param .align 16 .b8 param0[16]; -; CHECK-DAG: st.param.v4.b16 [param0], -; CHECK-DAG: st.param.b16 [param0+8], [[E4]]; -; CHECK: .param .align 16 .b8 retval0[16]; -; CHECK: call.uni (retval0), -; CHECK: test_v5f16, -; CHECK-DAG: ld.param.v4.b16 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]], [[R2:%rs[0-9]+]], [[R3:%rs[0-9]+]]}, [retval0]; -; CHECK-DAG: ld.param.b16 [[R4:%rs[0-9]+]], [retval0+8]; -; CHECK-DAG: st.param.v4.b16 [func_retval0], {[[R0]], [[R1]], [[R2]], [[R3]]}; -; CHECK-DAG: st.param.b16 [func_retval0+8], [[R4]]; -; CHECK: ret; define <5 x half> @test_v5f16(<5 x half> %a) { +; CHECK-LABEL: test_v5f16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<16>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs5, [test_v5f16_param_0+8]; +; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_v5f16_param_0]; +; CHECK-NEXT: { // callseq 24, 0 +; CHECK-NEXT: .param .align 16 .b8 param0[16]; +; CHECK-NEXT: st.param.v4.b16 [param0], {%rs1, %rs2, %rs3, %rs4}; +; CHECK-NEXT: st.param.b16 [param0+8], %rs5; +; CHECK-NEXT: .param .align 16 .b8 retval0[16]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_v5f16, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.v4.b16 {%rs6, %rs7, %rs8, %rs9}, [retval0]; +; CHECK-NEXT: ld.param.b16 %rs10, [retval0+8]; +; CHECK-NEXT: } // callseq 24 +; CHECK-NEXT: st.param.v4.b16 [func_retval0], {%rs6, %rs7, %rs8, %rs9}; +; CHECK-NEXT: st.param.b16 [func_retval0+8], %rs10; +; CHECK-NEXT: ret; %r = tail call <5 x half> @test_v5f16(<5 x half> %a); ret <5 x half> %r; } -; CHECK:.func (.param .align 16 .b8 func_retval0[16]) -; CHECK-LABEL: test_v8f16( -; CHECK: .param .align 16 .b8 test_v8f16_param_0[16] -; CHECK: ld.param.v4.b32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]], [[R45:%r[0-9]+]], [[R67:%r[0-9]+]]}, [test_v8f16_param_0]; -; CHECK: .param .align 16 .b8 param0[16]; -; CHECK: st.param.v4.b32 [param0], {[[R01]], [[R23]], [[R45]], [[R67]]}; -; CHECK: .param .align 16 .b8 retval0[16]; -; CHECK: call.uni (retval0), -; CHECK: test_v8f16, -; CHECK: ld.param.v4.b32 {[[RH01:%r[0-9]+]], [[RH23:%r[0-9]+]], [[RH45:%r[0-9]+]], [[RH67:%r[0-9]+]]}, [retval0]; -; CHECK: st.param.v4.b32 [func_retval0], {[[RH01]], [[RH23]], [[RH45]], [[RH67]]}; -; CHECK: ret; define <8 x half> @test_v8f16(<8 x half> %a) { +; CHECK-LABEL: test_v8f16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_v8f16_param_0]; +; CHECK-NEXT: { // callseq 25, 0 +; CHECK-NEXT: .param .align 16 .b8 param0[16]; +; CHECK-NEXT: st.param.v4.b32 [param0], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: .param .align 16 .b8 retval0[16]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_v8f16, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [retval0]; +; CHECK-NEXT: } // callseq 25 +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: ret; %r = tail call <8 x half> @test_v8f16(<8 x half> %a); ret <8 x half> %r; } -; CHECK:.func (.param .align 32 .b8 func_retval0[32]) -; CHECK-LABEL: test_v9f16( -; CHECK: .param .align 32 .b8 test_v9f16_param_0[32] -; CHECK-DAG: ld.param.v4.b16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v9f16_param_0]; -; CHECK-DAG: ld.param.v4.b16 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [test_v9f16_param_0+8]; -; CHECK-DAG: ld.param.b16 [[E8:%rs[0-9]+]], [test_v9f16_param_0+16]; -; CHECK: .param .align 32 .b8 param0[32]; -; CHECK-DAG: st.param.v4.b16 [param0], -; CHECK-DAG: st.param.v4.b16 [param0+8], -; CHECK-DAG: st.param.b16 [param0+16], [[E8]]; -; CHECK: .param .align 32 .b8 retval0[32]; -; CHECK: call.uni (retval0), -; CHECK: test_v9f16, -; CHECK-DAG: ld.param.v4.b16 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]], [[R2:%rs[0-9]+]], [[R3:%rs[0-9]+]]}, [retval0]; -; CHECK-DAG: ld.param.v4.b16 {[[R4:%rs[0-9]+]], [[R5:%rs[0-9]+]], [[R6:%rs[0-9]+]], [[R7:%rs[0-9]+]]}, [retval0+8]; -; CHECK-DAG: ld.param.b16 [[R8:%rs[0-9]+]], [retval0+16]; -; CHECK-DAG: st.param.v4.b16 [func_retval0], {[[R0]], [[R1]], [[R2]], [[R3]]}; -; CHECK-DAG: st.param.v4.b16 [func_retval0+8], {[[R4]], [[R5]], [[R6]], [[R7]]}; -; CHECK-DAG: st.param.b16 [func_retval0+16], [[R8]]; -; CHECK: ret; define <9 x half> @test_v9f16(<9 x half> %a) { +; CHECK-LABEL: test_v9f16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<28>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs9, [test_v9f16_param_0+16]; +; CHECK-NEXT: ld.param.v4.b16 {%rs5, %rs6, %rs7, %rs8}, [test_v9f16_param_0+8]; +; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_v9f16_param_0]; +; CHECK-NEXT: { // callseq 26, 0 +; CHECK-NEXT: .param .align 32 .b8 param0[32]; +; CHECK-NEXT: st.param.v4.b16 [param0], {%rs1, %rs2, %rs3, %rs4}; +; CHECK-NEXT: st.param.v4.b16 [param0+8], {%rs5, %rs6, %rs7, %rs8}; +; CHECK-NEXT: st.param.b16 [param0+16], %rs9; +; CHECK-NEXT: .param .align 32 .b8 retval0[32]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_v9f16, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.v4.b16 {%rs10, %rs11, %rs12, %rs13}, [retval0]; +; CHECK-NEXT: ld.param.v4.b16 {%rs14, %rs15, %rs16, %rs17}, [retval0+8]; +; CHECK-NEXT: ld.param.b16 %rs18, [retval0+16]; +; CHECK-NEXT: } // callseq 26 +; CHECK-NEXT: st.param.v4.b16 [func_retval0], {%rs10, %rs11, %rs12, %rs13}; +; CHECK-NEXT: st.param.v4.b16 [func_retval0+8], {%rs14, %rs15, %rs16, %rs17}; +; CHECK-NEXT: st.param.b16 [func_retval0+16], %rs18; +; CHECK-NEXT: ret; %r = tail call <9 x half> @test_v9f16(<9 x half> %a); ret <9 x half> %r; } -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i19( -; CHECK-NEXT: .param .b32 test_i19_param_0 -; CHECK-DAG: ld.param.b16 {{%r[0-9]+}}, [test_i19_param_0]; -; CHECK-DAG: ld.param.b8 {{%r[0-9]+}}, [test_i19_param_0+2]; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_i19, -; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0]; -; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}}; -; CHECK-NEXT: ret; define i19 @test_i19(i19 %a) { +; CHECK-LABEL: test_i19( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %r2, [test_i19_param_0+2]; +; CHECK-NEXT: shl.b32 %r3, %r2, 16; +; CHECK-NEXT: ld.param.b16 %r4, [test_i19_param_0]; +; CHECK-NEXT: or.b32 %r1, %r4, %r3; +; CHECK-NEXT: { // callseq 27, 0 +; CHECK-NEXT: .param .b32 param0; +; CHECK-NEXT: st.param.b32 [param0], %r1; +; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_i19, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r5, [retval0]; +; CHECK-NEXT: } // callseq 27 +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: ret; %r = tail call i19 @test_i19(i19 %a); ret i19 %r; } -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i23( -; CHECK-NEXT: .param .b32 test_i23_param_0 -; CHECK-DAG: ld.param.b16 {{%r[0-9]+}}, [test_i23_param_0]; -; CHECK-DAG: ld.param.b8 {{%r[0-9]+}}, [test_i23_param_0+2]; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_i23, -; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0]; -; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}}; -; CHECK-NEXT: ret; define i23 @test_i23(i23 %a) { +; CHECK-LABEL: test_i23( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %r2, [test_i23_param_0+2]; +; CHECK-NEXT: shl.b32 %r3, %r2, 16; +; CHECK-NEXT: ld.param.b16 %r4, [test_i23_param_0]; +; CHECK-NEXT: or.b32 %r1, %r4, %r3; +; CHECK-NEXT: { // callseq 28, 0 +; CHECK-NEXT: .param .b32 param0; +; CHECK-NEXT: st.param.b32 [param0], %r1; +; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_i23, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r5, [retval0]; +; CHECK-NEXT: } // callseq 28 +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: ret; %r = tail call i23 @test_i23(i23 %a); ret i23 %r; } -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i24( -; CHECK-NEXT: .param .b32 test_i24_param_0 -; CHECK-DAG: ld.param.b8 {{%r[0-9]+}}, [test_i24_param_0+2]; -; CHECK-DAG: ld.param.b16 {{%r[0-9]+}}, [test_i24_param_0]; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_i24, -; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0]; -; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}}; -; CHECK-NEXT: ret; define i24 @test_i24(i24 %a) { +; CHECK-LABEL: test_i24( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %r2, [test_i24_param_0+2]; +; CHECK-NEXT: shl.b32 %r3, %r2, 16; +; CHECK-NEXT: ld.param.b16 %r4, [test_i24_param_0]; +; CHECK-NEXT: or.b32 %r1, %r4, %r3; +; CHECK-NEXT: { // callseq 29, 0 +; CHECK-NEXT: .param .b32 param0; +; CHECK-NEXT: st.param.b32 [param0], %r1; +; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_i24, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r5, [retval0]; +; CHECK-NEXT: } // callseq 29 +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: ret; %r = tail call i24 @test_i24(i24 %a); ret i24 %r; } -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i29( -; CHECK-NEXT: .param .b32 test_i29_param_0 -; CHECK: ld.param.b32 {{%r[0-9]+}}, [test_i29_param_0]; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_i29, -; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0]; -; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}}; -; CHECK-NEXT: ret; define i29 @test_i29(i29 %a) { +; CHECK-LABEL: test_i29( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_i29_param_0]; +; CHECK-NEXT: { // callseq 30, 0 +; CHECK-NEXT: .param .b32 param0; +; CHECK-NEXT: st.param.b32 [param0], %r1; +; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_i29, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r2, [retval0]; +; CHECK-NEXT: } // callseq 30 +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = tail call i29 @test_i29(i29 %a); ret i29 %r; } -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i32( -; CHECK-NEXT: .param .b32 test_i32_param_0 -; CHECK: ld.param.b32 [[E:%r[0-9]+]], [test_i32_param_0]; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], [[E]]; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_i32, -; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0]; -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define i32 @test_i32(i32 %a) { +; CHECK-LABEL: test_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_i32_param_0]; +; CHECK-NEXT: { // callseq 31, 0 +; CHECK-NEXT: .param .b32 param0; +; CHECK-NEXT: st.param.b32 [param0], %r1; +; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_i32, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r2, [retval0]; +; CHECK-NEXT: } // callseq 31 +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = tail call i32 @test_i32(i32 %a); ret i32 %r; } -; CHECK: .func (.param .align 16 .b8 func_retval0[16]) -; CHECK-LABEL: test_v3i32( -; CHECK-NEXT: .param .align 16 .b8 test_v3i32_param_0[16] -; CHECK-DAG: ld.param.b32 [[E2:%r[0-9]+]], [test_v3i32_param_0+8]; -; CHECK-DAG: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v3i32_param_0]; -; CHECK: .param .align 16 .b8 param0[16]; -; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; -; CHECK: st.param.b32 [param0+8], [[E2]]; -; CHECK: .param .align 16 .b8 retval0[16]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v3i32, -; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0]; -; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; -; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[RE0]], [[RE1]]}; -; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]]; -; CHECK-NEXT: ret; define <3 x i32> @test_v3i32(<3 x i32> %a) { +; CHECK-LABEL: test_v3i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<10>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r3, [test_v3i32_param_0+8]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v3i32_param_0]; +; CHECK-NEXT: { // callseq 32, 0 +; CHECK-NEXT: .param .align 16 .b8 param0[16]; +; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, %r2}; +; CHECK-NEXT: st.param.b32 [param0+8], %r3; +; CHECK-NEXT: .param .align 16 .b8 retval0[16]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_v3i32, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.v2.b32 {%r4, %r5}, [retval0]; +; CHECK-NEXT: ld.param.b32 %r6, [retval0+8]; +; CHECK-NEXT: } // callseq 32 +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r5}; +; CHECK-NEXT: st.param.b32 [func_retval0+8], %r6; +; CHECK-NEXT: ret; %r = tail call <3 x i32> @test_v3i32(<3 x i32> %a); ret <3 x i32> %r; } -; CHECK: .func (.param .align 16 .b8 func_retval0[16]) -; CHECK-LABEL: test_v4i32( -; CHECK-NEXT: .param .align 16 .b8 test_v4i32_param_0[16] -; CHECK: ld.param.v4.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v4i32_param_0] -; CHECK: .param .align 16 .b8 param0[16]; -; CHECK: st.param.v4.b32 [param0], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK: .param .align 16 .b8 retval0[16]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v4i32, -; CHECK: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0]; -; CHECK: st.param.v4.b32 [func_retval0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} -; CHECK-NEXT: ret; define <4 x i32> @test_v4i32(<4 x i32> %a) { +; CHECK-LABEL: test_v4i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_v4i32_param_0]; +; CHECK-NEXT: { // callseq 33, 0 +; CHECK-NEXT: .param .align 16 .b8 param0[16]; +; CHECK-NEXT: st.param.v4.b32 [param0], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: .param .align 16 .b8 retval0[16]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_v4i32, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [retval0]; +; CHECK-NEXT: } // callseq 33 +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: ret; %r = tail call <4 x i32> @test_v4i32(<4 x i32> %a); ret <4 x i32> %r; } -; CHECK: .func (.param .align 32 .b8 func_retval0[32]) -; CHECK-LABEL: test_v5i32( -; CHECK-NEXT: .param .align 32 .b8 test_v5i32_param_0[32] -; CHECK-DAG: ld.param.b32 [[E4:%r[0-9]+]], [test_v5i32_param_0+16]; -; CHECK-DAG: ld.param.v4.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v5i32_param_0] -; CHECK: .param .align 32 .b8 param0[32]; -; CHECK-DAG: st.param.v4.b32 [param0], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK-DAG: st.param.b32 [param0+16], [[E4]]; -; CHECK: .param .align 32 .b8 retval0[32]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v5i32, -; CHECK-DAG: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0]; -; CHECK-DAG: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16]; -; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} -; CHECK-DAG: st.param.b32 [func_retval0+16], [[RE4]]; -; CHECK-NEXT: ret; define <5 x i32> @test_v5i32(<5 x i32> %a) { +; CHECK-LABEL: test_v5i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r5, [test_v5i32_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_v5i32_param_0]; +; CHECK-NEXT: { // callseq 34, 0 +; CHECK-NEXT: .param .align 32 .b8 param0[32]; +; CHECK-NEXT: st.param.v4.b32 [param0], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: st.param.b32 [param0+16], %r5; +; CHECK-NEXT: .param .align 32 .b8 retval0[32]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_v5i32, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.v4.b32 {%r6, %r7, %r8, %r9}, [retval0]; +; CHECK-NEXT: ld.param.b32 %r10, [retval0+16]; +; CHECK-NEXT: } // callseq 34 +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r6, %r7, %r8, %r9}; +; CHECK-NEXT: st.param.b32 [func_retval0+16], %r10; +; CHECK-NEXT: ret; %r = tail call <5 x i32> @test_v5i32(<5 x i32> %a); ret <5 x i32> %r; } -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_f32( -; CHECK-NEXT: .param .b32 test_f32_param_0 -; CHECK: ld.param.b32 [[E:%r[0-9]+]], [test_f32_param_0]; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], [[E]]; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_f32, -; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0]; -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define float @test_f32(float %a) { +; CHECK-LABEL: test_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_f32_param_0]; +; CHECK-NEXT: { // callseq 35, 0 +; CHECK-NEXT: .param .b32 param0; +; CHECK-NEXT: st.param.b32 [param0], %r1; +; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_f32, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r2, [retval0]; +; CHECK-NEXT: } // callseq 35 +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = tail call float @test_f32(float %a); ret float %r; } -; CHECK: .func (.param .b64 func_retval0) -; CHECK-LABEL: test_i40( -; CHECK-NEXT: .param .b64 test_i40_param_0 -; CHECK-DAG: ld.param.b8 {{%rd[0-9]+}}, [test_i40_param_0+4]; -; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i40_param_0]; -; CHECK: .param .b64 param0; -; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; -; CHECK: .param .b64 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_i40, -; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0]; -; CHECK: st.param.b64 [func_retval0], {{%rd[0-9]+}}; -; CHECK-NEXT: ret; define i40 @test_i40(i40 %a) { +; CHECK-LABEL: test_i40( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rd2, [test_i40_param_0+4]; +; CHECK-NEXT: shl.b64 %rd3, %rd2, 32; +; CHECK-NEXT: ld.param.b32 %rd4, [test_i40_param_0]; +; CHECK-NEXT: or.b64 %rd1, %rd4, %rd3; +; CHECK-NEXT: { // callseq 36, 0 +; CHECK-NEXT: .param .b64 param0; +; CHECK-NEXT: st.param.b64 [param0], %rd1; +; CHECK-NEXT: .param .b64 retval0; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_i40, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b64 %rd5, [retval0]; +; CHECK-NEXT: } // callseq 36 +; CHECK-NEXT: st.param.b64 [func_retval0], %rd5; +; CHECK-NEXT: ret; %r = tail call i40 @test_i40(i40 %a); ret i40 %r; } -; CHECK: .func (.param .b64 func_retval0) -; CHECK-LABEL: test_i47( -; CHECK-NEXT: .param .b64 test_i47_param_0 -; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i47_param_0+4]; -; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i47_param_0]; -; CHECK: .param .b64 param0; -; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; -; CHECK: .param .b64 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_i47, -; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0]; -; CHECK: st.param.b64 [func_retval0], {{%rd[0-9]+}}; -; CHECK-NEXT: ret; define i47 @test_i47(i47 %a) { +; CHECK-LABEL: test_i47( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rd2, [test_i47_param_0+4]; +; CHECK-NEXT: shl.b64 %rd3, %rd2, 32; +; CHECK-NEXT: ld.param.b32 %rd4, [test_i47_param_0]; +; CHECK-NEXT: or.b64 %rd1, %rd4, %rd3; +; CHECK-NEXT: { // callseq 37, 0 +; CHECK-NEXT: .param .b64 param0; +; CHECK-NEXT: st.param.b64 [param0], %rd1; +; CHECK-NEXT: .param .b64 retval0; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_i47, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b64 %rd5, [retval0]; +; CHECK-NEXT: } // callseq 37 +; CHECK-NEXT: st.param.b64 [func_retval0], %rd5; +; CHECK-NEXT: ret; %r = tail call i47 @test_i47(i47 %a); ret i47 %r; } -; CHECK: .func (.param .b64 func_retval0) -; CHECK-LABEL: test_i48( -; CHECK-NEXT: .param .b64 test_i48_param_0 -; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i48_param_0+4]; -; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i48_param_0]; -; CHECK: .param .b64 param0; -; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; -; CHECK: .param .b64 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_i48, -; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0]; -; CHECK: st.param.b64 [func_retval0], {{%rd[0-9]+}}; -; CHECK-NEXT: ret; define i48 @test_i48(i48 %a) { +; CHECK-LABEL: test_i48( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rd2, [test_i48_param_0+4]; +; CHECK-NEXT: shl.b64 %rd3, %rd2, 32; +; CHECK-NEXT: ld.param.b32 %rd4, [test_i48_param_0]; +; CHECK-NEXT: or.b64 %rd1, %rd4, %rd3; +; CHECK-NEXT: { // callseq 38, 0 +; CHECK-NEXT: .param .b64 param0; +; CHECK-NEXT: st.param.b64 [param0], %rd1; +; CHECK-NEXT: .param .b64 retval0; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_i48, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b64 %rd5, [retval0]; +; CHECK-NEXT: } // callseq 38 +; CHECK-NEXT: st.param.b64 [func_retval0], %rd5; +; CHECK-NEXT: ret; %r = tail call i48 @test_i48(i48 %a); ret i48 %r; } -; CHECK: .func (.param .b64 func_retval0) -; CHECK-LABEL: test_i51( -; CHECK-NEXT: .param .b64 test_i51_param_0 -; CHECK-DAG: ld.param.b8 {{%rd[0-9]+}}, [test_i51_param_0+6]; -; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i51_param_0+4]; -; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i51_param_0]; -; CHECK: .param .b64 param0; -; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; -; CHECK: .param .b64 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_i51, -; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0]; -; CHECK: st.param.b64 [func_retval0], {{%rd[0-9]+}}; -; CHECK-NEXT: ret; define i51 @test_i51(i51 %a) { +; CHECK-LABEL: test_i51( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<10>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rd2, [test_i51_param_0+4]; +; CHECK-NEXT: shl.b64 %rd3, %rd2, 32; +; CHECK-NEXT: ld.param.b8 %rd4, [test_i51_param_0+6]; +; CHECK-NEXT: shl.b64 %rd5, %rd4, 48; +; CHECK-NEXT: or.b64 %rd6, %rd5, %rd3; +; CHECK-NEXT: ld.param.b32 %rd7, [test_i51_param_0]; +; CHECK-NEXT: or.b64 %rd1, %rd7, %rd6; +; CHECK-NEXT: { // callseq 39, 0 +; CHECK-NEXT: .param .b64 param0; +; CHECK-NEXT: st.param.b64 [param0], %rd1; +; CHECK-NEXT: .param .b64 retval0; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_i51, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b64 %rd8, [retval0]; +; CHECK-NEXT: } // callseq 39 +; CHECK-NEXT: st.param.b64 [func_retval0], %rd8; +; CHECK-NEXT: ret; %r = tail call i51 @test_i51(i51 %a); ret i51 %r; } -; CHECK: .func (.param .b64 func_retval0) -; CHECK-LABEL: test_i56( -; CHECK-NEXT: .param .b64 test_i56_param_0 -; CHECK-DAG: ld.param.b8 {{%rd[0-9]+}}, [test_i56_param_0+6]; -; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i56_param_0+4]; -; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i56_param_0]; -; CHECK: .param .b64 param0; -; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; -; CHECK: .param .b64 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_i56, -; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0]; -; CHECK: st.param.b64 [func_retval0], {{%rd[0-9]+}}; -; CHECK-NEXT: ret; define i56 @test_i56(i56 %a) { +; CHECK-LABEL: test_i56( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<10>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rd2, [test_i56_param_0+4]; +; CHECK-NEXT: shl.b64 %rd3, %rd2, 32; +; CHECK-NEXT: ld.param.b8 %rd4, [test_i56_param_0+6]; +; CHECK-NEXT: shl.b64 %rd5, %rd4, 48; +; CHECK-NEXT: or.b64 %rd6, %rd5, %rd3; +; CHECK-NEXT: ld.param.b32 %rd7, [test_i56_param_0]; +; CHECK-NEXT: or.b64 %rd1, %rd7, %rd6; +; CHECK-NEXT: { // callseq 40, 0 +; CHECK-NEXT: .param .b64 param0; +; CHECK-NEXT: st.param.b64 [param0], %rd1; +; CHECK-NEXT: .param .b64 retval0; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_i56, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b64 %rd8, [retval0]; +; CHECK-NEXT: } // callseq 40 +; CHECK-NEXT: st.param.b64 [func_retval0], %rd8; +; CHECK-NEXT: ret; %r = tail call i56 @test_i56(i56 %a); ret i56 %r; } -; CHECK: .func (.param .b64 func_retval0) -; CHECK-LABEL: test_i57( -; CHECK-NEXT: .param .b64 test_i57_param_0 -; CHECK: ld.param.b64 {{%rd[0-9]+}}, [test_i57_param_0]; -; CHECK: .param .b64 param0; -; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; -; CHECK: .param .b64 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_i57, -; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0]; -; CHECK: st.param.b64 [func_retval0], {{%rd[0-9]+}}; -; CHECK-NEXT: ret; define i57 @test_i57(i57 %a) { +; CHECK-LABEL: test_i57( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_i57_param_0]; +; CHECK-NEXT: { // callseq 41, 0 +; CHECK-NEXT: .param .b64 param0; +; CHECK-NEXT: st.param.b64 [param0], %rd1; +; CHECK-NEXT: .param .b64 retval0; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_i57, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b64 %rd2, [retval0]; +; CHECK-NEXT: } // callseq 41 +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; %r = tail call i57 @test_i57(i57 %a); ret i57 %r; } -; CHECK: .func (.param .b64 func_retval0) -; CHECK-LABEL: test_i64( -; CHECK-NEXT: .param .b64 test_i64_param_0 -; CHECK: ld.param.b64 [[E:%rd[0-9]+]], [test_i64_param_0]; -; CHECK: .param .b64 param0; -; CHECK: st.param.b64 [param0], [[E]]; -; CHECK: .param .b64 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_i64, -; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0]; -; CHECK: st.param.b64 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define i64 @test_i64(i64 %a) { +; CHECK-LABEL: test_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_i64_param_0]; +; CHECK-NEXT: { // callseq 42, 0 +; CHECK-NEXT: .param .b64 param0; +; CHECK-NEXT: st.param.b64 [param0], %rd1; +; CHECK-NEXT: .param .b64 retval0; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_i64, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b64 %rd2, [retval0]; +; CHECK-NEXT: } // callseq 42 +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; %r = tail call i64 @test_i64(i64 %a); ret i64 %r; } -; CHECK: .func (.param .align 32 .b8 func_retval0[32]) -; CHECK-LABEL: test_v3i64( -; CHECK-NEXT: .param .align 32 .b8 test_v3i64_param_0[32] -; CHECK-DAG: ld.param.b64 [[E2:%rd[0-9]+]], [test_v3i64_param_0+16]; -; CHECK-DAG: ld.param.v2.b64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v3i64_param_0]; -; CHECK: .param .align 32 .b8 param0[32]; -; CHECK: st.param.v2.b64 [param0], {[[E0]], [[E1]]}; -; CHECK: st.param.b64 [param0+16], [[E2]]; -; CHECK: .param .align 32 .b8 retval0[32]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v3i64, -; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0]; -; CHECK: ld.param.b64 [[RE2:%rd[0-9]+]], [retval0+16]; -; CHECK-DAG: st.param.v2.b64 [func_retval0], {[[RE0]], [[RE1]]}; -; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]]; -; CHECK-DAG: st.param.v2.b64 [func_retval0], {[[RE0]], [[RE1]]}; -; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]]; -; CHECK-NEXT: ret; define <3 x i64> @test_v3i64(<3 x i64> %a) { +; CHECK-LABEL: test_v3i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<10>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd3, [test_v3i64_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v3i64_param_0]; +; CHECK-NEXT: { // callseq 43, 0 +; CHECK-NEXT: .param .align 32 .b8 param0[32]; +; CHECK-NEXT: st.param.v2.b64 [param0], {%rd1, %rd2}; +; CHECK-NEXT: st.param.b64 [param0+16], %rd3; +; CHECK-NEXT: .param .align 32 .b8 retval0[32]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_v3i64, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [retval0]; +; CHECK-NEXT: ld.param.b64 %rd6, [retval0+16]; +; CHECK-NEXT: } // callseq 43 +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd5}; +; CHECK-NEXT: st.param.b64 [func_retval0+16], %rd6; +; CHECK-NEXT: ret; %r = tail call <3 x i64> @test_v3i64(<3 x i64> %a); ret <3 x i64> %r; } ; For i64 vector loads are limited by PTX to 2 elements. -; CHECK: .func (.param .align 32 .b8 func_retval0[32]) -; CHECK-LABEL: test_v4i64( -; CHECK-NEXT: .param .align 32 .b8 test_v4i64_param_0[32] -; CHECK-DAG: ld.param.v2.b64 {[[E2:%rd[0-9]+]], [[E3:%rd[0-9]+]]}, [test_v4i64_param_0+16]; -; CHECK-DAG: ld.param.v2.b64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v4i64_param_0]; -; CHECK: .param .align 32 .b8 param0[32]; -; CHECK: st.param.v2.b64 [param0], {[[E0]], [[E1]]}; -; CHECK: st.param.v2.b64 [param0+16], {[[E2]], [[E3]]}; -; CHECK: .param .align 32 .b8 retval0[32]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v4i64, -; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0]; -; CHECK: ld.param.v2.b64 {[[RE2:%rd[0-9]+]], [[RE3:%rd[0-9]+]]}, [retval0+16]; -; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[RE2]], [[RE3]]}; -; CHECK-DAG: st.param.v2.b64 [func_retval0], {[[RE0]], [[RE1]]}; -; CHECK-NEXT: ret; define <4 x i64> @test_v4i64(<4 x i64> %a) { +; CHECK-LABEL: test_v4i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_v4i64_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v4i64_param_0]; +; CHECK-NEXT: { // callseq 44, 0 +; CHECK-NEXT: .param .align 32 .b8 param0[32]; +; CHECK-NEXT: st.param.v2.b64 [param0], {%rd1, %rd2}; +; CHECK-NEXT: st.param.v2.b64 [param0+16], {%rd3, %rd4}; +; CHECK-NEXT: .param .align 32 .b8 retval0[32]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_v4i64, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [retval0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [retval0+16]; +; CHECK-NEXT: } // callseq 44 +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd5, %rd6}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd7, %rd8}; +; CHECK-NEXT: ret; %r = tail call <4 x i64> @test_v4i64(<4 x i64> %a); ret <4 x i64> %r; } ; Aggregates, on the other hand, do not get extended. -; CHECK: .func (.param .align 1 .b8 func_retval0[1]) -; CHECK-LABEL: test_s_i1( -; CHECK-NEXT: .align 1 .b8 test_s_i1_param_0[1] -; CHECK: ld.param.b8 [[A:%rs[0-9]+]], [test_s_i1_param_0]; -; CHECK: .param .align 1 .b8 param0[1]; -; CHECK: st.param.b8 [param0], [[A]] -; CHECK: .param .align 1 .b8 retval0[1]; -; CHECK: call.uni -; CHECK-NEXT: test_s_i1, -; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0]; -; CHECK: st.param.b8 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define %s_i1 @test_s_i1(%s_i1 %a) { +; CHECK-LABEL: test_s_i1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [test_s_i1_param_0]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: { // callseq 45, 0 +; CHECK-NEXT: .param .align 1 .b8 param0[1]; +; CHECK-NEXT: st.param.b8 [param0], %rs1; +; CHECK-NEXT: .param .align 1 .b8 retval0[1]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_i1, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b8 %rs3, [retval0]; +; CHECK-NEXT: } // callseq 45 +; CHECK-NEXT: st.param.b8 [func_retval0], %rs3; +; CHECK-NEXT: ret; %r = tail call %s_i1 @test_s_i1(%s_i1 %a); ret %s_i1 %r; } -; CHECK: .func (.param .align 1 .b8 func_retval0[1]) -; CHECK-LABEL: test_s_i8( -; CHECK-NEXT: .param .align 1 .b8 test_s_i8_param_0[1] -; CHECK: ld.param.b8 [[A:%rs[0-9]+]], [test_s_i8_param_0]; -; CHECK: .param .align 1 .b8 param0[1]; -; CHECK: st.param.b8 [param0], [[A]] -; CHECK: .param .align 1 .b8 retval0[1]; -; CHECK: call.uni -; CHECK-NEXT: test_s_i8, -; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0]; -; CHECK: st.param.b8 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define %s_i8 @test_s_i8(%s_i8 %a) { +; CHECK-LABEL: test_s_i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [test_s_i8_param_0]; +; CHECK-NEXT: { // callseq 46, 0 +; CHECK-NEXT: .param .align 1 .b8 param0[1]; +; CHECK-NEXT: st.param.b8 [param0], %rs1; +; CHECK-NEXT: .param .align 1 .b8 retval0[1]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_i8, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b8 %rs2, [retval0]; +; CHECK-NEXT: } // callseq 46 +; CHECK-NEXT: st.param.b8 [func_retval0], %rs2; +; CHECK-NEXT: ret; %r = tail call %s_i8 @test_s_i8(%s_i8 %a); ret %s_i8 %r; } -; CHECK: .func (.param .align 2 .b8 func_retval0[2]) -; CHECK-LABEL: test_s_i16( -; CHECK-NEXT: .param .align 2 .b8 test_s_i16_param_0[2] -; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_s_i16_param_0]; -; CHECK: .param .align 2 .b8 param0[2]; -; CHECK: st.param.b16 [param0], [[A]] -; CHECK: .param .align 2 .b8 retval0[2]; -; CHECK: call.uni -; CHECK-NEXT: test_s_i16, -; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0]; -; CHECK: st.param.b16 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define %s_i16 @test_s_i16(%s_i16 %a) { +; CHECK-LABEL: test_s_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [test_s_i16_param_0]; +; CHECK-NEXT: { // callseq 47, 0 +; CHECK-NEXT: .param .align 2 .b8 param0[2]; +; CHECK-NEXT: st.param.b16 [param0], %rs1; +; CHECK-NEXT: .param .align 2 .b8 retval0[2]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_i16, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b16 %rs2, [retval0]; +; CHECK-NEXT: } // callseq 47 +; CHECK-NEXT: st.param.b16 [func_retval0], %rs2; +; CHECK-NEXT: ret; %r = tail call %s_i16 @test_s_i16(%s_i16 %a); ret %s_i16 %r; } -; CHECK: .func (.param .align 2 .b8 func_retval0[2]) -; CHECK-LABEL: test_s_f16( -; CHECK-NEXT: .param .align 2 .b8 test_s_f16_param_0[2] -; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_s_f16_param_0]; -; CHECK: .param .align 2 .b8 param0[2]; -; CHECK: st.param.b16 [param0], [[A]] -; CHECK: .param .align 2 .b8 retval0[2]; -; CHECK: call.uni -; CHECK-NEXT: test_s_f16, -; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0]; -; CHECK: st.param.b16 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define %s_f16 @test_s_f16(%s_f16 %a) { +; CHECK-LABEL: test_s_f16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [test_s_f16_param_0]; +; CHECK-NEXT: { // callseq 48, 0 +; CHECK-NEXT: .param .align 2 .b8 param0[2]; +; CHECK-NEXT: st.param.b16 [param0], %rs1; +; CHECK-NEXT: .param .align 2 .b8 retval0[2]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_f16, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b16 %rs2, [retval0]; +; CHECK-NEXT: } // callseq 48 +; CHECK-NEXT: st.param.b16 [func_retval0], %rs2; +; CHECK-NEXT: ret; %r = tail call %s_f16 @test_s_f16(%s_f16 %a); ret %s_f16 %r; } -; CHECK: .func (.param .align 4 .b8 func_retval0[4]) -; CHECK-LABEL: test_s_i32( -; CHECK-NEXT: .param .align 4 .b8 test_s_i32_param_0[4] -; CHECK: ld.param.b32 [[E:%r[0-9]+]], [test_s_i32_param_0]; -; CHECK: .param .align 4 .b8 param0[4] -; CHECK: st.param.b32 [param0], [[E]]; -; CHECK: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_s_i32, -; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0]; -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define %s_i32 @test_s_i32(%s_i32 %a) { +; CHECK-LABEL: test_s_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_s_i32_param_0]; +; CHECK-NEXT: { // callseq 49, 0 +; CHECK-NEXT: .param .align 4 .b8 param0[4]; +; CHECK-NEXT: st.param.b32 [param0], %r1; +; CHECK-NEXT: .param .align 4 .b8 retval0[4]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_i32, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r2, [retval0]; +; CHECK-NEXT: } // callseq 49 +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = tail call %s_i32 @test_s_i32(%s_i32 %a); ret %s_i32 %r; } -; CHECK: .func (.param .align 4 .b8 func_retval0[4]) -; CHECK-LABEL: test_s_f32( -; CHECK-NEXT: .param .align 4 .b8 test_s_f32_param_0[4] -; CHECK: ld.param.b32 [[E:%r[0-9]+]], [test_s_f32_param_0]; -; CHECK: .param .align 4 .b8 param0[4] -; CHECK: st.param.b32 [param0], [[E]]; -; CHECK: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_s_f32, -; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0]; -; CHECK: st.param.b32 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define %s_f32 @test_s_f32(%s_f32 %a) { +; CHECK-LABEL: test_s_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_s_f32_param_0]; +; CHECK-NEXT: { // callseq 50, 0 +; CHECK-NEXT: .param .align 4 .b8 param0[4]; +; CHECK-NEXT: st.param.b32 [param0], %r1; +; CHECK-NEXT: .param .align 4 .b8 retval0[4]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_f32, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r2, [retval0]; +; CHECK-NEXT: } // callseq 50 +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = tail call %s_f32 @test_s_f32(%s_f32 %a); ret %s_f32 %r; } -; CHECK: .func (.param .align 8 .b8 func_retval0[8]) -; CHECK-LABEL: test_s_i64( -; CHECK-NEXT: .param .align 8 .b8 test_s_i64_param_0[8] -; CHECK: ld.param.b64 [[E:%rd[0-9]+]], [test_s_i64_param_0]; -; CHECK: .param .align 8 .b8 param0[8]; -; CHECK: st.param.b64 [param0], [[E]]; -; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_s_i64, -; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0]; -; CHECK: st.param.b64 [func_retval0], [[R]]; -; CHECK-NEXT: ret; define %s_i64 @test_s_i64(%s_i64 %a) { +; CHECK-LABEL: test_s_i64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i64_param_0]; +; CHECK-NEXT: { // callseq 51, 0 +; CHECK-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-NEXT: st.param.b64 [param0], %rd1; +; CHECK-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_i64, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b64 %rd2, [retval0]; +; CHECK-NEXT: } // callseq 51 +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; %r = tail call %s_i64 @test_s_i64(%s_i64 %a); ret %s_i64 %r; } ; Fields that have different types, but identical sizes are not vectorized. -; CHECK: .func (.param .align 8 .b8 func_retval0[24]) -; CHECK-LABEL: test_s_i32f32( -; CHECK: .param .align 8 .b8 test_s_i32f32_param_0[24] -; CHECK-DAG: ld.param.b64 [[E4:%rd[0-9]+]], [test_s_i32f32_param_0+16]; -; CHECK-DAG: ld.param.b32 [[E3:%r[0-9]+]], [test_s_i32f32_param_0+12]; -; CHECK-DAG: ld.param.b32 [[E2:%r[0-9]+]], [test_s_i32f32_param_0+8]; -; CHECK-DAG: ld.param.b32 [[E1:%r[0-9]+]], [test_s_i32f32_param_0+4]; -; CHECK-DAG: ld.param.b32 [[E0:%r[0-9]+]], [test_s_i32f32_param_0]; -; CHECK: .param .align 8 .b8 param0[24]; -; CHECK-DAG: st.param.b32 [param0], [[E0]]; -; CHECK-DAG: st.param.b32 [param0+4], [[E1]]; -; CHECK-DAG: st.param.b32 [param0+8], [[E2]]; -; CHECK-DAG: st.param.b32 [param0+12], [[E3]]; -; CHECK-DAG: st.param.b64 [param0+16], [[E4]]; -; CHECK: .param .align 8 .b8 retval0[24]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_s_i32f32, -; CHECK-DAG: ld.param.b32 [[RE0:%r[0-9]+]], [retval0]; -; CHECK-DAG: ld.param.b32 [[RE1:%r[0-9]+]], [retval0+4]; -; CHECK-DAG: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; -; CHECK-DAG: ld.param.b32 [[RE3:%r[0-9]+]], [retval0+12]; -; CHECK-DAG: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16]; -; CHECK-DAG: st.param.b32 [func_retval0], [[RE0]]; -; CHECK-DAG: st.param.b32 [func_retval0+4], [[RE1]]; -; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]]; -; CHECK-DAG: st.param.b32 [func_retval0+12], [[RE3]]; -; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]]; -; CHECK: ret; define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) { +; CHECK-LABEL: test_s_i32f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i32f32_param_0+16]; +; CHECK-NEXT: ld.param.b32 %r4, [test_s_i32f32_param_0+12]; +; CHECK-NEXT: ld.param.b32 %r3, [test_s_i32f32_param_0+8]; +; CHECK-NEXT: ld.param.b32 %r2, [test_s_i32f32_param_0+4]; +; CHECK-NEXT: ld.param.b32 %r1, [test_s_i32f32_param_0]; +; CHECK-NEXT: { // callseq 52, 0 +; CHECK-NEXT: .param .align 8 .b8 param0[24]; +; CHECK-NEXT: st.param.b32 [param0], %r1; +; CHECK-NEXT: st.param.b32 [param0+4], %r2; +; CHECK-NEXT: st.param.b32 [param0+8], %r3; +; CHECK-NEXT: st.param.b32 [param0+12], %r4; +; CHECK-NEXT: st.param.b64 [param0+16], %rd1; +; CHECK-NEXT: .param .align 8 .b8 retval0[24]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_i32f32, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b32 %r5, [retval0]; +; CHECK-NEXT: ld.param.b32 %r6, [retval0+4]; +; CHECK-NEXT: ld.param.b32 %r7, [retval0+8]; +; CHECK-NEXT: ld.param.b32 %r8, [retval0+12]; +; CHECK-NEXT: ld.param.b64 %rd2, [retval0+16]; +; CHECK-NEXT: } // callseq 52 +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: st.param.b32 [func_retval0+4], %r6; +; CHECK-NEXT: st.param.b32 [func_retval0+8], %r7; +; CHECK-NEXT: st.param.b32 [func_retval0+12], %r8; +; CHECK-NEXT: st.param.b64 [func_retval0+16], %rd2; +; CHECK-NEXT: ret; %r = tail call %s_i32f32 @test_s_i32f32(%s_i32f32 %a); ret %s_i32f32 %r; } ; We do vectorize consecutive fields with matching types. -; CHECK:.visible .func (.param .align 8 .b8 func_retval0[24]) -; CHECK-LABEL: test_s_i32x4( -; CHECK: .param .align 8 .b8 test_s_i32x4_param_0[24] -; CHECK-DAG: ld.param.b64 [[RD1:%rd[0-9]+]], [test_s_i32x4_param_0+16]; -; CHECK-DAG: ld.param.v2.b32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_s_i32x4_param_0+8]; -; CHECK-DAG: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i32x4_param_0]; -; CHECK: .param .align 8 .b8 param0[24]; -; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; -; CHECK: st.param.v2.b32 [param0+8], {[[E2]], [[E3]]}; -; CHECK: st.param.b64 [param0+16], [[E4]]; -; CHECK: .param .align 8 .b8 retval0[24]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_s_i32x4, -; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0]; -; CHECK: ld.param.v2.b32 {[[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+8]; -; CHECK: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16]; -; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[RE0]], [[RE1]]}; -; CHECK-DAG: st.param.v2.b32 [func_retval0+8], {[[RE2]], [[RE3]]}; -; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]]; -; CHECK: ret; define %s_i32x4 @test_s_i32x4(%s_i32x4 %a) { +; CHECK-LABEL: test_s_i32x4( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i32x4_param_0+16]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_s_i32x4_param_0+8]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_s_i32x4_param_0]; +; CHECK-NEXT: { // callseq 53, 0 +; CHECK-NEXT: .param .align 8 .b8 param0[24]; +; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, %r2}; +; CHECK-NEXT: st.param.v2.b32 [param0+8], {%r3, %r4}; +; CHECK-NEXT: st.param.b64 [param0+16], %rd1; +; CHECK-NEXT: .param .align 8 .b8 retval0[24]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_i32x4, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [retval0]; +; CHECK-NEXT: ld.param.v2.b32 {%r7, %r8}, [retval0+8]; +; CHECK-NEXT: ld.param.b64 %rd2, [retval0+16]; +; CHECK-NEXT: } // callseq 53 +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r5, %r6}; +; CHECK-NEXT: st.param.v2.b32 [func_retval0+8], {%r7, %r8}; +; CHECK-NEXT: st.param.b64 [func_retval0+16], %rd2; +; CHECK-NEXT: ret; %r = tail call %s_i32x4 @test_s_i32x4(%s_i32x4 %a); ret %s_i32x4 %r; } -; CHECK:.visible .func (.param .align 8 .b8 func_retval0[32]) -; CHECK-LABEL: test_s_i1i32x4( -; CHECK: .param .align 8 .b8 test_s_i1i32x4_param_0[32] -; CHECK: ld.param.b64 [[E5:%rd[0-9]+]], [test_s_i1i32x4_param_0+24]; -; CHECK: ld.param.b32 [[E4:%r[0-9]+]], [test_s_i1i32x4_param_0+16]; -; CHECK: ld.param.b32 [[E3:%r[0-9]+]], [test_s_i1i32x4_param_0+12]; -; CHECK: ld.param.b8 [[E2:%rs[0-9]+]], [test_s_i1i32x4_param_0+8]; -; CHECK: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i1i32x4_param_0]; -; CHECK: .param .align 8 .b8 param0[32]; -; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; -; CHECK: st.param.b8 [param0+8], [[E2]]; -; CHECK: st.param.b32 [param0+12], [[E3]]; -; CHECK: st.param.b32 [param0+16], [[E4]]; -; CHECK: st.param.b64 [param0+24], [[E5]]; -; CHECK: .param .align 8 .b8 retval0[32]; -; CHECK: call.uni (retval0), -; CHECK: test_s_i1i32x4, -; CHECK: ( -; CHECK: param0 -; CHECK: ); -; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0]; -; CHECK: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+8]; -; CHECK: ld.param.b32 [[RE3:%r[0-9]+]], [retval0+12]; -; CHECK: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16]; -; CHECK: ld.param.b64 [[RE5:%rd[0-9]+]], [retval0+24]; -; CHECK: st.param.v2.b32 [func_retval0], {[[RE0]], [[RE1]]}; -; CHECK: st.param.b8 [func_retval0+8], [[RE2]]; -; CHECK: st.param.b32 [func_retval0+12], [[RE3]]; -; CHECK: st.param.b32 [func_retval0+16], [[RE4]]; -; CHECK: st.param.b64 [func_retval0+24], [[RE5]]; -; CHECK: ret; define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) { +; CHECK-LABEL: test_s_i1i32x4( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i1i32x4_param_0+24]; +; CHECK-NEXT: ld.param.b32 %r4, [test_s_i1i32x4_param_0+16]; +; CHECK-NEXT: ld.param.b32 %r3, [test_s_i1i32x4_param_0+12]; +; CHECK-NEXT: ld.param.b8 %rs1, [test_s_i1i32x4_param_0+8]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_s_i1i32x4_param_0]; +; CHECK-NEXT: { // callseq 54, 0 +; CHECK-NEXT: .param .align 8 .b8 param0[32]; +; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, %r2}; +; CHECK-NEXT: st.param.b8 [param0+8], %rs1; +; CHECK-NEXT: st.param.b32 [param0+12], %r3; +; CHECK-NEXT: st.param.b32 [param0+16], %r4; +; CHECK-NEXT: st.param.b64 [param0+24], %rd1; +; CHECK-NEXT: .param .align 8 .b8 retval0[32]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_i1i32x4, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [retval0]; +; CHECK-NEXT: ld.param.b8 %rs2, [retval0+8]; +; CHECK-NEXT: ld.param.b32 %r7, [retval0+12]; +; CHECK-NEXT: ld.param.b32 %r8, [retval0+16]; +; CHECK-NEXT: ld.param.b64 %rd2, [retval0+24]; +; CHECK-NEXT: } // callseq 54 +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r5, %r6}; +; CHECK-NEXT: st.param.b8 [func_retval0+8], %rs2; +; CHECK-NEXT: st.param.b32 [func_retval0+12], %r7; +; CHECK-NEXT: st.param.b32 [func_retval0+16], %r8; +; CHECK-NEXT: st.param.b64 [func_retval0+24], %rd2; +; CHECK-NEXT: ret; %r = tail call %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a); ret %s_i8i32x4 %r; } ; -- All loads/stores from parameters aligned by one must be done one ; -- byte at a time. -; CHECK:.visible .func (.param .align 1 .b8 func_retval0[25]) -; CHECK-LABEL: test_s_i1i32x4p( -; CHECK-DAG: .param .align 1 .b8 test_s_i1i32x4p_param_0[25] -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+24]; -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+23]; -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+22]; -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+21]; -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+20]; -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+19]; -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+18]; -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+17]; -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+16]; -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+15]; -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+14]; -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+13]; -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+12]; -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+11]; -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+10]; -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+9]; -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+8]; -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+7]; -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+6]; -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+5]; -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+4]; -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+3]; -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+2]; -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+1]; -; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0]; -; CHECK: .param .align 1 .b8 param0[25]; -; CHECK-DAG: st.param.b8 [param0], -; CHECK-DAG: st.param.b8 [param0+1], -; CHECK-DAG: st.param.b8 [param0+2], -; CHECK-DAG: st.param.b8 [param0+3], -; CHECK-DAG: st.param.b8 [param0+4], -; CHECK-DAG: st.param.b8 [param0+5], -; CHECK-DAG: st.param.b8 [param0+6], -; CHECK-DAG: st.param.b8 [param0+7], -; CHECK-DAG: st.param.b8 [param0+8], -; CHECK-DAG: st.param.b8 [param0+9], -; CHECK-DAG: st.param.b8 [param0+10], -; CHECK-DAG: st.param.b8 [param0+11], -; CHECK-DAG: st.param.b8 [param0+12], -; CHECK-DAG: st.param.b8 [param0+13], -; CHECK-DAG: st.param.b8 [param0+14], -; CHECK-DAG: st.param.b8 [param0+15], -; CHECK-DAG: st.param.b8 [param0+16], -; CHECK-DAG: st.param.b8 [param0+17], -; CHECK-DAG: st.param.b8 [param0+18], -; CHECK-DAG: st.param.b8 [param0+19], -; CHECK-DAG: st.param.b8 [param0+20], -; CHECK-DAG: st.param.b8 [param0+21], -; CHECK-DAG: st.param.b8 [param0+22], -; CHECK-DAG: st.param.b8 [param0+23], -; CHECK-DAG: st.param.b8 [param0+24], -; CHECK: .param .align 1 .b8 retval0[25]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_s_i1i32x4p, -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+1]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+2]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+3]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+4]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+5]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+6]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+7]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+8]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+9]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+10]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+11]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+12]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+13]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+14]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+15]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+16]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+17]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+18]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+19]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+20]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+21]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+22]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+23]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+24]; -; CHECK: } // callseq -; CHECK-DAG: st.param.b8 [func_retval0], -; CHECK-DAG: st.param.b8 [func_retval0+1], -; CHECK-DAG: st.param.b8 [func_retval0+2], -; CHECK-DAG: st.param.b8 [func_retval0+3], -; CHECK-DAG: st.param.b8 [func_retval0+4], -; CHECK-DAG: st.param.b8 [func_retval0+5], -; CHECK-DAG: st.param.b8 [func_retval0+6], -; CHECK-DAG: st.param.b8 [func_retval0+7], -; CHECK-DAG: st.param.b8 [func_retval0+8], -; CHECK-DAG: st.param.b8 [func_retval0+9], -; CHECK-DAG: st.param.b8 [func_retval0+10], -; CHECK-DAG: st.param.b8 [func_retval0+11], -; CHECK-DAG: st.param.b8 [func_retval0+12], -; CHECK-DAG: st.param.b8 [func_retval0+13], -; CHECK-DAG: st.param.b8 [func_retval0+14], -; CHECK-DAG: st.param.b8 [func_retval0+15], -; CHECK-DAG: st.param.b8 [func_retval0+16], -; CHECK-DAG: st.param.b8 [func_retval0+17], -; CHECK-DAG: st.param.b8 [func_retval0+18], -; CHECK-DAG: st.param.b8 [func_retval0+19], -; CHECK-DAG: st.param.b8 [func_retval0+20], -; CHECK-DAG: st.param.b8 [func_retval0+21], -; CHECK-DAG: st.param.b8 [func_retval0+22], -; CHECK-DAG: st.param.b8 [func_retval0+23], -; CHECK-DAG: st.param.b8 [func_retval0+24], define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) { +; CHECK-LABEL: test_s_i1i32x4p( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<52>; +; CHECK-NEXT: .reg .b32 %r<113>; +; CHECK-NEXT: .reg .b64 %rd<65>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rd2, [test_s_i1i32x4p_param_0+18]; +; CHECK-NEXT: shl.b64 %rd3, %rd2, 8; +; CHECK-NEXT: ld.param.b8 %rd4, [test_s_i1i32x4p_param_0+17]; +; CHECK-NEXT: or.b64 %rd5, %rd3, %rd4; +; CHECK-NEXT: ld.param.b8 %rd6, [test_s_i1i32x4p_param_0+19]; +; CHECK-NEXT: shl.b64 %rd7, %rd6, 16; +; CHECK-NEXT: ld.param.b8 %rd8, [test_s_i1i32x4p_param_0+20]; +; CHECK-NEXT: shl.b64 %rd9, %rd8, 24; +; CHECK-NEXT: or.b64 %rd10, %rd9, %rd7; +; CHECK-NEXT: or.b64 %rd11, %rd10, %rd5; +; CHECK-NEXT: ld.param.b8 %rd12, [test_s_i1i32x4p_param_0+22]; +; CHECK-NEXT: shl.b64 %rd13, %rd12, 8; +; CHECK-NEXT: ld.param.b8 %rd14, [test_s_i1i32x4p_param_0+21]; +; CHECK-NEXT: or.b64 %rd15, %rd13, %rd14; +; CHECK-NEXT: ld.param.b8 %rd16, [test_s_i1i32x4p_param_0+23]; +; CHECK-NEXT: shl.b64 %rd17, %rd16, 16; +; CHECK-NEXT: ld.param.b8 %rd18, [test_s_i1i32x4p_param_0+24]; +; CHECK-NEXT: shl.b64 %rd19, %rd18, 24; +; CHECK-NEXT: or.b64 %rd20, %rd19, %rd17; +; CHECK-NEXT: or.b64 %rd21, %rd20, %rd15; +; CHECK-NEXT: shl.b64 %rd22, %rd21, 32; +; CHECK-NEXT: or.b64 %rd1, %rd22, %rd11; +; CHECK-NEXT: ld.param.b8 %r5, [test_s_i1i32x4p_param_0+14]; +; CHECK-NEXT: shl.b32 %r6, %r5, 8; +; CHECK-NEXT: ld.param.b8 %r7, [test_s_i1i32x4p_param_0+13]; +; CHECK-NEXT: or.b32 %r8, %r6, %r7; +; CHECK-NEXT: ld.param.b8 %r9, [test_s_i1i32x4p_param_0+15]; +; CHECK-NEXT: shl.b32 %r10, %r9, 16; +; CHECK-NEXT: ld.param.b8 %r11, [test_s_i1i32x4p_param_0+16]; +; CHECK-NEXT: shl.b32 %r12, %r11, 24; +; CHECK-NEXT: or.b32 %r13, %r12, %r10; +; CHECK-NEXT: or.b32 %r4, %r13, %r8; +; CHECK-NEXT: ld.param.b8 %r14, [test_s_i1i32x4p_param_0+10]; +; CHECK-NEXT: shl.b32 %r15, %r14, 8; +; CHECK-NEXT: ld.param.b8 %r16, [test_s_i1i32x4p_param_0+9]; +; CHECK-NEXT: or.b32 %r17, %r15, %r16; +; CHECK-NEXT: ld.param.b8 %r18, [test_s_i1i32x4p_param_0+11]; +; CHECK-NEXT: shl.b32 %r19, %r18, 16; +; CHECK-NEXT: ld.param.b8 %r20, [test_s_i1i32x4p_param_0+12]; +; CHECK-NEXT: shl.b32 %r21, %r20, 24; +; CHECK-NEXT: or.b32 %r22, %r21, %r19; +; CHECK-NEXT: or.b32 %r3, %r22, %r17; +; CHECK-NEXT: ld.param.b8 %r23, [test_s_i1i32x4p_param_0+5]; +; CHECK-NEXT: shl.b32 %r24, %r23, 8; +; CHECK-NEXT: ld.param.b8 %r25, [test_s_i1i32x4p_param_0+4]; +; CHECK-NEXT: or.b32 %r26, %r24, %r25; +; CHECK-NEXT: ld.param.b8 %r27, [test_s_i1i32x4p_param_0+6]; +; CHECK-NEXT: shl.b32 %r28, %r27, 16; +; CHECK-NEXT: ld.param.b8 %r29, [test_s_i1i32x4p_param_0+7]; +; CHECK-NEXT: shl.b32 %r30, %r29, 24; +; CHECK-NEXT: or.b32 %r31, %r30, %r28; +; CHECK-NEXT: or.b32 %r2, %r31, %r26; +; CHECK-NEXT: ld.param.b8 %r32, [test_s_i1i32x4p_param_0+1]; +; CHECK-NEXT: shl.b32 %r33, %r32, 8; +; CHECK-NEXT: ld.param.b8 %r34, [test_s_i1i32x4p_param_0]; +; CHECK-NEXT: or.b32 %r35, %r33, %r34; +; CHECK-NEXT: ld.param.b8 %r36, [test_s_i1i32x4p_param_0+2]; +; CHECK-NEXT: shl.b32 %r37, %r36, 16; +; CHECK-NEXT: ld.param.b8 %r38, [test_s_i1i32x4p_param_0+3]; +; CHECK-NEXT: shl.b32 %r39, %r38, 24; +; CHECK-NEXT: or.b32 %r40, %r39, %r37; +; CHECK-NEXT: or.b32 %r1, %r40, %r35; +; CHECK-NEXT: ld.param.b8 %rs1, [test_s_i1i32x4p_param_0+8]; +; CHECK-NEXT: shr.u64 %rd23, %rd1, 8; +; CHECK-NEXT: shr.u64 %rd24, %rd1, 16; +; CHECK-NEXT: shr.u64 %rd25, %rd1, 24; +; CHECK-NEXT: shr.u32 %r41, %r1, 8; +; CHECK-NEXT: shr.u32 %r42, %r2, 8; +; CHECK-NEXT: shr.u32 %r43, %r3, 8; +; CHECK-NEXT: shr.u32 %r44, %r4, 8; +; CHECK-NEXT: bfe.u64 %rd26, %rd21, 8, 24; +; CHECK-NEXT: bfe.u64 %rd27, %rd21, 16, 16; +; CHECK-NEXT: bfe.u64 %rd28, %rd21, 24, 8; +; CHECK-NEXT: shr.u32 %r45, %r40, 16; +; CHECK-NEXT: shr.u32 %r46, %r31, 16; +; CHECK-NEXT: shr.u32 %r47, %r22, 16; +; CHECK-NEXT: shr.u32 %r48, %r13, 16; +; CHECK-NEXT: { // callseq 55, 0 +; CHECK-NEXT: .param .align 1 .b8 param0[25]; +; CHECK-NEXT: st.param.b8 [param0], %r1; +; CHECK-NEXT: st.param.b8 [param0+1], %r41; +; CHECK-NEXT: st.param.b8 [param0+2], %r45; +; CHECK-NEXT: st.param.b8 [param0+3], %r38; +; CHECK-NEXT: st.param.b8 [param0+4], %r2; +; CHECK-NEXT: st.param.b8 [param0+5], %r42; +; CHECK-NEXT: st.param.b8 [param0+6], %r46; +; CHECK-NEXT: st.param.b8 [param0+7], %r29; +; CHECK-NEXT: st.param.b8 [param0+8], %rs1; +; CHECK-NEXT: st.param.b8 [param0+9], %r3; +; CHECK-NEXT: st.param.b8 [param0+10], %r43; +; CHECK-NEXT: st.param.b8 [param0+11], %r47; +; CHECK-NEXT: st.param.b8 [param0+12], %r20; +; CHECK-NEXT: st.param.b8 [param0+13], %r4; +; CHECK-NEXT: st.param.b8 [param0+14], %r44; +; CHECK-NEXT: st.param.b8 [param0+15], %r48; +; CHECK-NEXT: st.param.b8 [param0+16], %r11; +; CHECK-NEXT: st.param.b8 [param0+17], %rd1; +; CHECK-NEXT: st.param.b8 [param0+18], %rd23; +; CHECK-NEXT: st.param.b8 [param0+19], %rd24; +; CHECK-NEXT: st.param.b8 [param0+20], %rd25; +; CHECK-NEXT: st.param.b8 [param0+21], %rd21; +; CHECK-NEXT: st.param.b8 [param0+22], %rd26; +; CHECK-NEXT: st.param.b8 [param0+23], %rd27; +; CHECK-NEXT: st.param.b8 [param0+24], %rd28; +; CHECK-NEXT: .param .align 1 .b8 retval0[25]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_i1i32x4p, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.b8 %rs2, [retval0]; +; CHECK-NEXT: ld.param.b8 %rs3, [retval0+1]; +; CHECK-NEXT: ld.param.b8 %rs4, [retval0+2]; +; CHECK-NEXT: ld.param.b8 %rs5, [retval0+3]; +; CHECK-NEXT: ld.param.b8 %rs6, [retval0+4]; +; CHECK-NEXT: ld.param.b8 %rs7, [retval0+5]; +; CHECK-NEXT: ld.param.b8 %rs8, [retval0+6]; +; CHECK-NEXT: ld.param.b8 %rs9, [retval0+7]; +; CHECK-NEXT: ld.param.b8 %rs10, [retval0+8]; +; CHECK-NEXT: ld.param.b8 %rs11, [retval0+9]; +; CHECK-NEXT: ld.param.b8 %rs12, [retval0+10]; +; CHECK-NEXT: ld.param.b8 %rs13, [retval0+11]; +; CHECK-NEXT: ld.param.b8 %rs14, [retval0+12]; +; CHECK-NEXT: ld.param.b8 %rs15, [retval0+13]; +; CHECK-NEXT: ld.param.b8 %rs16, [retval0+14]; +; CHECK-NEXT: ld.param.b8 %rs17, [retval0+15]; +; CHECK-NEXT: ld.param.b8 %rs18, [retval0+16]; +; CHECK-NEXT: ld.param.b8 %rs19, [retval0+17]; +; CHECK-NEXT: ld.param.b8 %rs20, [retval0+18]; +; CHECK-NEXT: ld.param.b8 %rs21, [retval0+19]; +; CHECK-NEXT: ld.param.b8 %rs22, [retval0+20]; +; CHECK-NEXT: ld.param.b8 %rs23, [retval0+21]; +; CHECK-NEXT: ld.param.b8 %rs24, [retval0+22]; +; CHECK-NEXT: ld.param.b8 %rs25, [retval0+23]; +; CHECK-NEXT: ld.param.b8 %rs26, [retval0+24]; +; CHECK-NEXT: } // callseq 55 +; CHECK-NEXT: cvt.u64.u16 %rd29, %rs19; +; CHECK-NEXT: and.b64 %rd30, %rd29, 255; +; CHECK-NEXT: cvt.u64.u16 %rd31, %rs20; +; CHECK-NEXT: and.b64 %rd32, %rd31, 255; +; CHECK-NEXT: shl.b64 %rd33, %rd32, 8; +; CHECK-NEXT: or.b64 %rd34, %rd30, %rd33; +; CHECK-NEXT: cvt.u64.u16 %rd35, %rs21; +; CHECK-NEXT: and.b64 %rd36, %rd35, 255; +; CHECK-NEXT: shl.b64 %rd37, %rd36, 16; +; CHECK-NEXT: or.b64 %rd38, %rd34, %rd37; +; CHECK-NEXT: cvt.u64.u16 %rd39, %rs22; +; CHECK-NEXT: and.b64 %rd40, %rd39, 255; +; CHECK-NEXT: shl.b64 %rd41, %rd40, 24; +; CHECK-NEXT: or.b64 %rd42, %rd38, %rd41; +; CHECK-NEXT: cvt.u64.u16 %rd43, %rs23; +; CHECK-NEXT: and.b64 %rd44, %rd43, 255; +; CHECK-NEXT: shl.b64 %rd45, %rd44, 32; +; CHECK-NEXT: or.b64 %rd46, %rd42, %rd45; +; CHECK-NEXT: cvt.u64.u16 %rd47, %rs24; +; CHECK-NEXT: and.b64 %rd48, %rd47, 255; +; CHECK-NEXT: shl.b64 %rd49, %rd48, 40; +; CHECK-NEXT: or.b64 %rd50, %rd46, %rd49; +; CHECK-NEXT: cvt.u64.u16 %rd51, %rs25; +; CHECK-NEXT: and.b64 %rd52, %rd51, 255; +; CHECK-NEXT: shl.b64 %rd53, %rd52, 48; +; CHECK-NEXT: or.b64 %rd54, %rd50, %rd53; +; CHECK-NEXT: cvt.u64.u16 %rd55, %rs26; +; CHECK-NEXT: shl.b64 %rd56, %rd55, 56; +; CHECK-NEXT: or.b64 %rd57, %rd54, %rd56; +; CHECK-NEXT: cvt.u32.u16 %r49, %rs15; +; CHECK-NEXT: and.b32 %r50, %r49, 255; +; CHECK-NEXT: cvt.u32.u16 %r51, %rs16; +; CHECK-NEXT: and.b32 %r52, %r51, 255; +; CHECK-NEXT: shl.b32 %r53, %r52, 8; +; CHECK-NEXT: or.b32 %r54, %r50, %r53; +; CHECK-NEXT: cvt.u32.u16 %r55, %rs17; +; CHECK-NEXT: and.b32 %r56, %r55, 255; +; CHECK-NEXT: shl.b32 %r57, %r56, 16; +; CHECK-NEXT: or.b32 %r58, %r54, %r57; +; CHECK-NEXT: cvt.u32.u16 %r59, %rs18; +; CHECK-NEXT: shl.b32 %r60, %r59, 24; +; CHECK-NEXT: or.b32 %r61, %r58, %r60; +; CHECK-NEXT: cvt.u32.u16 %r62, %rs11; +; CHECK-NEXT: and.b32 %r63, %r62, 255; +; CHECK-NEXT: cvt.u32.u16 %r64, %rs12; +; CHECK-NEXT: and.b32 %r65, %r64, 255; +; CHECK-NEXT: shl.b32 %r66, %r65, 8; +; CHECK-NEXT: or.b32 %r67, %r63, %r66; +; CHECK-NEXT: cvt.u32.u16 %r68, %rs13; +; CHECK-NEXT: and.b32 %r69, %r68, 255; +; CHECK-NEXT: shl.b32 %r70, %r69, 16; +; CHECK-NEXT: or.b32 %r71, %r67, %r70; +; CHECK-NEXT: cvt.u32.u16 %r72, %rs14; +; CHECK-NEXT: shl.b32 %r73, %r72, 24; +; CHECK-NEXT: or.b32 %r74, %r71, %r73; +; CHECK-NEXT: cvt.u32.u16 %r75, %rs6; +; CHECK-NEXT: and.b32 %r76, %r75, 255; +; CHECK-NEXT: cvt.u32.u16 %r77, %rs7; +; CHECK-NEXT: and.b32 %r78, %r77, 255; +; CHECK-NEXT: shl.b32 %r79, %r78, 8; +; CHECK-NEXT: or.b32 %r80, %r76, %r79; +; CHECK-NEXT: cvt.u32.u16 %r81, %rs8; +; CHECK-NEXT: and.b32 %r82, %r81, 255; +; CHECK-NEXT: shl.b32 %r83, %r82, 16; +; CHECK-NEXT: or.b32 %r84, %r80, %r83; +; CHECK-NEXT: cvt.u32.u16 %r85, %rs9; +; CHECK-NEXT: shl.b32 %r86, %r85, 24; +; CHECK-NEXT: or.b32 %r87, %r84, %r86; +; CHECK-NEXT: cvt.u32.u16 %r88, %rs2; +; CHECK-NEXT: and.b32 %r89, %r88, 255; +; CHECK-NEXT: cvt.u32.u16 %r90, %rs3; +; CHECK-NEXT: and.b32 %r91, %r90, 255; +; CHECK-NEXT: shl.b32 %r92, %r91, 8; +; CHECK-NEXT: or.b32 %r93, %r89, %r92; +; CHECK-NEXT: cvt.u32.u16 %r94, %rs4; +; CHECK-NEXT: and.b32 %r95, %r94, 255; +; CHECK-NEXT: shl.b32 %r96, %r95, 16; +; CHECK-NEXT: or.b32 %r97, %r93, %r96; +; CHECK-NEXT: cvt.u32.u16 %r98, %rs5; +; CHECK-NEXT: shl.b32 %r99, %r98, 24; +; CHECK-NEXT: or.b32 %r100, %r97, %r99; +; CHECK-NEXT: st.param.b8 [func_retval0], %r100; +; CHECK-NEXT: shr.u32 %r101, %r100, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %r101; +; CHECK-NEXT: shr.u32 %r102, %r100, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+2], %r102; +; CHECK-NEXT: and.b32 %r103, %r98, 255; +; CHECK-NEXT: st.param.b8 [func_retval0+3], %r103; +; CHECK-NEXT: st.param.b8 [func_retval0+4], %r87; +; CHECK-NEXT: shr.u32 %r104, %r87, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+5], %r104; +; CHECK-NEXT: shr.u32 %r105, %r87, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+6], %r105; +; CHECK-NEXT: and.b32 %r106, %r85, 255; +; CHECK-NEXT: st.param.b8 [func_retval0+7], %r106; +; CHECK-NEXT: st.param.b8 [func_retval0+8], %rs10; +; CHECK-NEXT: st.param.b8 [func_retval0+9], %r74; +; CHECK-NEXT: shr.u32 %r107, %r74, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+10], %r107; +; CHECK-NEXT: shr.u32 %r108, %r74, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+11], %r108; +; CHECK-NEXT: and.b32 %r109, %r72, 255; +; CHECK-NEXT: st.param.b8 [func_retval0+12], %r109; +; CHECK-NEXT: st.param.b8 [func_retval0+13], %r61; +; CHECK-NEXT: shr.u32 %r110, %r61, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+14], %r110; +; CHECK-NEXT: shr.u32 %r111, %r61, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+15], %r111; +; CHECK-NEXT: and.b32 %r112, %r59, 255; +; CHECK-NEXT: st.param.b8 [func_retval0+16], %r112; +; CHECK-NEXT: st.param.b8 [func_retval0+17], %rd57; +; CHECK-NEXT: shr.u64 %rd58, %rd57, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+18], %rd58; +; CHECK-NEXT: shr.u64 %rd59, %rd57, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+19], %rd59; +; CHECK-NEXT: shr.u64 %rd60, %rd57, 24; +; CHECK-NEXT: st.param.b8 [func_retval0+20], %rd60; +; CHECK-NEXT: shr.u64 %rd61, %rd57, 32; +; CHECK-NEXT: st.param.b8 [func_retval0+21], %rd61; +; CHECK-NEXT: shr.u64 %rd62, %rd57, 40; +; CHECK-NEXT: st.param.b8 [func_retval0+22], %rd62; +; CHECK-NEXT: shr.u64 %rd63, %rd57, 48; +; CHECK-NEXT: st.param.b8 [func_retval0+23], %rd63; +; CHECK-NEXT: shr.u64 %rd64, %rd57, 56; +; CHECK-NEXT: st.param.b8 [func_retval0+24], %rd64; +; CHECK-NEXT: ret; %r = tail call %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a); ret %s_i8i32x4p %r; } ; Check that we can vectorize loads that span multiple aggregate fields. -; CHECK:.visible .func (.param .align 16 .b8 func_retval0[80]) -; CHECK-LABEL: test_s_crossfield( -; CHECK: .param .align 16 .b8 test_s_crossfield_param_0[80] -; CHECK: ld.param.b32 [[E15:%r[0-9]+]], [test_s_crossfield_param_0+64]; -; CHECK: ld.param.v4.b32 {[[E11:%r[0-9]+]], [[E12:%r[0-9]+]], [[E13:%r[0-9]+]], [[E14:%r[0-9]+]]}, [test_s_crossfield_param_0+48]; -; CHECK: ld.param.v4.b32 {[[E7:%r[0-9]+]], [[E8:%r[0-9]+]], [[E9:%r[0-9]+]], [[E10:%r[0-9]+]]}, [test_s_crossfield_param_0+32]; -; CHECK: ld.param.v4.b32 {[[E3:%r[0-9]+]], [[E4:%r[0-9]+]], [[E5:%r[0-9]+]], [[E6:%r[0-9]+]]}, [test_s_crossfield_param_0+16]; -; CHECK: ld.param.b32 [[E2:%r[0-9]+]], [test_s_crossfield_param_0+8]; -; CHECK: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_crossfield_param_0]; -; CHECK: .param .align 16 .b8 param0[80]; -; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; -; CHECK: st.param.b32 [param0+8], [[E2]]; -; CHECK: st.param.v4.b32 [param0+16], {[[E3]], [[E4]], [[E5]], [[E6]]}; -; CHECK: st.param.v4.b32 [param0+32], {[[E7]], [[E8]], [[E9]], [[E10]]}; -; CHECK: st.param.v4.b32 [param0+48], {[[E11]], [[E12]], [[E13]], [[E14]]}; -; CHECK: st.param.b32 [param0+64], [[E15]]; -; CHECK: .param .align 16 .b8 retval0[80]; -; CHECK: call.uni (retval0), -; CHECK: test_s_crossfield, -; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0]; -; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; -; CHECK: ld.param.v4.b32 {[[RE3:%r[0-9]+]], [[RE4:%r[0-9]+]], [[RE5:%r[0-9]+]], [[RE6:%r[0-9]+]]}, [retval0+16]; -; CHECK: ld.param.v4.b32 {[[RE7:%r[0-9]+]], [[RE8:%r[0-9]+]], [[RE9:%r[0-9]+]], [[RE10:%r[0-9]+]]}, [retval0+32]; -; CHECK: ld.param.v4.b32 {[[RE11:%r[0-9]+]], [[RE12:%r[0-9]+]], [[RE13:%r[0-9]+]], [[RE14:%r[0-9]+]]}, [retval0+48]; -; CHECK: ld.param.b32 [[RE15:%r[0-9]+]], [retval0+64]; -; CHECK: st.param.v2.b32 [func_retval0], {[[RE0]], [[RE1]]}; -; CHECK: st.param.b32 [func_retval0+8], [[RE2]]; -; CHECK: st.param.v4.b32 [func_retval0+16], {[[RE3]], [[RE4]], [[RE5]], [[RE6]]}; -; CHECK: st.param.v4.b32 [func_retval0+32], {[[RE7]], [[RE8]], [[RE9]], [[RE10]]}; -; CHECK: st.param.v4.b32 [func_retval0+48], {[[RE11]], [[RE12]], [[RE13]], [[RE14]]}; -; CHECK: st.param.b32 [func_retval0+64], [[RE15]]; -; CHECK: ret; define %s_crossfield @test_s_crossfield(%s_crossfield %a) { +; CHECK-LABEL: test_s_crossfield( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<49>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r16, [test_s_crossfield_param_0+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r12, %r13, %r14, %r15}, [test_s_crossfield_param_0+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r8, %r9, %r10, %r11}, [test_s_crossfield_param_0+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r4, %r5, %r6, %r7}, [test_s_crossfield_param_0+16]; +; CHECK-NEXT: ld.param.b32 %r3, [test_s_crossfield_param_0+8]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_s_crossfield_param_0]; +; CHECK-NEXT: { // callseq 56, 0 +; CHECK-NEXT: .param .align 16 .b8 param0[80]; +; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, %r2}; +; CHECK-NEXT: st.param.b32 [param0+8], %r3; +; CHECK-NEXT: st.param.v4.b32 [param0+16], {%r4, %r5, %r6, %r7}; +; CHECK-NEXT: st.param.v4.b32 [param0+32], {%r8, %r9, %r10, %r11}; +; CHECK-NEXT: st.param.v4.b32 [param0+48], {%r12, %r13, %r14, %r15}; +; CHECK-NEXT: st.param.b32 [param0+64], %r16; +; CHECK-NEXT: .param .align 16 .b8 retval0[80]; +; CHECK-NEXT: call.uni (retval0), +; CHECK-NEXT: test_s_crossfield, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.v2.b32 {%r17, %r18}, [retval0]; +; CHECK-NEXT: ld.param.b32 %r19, [retval0+8]; +; CHECK-NEXT: ld.param.v4.b32 {%r20, %r21, %r22, %r23}, [retval0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r24, %r25, %r26, %r27}, [retval0+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r28, %r29, %r30, %r31}, [retval0+48]; +; CHECK-NEXT: ld.param.b32 %r32, [retval0+64]; +; CHECK-NEXT: } // callseq 56 +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r17, %r18}; +; CHECK-NEXT: st.param.b32 [func_retval0+8], %r19; +; CHECK-NEXT: st.param.v4.b32 [func_retval0+16], {%r20, %r21, %r22, %r23}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0+32], {%r24, %r25, %r26, %r27}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0+48], {%r28, %r29, %r30, %r31}; +; CHECK-NEXT: st.param.b32 [func_retval0+64], %r32; +; CHECK-NEXT: ret; %r = tail call %s_crossfield @test_s_crossfield(%s_crossfield %a); ret %s_crossfield %r; } diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll index d5b451dad7bc3..6113e13c6f3f7 100644 --- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll @@ -116,18 +116,23 @@ define float @reduce_fadd_float(<8 x float> %in) { ; CHECK-LABEL: reduce_fadd_float( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<17>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_param_0]; -; CHECK-NEXT: add.rn.f32 %r9, %r1, 0f00000000; -; CHECK-NEXT: add.rn.f32 %r10, %r9, %r2; -; CHECK-NEXT: add.rn.f32 %r11, %r10, %r3; -; CHECK-NEXT: add.rn.f32 %r12, %r11, %r4; -; CHECK-NEXT: add.rn.f32 %r13, %r12, %r5; -; CHECK-NEXT: add.rn.f32 %r14, %r13, %r6; -; CHECK-NEXT: add.rn.f32 %r15, %r14, %r7; -; CHECK-NEXT: add.rn.f32 %r16, %r15, %r8; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NEXT: add.rn.f32 %r3, %r1, 0f00000000; +; CHECK-NEXT: add.rn.f32 %r4, %r3, %r2; +; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd2; +; CHECK-NEXT: add.rn.f32 %r7, %r4, %r5; +; CHECK-NEXT: add.rn.f32 %r8, %r7, %r6; +; CHECK-NEXT: mov.b64 {%r9, %r10}, %rd3; +; CHECK-NEXT: add.rn.f32 %r11, %r8, %r9; +; CHECK-NEXT: add.rn.f32 %r12, %r11, %r10; +; CHECK-NEXT: mov.b64 {%r13, %r14}, %rd4; +; CHECK-NEXT: add.rn.f32 %r15, %r12, %r13; +; CHECK-NEXT: add.rn.f32 %r16, %r15, %r14; ; CHECK-NEXT: st.param.b32 [func_retval0], %r16; ; CHECK-NEXT: ret; %res = call float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in) @@ -135,45 +140,93 @@ define float @reduce_fadd_float(<8 x float> %in) { } define float @reduce_fadd_float_reassoc(<8 x float> %in) { -; CHECK-LABEL: reduce_fadd_float_reassoc( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<17>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_param_0]; -; CHECK-NEXT: add.rn.f32 %r9, %r3, %r7; -; CHECK-NEXT: add.rn.f32 %r10, %r1, %r5; -; CHECK-NEXT: add.rn.f32 %r11, %r4, %r8; -; CHECK-NEXT: add.rn.f32 %r12, %r2, %r6; -; CHECK-NEXT: add.rn.f32 %r13, %r12, %r11; -; CHECK-NEXT: add.rn.f32 %r14, %r10, %r9; -; CHECK-NEXT: add.rn.f32 %r15, %r14, %r13; -; CHECK-NEXT: add.rn.f32 %r16, %r15, 0f00000000; -; CHECK-NEXT: st.param.b32 [func_retval0], %r16; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fadd_float_reassoc( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<17>; +; CHECK-SM80-NEXT: .reg .b64 %rd<5>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_reassoc_param_0]; +; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-SM80-NEXT: add.rn.f32 %r5, %r3, %r1; +; CHECK-SM80-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-SM80-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-SM80-NEXT: add.rn.f32 %r10, %r8, %r6; +; CHECK-SM80-NEXT: add.rn.f32 %r11, %r4, %r2; +; CHECK-SM80-NEXT: add.rn.f32 %r12, %r9, %r7; +; CHECK-SM80-NEXT: add.rn.f32 %r13, %r12, %r11; +; CHECK-SM80-NEXT: add.rn.f32 %r14, %r10, %r5; +; CHECK-SM80-NEXT: add.rn.f32 %r15, %r14, %r13; +; CHECK-SM80-NEXT: add.rn.f32 %r16, %r15, 0f00000000; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r16; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fadd_float_reassoc( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<5>; +; CHECK-SM100-NEXT: .reg .b64 %rd<10>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_reassoc_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_reassoc_param_0]; +; CHECK-SM100-NEXT: add.rn.f32x2 %rd5, %rd2, %rd4; +; CHECK-SM100-NEXT: add.rn.f32x2 %rd6, %rd1, %rd3; +; CHECK-SM100-NEXT: add.rn.f32x2 %rd7, %rd6, %rd5; +; CHECK-SM100-NEXT: mov.b64 {_, %r1}, %rd7; +; CHECK-SM100-NEXT: // implicit-def: %r2 +; CHECK-SM100-NEXT: mov.b64 %rd8, {%r1, %r2}; +; CHECK-SM100-NEXT: add.rn.f32x2 %rd9, %rd7, %rd8; +; CHECK-SM100-NEXT: mov.b64 {%r3, _}, %rd9; +; CHECK-SM100-NEXT: add.rn.f32 %r4, %r3, 0f00000000; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in) ret float %res } define float @reduce_fadd_float_reassoc_nonpow2(<7 x float> %in) { -; CHECK-LABEL: reduce_fadd_float_reassoc_nonpow2( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<15>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r7, [reduce_fadd_float_reassoc_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fadd_float_reassoc_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_nonpow2_param_0]; -; CHECK-NEXT: add.rn.f32 %r8, %r3, %r7; -; CHECK-NEXT: add.rn.f32 %r9, %r1, %r5; -; CHECK-NEXT: add.rn.f32 %r10, %r9, %r8; -; CHECK-NEXT: add.rn.f32 %r11, %r2, %r6; -; CHECK-NEXT: add.rn.f32 %r12, %r11, %r4; -; CHECK-NEXT: add.rn.f32 %r13, %r10, %r12; -; CHECK-NEXT: add.rn.f32 %r14, %r13, 0f00000000; -; CHECK-NEXT: st.param.b32 [func_retval0], %r14; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fadd_float_reassoc_nonpow2( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<15>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.b32 %r7, [reduce_fadd_float_reassoc_nonpow2_param_0+24]; +; CHECK-SM80-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fadd_float_reassoc_nonpow2_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_nonpow2_param_0]; +; CHECK-SM80-NEXT: add.rn.f32 %r8, %r3, %r7; +; CHECK-SM80-NEXT: add.rn.f32 %r9, %r1, %r5; +; CHECK-SM80-NEXT: add.rn.f32 %r10, %r9, %r8; +; CHECK-SM80-NEXT: add.rn.f32 %r11, %r2, %r6; +; CHECK-SM80-NEXT: add.rn.f32 %r12, %r11, %r4; +; CHECK-SM80-NEXT: add.rn.f32 %r13, %r10, %r12; +; CHECK-SM80-NEXT: add.rn.f32 %r14, %r13, 0f00000000; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r14; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fadd_float_reassoc_nonpow2( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<13>; +; CHECK-SM100-NEXT: .reg .b64 %rd<8>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fadd_float_reassoc_nonpow2_param_0+16]; +; CHECK-SM100-NEXT: mov.b64 %rd1, {%r5, %r6}; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_nonpow2_param_0]; +; CHECK-SM100-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-SM100-NEXT: mov.b64 %rd3, {%r3, %r4}; +; CHECK-SM100-NEXT: ld.param.b32 %r7, [reduce_fadd_float_reassoc_nonpow2_param_0+24]; +; CHECK-SM100-NEXT: mov.b32 %r8, 0f80000000; +; CHECK-SM100-NEXT: mov.b64 %rd4, {%r7, %r8}; +; CHECK-SM100-NEXT: add.rn.f32x2 %rd5, %rd3, %rd4; +; CHECK-SM100-NEXT: add.rn.f32x2 %rd6, %rd2, %rd1; +; CHECK-SM100-NEXT: add.rn.f32x2 %rd7, %rd6, %rd5; +; CHECK-SM100-NEXT: mov.b64 {%r9, %r10}, %rd7; +; CHECK-SM100-NEXT: add.rn.f32 %r11, %r9, %r10; +; CHECK-SM100-NEXT: add.rn.f32 %r12, %r11, 0f00000000; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12; +; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fadd(float 0.0, <7 x float> %in) ret float %res } @@ -275,17 +328,22 @@ define float @reduce_fmul_float(<8 x float> %in) { ; CHECK-LABEL: reduce_fmul_float( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_param_0]; -; CHECK-NEXT: mul.rn.f32 %r9, %r1, %r2; -; CHECK-NEXT: mul.rn.f32 %r10, %r9, %r3; -; CHECK-NEXT: mul.rn.f32 %r11, %r10, %r4; -; CHECK-NEXT: mul.rn.f32 %r12, %r11, %r5; -; CHECK-NEXT: mul.rn.f32 %r13, %r12, %r6; -; CHECK-NEXT: mul.rn.f32 %r14, %r13, %r7; -; CHECK-NEXT: mul.rn.f32 %r15, %r14, %r8; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NEXT: mul.rn.f32 %r3, %r1, %r2; +; CHECK-NEXT: mov.b64 {%r4, %r5}, %rd2; +; CHECK-NEXT: mul.rn.f32 %r6, %r3, %r4; +; CHECK-NEXT: mul.rn.f32 %r7, %r6, %r5; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd3; +; CHECK-NEXT: mul.rn.f32 %r10, %r7, %r8; +; CHECK-NEXT: mul.rn.f32 %r11, %r10, %r9; +; CHECK-NEXT: mov.b64 {%r12, %r13}, %rd4; +; CHECK-NEXT: mul.rn.f32 %r14, %r11, %r12; +; CHECK-NEXT: mul.rn.f32 %r15, %r14, %r13; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-NEXT: ret; %res = call float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in) @@ -293,43 +351,89 @@ define float @reduce_fmul_float(<8 x float> %in) { } define float @reduce_fmul_float_reassoc(<8 x float> %in) { -; CHECK-LABEL: reduce_fmul_float_reassoc( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_param_0]; -; CHECK-NEXT: mul.rn.f32 %r9, %r3, %r7; -; CHECK-NEXT: mul.rn.f32 %r10, %r1, %r5; -; CHECK-NEXT: mul.rn.f32 %r11, %r4, %r8; -; CHECK-NEXT: mul.rn.f32 %r12, %r2, %r6; -; CHECK-NEXT: mul.rn.f32 %r13, %r12, %r11; -; CHECK-NEXT: mul.rn.f32 %r14, %r10, %r9; -; CHECK-NEXT: mul.rn.f32 %r15, %r14, %r13; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fmul_float_reassoc( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-NEXT: .reg .b64 %rd<5>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_reassoc_param_0]; +; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-SM80-NEXT: mul.rn.f32 %r5, %r3, %r1; +; CHECK-SM80-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-SM80-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-SM80-NEXT: mul.rn.f32 %r10, %r8, %r6; +; CHECK-SM80-NEXT: mul.rn.f32 %r11, %r4, %r2; +; CHECK-SM80-NEXT: mul.rn.f32 %r12, %r9, %r7; +; CHECK-SM80-NEXT: mul.rn.f32 %r13, %r12, %r11; +; CHECK-SM80-NEXT: mul.rn.f32 %r14, %r10, %r5; +; CHECK-SM80-NEXT: mul.rn.f32 %r15, %r14, %r13; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmul_float_reassoc( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<4>; +; CHECK-SM100-NEXT: .reg .b64 %rd<10>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_reassoc_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_reassoc_param_0]; +; CHECK-SM100-NEXT: mul.rn.f32x2 %rd5, %rd2, %rd4; +; CHECK-SM100-NEXT: mul.rn.f32x2 %rd6, %rd1, %rd3; +; CHECK-SM100-NEXT: mul.rn.f32x2 %rd7, %rd6, %rd5; +; CHECK-SM100-NEXT: mov.b64 {_, %r1}, %rd7; +; CHECK-SM100-NEXT: // implicit-def: %r2 +; CHECK-SM100-NEXT: mov.b64 %rd8, {%r1, %r2}; +; CHECK-SM100-NEXT: mul.rn.f32x2 %rd9, %rd7, %rd8; +; CHECK-SM100-NEXT: mov.b64 {%r3, _}, %rd9; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in) ret float %res } define float @reduce_fmul_float_reassoc_nonpow2(<7 x float> %in) { -; CHECK-LABEL: reduce_fmul_float_reassoc_nonpow2( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<14>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r7, [reduce_fmul_float_reassoc_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmul_float_reassoc_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_nonpow2_param_0]; -; CHECK-NEXT: mul.rn.f32 %r8, %r3, %r7; -; CHECK-NEXT: mul.rn.f32 %r9, %r1, %r5; -; CHECK-NEXT: mul.rn.f32 %r10, %r9, %r8; -; CHECK-NEXT: mul.rn.f32 %r11, %r2, %r6; -; CHECK-NEXT: mul.rn.f32 %r12, %r11, %r4; -; CHECK-NEXT: mul.rn.f32 %r13, %r10, %r12; -; CHECK-NEXT: st.param.b32 [func_retval0], %r13; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fmul_float_reassoc_nonpow2( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<14>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.b32 %r7, [reduce_fmul_float_reassoc_nonpow2_param_0+24]; +; CHECK-SM80-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmul_float_reassoc_nonpow2_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_nonpow2_param_0]; +; CHECK-SM80-NEXT: mul.rn.f32 %r8, %r3, %r7; +; CHECK-SM80-NEXT: mul.rn.f32 %r9, %r1, %r5; +; CHECK-SM80-NEXT: mul.rn.f32 %r10, %r9, %r8; +; CHECK-SM80-NEXT: mul.rn.f32 %r11, %r2, %r6; +; CHECK-SM80-NEXT: mul.rn.f32 %r12, %r11, %r4; +; CHECK-SM80-NEXT: mul.rn.f32 %r13, %r10, %r12; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r13; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmul_float_reassoc_nonpow2( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<12>; +; CHECK-SM100-NEXT: .reg .b64 %rd<8>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmul_float_reassoc_nonpow2_param_0+16]; +; CHECK-SM100-NEXT: mov.b64 %rd1, {%r5, %r6}; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_nonpow2_param_0]; +; CHECK-SM100-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-SM100-NEXT: mov.b64 %rd3, {%r3, %r4}; +; CHECK-SM100-NEXT: ld.param.b32 %r7, [reduce_fmul_float_reassoc_nonpow2_param_0+24]; +; CHECK-SM100-NEXT: mov.b32 %r8, 0f3F800000; +; CHECK-SM100-NEXT: mov.b64 %rd4, {%r7, %r8}; +; CHECK-SM100-NEXT: mul.rn.f32x2 %rd5, %rd3, %rd4; +; CHECK-SM100-NEXT: mul.rn.f32x2 %rd6, %rd2, %rd1; +; CHECK-SM100-NEXT: mul.rn.f32x2 %rd7, %rd6, %rd5; +; CHECK-SM100-NEXT: mov.b64 {%r9, %r10}, %rd7; +; CHECK-SM100-NEXT: mul.rn.f32 %r11, %r9, %r10; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r11; +; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmul(float 1.0, <7 x float> %in) ret float %res } @@ -405,15 +509,20 @@ define float @reduce_fmax_float(<8 x float> %in) { ; CHECK-LABEL: reduce_fmax_float( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0]; -; CHECK-NEXT: max.f32 %r9, %r4, %r8; -; CHECK-NEXT: max.f32 %r10, %r2, %r6; -; CHECK-NEXT: max.f32 %r11, %r10, %r9; -; CHECK-NEXT: max.f32 %r12, %r3, %r7; -; CHECK-NEXT: max.f32 %r13, %r1, %r5; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: max.f32 %r5, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-NEXT: max.f32 %r10, %r9, %r7; +; CHECK-NEXT: max.f32 %r11, %r10, %r5; +; CHECK-NEXT: max.f32 %r12, %r3, %r1; +; CHECK-NEXT: max.f32 %r13, %r8, %r6; ; CHECK-NEXT: max.f32 %r14, %r13, %r12; ; CHECK-NEXT: max.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -427,15 +536,20 @@ define float @reduce_fmax_float_reassoc(<8 x float> %in) { ; CHECK-LABEL: reduce_fmax_float_reassoc( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_param_0]; -; CHECK-NEXT: max.f32 %r9, %r4, %r8; -; CHECK-NEXT: max.f32 %r10, %r2, %r6; -; CHECK-NEXT: max.f32 %r11, %r10, %r9; -; CHECK-NEXT: max.f32 %r12, %r3, %r7; -; CHECK-NEXT: max.f32 %r13, %r1, %r5; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_reassoc_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_reassoc_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: max.f32 %r5, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-NEXT: max.f32 %r10, %r9, %r7; +; CHECK-NEXT: max.f32 %r11, %r10, %r5; +; CHECK-NEXT: max.f32 %r12, %r3, %r1; +; CHECK-NEXT: max.f32 %r13, %r8, %r6; ; CHECK-NEXT: max.f32 %r14, %r13, %r12; ; CHECK-NEXT: max.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -537,15 +651,20 @@ define float @reduce_fmin_float(<8 x float> %in) { ; CHECK-LABEL: reduce_fmin_float( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0]; -; CHECK-NEXT: min.f32 %r9, %r4, %r8; -; CHECK-NEXT: min.f32 %r10, %r2, %r6; -; CHECK-NEXT: min.f32 %r11, %r10, %r9; -; CHECK-NEXT: min.f32 %r12, %r3, %r7; -; CHECK-NEXT: min.f32 %r13, %r1, %r5; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: min.f32 %r5, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-NEXT: min.f32 %r10, %r9, %r7; +; CHECK-NEXT: min.f32 %r11, %r10, %r5; +; CHECK-NEXT: min.f32 %r12, %r3, %r1; +; CHECK-NEXT: min.f32 %r13, %r8, %r6; ; CHECK-NEXT: min.f32 %r14, %r13, %r12; ; CHECK-NEXT: min.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -559,15 +678,20 @@ define float @reduce_fmin_float_reassoc(<8 x float> %in) { ; CHECK-LABEL: reduce_fmin_float_reassoc( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_param_0]; -; CHECK-NEXT: min.f32 %r9, %r4, %r8; -; CHECK-NEXT: min.f32 %r10, %r2, %r6; -; CHECK-NEXT: min.f32 %r11, %r10, %r9; -; CHECK-NEXT: min.f32 %r12, %r3, %r7; -; CHECK-NEXT: min.f32 %r13, %r1, %r5; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_reassoc_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_reassoc_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: min.f32 %r5, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-NEXT: min.f32 %r10, %r9, %r7; +; CHECK-NEXT: min.f32 %r11, %r10, %r5; +; CHECK-NEXT: min.f32 %r12, %r3, %r1; +; CHECK-NEXT: min.f32 %r13, %r8, %r6; ; CHECK-NEXT: min.f32 %r14, %r13, %r12; ; CHECK-NEXT: min.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -669,15 +793,20 @@ define float @reduce_fmaximum_float(<8 x float> %in) { ; CHECK-LABEL: reduce_fmaximum_float( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0]; -; CHECK-NEXT: max.NaN.f32 %r9, %r4, %r8; -; CHECK-NEXT: max.NaN.f32 %r10, %r2, %r6; -; CHECK-NEXT: max.NaN.f32 %r11, %r10, %r9; -; CHECK-NEXT: max.NaN.f32 %r12, %r3, %r7; -; CHECK-NEXT: max.NaN.f32 %r13, %r1, %r5; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmaximum_float_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmaximum_float_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: max.NaN.f32 %r5, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-NEXT: max.NaN.f32 %r10, %r9, %r7; +; CHECK-NEXT: max.NaN.f32 %r11, %r10, %r5; +; CHECK-NEXT: max.NaN.f32 %r12, %r3, %r1; +; CHECK-NEXT: max.NaN.f32 %r13, %r8, %r6; ; CHECK-NEXT: max.NaN.f32 %r14, %r13, %r12; ; CHECK-NEXT: max.NaN.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -691,15 +820,20 @@ define float @reduce_fmaximum_float_reassoc(<8 x float> %in) { ; CHECK-LABEL: reduce_fmaximum_float_reassoc( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_param_0]; -; CHECK-NEXT: max.NaN.f32 %r9, %r4, %r8; -; CHECK-NEXT: max.NaN.f32 %r10, %r2, %r6; -; CHECK-NEXT: max.NaN.f32 %r11, %r10, %r9; -; CHECK-NEXT: max.NaN.f32 %r12, %r3, %r7; -; CHECK-NEXT: max.NaN.f32 %r13, %r1, %r5; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmaximum_float_reassoc_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmaximum_float_reassoc_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: max.NaN.f32 %r5, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-NEXT: max.NaN.f32 %r10, %r9, %r7; +; CHECK-NEXT: max.NaN.f32 %r11, %r10, %r5; +; CHECK-NEXT: max.NaN.f32 %r12, %r3, %r1; +; CHECK-NEXT: max.NaN.f32 %r13, %r8, %r6; ; CHECK-NEXT: max.NaN.f32 %r14, %r13, %r12; ; CHECK-NEXT: max.NaN.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -801,15 +935,20 @@ define float @reduce_fminimum_float(<8 x float> %in) { ; CHECK-LABEL: reduce_fminimum_float( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0]; -; CHECK-NEXT: min.NaN.f32 %r9, %r4, %r8; -; CHECK-NEXT: min.NaN.f32 %r10, %r2, %r6; -; CHECK-NEXT: min.NaN.f32 %r11, %r10, %r9; -; CHECK-NEXT: min.NaN.f32 %r12, %r3, %r7; -; CHECK-NEXT: min.NaN.f32 %r13, %r1, %r5; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fminimum_float_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fminimum_float_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: min.NaN.f32 %r5, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-NEXT: min.NaN.f32 %r10, %r9, %r7; +; CHECK-NEXT: min.NaN.f32 %r11, %r10, %r5; +; CHECK-NEXT: min.NaN.f32 %r12, %r3, %r1; +; CHECK-NEXT: min.NaN.f32 %r13, %r8, %r6; ; CHECK-NEXT: min.NaN.f32 %r14, %r13, %r12; ; CHECK-NEXT: min.NaN.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -823,15 +962,20 @@ define float @reduce_fminimum_float_reassoc(<8 x float> %in) { ; CHECK-LABEL: reduce_fminimum_float_reassoc( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_param_0]; -; CHECK-NEXT: min.NaN.f32 %r9, %r4, %r8; -; CHECK-NEXT: min.NaN.f32 %r10, %r2, %r6; -; CHECK-NEXT: min.NaN.f32 %r11, %r10, %r9; -; CHECK-NEXT: min.NaN.f32 %r12, %r3, %r7; -; CHECK-NEXT: min.NaN.f32 %r13, %r1, %r5; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fminimum_float_reassoc_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fminimum_float_reassoc_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: min.NaN.f32 %r5, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-NEXT: min.NaN.f32 %r10, %r9, %r7; +; CHECK-NEXT: min.NaN.f32 %r11, %r10, %r5; +; CHECK-NEXT: min.NaN.f32 %r12, %r3, %r1; +; CHECK-NEXT: min.NaN.f32 %r13, %r8, %r6; ; CHECK-NEXT: min.NaN.f32 %r14, %r13, %r12; ; CHECK-NEXT: min.NaN.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; diff --git a/llvm/test/CodeGen/NVPTX/vec-param-load.ll b/llvm/test/CodeGen/NVPTX/vec-param-load.ll index 765e50554c8d2..4320c0d312622 100644 --- a/llvm/test/CodeGen/NVPTX/vec-param-load.ll +++ b/llvm/test/CodeGen/NVPTX/vec-param-load.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} @@ -5,75 +6,105 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 define <16 x float> @test_v16f32(<16 x float> %a) { ; CHECK-LABEL: test_v16f32( -; CHECK-DAG: ld.param.v4.b32 {[[V_12_15:(%r[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+48]; -; CHECK-DAG: ld.param.v4.b32 {[[V_8_11:(%r[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+32]; -; CHECK-DAG: ld.param.v4.b32 {[[V_4_7:(%r[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+16]; -; CHECK-DAG: ld.param.v4.b32 {[[V_0_3:(%r[0-9]+[, ]*){4}]]}, [test_v16f32_param_0]; -; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[V_0_3]]} -; CHECK-DAG: st.param.v4.b32 [func_retval0+16], {[[V_4_7]]} -; CHECK-DAG: st.param.v4.b32 [func_retval0+32], {[[V_8_11]]} -; CHECK-DAG: st.param.v4.b32 [func_retval0+48], {[[V_12_15]]} -; CHECK: ret; +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v16f32_param_0+48]; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_v16f32_param_0+32]; +; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [test_v16f32_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [test_v16f32_param_0]; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd7, %rd8}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd5, %rd6}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0+32], {%rd3, %rd4}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0+48], {%rd1, %rd2}; +; CHECK-NEXT: ret; ret <16 x float> %a } define <8 x float> @test_v8f32(<8 x float> %a) { ; CHECK-LABEL: test_v8f32( -; CHECK-DAG: ld.param.v4.b32 {[[V_4_7:(%r[0-9]+[, ]*){4}]]}, [test_v8f32_param_0+16]; -; CHECK-DAG: ld.param.v4.b32 {[[V_0_3:(%r[0-9]+[, ]*){4}]]}, [test_v8f32_param_0]; -; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[V_0_3]]} -; CHECK-DAG: st.param.v4.b32 [func_retval0+16], {[[V_4_7]]} -; CHECK: ret; +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v8f32_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_v8f32_param_0]; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd4}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd1, %rd2}; +; CHECK-NEXT: ret; ret <8 x float> %a } define <4 x float> @test_v4f32(<4 x float> %a) { ; CHECK-LABEL: test_v4f32( -; CHECK-DAG: ld.param.v4.b32 {[[V_0_3:(%r[0-9]+[, ]*){4}]]}, [test_v4f32_param_0]; -; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[V_0_3]]} -; CHECK: ret; +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v4f32_param_0]; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; ret <4 x float> %a } define <2 x float> @test_v2f32(<2 x float> %a) { ; CHECK-LABEL: test_v2f32( -; CHECK-DAG: ld.param.v2.b32 {[[V_0_3:(%r[0-9]+[, ]*){2}]]}, [test_v2f32_param_0]; -; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[V_0_3]]} -; CHECK: ret; +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v2f32_param_0]; +; CHECK-NEXT: mov.b64 %rd1, {%r1, %r2}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-NEXT: ret; ret <2 x float> %a } ; Oddly shaped vectors should not load any extra elements. define <3 x float> @test_v3f32(<3 x float> %a) { ; CHECK-LABEL: test_v3f32( -; CHECK-DAG: ld.param.b32 [[V_2:%r[0-9]+]], [test_v3f32_param_0+8]; -; CHECK-DAG: ld.param.v2.b32 {[[V_0_1:(%r[0-9]+[, ]*){2}]]}, [test_v3f32_param_0]; -; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[V_0_1]]} -; CHECK-DAG: st.param.b32 [func_retval0+8], [[V_2]] -; CHECK: ret; +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_v3f32_param_0+8]; +; CHECK-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_v3f32_param_0]; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r3}; +; CHECK-NEXT: st.param.b32 [func_retval0+8], %r1; +; CHECK-NEXT: ret; ret <3 x float> %a } define <8 x i64> @test_v8i64(<8 x i64> %a) { ; CHECK-LABEL: test_v8i64( -; CHECK-DAG: ld.param.v2.b64 {[[V_6_7:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+48]; -; CHECK-DAG: ld.param.v2.b64 {[[V_4_5:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+32]; -; CHECK-DAG: ld.param.v2.b64 {[[V_2_3:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+16]; -; CHECK-DAG: ld.param.v2.b64 {[[V_0_1:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0]; -; CHECK-DAG: st.param.v2.b64 [func_retval0], {[[V_0_1]]} -; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[V_2_3]]} -; CHECK-DAG: st.param.v2.b64 [func_retval0+32], {[[V_4_5]]} -; CHECK-DAG: st.param.v2.b64 [func_retval0+48], {[[V_6_7]]} -; CHECK: ret; +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v8i64_param_0+48]; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_v8i64_param_0+32]; +; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [test_v8i64_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [test_v8i64_param_0]; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd7, %rd8}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd5, %rd6}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0+32], {%rd3, %rd4}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0+48], {%rd1, %rd2}; +; CHECK-NEXT: ret; ret <8 x i64> %a } define <16 x i16> @test_v16i16(<16 x i16> %a) { ; CHECK-LABEL: test_v16i16( -; CHECK-DAG: ld.param.v4.b32 {[[V_8_15:(%r[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+16]; -; CHECK-DAG: ld.param.v4.b32 {[[V_0_7:(%r[0-9]+[, ]*){4}]]}, [test_v16i16_param_0]; -; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[V_0_7]]} -; CHECK-DAG: st.param.v4.b32 [func_retval0+16], {[[V_8_15]]} -; CHECK: ret; +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_v16i16_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_v16i16_param_0]; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0+16], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; ret <16 x i16> %a } diff --git a/llvm/test/CodeGen/NVPTX/vector-args.ll b/llvm/test/CodeGen/NVPTX/vector-args.ll index b08c19206a0b8..577a45288a104 100644 --- a/llvm/test/CodeGen/NVPTX/vector-args.ll +++ b/llvm/test/CodeGen/NVPTX/vector-args.ll @@ -1,10 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} define float @foo(<2 x float> %a) { -; CHECK: .func (.param .b32 func_retval0) foo -; CHECK: .param .align 8 .b8 foo_param_0[8] -; CHECK: ld.param.v2.b32 {%r{{[0-9]+}}, %r{{[0-9]+}}} +; CHECK-LABEL: foo( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [foo_param_0]; +; CHECK-NEXT: mul.rn.f32 %r3, %r2, %r2; +; CHECK-NEXT: mul.rn.f32 %r4, %r1, %r1; +; CHECK-NEXT: add.rn.f32 %r5, %r4, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: ret; %t1 = fmul <2 x float> %a, %a %t2 = extractelement <2 x float> %t1, i32 0 %t3 = extractelement <2 x float> %t1, i32 1 @@ -14,9 +23,19 @@ define float @foo(<2 x float> %a) { define float @bar(<4 x float> %a) { -; CHECK: .func (.param .b32 func_retval0) bar -; CHECK: .param .align 16 .b8 bar_param_0[16] -; CHECK: ld.param.v4.b32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} +; CHECK-LABEL: bar( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [bar_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NEXT: mul.rn.f32 %r3, %r2, %r2; +; CHECK-NEXT: mul.rn.f32 %r4, %r1, %r1; +; CHECK-NEXT: add.rn.f32 %r5, %r4, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: ret; %t1 = fmul <4 x float> %a, %a %t2 = extractelement <4 x float> %t1, i32 0 %t3 = extractelement <4 x float> %t1, i32 1 @@ -26,10 +45,23 @@ define float @bar(<4 x float> %a) { define <4 x float> @baz(<4 x float> %a) { -; CHECK: .func (.param .align 16 .b8 func_retval0[16]) baz -; CHECK: .param .align 16 .b8 baz_param_0[16] -; CHECK: ld.param.v4.b32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} -; CHECK: st.param.v4.b32 [func_retval0], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} +; CHECK-LABEL: baz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [baz_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mul.rn.f32 %r3, %r2, %r2; +; CHECK-NEXT: mul.rn.f32 %r4, %r1, %r1; +; CHECK-NEXT: mov.b64 %rd3, {%r4, %r3}; +; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd1; +; CHECK-NEXT: mul.rn.f32 %r7, %r6, %r6; +; CHECK-NEXT: mul.rn.f32 %r8, %r5, %r5; +; CHECK-NEXT: mov.b64 %rd4, {%r8, %r7}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd3}; +; CHECK-NEXT: ret; %t1 = fmul <4 x float> %a, %a ret <4 x float> %t1 } diff --git a/llvm/test/CodeGen/NVPTX/vector-loads.ll b/llvm/test/CodeGen/NVPTX/vector-loads.ll index 88ff59407a143..e04bf8ca34544 100644 --- a/llvm/test/CodeGen/NVPTX/vector-loads.ll +++ b/llvm/test/CodeGen/NVPTX/vector-loads.ll @@ -101,18 +101,18 @@ define void @foo_complex(ptr nocapture readonly align 16 dereferenceable(1342177 define void @extv8f16_global_a16(ptr addrspace(1) noalias readonly align 16 %dst, ptr addrspace(1) noalias readonly align 16 %src) #0 { ; CHECK: ld.global.v4.b32 {%r %v = load <8 x half>, ptr addrspace(1) %src, align 16 -; CHECK: mov.b32 {%rs -; CHECK: mov.b32 {%rs -; CHECK: mov.b32 {%rs -; CHECK: mov.b32 {%rs -; CHECK: cvt.f32.f16 %r{{.*}}, %rs -; CHECK: cvt.f32.f16 %r{{.*}}, %rs -; CHECK: cvt.f32.f16 %r{{.*}}, %rs -; CHECK: cvt.f32.f16 %r{{.*}}, %rs -; CHECK: cvt.f32.f16 %r{{.*}}, %rs -; CHECK: cvt.f32.f16 %r{{.*}}, %rs -; CHECK: cvt.f32.f16 %r{{.*}}, %rs -; CHECK: cvt.f32.f16 %r{{.*}}, %rs +; CHECK-DAG: mov.b32 {%rs +; CHECK-DAG: mov.b32 {%rs +; CHECK-DAG: mov.b32 {%rs +; CHECK-DAG: mov.b32 {%rs +; CHECK-DAG: cvt.f32.f16 %r{{.*}}, %rs +; CHECK-DAG: cvt.f32.f16 %r{{.*}}, %rs +; CHECK-DAG: cvt.f32.f16 %r{{.*}}, %rs +; CHECK-DAG: cvt.f32.f16 %r{{.*}}, %rs +; CHECK-DAG: cvt.f32.f16 %r{{.*}}, %rs +; CHECK-DAG: cvt.f32.f16 %r{{.*}}, %rs +; CHECK-DAG: cvt.f32.f16 %r{{.*}}, %rs +; CHECK-DAG: cvt.f32.f16 %r{{.*}}, %rs %ext = fpext <8 x half> %v to <8 x float> ; CHECK: st.global.v4.b32 ; CHECK: st.global.v4.b32 @@ -151,18 +151,18 @@ define void @extv8f16_global_a4(ptr addrspace(1) noalias readonly align 16 %dst, define void @extv8f16_generic_a16(ptr noalias readonly align 16 %dst, ptr noalias readonly align 16 %src) #0 { ; CHECK: ld.v4.b32 {%r %v = load <8 x half>, ptr %src, align 16 -; CHECK: mov.b32 {%rs -; CHECK: mov.b32 {%rs -; CHECK: mov.b32 {%rs -; CHECK: mov.b32 {%rs -; CHECK: cvt.f32.f16 %r{{.*}}, %rs -; CHECK: cvt.f32.f16 %r{{.*}}, %rs -; CHECK: cvt.f32.f16 %r{{.*}}, %rs -; CHECK: cvt.f32.f16 %r{{.*}}, %rs -; CHECK: cvt.f32.f16 %r{{.*}}, %rs -; CHECK: cvt.f32.f16 %r{{.*}}, %rs -; CHECK: cvt.f32.f16 %r{{.*}}, %rs -; CHECK: cvt.f32.f16 %r{{.*}}, %rs +; CHECK-DAG: mov.b32 {%rs +; CHECK-DAG: mov.b32 {%rs +; CHECK-DAG: mov.b32 {%rs +; CHECK-DAG: mov.b32 {%rs +; CHECK-DAG: cvt.f32.f16 %r{{.*}}, %rs +; CHECK-DAG: cvt.f32.f16 %r{{.*}}, %rs +; CHECK-DAG: cvt.f32.f16 %r{{.*}}, %rs +; CHECK-DAG: cvt.f32.f16 %r{{.*}}, %rs +; CHECK-DAG: cvt.f32.f16 %r{{.*}}, %rs +; CHECK-DAG: cvt.f32.f16 %r{{.*}}, %rs +; CHECK-DAG: cvt.f32.f16 %r{{.*}}, %rs +; CHECK-DAG: cvt.f32.f16 %r{{.*}}, %rs %ext = fpext <8 x half> %v to <8 x float> ; CHECK: st.v4.b32 ; CHECK: st.v4.b32 diff --git a/llvm/test/CodeGen/NVPTX/vector-stores.ll b/llvm/test/CodeGen/NVPTX/vector-stores.ll index f3b1015070085..7f3dfb164d8d6 100644 --- a/llvm/test/CodeGen/NVPTX/vector-stores.ll +++ b/llvm/test/CodeGen/NVPTX/vector-stores.ll @@ -1,38 +1,80 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} -; CHECK-LABEL: .visible .func foo1 -; CHECK: st.v2.b32 define void @foo1(<2 x float> %val, ptr %ptr) { +; CHECK-LABEL: foo1( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [foo1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [foo1_param_1]; +; CHECK-NEXT: st.b64 [%rd2], %rd1; +; CHECK-NEXT: ret; store <2 x float> %val, ptr %ptr ret void } -; CHECK-LABEL: .visible .func foo2 -; CHECK: st.v4.b32 define void @foo2(<4 x float> %val, ptr %ptr) { +; CHECK-LABEL: foo2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [foo2_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: ld.param.b64 %rd3, [foo2_param_1]; +; CHECK-NEXT: st.v4.b32 [%rd3], {%r3, %r4, %r1, %r2}; +; CHECK-NEXT: ret; store <4 x float> %val, ptr %ptr ret void } -; CHECK-LABEL: .visible .func foo3 -; CHECK: st.v2.b32 define void @foo3(<2 x i32> %val, ptr %ptr) { +; CHECK-LABEL: foo3( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [foo3_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [foo3_param_1]; +; CHECK-NEXT: st.v2.b32 [%rd1], {%r1, %r2}; +; CHECK-NEXT: ret; store <2 x i32> %val, ptr %ptr ret void } -; CHECK-LABEL: .visible .func foo4 -; CHECK: st.v4.b32 define void @foo4(<4 x i32> %val, ptr %ptr) { +; CHECK-LABEL: foo4( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [foo4_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [foo4_param_1]; +; CHECK-NEXT: st.v4.b32 [%rd1], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; store <4 x i32> %val, ptr %ptr ret void } -; CHECK-LABEL: .visible .func v16i8 define void @v16i8(ptr %a, ptr %b) { -; CHECK: ld.v4.b32 -; CHECK: st.v4.b32 +; CHECK-LABEL: v16i8( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [v16i8_param_0]; +; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd2, [v16i8_param_1]; +; CHECK-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; %v = load <16 x i8>, ptr %a store <16 x i8> %v, ptr %b ret void