//===-- VEISelLowering.cpp - VE DAG Lowering Implementation ---------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements the interfaces that VE uses to lower LLVM code into a // selection DAG. // //===----------------------------------------------------------------------===// #include "VEISelLowering.h" #include "MCTargetDesc/VEMCExpr.h" #include "VEInstrBuilder.h" #include "VEMachineFunctionInfo.h" #include "VERegisterInfo.h" #include "VETargetMachine.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/Module.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" using namespace llvm; #define DEBUG_TYPE "ve-lower" //===----------------------------------------------------------------------===// // Calling Convention Implementation //===----------------------------------------------------------------------===// #include "VEGenCallingConv.inc" CCAssignFn *getReturnCC(CallingConv::ID CallConv) { switch (CallConv) { default: return RetCC_VE_C; case CallingConv::Fast: return RetCC_VE_Fast; } } CCAssignFn *getParamCC(CallingConv::ID CallConv, bool IsVarArg) { if (IsVarArg) return CC_VE2; switch (CallConv) { default: return CC_VE_C; case CallingConv::Fast: return CC_VE_Fast; } } bool VETargetLowering::CanLowerReturn( CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const { CCAssignFn *RetCC = getReturnCC(CallConv); SmallVector RVLocs; CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC); } static const MVT AllVectorVTs[] = {MVT::v256i32, MVT::v512i32, MVT::v256i64, MVT::v256f32, MVT::v512f32, MVT::v256f64}; static const MVT AllPackedVTs[] = {MVT::v512i32, MVT::v512f32}; void VETargetLowering::initRegisterClasses() { // Set up the register classes. addRegisterClass(MVT::i32, &VE::I32RegClass); addRegisterClass(MVT::i64, &VE::I64RegClass); addRegisterClass(MVT::f32, &VE::F32RegClass); addRegisterClass(MVT::f64, &VE::I64RegClass); addRegisterClass(MVT::f128, &VE::F128RegClass); if (Subtarget->enableVPU()) { for (MVT VecVT : AllVectorVTs) addRegisterClass(VecVT, &VE::V64RegClass); addRegisterClass(MVT::v256i1, &VE::VMRegClass); addRegisterClass(MVT::v512i1, &VE::VM512RegClass); } } void VETargetLowering::initSPUActions() { const auto &TM = getTargetMachine(); /// Load & Store { // VE doesn't have i1 sign extending load. for (MVT VT : MVT::integer_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); setTruncStoreAction(VT, MVT::i1, Expand); } // VE doesn't have floating point extload/truncstore, so expand them. for (MVT FPVT : MVT::fp_valuetypes()) { for (MVT OtherFPVT : MVT::fp_valuetypes()) { setLoadExtAction(ISD::EXTLOAD, FPVT, OtherFPVT, Expand); setTruncStoreAction(FPVT, OtherFPVT, Expand); } } // VE doesn't have fp128 load/store, so expand them in custom lower. setOperationAction(ISD::LOAD, MVT::f128, Custom); setOperationAction(ISD::STORE, MVT::f128, Custom); /// } Load & Store // Custom legalize address nodes into LO/HI parts. MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0)); setOperationAction(ISD::BlockAddress, PtrVT, Custom); setOperationAction(ISD::GlobalAddress, PtrVT, Custom); setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom); setOperationAction(ISD::ConstantPool, PtrVT, Custom); setOperationAction(ISD::JumpTable, PtrVT, Custom); /// VAARG handling { setOperationAction(ISD::VASTART, MVT::Other, Custom); // VAARG needs to be lowered to access with 8 bytes alignment. setOperationAction(ISD::VAARG, MVT::Other, Custom); // Use the default implementation. setOperationAction(ISD::VACOPY, MVT::Other, Expand); setOperationAction(ISD::VAEND, MVT::Other, Expand); /// } VAARG handling /// Stack { setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); // Use the default implementation. setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); /// } Stack /// Branch { // VE doesn't have BRCOND setOperationAction(ISD::BRCOND, MVT::Other, Expand); // BR_JT is not implemented yet. setOperationAction(ISD::BR_JT, MVT::Other, Expand); /// } Branch /// Int Ops { for (MVT IntVT : {MVT::i32, MVT::i64}) { // VE has no REM or DIVREM operations. setOperationAction(ISD::UREM, IntVT, Expand); setOperationAction(ISD::SREM, IntVT, Expand); setOperationAction(ISD::SDIVREM, IntVT, Expand); setOperationAction(ISD::UDIVREM, IntVT, Expand); // VE has no SHL_PARTS/SRA_PARTS/SRL_PARTS operations. setOperationAction(ISD::SHL_PARTS, IntVT, Expand); setOperationAction(ISD::SRA_PARTS, IntVT, Expand); setOperationAction(ISD::SRL_PARTS, IntVT, Expand); // VE has no MULHU/S or U/SMUL_LOHI operations. // TODO: Use MPD instruction to implement SMUL_LOHI for i32 type. setOperationAction(ISD::MULHU, IntVT, Expand); setOperationAction(ISD::MULHS, IntVT, Expand); setOperationAction(ISD::UMUL_LOHI, IntVT, Expand); setOperationAction(ISD::SMUL_LOHI, IntVT, Expand); // VE has no CTTZ, ROTL, ROTR operations. setOperationAction(ISD::CTTZ, IntVT, Expand); setOperationAction(ISD::ROTL, IntVT, Expand); setOperationAction(ISD::ROTR, IntVT, Expand); // VE has 64 bits instruction which works as i64 BSWAP operation. This // instruction works fine as i32 BSWAP operation with an additional // parameter. Use isel patterns to lower BSWAP. setOperationAction(ISD::BSWAP, IntVT, Legal); // VE has only 64 bits instructions which work as i64 BITREVERSE/CTLZ/CTPOP // operations. Use isel patterns for i64, promote for i32. LegalizeAction Act = (IntVT == MVT::i32) ? Promote : Legal; setOperationAction(ISD::BITREVERSE, IntVT, Act); setOperationAction(ISD::CTLZ, IntVT, Act); setOperationAction(ISD::CTLZ_ZERO_UNDEF, IntVT, Act); setOperationAction(ISD::CTPOP, IntVT, Act); // VE has only 64 bits instructions which work as i64 AND/OR/XOR operations. // Use isel patterns for i64, promote for i32. setOperationAction(ISD::AND, IntVT, Act); setOperationAction(ISD::OR, IntVT, Act); setOperationAction(ISD::XOR, IntVT, Act); } /// } Int Ops /// Conversion { // VE doesn't have instructions for fp<->uint, so expand them by llvm setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote); // use i64 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote); // use i64 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand); // fp16 not supported for (MVT FPVT : MVT::fp_valuetypes()) { setOperationAction(ISD::FP16_TO_FP, FPVT, Expand); setOperationAction(ISD::FP_TO_FP16, FPVT, Expand); } /// } Conversion /// Floating-point Ops { /// Note: Floating-point operations are fneg, fadd, fsub, fmul, fdiv, frem, /// and fcmp. // VE doesn't have following floating point operations. for (MVT VT : MVT::fp_valuetypes()) { setOperationAction(ISD::FNEG, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); } // VE doesn't have fdiv of f128. setOperationAction(ISD::FDIV, MVT::f128, Expand); for (MVT FPVT : {MVT::f32, MVT::f64}) { // f32 and f64 uses ConstantFP. f128 uses ConstantPool. setOperationAction(ISD::ConstantFP, FPVT, Legal); } /// } Floating-point Ops /// Floating-point math functions { // VE doesn't have following floating point math functions. for (MVT VT : MVT::fp_valuetypes()) { setOperationAction(ISD::FABS, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FSQRT, VT, Expand); } /// } Floating-point math functions /// Atomic instructions { setMaxAtomicSizeInBitsSupported(64); setMinCmpXchgSizeInBits(32); setSupportsUnalignedAtomics(false); // Use custom inserter for ATOMIC_FENCE. setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); // Other atomic instructions. for (MVT VT : MVT::integer_valuetypes()) { // Support i8/i16 atomic swap. setOperationAction(ISD::ATOMIC_SWAP, VT, Custom); // FIXME: Support "atmam" instructions. setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Expand); setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Expand); setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Expand); setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Expand); // VE doesn't have follwing instructions. setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Expand); setOperationAction(ISD::ATOMIC_LOAD_CLR, VT, Expand); setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Expand); setOperationAction(ISD::ATOMIC_LOAD_NAND, VT, Expand); setOperationAction(ISD::ATOMIC_LOAD_MIN, VT, Expand); setOperationAction(ISD::ATOMIC_LOAD_MAX, VT, Expand); setOperationAction(ISD::ATOMIC_LOAD_UMIN, VT, Expand); setOperationAction(ISD::ATOMIC_LOAD_UMAX, VT, Expand); } /// } Atomic instructions /// SJLJ instructions { setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); if (TM.Options.ExceptionModel == ExceptionHandling::SjLj) setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); /// } SJLJ instructions // Intrinsic instructions setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); } void VETargetLowering::initVPUActions() { for (MVT LegalVecVT : AllVectorVTs) { setOperationAction(ISD::BUILD_VECTOR, LegalVecVT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, LegalVecVT, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalVecVT, Legal); // Translate all vector instructions with legal element types to VVP_* // nodes. // TODO We will custom-widen into VVP_* nodes in the future. While we are // buildling the infrastructure for this, we only do this for legal vector // VTs. #define HANDLE_VP_TO_VVP(VP_OPC, VVP_NAME) \ setOperationAction(ISD::VP_OPC, LegalVecVT, Custom); #define ADD_VVP_OP(VVP_NAME, ISD_NAME) \ setOperationAction(ISD::ISD_NAME, LegalVecVT, Custom); #include "VVPNodes.def" } for (MVT LegalPackedVT : AllPackedVTs) { setOperationAction(ISD::INSERT_VECTOR_ELT, LegalPackedVT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalPackedVT, Custom); } } SDValue VETargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const { // CCValAssign - represent the assignment of the return value to locations. SmallVector RVLocs; // CCState - Info about the registers and stack slot. CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); // Analyze return values. CCInfo.AnalyzeReturn(Outs, getReturnCC(CallConv)); SDValue Flag; SmallVector RetOps(1, Chain); // Copy the result values into the output registers. for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); assert(!VA.needsCustom() && "Unexpected custom lowering"); SDValue OutVal = OutVals[i]; // Integer return values must be sign or zero extended by the callee. switch (VA.getLocInfo()) { case CCValAssign::Full: break; case CCValAssign::SExt: OutVal = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), OutVal); break; case CCValAssign::ZExt: OutVal = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), OutVal); break; case CCValAssign::AExt: OutVal = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), OutVal); break; case CCValAssign::BCvt: { // Convert a float return value to i64 with padding. // 63 31 0 // +------+------+ // | float| 0 | // +------+------+ assert(VA.getLocVT() == MVT::i64); assert(VA.getValVT() == MVT::f32); SDValue Undef = SDValue( DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0); SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32); OutVal = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i64, Undef, OutVal, Sub_f32), 0); break; } default: llvm_unreachable("Unknown loc info!"); } Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVal, Flag); // Guarantee that all emitted copies are stuck together with flags. Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } RetOps[0] = Chain; // Update chain. // Add the flag if we have it. if (Flag.getNode()) RetOps.push_back(Flag); return DAG.getNode(VEISD::RET_FLAG, DL, MVT::Other, RetOps); } SDValue VETargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); // Get the base offset of the incoming arguments stack space. unsigned ArgsBaseOffset = Subtarget->getRsaSize(); // Get the size of the preserved arguments area unsigned ArgsPreserved = 64; // Analyze arguments according to CC_VE. SmallVector ArgLocs; CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); // Allocate the preserved area first. CCInfo.AllocateStack(ArgsPreserved, Align(8)); // We already allocated the preserved area, so the stack offset computed // by CC_VE would be correct now. CCInfo.AnalyzeFormalArguments(Ins, getParamCC(CallConv, false)); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; assert(!VA.needsCustom() && "Unexpected custom lowering"); if (VA.isRegLoc()) { // This argument is passed in a register. // All integer register arguments are promoted by the caller to i64. // Create a virtual register for the promoted live-in value. unsigned VReg = MF.addLiveIn(VA.getLocReg(), getRegClassFor(VA.getLocVT())); SDValue Arg = DAG.getCopyFromReg(Chain, DL, VReg, VA.getLocVT()); // The caller promoted the argument, so insert an Assert?ext SDNode so we // won't promote the value again in this function. switch (VA.getLocInfo()) { case CCValAssign::SExt: Arg = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Arg, DAG.getValueType(VA.getValVT())); break; case CCValAssign::ZExt: Arg = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Arg, DAG.getValueType(VA.getValVT())); break; case CCValAssign::BCvt: { // Extract a float argument from i64 with padding. // 63 31 0 // +------+------+ // | float| 0 | // +------+------+ assert(VA.getLocVT() == MVT::i64); assert(VA.getValVT() == MVT::f32); SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32); Arg = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f32, Arg, Sub_f32), 0); break; } default: break; } // Truncate the register down to the argument type. if (VA.isExtInLoc()) Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg); InVals.push_back(Arg); continue; } // The registers are exhausted. This argument was passed on the stack. assert(VA.isMemLoc()); // The CC_VE_Full/Half functions compute stack offsets relative to the // beginning of the arguments area at %fp + the size of reserved area. unsigned Offset = VA.getLocMemOffset() + ArgsBaseOffset; unsigned ValSize = VA.getValVT().getSizeInBits() / 8; // Adjust offset for a float argument by adding 4 since the argument is // stored in 8 bytes buffer with offset like below. LLVM generates // 4 bytes load instruction, so need to adjust offset here. This // adjustment is required in only LowerFormalArguments. In LowerCall, // a float argument is converted to i64 first, and stored as 8 bytes // data, which is required by ABI, so no need for adjustment. // 0 4 // +------+------+ // | empty| float| // +------+------+ if (VA.getValVT() == MVT::f32) Offset += 4; int FI = MF.getFrameInfo().CreateFixedObject(ValSize, Offset, true); InVals.push_back( DAG.getLoad(VA.getValVT(), DL, Chain, DAG.getFrameIndex(FI, getPointerTy(MF.getDataLayout())), MachinePointerInfo::getFixedStack(MF, FI))); } if (!IsVarArg) return Chain; // This function takes variable arguments, some of which may have been passed // in registers %s0-%s8. // // The va_start intrinsic needs to know the offset to the first variable // argument. // TODO: need to calculate offset correctly once we support f128. unsigned ArgOffset = ArgLocs.size() * 8; VEMachineFunctionInfo *FuncInfo = MF.getInfo(); // Skip the reserved area at the top of stack. FuncInfo->setVarArgsFrameOffset(ArgOffset + ArgsBaseOffset); return Chain; } // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. Register VETargetLowering::getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const { Register Reg = StringSwitch(RegName) .Case("sp", VE::SX11) // Stack pointer .Case("fp", VE::SX9) // Frame pointer .Case("sl", VE::SX8) // Stack limit .Case("lr", VE::SX10) // Link register .Case("tp", VE::SX14) // Thread pointer .Case("outer", VE::SX12) // Outer regiser .Case("info", VE::SX17) // Info area register .Case("got", VE::SX15) // Global offset table register .Case("plt", VE::SX16) // Procedure linkage table register .Default(0); if (Reg) return Reg; report_fatal_error("Invalid register name global variable"); } //===----------------------------------------------------------------------===// // TargetLowering Implementation //===----------------------------------------------------------------------===// SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { SelectionDAG &DAG = CLI.DAG; SDLoc DL = CLI.DL; SDValue Chain = CLI.Chain; auto PtrVT = getPointerTy(DAG.getDataLayout()); // VE target does not yet support tail call optimization. CLI.IsTailCall = false; // Get the base offset of the outgoing arguments stack space. unsigned ArgsBaseOffset = Subtarget->getRsaSize(); // Get the size of the preserved arguments area unsigned ArgsPreserved = 8 * 8u; // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; CCState CCInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); // Allocate the preserved area first. CCInfo.AllocateStack(ArgsPreserved, Align(8)); // We already allocated the preserved area, so the stack offset computed // by CC_VE would be correct now. CCInfo.AnalyzeCallOperands(CLI.Outs, getParamCC(CLI.CallConv, false)); // VE requires to use both register and stack for varargs or no-prototyped // functions. bool UseBoth = CLI.IsVarArg; // Analyze operands again if it is required to store BOTH. SmallVector ArgLocs2; CCState CCInfo2(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), ArgLocs2, *DAG.getContext()); if (UseBoth) CCInfo2.AnalyzeCallOperands(CLI.Outs, getParamCC(CLI.CallConv, true)); // Get the size of the outgoing arguments stack space requirement. unsigned ArgsSize = CCInfo.getNextStackOffset(); // Keep stack frames 16-byte aligned. ArgsSize = alignTo(ArgsSize, 16); // Adjust the stack pointer to make room for the arguments. // FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls // with more than 6 arguments. Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, DL); // Collect the set of registers to pass to the function and their values. // This will be emitted as a sequence of CopyToReg nodes glued to the call // instruction. SmallVector, 8> RegsToPass; // Collect chains from all the memory opeations that copy arguments to the // stack. They must follow the stack pointer adjustment above and precede the // call instruction itself. SmallVector MemOpChains; // VE needs to get address of callee function in a register // So, prepare to copy it to SX12 here. // If the callee is a GlobalAddress node (quite common, every direct call is) // turn it into a TargetGlobalAddress node so that legalize doesn't hack it. // Likewise ExternalSymbol -> TargetExternalSymbol. SDValue Callee = CLI.Callee; bool IsPICCall = isPositionIndependent(); // PC-relative references to external symbols should go through $stub. // If so, we need to prepare GlobalBaseReg first. const TargetMachine &TM = DAG.getTarget(); const Module *Mod = DAG.getMachineFunction().getFunction().getParent(); const GlobalValue *GV = nullptr; auto *CalleeG = dyn_cast(Callee); if (CalleeG) GV = CalleeG->getGlobal(); bool Local = TM.shouldAssumeDSOLocal(*Mod, GV); bool UsePlt = !Local; MachineFunction &MF = DAG.getMachineFunction(); // Turn GlobalAddress/ExternalSymbol node into a value node // containing the address of them here. if (CalleeG) { if (IsPICCall) { if (UsePlt) Subtarget->getInstrInfo()->getGlobalBaseReg(&MF); Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee); } else { Callee = makeHiLoPair(Callee, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG); } } else if (ExternalSymbolSDNode *E = dyn_cast(Callee)) { if (IsPICCall) { if (UsePlt) Subtarget->getInstrInfo()->getGlobalBaseReg(&MF); Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, 0); Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee); } else { Callee = makeHiLoPair(Callee, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG); } } RegsToPass.push_back(std::make_pair(VE::SX12, Callee)); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = CLI.OutVals[i]; // Promote the value if needed. switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown location info!"); case CCValAssign::Full: break; case CCValAssign::SExt: Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); break; case CCValAssign::ZExt: Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); break; case CCValAssign::AExt: Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); break; case CCValAssign::BCvt: { // Convert a float argument to i64 with padding. // 63 31 0 // +------+------+ // | float| 0 | // +------+------+ assert(VA.getLocVT() == MVT::i64); assert(VA.getValVT() == MVT::f32); SDValue Undef = SDValue( DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0); SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32); Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i64, Undef, Arg, Sub_f32), 0); break; } } if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); if (!UseBoth) continue; VA = ArgLocs2[i]; } assert(VA.isMemLoc()); // Create a store off the stack pointer for this argument. SDValue StackPtr = DAG.getRegister(VE::SX11, PtrVT); // The argument area starts at %fp/%sp + the size of reserved area. SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset() + ArgsBaseOffset, DL); PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); MemOpChains.push_back( DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo())); } // Emit all stores, make sure they occur before the call. if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); // Build a sequence of CopyToReg nodes glued together with token chain and // glue operands which copy the outgoing args into registers. The InGlue is // necessary since all emitted instructions must be stuck together in order // to pass the live physical registers. SDValue InGlue; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first, RegsToPass[i].second, InGlue); InGlue = Chain.getValue(1); } // Build the operands for the call instruction itself. SmallVector Ops; Ops.push_back(Chain); for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) Ops.push_back(DAG.getRegister(RegsToPass[i].first, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. const VERegisterInfo *TRI = Subtarget->getRegisterInfo(); const uint32_t *Mask = TRI->getCallPreservedMask(DAG.getMachineFunction(), CLI.CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); // Make sure the CopyToReg nodes are glued to the call instruction which // consumes the registers. if (InGlue.getNode()) Ops.push_back(InGlue); // Now the call itself. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getNode(VEISD::CALL, DL, NodeTys, Ops); InGlue = Chain.getValue(1); // Revert the stack pointer immediately after the call. Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, DL, true), DAG.getIntPtrConstant(0, DL, true), InGlue, DL); InGlue = Chain.getValue(1); // Now extract the return values. This is more or less the same as // LowerFormalArguments. // Assign locations to each value returned by this call. SmallVector RVLocs; CCState RVInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); // Set inreg flag manually for codegen generated library calls that // return float. if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && !CLI.CB) CLI.Ins[0].Flags.setInReg(); RVInfo.AnalyzeCallResult(CLI.Ins, getReturnCC(CLI.CallConv)); // Copy all of the result registers out of their specified physreg. for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign &VA = RVLocs[i]; assert(!VA.needsCustom() && "Unexpected custom lowering"); unsigned Reg = VA.getLocReg(); // When returning 'inreg {i32, i32 }', two consecutive i32 arguments can // reside in the same register in the high and low bits. Reuse the // CopyFromReg previous node to avoid duplicate copies. SDValue RV; if (RegisterSDNode *SrcReg = dyn_cast(Chain.getOperand(1))) if (SrcReg->getReg() == Reg && Chain->getOpcode() == ISD::CopyFromReg) RV = Chain.getValue(0); // But usually we'll create a new CopyFromReg for a different register. if (!RV.getNode()) { RV = DAG.getCopyFromReg(Chain, DL, Reg, RVLocs[i].getLocVT(), InGlue); Chain = RV.getValue(1); InGlue = Chain.getValue(2); } // The callee promoted the return value, so insert an Assert?ext SDNode so // we won't promote the value again in this function. switch (VA.getLocInfo()) { case CCValAssign::SExt: RV = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), RV, DAG.getValueType(VA.getValVT())); break; case CCValAssign::ZExt: RV = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), RV, DAG.getValueType(VA.getValVT())); break; case CCValAssign::BCvt: { // Extract a float return value from i64 with padding. // 63 31 0 // +------+------+ // | float| 0 | // +------+------+ assert(VA.getLocVT() == MVT::i64); assert(VA.getValVT() == MVT::f32); SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32); RV = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f32, RV, Sub_f32), 0); break; } default: break; } // Truncate the register down to the return value type. if (VA.isExtInLoc()) RV = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), RV); InVals.push_back(RV); } return Chain; } bool VETargetLowering::isOffsetFoldingLegal( const GlobalAddressSDNode *GA) const { // VE uses 64 bit addressing, so we need multiple instructions to generate // an address. Folding address with offset increases the number of // instructions, so that we disable it here. Offsets will be folded in // the DAG combine later if it worth to do so. return false; } /// isFPImmLegal - Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. bool VETargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const { return VT == MVT::f32 || VT == MVT::f64; } /// Determine if the target supports unaligned memory accesses. /// /// This function returns true if the target allows unaligned memory accesses /// of the specified type in the given address space. If true, it also returns /// whether the unaligned memory access is "fast" in the last argument by /// reference. This is used, for example, in situations where an array /// copy/move/set is converted to a sequence of store operations. Its use /// helps to ensure that such replacements don't generate code that causes an /// alignment error (trap) on the target machine. bool VETargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags, bool *Fast) const { if (Fast) { // It's fast anytime on VE *Fast = true; } return true; } VETargetLowering::VETargetLowering(const TargetMachine &TM, const VESubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { // Instructions which use registers as conditionals examine all the // bits (as does the pseudo SELECT_CC expansion). I don't think it // matters much whether it's ZeroOrOneBooleanContent, or // ZeroOrNegativeOneBooleanContent, so, arbitrarily choose the // former. setBooleanContents(ZeroOrOneBooleanContent); setBooleanVectorContents(ZeroOrOneBooleanContent); initRegisterClasses(); initSPUActions(); initVPUActions(); setStackPointerRegisterToSaveRestore(VE::SX11); // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::TRUNCATE); // Set function alignment to 16 bytes setMinFunctionAlignment(Align(16)); // VE stores all argument by 8 bytes alignment setMinStackArgumentAlignment(Align(8)); computeRegisterProperties(Subtarget->getRegisterInfo()); } const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const { #define TARGET_NODE_CASE(NAME) \ case VEISD::NAME: \ return "VEISD::" #NAME; switch ((VEISD::NodeType)Opcode) { case VEISD::FIRST_NUMBER: break; TARGET_NODE_CASE(CALL) TARGET_NODE_CASE(EH_SJLJ_LONGJMP) TARGET_NODE_CASE(EH_SJLJ_SETJMP) TARGET_NODE_CASE(EH_SJLJ_SETUP_DISPATCH) TARGET_NODE_CASE(GETFUNPLT) TARGET_NODE_CASE(GETSTACKTOP) TARGET_NODE_CASE(GETTLSADDR) TARGET_NODE_CASE(GLOBAL_BASE_REG) TARGET_NODE_CASE(Hi) TARGET_NODE_CASE(Lo) TARGET_NODE_CASE(MEMBARRIER) TARGET_NODE_CASE(RET_FLAG) TARGET_NODE_CASE(TS1AM) TARGET_NODE_CASE(VEC_BROADCAST) // Register the VVP_* SDNodes. #define ADD_VVP_OP(VVP_NAME, ...) TARGET_NODE_CASE(VVP_NAME) #include "VVPNodes.def" } #undef TARGET_NODE_CASE return nullptr; } EVT VETargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &, EVT VT) const { return MVT::i32; } // Convert to a target node and set target flags. SDValue VETargetLowering::withTargetFlags(SDValue Op, unsigned TF, SelectionDAG &DAG) const { if (const GlobalAddressSDNode *GA = dyn_cast(Op)) return DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA), GA->getValueType(0), GA->getOffset(), TF); if (const BlockAddressSDNode *BA = dyn_cast(Op)) return DAG.getTargetBlockAddress(BA->getBlockAddress(), Op.getValueType(), 0, TF); if (const ConstantPoolSDNode *CP = dyn_cast(Op)) return DAG.getTargetConstantPool(CP->getConstVal(), CP->getValueType(0), CP->getAlign(), CP->getOffset(), TF); if (const ExternalSymbolSDNode *ES = dyn_cast(Op)) return DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0), TF); if (const JumpTableSDNode *JT = dyn_cast(Op)) return DAG.getTargetJumpTable(JT->getIndex(), JT->getValueType(0), TF); llvm_unreachable("Unhandled address SDNode"); } // Split Op into high and low parts according to HiTF and LoTF. // Return an ADD node combining the parts. SDValue VETargetLowering::makeHiLoPair(SDValue Op, unsigned HiTF, unsigned LoTF, SelectionDAG &DAG) const { SDLoc DL(Op); EVT VT = Op.getValueType(); SDValue Hi = DAG.getNode(VEISD::Hi, DL, VT, withTargetFlags(Op, HiTF, DAG)); SDValue Lo = DAG.getNode(VEISD::Lo, DL, VT, withTargetFlags(Op, LoTF, DAG)); return DAG.getNode(ISD::ADD, DL, VT, Hi, Lo); } // Build SDNodes for producing an address from a GlobalAddress, ConstantPool, // or ExternalSymbol SDNode. SDValue VETargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT PtrVT = Op.getValueType(); // Handle PIC mode first. VE needs a got load for every variable! if (isPositionIndependent()) { auto GlobalN = dyn_cast(Op); if (isa(Op) || isa(Op) || (GlobalN && GlobalN->getGlobal()->hasLocalLinkage())) { // Create following instructions for local linkage PIC code. // lea %reg, label@gotoff_lo // and %reg, %reg, (32)0 // lea.sl %reg, label@gotoff_hi(%reg, %got) SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOTOFF_HI32, VEMCExpr::VK_VE_GOTOFF_LO32, DAG); SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT); return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo); } // Create following instructions for not local linkage PIC code. // lea %reg, label@got_lo // and %reg, %reg, (32)0 // lea.sl %reg, label@got_hi(%reg) // ld %reg, (%reg, %got) SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOT_HI32, VEMCExpr::VK_VE_GOT_LO32, DAG); SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT); SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo); return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), AbsAddr, MachinePointerInfo::getGOT(DAG.getMachineFunction())); } // This is one of the absolute code models. switch (getTargetMachine().getCodeModel()) { default: llvm_unreachable("Unsupported absolute code model"); case CodeModel::Small: case CodeModel::Medium: case CodeModel::Large: // abs64. return makeHiLoPair(Op, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG); } } /// Custom Lower { // The mappings for emitLeading/TrailingFence for VE is designed by following // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html Instruction *VETargetLowering::emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: llvm_unreachable("Invalid fence: unordered/non-atomic"); case AtomicOrdering::Monotonic: case AtomicOrdering::Acquire: return nullptr; // Nothing to do case AtomicOrdering::Release: case AtomicOrdering::AcquireRelease: return Builder.CreateFence(AtomicOrdering::Release); case AtomicOrdering::SequentiallyConsistent: if (!Inst->hasAtomicStore()) return nullptr; // Nothing to do return Builder.CreateFence(AtomicOrdering::SequentiallyConsistent); } llvm_unreachable("Unknown fence ordering in emitLeadingFence"); } Instruction *VETargetLowering::emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: llvm_unreachable("Invalid fence: unordered/not-atomic"); case AtomicOrdering::Monotonic: case AtomicOrdering::Release: return nullptr; // Nothing to do case AtomicOrdering::Acquire: case AtomicOrdering::AcquireRelease: return Builder.CreateFence(AtomicOrdering::Acquire); case AtomicOrdering::SequentiallyConsistent: return Builder.CreateFence(AtomicOrdering::SequentiallyConsistent); } llvm_unreachable("Unknown fence ordering in emitTrailingFence"); } SDValue VETargetLowering::lowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); AtomicOrdering FenceOrdering = static_cast( cast(Op.getOperand(1))->getZExtValue()); SyncScope::ID FenceSSID = static_cast( cast(Op.getOperand(2))->getZExtValue()); // VE uses Release consistency, so need a fence instruction if it is a // cross-thread fence. if (FenceSSID == SyncScope::System) { switch (FenceOrdering) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: case AtomicOrdering::Monotonic: // No need to generate fencem instruction here. break; case AtomicOrdering::Acquire: // Generate "fencem 2" as acquire fence. return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other, DAG.getTargetConstant(2, DL, MVT::i32), Op.getOperand(0)), 0); case AtomicOrdering::Release: // Generate "fencem 1" as release fence. return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other, DAG.getTargetConstant(1, DL, MVT::i32), Op.getOperand(0)), 0); case AtomicOrdering::AcquireRelease: case AtomicOrdering::SequentiallyConsistent: // Generate "fencem 3" as acq_rel and seq_cst fence. // FIXME: "fencem 3" doesn't wait for for PCIe deveices accesses, // so seq_cst may require more instruction for them. return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other, DAG.getTargetConstant(3, DL, MVT::i32), Op.getOperand(0)), 0); } } // MEMBARRIER is a compiler barrier; it codegens to a no-op. return DAG.getNode(VEISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0)); } TargetLowering::AtomicExpansionKind VETargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { // We have TS1AM implementation for i8/i16/i32/i64, so use it. if (AI->getOperation() == AtomicRMWInst::Xchg) { return AtomicExpansionKind::None; } // FIXME: Support "ATMAM" instruction for LOAD_ADD/SUB/AND/OR. // Otherwise, expand it using compare and exchange instruction to not call // __sync_fetch_and_* functions. return AtomicExpansionKind::CmpXChg; } static SDValue prepareTS1AM(SDValue Op, SelectionDAG &DAG, SDValue &Flag, SDValue &Bits) { SDLoc DL(Op); AtomicSDNode *N = cast(Op); SDValue Ptr = N->getOperand(1); SDValue Val = N->getOperand(2); EVT PtrVT = Ptr.getValueType(); bool Byte = N->getMemoryVT() == MVT::i8; // Remainder = AND Ptr, 3 // Flag = 1 << Remainder ; If Byte is true (1 byte swap flag) // Flag = 3 << Remainder ; If Byte is false (2 bytes swap flag) // Bits = Remainder << 3 // NewVal = Val << Bits SDValue Const3 = DAG.getConstant(3, DL, PtrVT); SDValue Remainder = DAG.getNode(ISD::AND, DL, PtrVT, {Ptr, Const3}); SDValue Mask = Byte ? DAG.getConstant(1, DL, MVT::i32) : DAG.getConstant(3, DL, MVT::i32); Flag = DAG.getNode(ISD::SHL, DL, MVT::i32, {Mask, Remainder}); Bits = DAG.getNode(ISD::SHL, DL, PtrVT, {Remainder, Const3}); return DAG.getNode(ISD::SHL, DL, Val.getValueType(), {Val, Bits}); } static SDValue finalizeTS1AM(SDValue Op, SelectionDAG &DAG, SDValue Data, SDValue Bits) { SDLoc DL(Op); EVT VT = Data.getValueType(); bool Byte = cast(Op)->getMemoryVT() == MVT::i8; // NewData = Data >> Bits // Result = NewData & 0xff ; If Byte is true (1 byte) // Result = NewData & 0xffff ; If Byte is false (2 bytes) SDValue NewData = DAG.getNode(ISD::SRL, DL, VT, Data, Bits); return DAG.getNode(ISD::AND, DL, VT, {NewData, DAG.getConstant(Byte ? 0xff : 0xffff, DL, VT)}); } SDValue VETargetLowering::lowerATOMIC_SWAP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); AtomicSDNode *N = cast(Op); if (N->getMemoryVT() == MVT::i8) { // For i8, use "ts1am" // Input: // ATOMIC_SWAP Ptr, Val, Order // // Output: // Remainder = AND Ptr, 3 // Flag = 1 << Remainder ; 1 byte swap flag for TS1AM inst. // Bits = Remainder << 3 // NewVal = Val << Bits // // Aligned = AND Ptr, -4 // Data = TS1AM Aligned, Flag, NewVal // // NewData = Data >> Bits // Result = NewData & 0xff ; 1 byte result SDValue Flag; SDValue Bits; SDValue NewVal = prepareTS1AM(Op, DAG, Flag, Bits); SDValue Ptr = N->getOperand(1); SDValue Aligned = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), {Ptr, DAG.getConstant(-4, DL, MVT::i64)}); SDValue TS1AM = DAG.getAtomic(VEISD::TS1AM, DL, N->getMemoryVT(), DAG.getVTList(Op.getNode()->getValueType(0), Op.getNode()->getValueType(1)), {N->getChain(), Aligned, Flag, NewVal}, N->getMemOperand()); SDValue Result = finalizeTS1AM(Op, DAG, TS1AM, Bits); SDValue Chain = TS1AM.getValue(1); return DAG.getMergeValues({Result, Chain}, DL); } if (N->getMemoryVT() == MVT::i16) { // For i16, use "ts1am" SDValue Flag; SDValue Bits; SDValue NewVal = prepareTS1AM(Op, DAG, Flag, Bits); SDValue Ptr = N->getOperand(1); SDValue Aligned = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), {Ptr, DAG.getConstant(-4, DL, MVT::i64)}); SDValue TS1AM = DAG.getAtomic(VEISD::TS1AM, DL, N->getMemoryVT(), DAG.getVTList(Op.getNode()->getValueType(0), Op.getNode()->getValueType(1)), {N->getChain(), Aligned, Flag, NewVal}, N->getMemOperand()); SDValue Result = finalizeTS1AM(Op, DAG, TS1AM, Bits); SDValue Chain = TS1AM.getValue(1); return DAG.getMergeValues({Result, Chain}, DL); } // Otherwise, let llvm legalize it. return Op; } SDValue VETargetLowering::lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { return makeAddress(Op, DAG); } SDValue VETargetLowering::lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { return makeAddress(Op, DAG); } SDValue VETargetLowering::lowerConstantPool(SDValue Op, SelectionDAG &DAG) const { return makeAddress(Op, DAG); } SDValue VETargetLowering::lowerToTLSGeneralDynamicModel(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); // Generate the following code: // t1: ch,glue = callseq_start t0, 0, 0 // t2: i64,ch,glue = VEISD::GETTLSADDR t1, label, t1:1 // t3: ch,glue = callseq_end t2, 0, 0, t2:2 // t4: i64,ch,glue = CopyFromReg t3, Register:i64 $sx0, t3:1 SDValue Label = withTargetFlags(Op, 0, DAG); EVT PtrVT = Op.getValueType(); // Lowering the machine isd will make sure everything is in the right // location. SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); const uint32_t *Mask = Subtarget->getRegisterInfo()->getCallPreservedMask( DAG.getMachineFunction(), CallingConv::C); Chain = DAG.getCALLSEQ_START(Chain, 64, 0, DL); SDValue Args[] = {Chain, Label, DAG.getRegisterMask(Mask), Chain.getValue(1)}; Chain = DAG.getNode(VEISD::GETTLSADDR, DL, NodeTys, Args); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(64, DL, true), DAG.getIntPtrConstant(0, DL, true), Chain.getValue(1), DL); Chain = DAG.getCopyFromReg(Chain, DL, VE::SX0, PtrVT, Chain.getValue(1)); // GETTLSADDR will be codegen'ed as call. Inform MFI that function has calls. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MFI.setHasCalls(true); // Also generate code to prepare a GOT register if it is PIC. if (isPositionIndependent()) { MachineFunction &MF = DAG.getMachineFunction(); Subtarget->getInstrInfo()->getGlobalBaseReg(&MF); } return Chain; } SDValue VETargetLowering::lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // The current implementation of nld (2.26) doesn't allow local exec model // code described in VE-tls_v1.1.pdf (*1) as its input. Instead, we always // generate the general dynamic model code sequence. // // *1: https://www.nec.com/en/global/prod/hpc/aurora/document/VE-tls_v1.1.pdf return lowerToTLSGeneralDynamicModel(Op, DAG); } SDValue VETargetLowering::lowerJumpTable(SDValue Op, SelectionDAG &DAG) const { return makeAddress(Op, DAG); } // Lower a f128 load into two f64 loads. static SDValue lowerLoadF128(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); LoadSDNode *LdNode = dyn_cast(Op.getNode()); assert(LdNode && LdNode->getOffset().isUndef() && "Unexpected node type"); unsigned Alignment = LdNode->getAlign().value(); if (Alignment > 8) Alignment = 8; SDValue Lo64 = DAG.getLoad(MVT::f64, DL, LdNode->getChain(), LdNode->getBasePtr(), LdNode->getPointerInfo(), Alignment, LdNode->isVolatile() ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone); EVT AddrVT = LdNode->getBasePtr().getValueType(); SDValue HiPtr = DAG.getNode(ISD::ADD, DL, AddrVT, LdNode->getBasePtr(), DAG.getConstant(8, DL, AddrVT)); SDValue Hi64 = DAG.getLoad(MVT::f64, DL, LdNode->getChain(), HiPtr, LdNode->getPointerInfo(), Alignment, LdNode->isVolatile() ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone); SDValue SubRegEven = DAG.getTargetConstant(VE::sub_even, DL, MVT::i32); SDValue SubRegOdd = DAG.getTargetConstant(VE::sub_odd, DL, MVT::i32); // VE stores Hi64 to 8(addr) and Lo64 to 0(addr) SDNode *InFP128 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f128); InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f128, SDValue(InFP128, 0), Hi64, SubRegEven); InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f128, SDValue(InFP128, 0), Lo64, SubRegOdd); SDValue OutChains[2] = {SDValue(Lo64.getNode(), 1), SDValue(Hi64.getNode(), 1)}; SDValue OutChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains); SDValue Ops[2] = {SDValue(InFP128, 0), OutChain}; return DAG.getMergeValues(Ops, DL); } SDValue VETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const { LoadSDNode *LdNode = cast(Op.getNode()); SDValue BasePtr = LdNode->getBasePtr(); if (isa(BasePtr.getNode())) { // Do not expand store instruction with frame index here because of // dependency problems. We expand it later in eliminateFrameIndex(). return Op; } EVT MemVT = LdNode->getMemoryVT(); if (MemVT == MVT::f128) return lowerLoadF128(Op, DAG); return Op; } // Lower a f128 store into two f64 stores. static SDValue lowerStoreF128(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); StoreSDNode *StNode = dyn_cast(Op.getNode()); assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type"); SDValue SubRegEven = DAG.getTargetConstant(VE::sub_even, DL, MVT::i32); SDValue SubRegOdd = DAG.getTargetConstant(VE::sub_odd, DL, MVT::i32); SDNode *Hi64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i64, StNode->getValue(), SubRegEven); SDNode *Lo64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i64, StNode->getValue(), SubRegOdd); unsigned Alignment = StNode->getAlign().value(); if (Alignment > 8) Alignment = 8; // VE stores Hi64 to 8(addr) and Lo64 to 0(addr) SDValue OutChains[2]; OutChains[0] = DAG.getStore(StNode->getChain(), DL, SDValue(Lo64, 0), StNode->getBasePtr(), MachinePointerInfo(), Alignment, StNode->isVolatile() ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone); EVT AddrVT = StNode->getBasePtr().getValueType(); SDValue HiPtr = DAG.getNode(ISD::ADD, DL, AddrVT, StNode->getBasePtr(), DAG.getConstant(8, DL, AddrVT)); OutChains[1] = DAG.getStore(StNode->getChain(), DL, SDValue(Hi64, 0), HiPtr, MachinePointerInfo(), Alignment, StNode->isVolatile() ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains); } SDValue VETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const { StoreSDNode *StNode = cast(Op.getNode()); assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type"); SDValue BasePtr = StNode->getBasePtr(); if (isa(BasePtr.getNode())) { // Do not expand store instruction with frame index here because of // dependency problems. We expand it later in eliminateFrameIndex(). return Op; } EVT MemVT = StNode->getMemoryVT(); if (MemVT == MVT::f128) return lowerStoreF128(Op, DAG); // Otherwise, ask llvm to expand it. return SDValue(); } SDValue VETargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); VEMachineFunctionInfo *FuncInfo = MF.getInfo(); auto PtrVT = getPointerTy(DAG.getDataLayout()); // Need frame address to find the address of VarArgsFrameIndex. MF.getFrameInfo().setFrameAddressIsTaken(true); // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. SDLoc DL(Op); SDValue Offset = DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getRegister(VE::SX9, PtrVT), DAG.getIntPtrConstant(FuncInfo->getVarArgsFrameOffset(), DL)); const Value *SV = cast(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), DL, Offset, Op.getOperand(1), MachinePointerInfo(SV)); } SDValue VETargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const { SDNode *Node = Op.getNode(); EVT VT = Node->getValueType(0); SDValue InChain = Node->getOperand(0); SDValue VAListPtr = Node->getOperand(1); EVT PtrVT = VAListPtr.getValueType(); const Value *SV = cast(Node->getOperand(2))->getValue(); SDLoc DL(Node); SDValue VAList = DAG.getLoad(PtrVT, DL, InChain, VAListPtr, MachinePointerInfo(SV)); SDValue Chain = VAList.getValue(1); SDValue NextPtr; if (VT == MVT::f128) { // VE f128 values must be stored with 16 bytes alignment. We doesn't // know the actual alignment of VAList, so we take alignment of it // dyanmically. int Align = 16; VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(Align - 1, DL, PtrVT)); VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, DAG.getConstant(-Align, DL, PtrVT)); // Increment the pointer, VAList, by 16 to the next vaarg. NextPtr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(16, DL)); } else if (VT == MVT::f32) { // float --> need special handling like below. // 0 4 // +------+------+ // | empty| float| // +------+------+ // Increment the pointer, VAList, by 8 to the next vaarg. NextPtr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(8, DL)); // Then, adjust VAList. unsigned InternalOffset = 4; VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(InternalOffset, DL, PtrVT)); } else { // Increment the pointer, VAList, by 8 to the next vaarg. NextPtr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(8, DL)); } // Store the incremented VAList to the legalized pointer. InChain = DAG.getStore(Chain, DL, NextPtr, VAListPtr, MachinePointerInfo(SV)); // Load the actual argument out of the pointer VAList. // We can't count on greater alignment than the word size. return DAG.getLoad(VT, DL, InChain, VAList, MachinePointerInfo(), std::min(PtrVT.getSizeInBits(), VT.getSizeInBits()) / 8); } SDValue VETargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { // Generate following code. // (void)__llvm_grow_stack(size); // ret = GETSTACKTOP; // pseudo instruction SDLoc DL(Op); // Get the inputs. SDNode *Node = Op.getNode(); SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); MaybeAlign Alignment(Op.getConstantOperandVal(2)); EVT VT = Node->getValueType(0); // Chain the dynamic stack allocation so that it doesn't modify the stack // pointer when other instructions are using the stack. Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); Align StackAlign = TFI.getStackAlign(); bool NeedsAlign = Alignment.valueOrOne() > StackAlign; // Prepare arguments TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Node = Size; Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext()); Args.push_back(Entry); if (NeedsAlign) { Entry.Node = DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT); Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext()); Args.push_back(Entry); } Type *RetTy = Type::getVoidTy(*DAG.getContext()); EVT PtrVT = Op.getValueType(); SDValue Callee; if (NeedsAlign) { Callee = DAG.getTargetExternalSymbol("__ve_grow_stack_align", PtrVT, 0); } else { Callee = DAG.getTargetExternalSymbol("__ve_grow_stack", PtrVT, 0); } TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(DL) .setChain(Chain) .setCallee(CallingConv::PreserveAll, RetTy, Callee, std::move(Args)) .setDiscardResult(true); std::pair pair = LowerCallTo(CLI); Chain = pair.second; SDValue Result = DAG.getNode(VEISD::GETSTACKTOP, DL, VT, Chain); if (NeedsAlign) { Result = DAG.getNode(ISD::ADD, DL, VT, Result, DAG.getConstant((Alignment->value() - 1ULL), DL, VT)); Result = DAG.getNode(ISD::AND, DL, VT, Result, DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT)); } // Chain = Result.getValue(1); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true), DAG.getIntPtrConstant(0, DL, true), SDValue(), DL); SDValue Ops[2] = {Result, Chain}; return DAG.getMergeValues(Ops, DL); } SDValue VETargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); return DAG.getNode(VEISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0), Op.getOperand(1)); } SDValue VETargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); return DAG.getNode(VEISD::EH_SJLJ_SETJMP, DL, DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), Op.getOperand(1)); } SDValue VETargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); return DAG.getNode(VEISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other, Op.getOperand(0)); } static SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG, const VETargetLowering &TLI, const VESubtarget *Subtarget) { SDLoc DL(Op); MachineFunction &MF = DAG.getMachineFunction(); EVT PtrVT = TLI.getPointerTy(MF.getDataLayout()); MachineFrameInfo &MFI = MF.getFrameInfo(); MFI.setFrameAddressIsTaken(true); unsigned Depth = Op.getConstantOperandVal(0); const VERegisterInfo *RegInfo = Subtarget->getRegisterInfo(); unsigned FrameReg = RegInfo->getFrameRegister(MF); SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameReg, PtrVT); while (Depth--) FrameAddr = DAG.getLoad(Op.getValueType(), DL, DAG.getEntryNode(), FrameAddr, MachinePointerInfo()); return FrameAddr; } static SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG, const VETargetLowering &TLI, const VESubtarget *Subtarget) { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); MFI.setReturnAddressIsTaken(true); if (TLI.verifyReturnAddressArgumentIsConstant(Op, DAG)) return SDValue(); SDValue FrameAddr = lowerFRAMEADDR(Op, DAG, TLI, Subtarget); SDLoc DL(Op); EVT VT = Op.getValueType(); SDValue Offset = DAG.getConstant(8, DL, VT); return DAG.getLoad(VT, DL, DAG.getEntryNode(), DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo()); } SDValue VETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); switch (IntNo) { default: // Don't custom lower most intrinsics. return SDValue(); case Intrinsic::eh_sjlj_lsda: { MachineFunction &MF = DAG.getMachineFunction(); MVT VT = Op.getSimpleValueType(); const VETargetMachine *TM = static_cast(&DAG.getTarget()); // Create GCC_except_tableXX string. The real symbol for that will be // generated in EHStreamer::emitExceptionTable() later. So, we just // borrow it's name here. TM->getStrList()->push_back(std::string( (Twine("GCC_except_table") + Twine(MF.getFunctionNumber())).str())); SDValue Addr = DAG.getTargetExternalSymbol(TM->getStrList()->back().c_str(), VT, 0); if (isPositionIndependent()) { Addr = makeHiLoPair(Addr, VEMCExpr::VK_VE_GOTOFF_HI32, VEMCExpr::VK_VE_GOTOFF_LO32, DAG); SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, VT); return DAG.getNode(ISD::ADD, DL, VT, GlobalBase, Addr); } return makeHiLoPair(Addr, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG); } } } static bool getUniqueInsertion(SDNode *N, unsigned &UniqueIdx) { if (!isa(N)) return false; const auto *BVN = cast(N); // Find first non-undef insertion. unsigned Idx; for (Idx = 0; Idx < BVN->getNumOperands(); ++Idx) { auto ElemV = BVN->getOperand(Idx); if (!ElemV->isUndef()) break; } // Catch the (hypothetical) all-undef case. if (Idx == BVN->getNumOperands()) return false; // Remember insertion. UniqueIdx = Idx++; // Verify that all other insertions are undef. for (; Idx < BVN->getNumOperands(); ++Idx) { auto ElemV = BVN->getOperand(Idx); if (!ElemV->isUndef()) return false; } return true; } static SDValue getSplatValue(SDNode *N) { if (auto *BuildVec = dyn_cast(N)) { return BuildVec->getSplatValue(); } return SDValue(); } SDValue VETargetLowering::lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); unsigned NumEls = Op.getValueType().getVectorNumElements(); MVT ElemVT = Op.getSimpleValueType().getVectorElementType(); // If there is just one element, expand to INSERT_VECTOR_ELT. unsigned UniqueIdx; if (getUniqueInsertion(Op.getNode(), UniqueIdx)) { SDValue AccuV = DAG.getUNDEF(Op.getValueType()); auto ElemV = Op->getOperand(UniqueIdx); SDValue IdxV = DAG.getConstant(UniqueIdx, DL, MVT::i64); return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), AccuV, ElemV, IdxV); } // Else emit a broadcast. if (SDValue ScalarV = getSplatValue(Op.getNode())) { // lower to VEC_BROADCAST MVT LegalResVT = MVT::getVectorVT(ElemVT, 256); auto AVL = DAG.getConstant(NumEls, DL, MVT::i32); return DAG.getNode(VEISD::VEC_BROADCAST, DL, LegalResVT, Op.getOperand(0), AVL); } // Expand return SDValue(); } SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { unsigned Opcode = Op.getOpcode(); if (ISD::isVPOpcode(Opcode)) return lowerToVVP(Op, DAG); switch (Opcode) { default: llvm_unreachable("Should not custom lower this!"); case ISD::ATOMIC_FENCE: return lowerATOMIC_FENCE(Op, DAG); case ISD::ATOMIC_SWAP: return lowerATOMIC_SWAP(Op, DAG); case ISD::BlockAddress: return lowerBlockAddress(Op, DAG); case ISD::ConstantPool: return lowerConstantPool(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return lowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_SETUP_DISPATCH: return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); case ISD::FRAMEADDR: return lowerFRAMEADDR(Op, DAG, *this, Subtarget); case ISD::GlobalAddress: return lowerGlobalAddress(Op, DAG); case ISD::GlobalTLSAddress: return lowerGlobalTLSAddress(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return lowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::JumpTable: return lowerJumpTable(Op, DAG); case ISD::LOAD: return lowerLOAD(Op, DAG); case ISD::RETURNADDR: return lowerRETURNADDR(Op, DAG, *this, Subtarget); case ISD::BUILD_VECTOR: return lowerBUILD_VECTOR(Op, DAG); case ISD::STORE: return lowerSTORE(Op, DAG); case ISD::VASTART: return lowerVASTART(Op, DAG); case ISD::VAARG: return lowerVAARG(Op, DAG); case ISD::INSERT_VECTOR_ELT: return lowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return lowerEXTRACT_VECTOR_ELT(Op, DAG); #define ADD_BINARY_VVP_OP(VVP_NAME, ISD_NAME) case ISD::ISD_NAME: #include "VVPNodes.def" return lowerToVVP(Op, DAG); } } /// } Custom Lower void VETargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { case ISD::ATOMIC_SWAP: // Let LLVM expand atomic swap instruction through LowerOperation. return; default: LLVM_DEBUG(N->dumpr(&DAG)); llvm_unreachable("Do not know how to custom type legalize this operation!"); } } /// JumpTable for VE. /// /// VE cannot generate relocatable symbol in jump table. VE cannot /// generate expressions using symbols in both text segment and data /// segment like below. /// .4byte .LBB0_2-.LJTI0_0 /// So, we generate offset from the top of function like below as /// a custom label. /// .4byte .LBB0_2- unsigned VETargetLowering::getJumpTableEncoding() const { // Use custom label for PIC. if (isPositionIndependent()) return MachineJumpTableInfo::EK_Custom32; // Otherwise, use the normal jump table encoding heuristics. return TargetLowering::getJumpTableEncoding(); } const MCExpr *VETargetLowering::LowerCustomJumpTableEntry( const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB, unsigned Uid, MCContext &Ctx) const { assert(isPositionIndependent()); // Generate custom label for PIC like below. // .4bytes .LBB0_2- const auto *Value = MCSymbolRefExpr::create(MBB->getSymbol(), Ctx); MCSymbol *Sym = Ctx.getOrCreateSymbol(MBB->getParent()->getName().data()); const auto *Base = MCSymbolRefExpr::create(Sym, Ctx); return MCBinaryExpr::createSub(Value, Base, Ctx); } SDValue VETargetLowering::getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) const { assert(isPositionIndependent()); SDLoc DL(Table); Function *Function = &DAG.getMachineFunction().getFunction(); assert(Function != nullptr); auto PtrTy = getPointerTy(DAG.getDataLayout(), Function->getAddressSpace()); // In the jump table, we have following values in PIC mode. // .4bytes .LBB0_2- // We need to add this value and the address of this function to generate // .LBB0_2 label correctly under PIC mode. So, we want to generate following // instructions: // lea %reg, fun@gotoff_lo // and %reg, %reg, (32)0 // lea.sl %reg, fun@gotoff_hi(%reg, %got) // In order to do so, we need to genarate correctly marked DAG node using // makeHiLoPair. SDValue Op = DAG.getGlobalAddress(Function, DL, PtrTy); SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOTOFF_HI32, VEMCExpr::VK_VE_GOTOFF_LO32, DAG); SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrTy); return DAG.getNode(ISD::ADD, DL, PtrTy, GlobalBase, HiLo); } Register VETargetLowering::prepareMBB(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *TargetBB, const DebugLoc &DL) const { MachineFunction *MF = MBB.getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); const VEInstrInfo *TII = Subtarget->getInstrInfo(); const TargetRegisterClass *RC = &VE::I64RegClass; Register Tmp1 = MRI.createVirtualRegister(RC); Register Tmp2 = MRI.createVirtualRegister(RC); Register Result = MRI.createVirtualRegister(RC); if (isPositionIndependent()) { // Create following instructions for local linkage PIC code. // lea %Tmp1, TargetBB@gotoff_lo // and %Tmp2, %Tmp1, (32)0 // lea.sl %Result, TargetBB@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1) .addImm(0) .addImm(0) .addMBB(TargetBB, VEMCExpr::VK_VE_GOTOFF_LO32); BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2) .addReg(Tmp1, getKillRegState(true)) .addImm(M0(32)); BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Result) .addReg(VE::SX15) .addReg(Tmp2, getKillRegState(true)) .addMBB(TargetBB, VEMCExpr::VK_VE_GOTOFF_HI32); } else { // Create following instructions for non-PIC code. // lea %Tmp1, TargetBB@lo // and %Tmp2, %Tmp1, (32)0 // lea.sl %Result, TargetBB@hi(%Tmp2) BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1) .addImm(0) .addImm(0) .addMBB(TargetBB, VEMCExpr::VK_VE_LO32); BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2) .addReg(Tmp1, getKillRegState(true)) .addImm(M0(32)); BuildMI(MBB, I, DL, TII->get(VE::LEASLrii), Result) .addReg(Tmp2, getKillRegState(true)) .addImm(0) .addMBB(TargetBB, VEMCExpr::VK_VE_HI32); } return Result; } Register VETargetLowering::prepareSymbol(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, StringRef Symbol, const DebugLoc &DL, bool IsLocal = false, bool IsCall = false) const { MachineFunction *MF = MBB.getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); const VEInstrInfo *TII = Subtarget->getInstrInfo(); const TargetRegisterClass *RC = &VE::I64RegClass; Register Result = MRI.createVirtualRegister(RC); if (isPositionIndependent()) { if (IsCall && !IsLocal) { // Create following instructions for non-local linkage PIC code function // calls. These instructions uses IC and magic number -24, so we expand // them in VEAsmPrinter.cpp from GETFUNPLT pseudo instruction. // lea %Reg, Symbol@plt_lo(-24) // and %Reg, %Reg, (32)0 // sic %s16 // lea.sl %Result, Symbol@plt_hi(%Reg, %s16) ; %s16 is PLT BuildMI(MBB, I, DL, TII->get(VE::GETFUNPLT), Result) .addExternalSymbol("abort"); } else if (IsLocal) { Register Tmp1 = MRI.createVirtualRegister(RC); Register Tmp2 = MRI.createVirtualRegister(RC); // Create following instructions for local linkage PIC code. // lea %Tmp1, Symbol@gotoff_lo // and %Tmp2, %Tmp1, (32)0 // lea.sl %Result, Symbol@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1) .addImm(0) .addImm(0) .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOTOFF_LO32); BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2) .addReg(Tmp1, getKillRegState(true)) .addImm(M0(32)); BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Result) .addReg(VE::SX15) .addReg(Tmp2, getKillRegState(true)) .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOTOFF_HI32); } else { Register Tmp1 = MRI.createVirtualRegister(RC); Register Tmp2 = MRI.createVirtualRegister(RC); // Create following instructions for not local linkage PIC code. // lea %Tmp1, Symbol@got_lo // and %Tmp2, %Tmp1, (32)0 // lea.sl %Tmp3, Symbol@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT // ld %Result, 0(%Tmp3) Register Tmp3 = MRI.createVirtualRegister(RC); BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1) .addImm(0) .addImm(0) .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOT_LO32); BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2) .addReg(Tmp1, getKillRegState(true)) .addImm(M0(32)); BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Tmp3) .addReg(VE::SX15) .addReg(Tmp2, getKillRegState(true)) .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOT_HI32); BuildMI(MBB, I, DL, TII->get(VE::LDrii), Result) .addReg(Tmp3, getKillRegState(true)) .addImm(0) .addImm(0); } } else { Register Tmp1 = MRI.createVirtualRegister(RC); Register Tmp2 = MRI.createVirtualRegister(RC); // Create following instructions for non-PIC code. // lea %Tmp1, Symbol@lo // and %Tmp2, %Tmp1, (32)0 // lea.sl %Result, Symbol@hi(%Tmp2) BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1) .addImm(0) .addImm(0) .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_LO32); BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2) .addReg(Tmp1, getKillRegState(true)) .addImm(M0(32)); BuildMI(MBB, I, DL, TII->get(VE::LEASLrii), Result) .addReg(Tmp2, getKillRegState(true)) .addImm(0) .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_HI32); } return Result; } void VETargetLowering::setupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB, MachineBasicBlock *DispatchBB, int FI, int Offset) const { DebugLoc DL = MI.getDebugLoc(); const VEInstrInfo *TII = Subtarget->getInstrInfo(); Register LabelReg = prepareMBB(*MBB, MachineBasicBlock::iterator(MI), DispatchBB, DL); // Store an address of DispatchBB to a given jmpbuf[1] where has next IC // referenced by longjmp (throw) later. MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii)); addFrameReference(MIB, FI, Offset); // jmpbuf[1] MIB.addReg(LabelReg, getKillRegState(true)); } MachineBasicBlock * VETargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); const BasicBlock *BB = MBB->getBasicBlock(); MachineFunction::iterator I = ++MBB->getIterator(); // Memory Reference. SmallVector MMOs(MI.memoperands_begin(), MI.memoperands_end()); Register BufReg = MI.getOperand(1).getReg(); Register DstReg; DstReg = MI.getOperand(0).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"); (void)TRI; Register MainDestReg = MRI.createVirtualRegister(RC); Register RestoreDestReg = MRI.createVirtualRegister(RC); // For `v = call @llvm.eh.sjlj.setjmp(buf)`, we generate following // instructions. SP/FP must be saved in jmpbuf before `llvm.eh.sjlj.setjmp`. // // ThisMBB: // buf[3] = %s17 iff %s17 is used as BP // buf[1] = RestoreMBB as IC after longjmp // # SjLjSetup RestoreMBB // // MainMBB: // v_main = 0 // // SinkMBB: // v = phi(v_main, MainMBB, v_restore, RestoreMBB) // ... // // RestoreMBB: // %s17 = buf[3] = iff %s17 is used as BP // v_restore = 1 // goto SinkMBB MachineBasicBlock *ThisMBB = MBB; MachineBasicBlock *MainMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *SinkMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *RestoreMBB = MF->CreateMachineBasicBlock(BB); MF->insert(I, MainMBB); MF->insert(I, SinkMBB); MF->push_back(RestoreMBB); RestoreMBB->setHasAddressTaken(); // Transfer the remainder of BB and its successor edges to SinkMBB. SinkMBB->splice(SinkMBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); SinkMBB->transferSuccessorsAndUpdatePHIs(MBB); // ThisMBB: Register LabelReg = prepareMBB(*MBB, MachineBasicBlock::iterator(MI), RestoreMBB, DL); // Store BP in buf[3] iff this function is using BP. const VEFrameLowering *TFI = Subtarget->getFrameLowering(); if (TFI->hasBP(*MF)) { MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii)); MIB.addReg(BufReg); MIB.addImm(0); MIB.addImm(24); MIB.addReg(VE::SX17); MIB.setMemRefs(MMOs); } // Store IP in buf[1]. MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii)); MIB.add(MI.getOperand(1)); // we can preserve the kill flags here. MIB.addImm(0); MIB.addImm(8); MIB.addReg(LabelReg, getKillRegState(true)); MIB.setMemRefs(MMOs); // SP/FP are already stored in jmpbuf before `llvm.eh.sjlj.setjmp`. // Insert setup. MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::EH_SjLj_Setup)).addMBB(RestoreMBB); const VERegisterInfo *RegInfo = Subtarget->getRegisterInfo(); MIB.addRegMask(RegInfo->getNoPreservedMask()); ThisMBB->addSuccessor(MainMBB); ThisMBB->addSuccessor(RestoreMBB); // MainMBB: BuildMI(MainMBB, DL, TII->get(VE::LEAzii), MainDestReg) .addImm(0) .addImm(0) .addImm(0); MainMBB->addSuccessor(SinkMBB); // SinkMBB: BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(VE::PHI), DstReg) .addReg(MainDestReg) .addMBB(MainMBB) .addReg(RestoreDestReg) .addMBB(RestoreMBB); // RestoreMBB: // Restore BP from buf[3] iff this function is using BP. The address of // buf is in SX10. // FIXME: Better to not use SX10 here if (TFI->hasBP(*MF)) { MachineInstrBuilder MIB = BuildMI(RestoreMBB, DL, TII->get(VE::LDrii), VE::SX17); MIB.addReg(VE::SX10); MIB.addImm(0); MIB.addImm(24); MIB.setMemRefs(MMOs); } BuildMI(RestoreMBB, DL, TII->get(VE::LEAzii), RestoreDestReg) .addImm(0) .addImm(0) .addImm(1); BuildMI(RestoreMBB, DL, TII->get(VE::BRCFLa_t)).addMBB(SinkMBB); RestoreMBB->addSuccessor(SinkMBB); MI.eraseFromParent(); return SinkMBB; } MachineBasicBlock * VETargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); // Memory Reference. SmallVector MMOs(MI.memoperands_begin(), MI.memoperands_end()); Register BufReg = MI.getOperand(0).getReg(); Register Tmp = MRI.createVirtualRegister(&VE::I64RegClass); // Since FP is only updated here but NOT referenced, it's treated as GPR. Register FP = VE::SX9; Register SP = VE::SX11; MachineInstrBuilder MIB; MachineBasicBlock *ThisMBB = MBB; // For `call @llvm.eh.sjlj.longjmp(buf)`, we generate following instructions. // // ThisMBB: // %fp = load buf[0] // %jmp = load buf[1] // %s10 = buf ; Store an address of buf to SX10 for RestoreMBB // %sp = load buf[2] ; generated by llvm.eh.sjlj.setjmp. // jmp %jmp // Reload FP. MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), FP); MIB.addReg(BufReg); MIB.addImm(0); MIB.addImm(0); MIB.setMemRefs(MMOs); // Reload IP. MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), Tmp); MIB.addReg(BufReg); MIB.addImm(0); MIB.addImm(8); MIB.setMemRefs(MMOs); // Copy BufReg to SX10 for later use in setjmp. // FIXME: Better to not use SX10 here BuildMI(*ThisMBB, MI, DL, TII->get(VE::ORri), VE::SX10) .addReg(BufReg) .addImm(0); // Reload SP. MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), SP); MIB.add(MI.getOperand(0)); // we can preserve the kill flags here. MIB.addImm(0); MIB.addImm(16); MIB.setMemRefs(MMOs); // Jump. BuildMI(*ThisMBB, MI, DL, TII->get(VE::BCFLari_t)) .addReg(Tmp, getKillRegState(true)) .addImm(0); MI.eraseFromParent(); return ThisMBB; } MachineBasicBlock * VETargetLowering::emitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *BB) const { DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = BB->getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); const VEInstrInfo *TII = Subtarget->getInstrInfo(); int FI = MFI.getFunctionContextIndex(); // Get a mapping of the call site numbers to all of the landing pads they're // associated with. DenseMap> CallSiteNumToLPad; unsigned MaxCSNum = 0; for (auto &MBB : *MF) { if (!MBB.isEHPad()) continue; MCSymbol *Sym = nullptr; for (const auto &MI : MBB) { if (MI.isDebugInstr()) continue; assert(MI.isEHLabel() && "expected EH_LABEL"); Sym = MI.getOperand(0).getMCSymbol(); break; } if (!MF->hasCallSiteLandingPad(Sym)) continue; for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) { CallSiteNumToLPad[CSI].push_back(&MBB); MaxCSNum = std::max(MaxCSNum, CSI); } } // Get an ordered list of the machine basic blocks for the jump table. std::vector LPadList; SmallPtrSet InvokeBBs; LPadList.reserve(CallSiteNumToLPad.size()); for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) { for (auto &LP : CallSiteNumToLPad[CSI]) { LPadList.push_back(LP); InvokeBBs.insert(LP->pred_begin(), LP->pred_end()); } } assert(!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"); // The %fn_context is allocated like below (from --print-after=sjljehprepare): // %fn_context = alloca { i8*, i64, [4 x i64], i8*, i8*, [5 x i8*] } // // This `[5 x i8*]` is jmpbuf, so jmpbuf[1] is FI+72. // First `i64` is callsite, so callsite is FI+8. static const int OffsetIC = 72; static const int OffsetCS = 8; // Create the MBBs for the dispatch code like following: // // ThisMBB: // Prepare DispatchBB address and store it to buf[1]. // ... // // DispatchBB: // %s15 = GETGOT iff isPositionIndependent // %callsite = load callsite // brgt.l.t #size of callsites, %callsite, DispContBB // // TrapBB: // Call abort. // // DispContBB: // %breg = address of jump table // %pc = load and calculate next pc from %breg and %callsite // jmp %pc // Shove the dispatch's address into the return slot in the function context. MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); DispatchBB->setIsEHPad(true); // Trap BB will causes trap like `assert(0)`. MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); DispatchBB->addSuccessor(TrapBB); MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); DispatchBB->addSuccessor(DispContBB); // Insert MBBs. MF->push_back(DispatchBB); MF->push_back(DispContBB); MF->push_back(TrapBB); // Insert code to call abort in the TrapBB. Register Abort = prepareSymbol(*TrapBB, TrapBB->end(), "abort", DL, /* Local */ false, /* Call */ true); BuildMI(TrapBB, DL, TII->get(VE::BSICrii), VE::SX10) .addReg(Abort, getKillRegState(true)) .addImm(0) .addImm(0); // Insert code into the entry block that creates and registers the function // context. setupEntryBlockForSjLj(MI, BB, DispatchBB, FI, OffsetIC); // Create the jump table and associated information unsigned JTE = getJumpTableEncoding(); MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE); unsigned MJTI = JTI->createJumpTableIndex(LPadList); const VERegisterInfo &RI = TII->getRegisterInfo(); // Add a register mask with no preserved registers. This results in all // registers being marked as clobbered. BuildMI(DispatchBB, DL, TII->get(VE::NOP)) .addRegMask(RI.getNoPreservedMask()); if (isPositionIndependent()) { // Force to generate GETGOT, since current implementation doesn't store GOT // register. BuildMI(DispatchBB, DL, TII->get(VE::GETGOT), VE::SX15); } // IReg is used as an index in a memory operand and therefore can't be SP const TargetRegisterClass *RC = &VE::I64RegClass; Register IReg = MRI.createVirtualRegister(RC); addFrameReference(BuildMI(DispatchBB, DL, TII->get(VE::LDLZXrii), IReg), FI, OffsetCS); if (LPadList.size() < 64) { BuildMI(DispatchBB, DL, TII->get(VE::BRCFLir_t)) .addImm(VECC::CC_ILE) .addImm(LPadList.size()) .addReg(IReg) .addMBB(TrapBB); } else { assert(LPadList.size() <= 0x7FFFFFFF && "Too large Landing Pad!"); Register TmpReg = MRI.createVirtualRegister(RC); BuildMI(DispatchBB, DL, TII->get(VE::LEAzii), TmpReg) .addImm(0) .addImm(0) .addImm(LPadList.size()); BuildMI(DispatchBB, DL, TII->get(VE::BRCFLrr_t)) .addImm(VECC::CC_ILE) .addReg(TmpReg, getKillRegState(true)) .addReg(IReg) .addMBB(TrapBB); } Register BReg = MRI.createVirtualRegister(RC); Register Tmp1 = MRI.createVirtualRegister(RC); Register Tmp2 = MRI.createVirtualRegister(RC); if (isPositionIndependent()) { // Create following instructions for local linkage PIC code. // lea %Tmp1, .LJTI0_0@gotoff_lo // and %Tmp2, %Tmp1, (32)0 // lea.sl %BReg, .LJTI0_0@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT BuildMI(DispContBB, DL, TII->get(VE::LEAzii), Tmp1) .addImm(0) .addImm(0) .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_GOTOFF_LO32); BuildMI(DispContBB, DL, TII->get(VE::ANDrm), Tmp2) .addReg(Tmp1, getKillRegState(true)) .addImm(M0(32)); BuildMI(DispContBB, DL, TII->get(VE::LEASLrri), BReg) .addReg(VE::SX15) .addReg(Tmp2, getKillRegState(true)) .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_GOTOFF_HI32); } else { // Create following instructions for non-PIC code. // lea %Tmp1, .LJTI0_0@lo // and %Tmp2, %Tmp1, (32)0 // lea.sl %BReg, .LJTI0_0@hi(%Tmp2) BuildMI(DispContBB, DL, TII->get(VE::LEAzii), Tmp1) .addImm(0) .addImm(0) .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_LO32); BuildMI(DispContBB, DL, TII->get(VE::ANDrm), Tmp2) .addReg(Tmp1, getKillRegState(true)) .addImm(M0(32)); BuildMI(DispContBB, DL, TII->get(VE::LEASLrii), BReg) .addReg(Tmp2, getKillRegState(true)) .addImm(0) .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_HI32); } switch (JTE) { case MachineJumpTableInfo::EK_BlockAddress: { // Generate simple block address code for no-PIC model. // sll %Tmp1, %IReg, 3 // lds %TReg, 0(%Tmp1, %BReg) // bcfla %TReg Register TReg = MRI.createVirtualRegister(RC); Register Tmp1 = MRI.createVirtualRegister(RC); BuildMI(DispContBB, DL, TII->get(VE::SLLri), Tmp1) .addReg(IReg, getKillRegState(true)) .addImm(3); BuildMI(DispContBB, DL, TII->get(VE::LDrri), TReg) .addReg(BReg, getKillRegState(true)) .addReg(Tmp1, getKillRegState(true)) .addImm(0); BuildMI(DispContBB, DL, TII->get(VE::BCFLari_t)) .addReg(TReg, getKillRegState(true)) .addImm(0); break; } case MachineJumpTableInfo::EK_Custom32: { // Generate block address code using differences from the function pointer // for PIC model. // sll %Tmp1, %IReg, 2 // ldl.zx %OReg, 0(%Tmp1, %BReg) // Prepare function address in BReg2. // adds.l %TReg, %BReg2, %OReg // bcfla %TReg assert(isPositionIndependent()); Register OReg = MRI.createVirtualRegister(RC); Register TReg = MRI.createVirtualRegister(RC); Register Tmp1 = MRI.createVirtualRegister(RC); BuildMI(DispContBB, DL, TII->get(VE::SLLri), Tmp1) .addReg(IReg, getKillRegState(true)) .addImm(2); BuildMI(DispContBB, DL, TII->get(VE::LDLZXrri), OReg) .addReg(BReg, getKillRegState(true)) .addReg(Tmp1, getKillRegState(true)) .addImm(0); Register BReg2 = prepareSymbol(*DispContBB, DispContBB->end(), DispContBB->getParent()->getName(), DL, /* Local */ true); BuildMI(DispContBB, DL, TII->get(VE::ADDSLrr), TReg) .addReg(OReg, getKillRegState(true)) .addReg(BReg2, getKillRegState(true)); BuildMI(DispContBB, DL, TII->get(VE::BCFLari_t)) .addReg(TReg, getKillRegState(true)) .addImm(0); break; } default: llvm_unreachable("Unexpected jump table encoding"); } // Add the jump table entries as successors to the MBB. SmallPtrSet SeenMBBs; for (auto &LP : LPadList) if (SeenMBBs.insert(LP).second) DispContBB->addSuccessor(LP); // N.B. the order the invoke BBs are processed in doesn't matter here. SmallVector MBBLPads; const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs(); for (MachineBasicBlock *MBB : InvokeBBs) { // Remove the landing pad successor from the invoke block and replace it // with the new dispatch block. // Keep a copy of Successors since it's modified inside the loop. SmallVector Successors(MBB->succ_rbegin(), MBB->succ_rend()); // FIXME: Avoid quadratic complexity. for (auto MBBS : Successors) { if (MBBS->isEHPad()) { MBB->removeSuccessor(MBBS); MBBLPads.push_back(MBBS); } } MBB->addSuccessor(DispatchBB); // Find the invoke call and mark all of the callee-saved registers as // 'implicit defined' so that they're spilled. This prevents code from // moving instructions to before the EH block, where they will never be // executed. for (auto &II : reverse(*MBB)) { if (!II.isCall()) continue; DenseMap DefRegs; for (auto &MOp : II.operands()) if (MOp.isReg()) DefRegs[MOp.getReg()] = true; MachineInstrBuilder MIB(*MF, &II); for (unsigned RI = 0; SavedRegs[RI]; ++RI) { Register Reg = SavedRegs[RI]; if (!DefRegs[Reg]) MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); } break; } } // Mark all former landing pads as non-landing pads. The dispatch is the only // landing pad now. for (auto &LP : MBBLPads) LP->setIsEHPad(false); // The instruction is gone now. MI.eraseFromParent(); return BB; } MachineBasicBlock * VETargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { switch (MI.getOpcode()) { default: llvm_unreachable("Unknown Custom Instruction!"); case VE::EH_SjLj_LongJmp: return emitEHSjLjLongJmp(MI, BB); case VE::EH_SjLj_SetJmp: return emitEHSjLjSetJmp(MI, BB); case VE::EH_SjLj_Setup_Dispatch: return emitSjLjDispatchBlock(MI, BB); } } static bool isI32Insn(const SDNode *User, const SDNode *N) { switch (User->getOpcode()) { default: return false; case ISD::ADD: case ISD::SUB: case ISD::MUL: case ISD::SDIV: case ISD::UDIV: case ISD::SETCC: case ISD::SMIN: case ISD::SMAX: case ISD::SHL: case ISD::SRA: case ISD::BSWAP: case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: case ISD::BR_CC: case ISD::BITCAST: case ISD::ATOMIC_CMP_SWAP: case ISD::ATOMIC_SWAP: return true; case ISD::SRL: if (N->getOperand(0).getOpcode() != ISD::SRL) return true; // (srl (trunc (srl ...))) may be optimized by combining srl, so // doesn't optimize trunc now. return false; case ISD::SELECT_CC: if (User->getOperand(2).getNode() != N && User->getOperand(3).getNode() != N) return true; LLVM_FALLTHROUGH; case ISD::AND: case ISD::OR: case ISD::XOR: case ISD::SELECT: case ISD::CopyToReg: // Check all use of selections, bit operations, and copies. If all of them // are safe, optimize truncate to extract_subreg. for (SDNode::use_iterator UI = User->use_begin(), UE = User->use_end(); UI != UE; ++UI) { switch ((*UI)->getOpcode()) { default: // If the use is an instruction which treats the source operand as i32, // it is safe to avoid truncate here. if (isI32Insn(*UI, N)) continue; break; case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: { // Special optimizations to the combination of ext and trunc. // (ext ... (select ... (trunc ...))) is safe to avoid truncate here // since this truncate instruction clears higher 32 bits which is filled // by one of ext instructions later. assert(N->getValueType(0) == MVT::i32 && "find truncate to not i32 integer"); if (User->getOpcode() == ISD::SELECT_CC || User->getOpcode() == ISD::SELECT) continue; break; } } return false; } return true; } } // Optimize TRUNCATE in DAG combining. Optimizing it in CUSTOM lower is // sometime too early. Optimizing it in DAG pattern matching in VEInstrInfo.td // is sometime too late. So, doing it at here. SDValue VETargetLowering::combineTRUNCATE(SDNode *N, DAGCombinerInfo &DCI) const { assert(N->getOpcode() == ISD::TRUNCATE && "Should be called with a TRUNCATE node"); SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); EVT VT = N->getValueType(0); // We prefer to do this when all types are legal. if (!DCI.isAfterLegalizeDAG()) return SDValue(); // Skip combine TRUNCATE atm if the operand of TRUNCATE might be a constant. if (N->getOperand(0)->getOpcode() == ISD::SELECT_CC && isa(N->getOperand(0)->getOperand(0)) && isa(N->getOperand(0)->getOperand(1))) return SDValue(); // Check all use of this TRUNCATE. for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE; ++UI) { SDNode *User = *UI; // Make sure that we're not going to replace TRUNCATE for non i32 // instructions. // // FIXME: Although we could sometimes handle this, and it does occur in // practice that one of the condition inputs to the select is also one of // the outputs, we currently can't deal with this. if (isI32Insn(User, N)) continue; return SDValue(); } SDValue SubI32 = DAG.getTargetConstant(VE::sub_i32, DL, MVT::i32); return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, N->getOperand(0), SubI32), 0); } SDValue VETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { default: break; case ISD::TRUNCATE: return combineTRUNCATE(N, DCI); } return SDValue(); } //===----------------------------------------------------------------------===// // VE Inline Assembly Support //===----------------------------------------------------------------------===// VETargetLowering::ConstraintType VETargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { default: break; case 'v': // vector registers return C_RegisterClass; } } return TargetLowering::getConstraintType(Constraint); } std::pair VETargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { const TargetRegisterClass *RC = nullptr; if (Constraint.size() == 1) { switch (Constraint[0]) { default: return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); case 'r': RC = &VE::I64RegClass; break; case 'v': RC = &VE::V64RegClass; break; } return std::make_pair(0U, RC); } return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } //===----------------------------------------------------------------------===// // VE Target Optimization Support //===----------------------------------------------------------------------===// unsigned VETargetLowering::getMinimumJumpTableEntries() const { // Specify 8 for PIC model to relieve the impact of PIC load instructions. if (isJumpTableRelative()) return 8; return TargetLowering::getMinimumJumpTableEntries(); } bool VETargetLowering::hasAndNot(SDValue Y) const { EVT VT = Y.getValueType(); // VE doesn't have vector and not instruction. if (VT.isVector()) return false; // VE allows different immediate values for X and Y where ~X & Y. // Only simm7 works for X, and only mimm works for Y on VE. However, this // function is used to check whether an immediate value is OK for and-not // instruction as both X and Y. Generating additional instruction to // retrieve an immediate value is no good since the purpose of this // function is to convert a series of 3 instructions to another series of // 3 instructions with better parallelism. Therefore, we return false // for all immediate values now. // FIXME: Change hasAndNot function to have two operands to make it work // correctly with Aurora VE. if (isa(Y)) return false; // It's ok for generic registers. return true; } /// \returns the VVP_* SDNode opcode corresponsing to \p OC. static Optional getVVPOpcode(unsigned Opcode) { switch (Opcode) { #define HANDLE_VP_TO_VVP(VPOPC, VVPNAME) \ case ISD::VPOPC: \ return VEISD::VVPNAME; #define ADD_VVP_OP(VVPNAME, SDNAME) \ case VEISD::VVPNAME: \ case ISD::SDNAME: \ return VEISD::VVPNAME; #include "VVPNodes.def" } return None; } SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const { // Can we represent this as a VVP node. const unsigned Opcode = Op->getOpcode(); auto VVPOpcodeOpt = getVVPOpcode(Opcode); if (!VVPOpcodeOpt.hasValue()) return SDValue(); unsigned VVPOpcode = VVPOpcodeOpt.getValue(); const bool FromVP = ISD::isVPOpcode(Opcode); // The representative and legalized vector type of this operation. SDLoc DL(Op); MVT MaskVT = MVT::v256i1; // TODO: packed mode. EVT OpVecVT = Op.getValueType(); EVT LegalVecVT = getTypeToTransformTo(*DAG.getContext(), OpVecVT); SDValue AVL; SDValue Mask; if (FromVP) { // All upstream VP SDNodes always have a mask and avl. auto MaskIdx = ISD::getVPMaskIdx(Opcode).getValue(); auto AVLIdx = ISD::getVPExplicitVectorLengthIdx(Opcode).getValue(); Mask = Op->getOperand(MaskIdx); AVL = Op->getOperand(AVLIdx); } else { // Materialize the VL parameter. AVL = DAG.getConstant(OpVecVT.getVectorNumElements(), DL, MVT::i32); SDValue ConstTrue = DAG.getConstant(1, DL, MVT::i32); Mask = DAG.getNode(VEISD::VEC_BROADCAST, DL, MaskVT, ConstTrue); // emit a VEISD::VEC_BROADCAST here. } // Categories we are interested in. bool IsBinaryOp = false; switch (VVPOpcode) { #define ADD_BINARY_VVP_OP(VVPNAME, ...) \ case VEISD::VVPNAME: \ IsBinaryOp = true; \ break; #include "VVPNodes.def" } if (IsBinaryOp) { assert(LegalVecVT.isSimple()); return DAG.getNode(VVPOpcode, DL, LegalVecVT, Op->getOperand(0), Op->getOperand(1), Mask, AVL); } llvm_unreachable("lowerToVVP called for unexpected SDNode."); } SDValue VETargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); MVT VT = Op.getOperand(0).getSimpleValueType(); // Special treatment for packed V64 types. assert(VT == MVT::v512i32 || VT == MVT::v512f32); // Example of codes: // %packed_v = extractelt %vr, %idx / 2 // %v = %packed_v >> (%idx % 2 * 32) // %res = %v & 0xffffffff SDValue Vec = Op.getOperand(0); SDValue Idx = Op.getOperand(1); SDLoc DL(Op); SDValue Result = Op; if (0 /* Idx->isConstant() */) { // TODO: optimized implementation using constant values } else { SDValue Const1 = DAG.getConstant(1, DL, MVT::i64); SDValue HalfIdx = DAG.getNode(ISD::SRL, DL, MVT::i64, {Idx, Const1}); SDValue PackedElt = SDValue(DAG.getMachineNode(VE::LVSvr, DL, MVT::i64, {Vec, HalfIdx}), 0); SDValue AndIdx = DAG.getNode(ISD::AND, DL, MVT::i64, {Idx, Const1}); SDValue Shift = DAG.getNode(ISD::XOR, DL, MVT::i64, {AndIdx, Const1}); SDValue Const5 = DAG.getConstant(5, DL, MVT::i64); Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, {Shift, Const5}); PackedElt = DAG.getNode(ISD::SRL, DL, MVT::i64, {PackedElt, Shift}); SDValue Mask = DAG.getConstant(0xFFFFFFFFL, DL, MVT::i64); PackedElt = DAG.getNode(ISD::AND, DL, MVT::i64, {PackedElt, Mask}); SDValue SubI32 = DAG.getTargetConstant(VE::sub_i32, DL, MVT::i32); Result = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i32, PackedElt, SubI32), 0); if (Op.getSimpleValueType() == MVT::f32) { Result = DAG.getBitcast(MVT::f32, Result); } else { assert(Op.getSimpleValueType() == MVT::i32); } } return Result; } SDValue VETargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); MVT VT = Op.getOperand(0).getSimpleValueType(); // Special treatment for packed V64 types. assert(VT == MVT::v512i32 || VT == MVT::v512f32); // The v512i32 and v512f32 starts from upper bits (0..31). This "upper // bits" required `val << 32` from C implementation's point of view. // // Example of codes: // %packed_elt = extractelt %vr, (%idx >> 1) // %shift = ((%idx & 1) ^ 1) << 5 // %packed_elt &= 0xffffffff00000000 >> shift // %packed_elt |= (zext %val) << shift // %vr = insertelt %vr, %packed_elt, (%idx >> 1) SDLoc DL(Op); SDValue Vec = Op.getOperand(0); SDValue Val = Op.getOperand(1); SDValue Idx = Op.getOperand(2); if (Idx.getSimpleValueType() == MVT::i32) Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx); if (Val.getSimpleValueType() == MVT::f32) Val = DAG.getBitcast(MVT::i32, Val); assert(Val.getSimpleValueType() == MVT::i32); Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); SDValue Result = Op; if (0 /* Idx->isConstant()*/) { // TODO: optimized implementation using constant values } else { SDValue Const1 = DAG.getConstant(1, DL, MVT::i64); SDValue HalfIdx = DAG.getNode(ISD::SRL, DL, MVT::i64, {Idx, Const1}); SDValue PackedElt = SDValue(DAG.getMachineNode(VE::LVSvr, DL, MVT::i64, {Vec, HalfIdx}), 0); SDValue AndIdx = DAG.getNode(ISD::AND, DL, MVT::i64, {Idx, Const1}); SDValue Shift = DAG.getNode(ISD::XOR, DL, MVT::i64, {AndIdx, Const1}); SDValue Const5 = DAG.getConstant(5, DL, MVT::i64); Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, {Shift, Const5}); SDValue Mask = DAG.getConstant(0xFFFFFFFF00000000L, DL, MVT::i64); Mask = DAG.getNode(ISD::SRL, DL, MVT::i64, {Mask, Shift}); PackedElt = DAG.getNode(ISD::AND, DL, MVT::i64, {PackedElt, Mask}); Val = DAG.getNode(ISD::SHL, DL, MVT::i64, {Val, Shift}); PackedElt = DAG.getNode(ISD::OR, DL, MVT::i64, {PackedElt, Val}); Result = SDValue(DAG.getMachineNode(VE::LSVrr_v, DL, Vec.getSimpleValueType(), {HalfIdx, PackedElt, Vec}), 0); } return Result; }