//===--- AMDGPUPropagateAttributes.cpp --------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file /// \brief This pass propagates attributes from kernels to the non-entry /// functions. Most of the library functions were not compiled for specific ABI, /// yet will be correctly compiled if proper attrbutes are propagated from the /// caller. /// /// The pass analyzes call graph and propagates ABI target features through the /// call graph. /// /// It can run in two modes: as a function or module pass. A function pass /// simply propagates attributes. A module pass clones functions if there are /// callers with different ABI. If a function is clonned all call sites will /// be updated to use a correct clone. /// /// A function pass is limited in functionality but can run early in the /// pipeline. A module pass is more powerful but has to run late, so misses /// library folding opportunities. // //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/InstrTypes.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Utils/Cloning.h" #define DEBUG_TYPE "amdgpu-propagate-attributes" using namespace llvm; namespace llvm { extern const SubtargetFeatureKV AMDGPUFeatureKV[AMDGPU::NumSubtargetFeatures-1]; } namespace { // Target features to propagate. static constexpr const FeatureBitset TargetFeatures = { AMDGPU::FeatureWavefrontSize16, AMDGPU::FeatureWavefrontSize32, AMDGPU::FeatureWavefrontSize64 }; // Attributes to propagate. // TODO: Support conservative min/max merging instead of cloning. static constexpr const char* AttributeNames[] = { "amdgpu-waves-per-eu", "amdgpu-flat-work-group-size" }; static constexpr unsigned NumAttr = sizeof(AttributeNames) / sizeof(AttributeNames[0]); class AMDGPUPropagateAttributes { class FnProperties { private: explicit FnProperties(const FeatureBitset &&FB) : Features(FB) {} public: explicit FnProperties(const TargetMachine &TM, const Function &F) { Features = TM.getSubtargetImpl(F)->getFeatureBits(); for (unsigned I = 0; I < NumAttr; ++I) if (F.hasFnAttribute(AttributeNames[I])) Attributes[I] = F.getFnAttribute(AttributeNames[I]); } bool operator == (const FnProperties &Other) const { if ((Features & TargetFeatures) != (Other.Features & TargetFeatures)) return false; for (unsigned I = 0; I < NumAttr; ++I) if (Attributes[I] != Other.Attributes[I]) return false; return true; } FnProperties adjustToCaller(const FnProperties &CallerProps) const { FnProperties New((Features & ~TargetFeatures) | CallerProps.Features); for (unsigned I = 0; I < NumAttr; ++I) New.Attributes[I] = CallerProps.Attributes[I]; return New; } FeatureBitset Features; Optional Attributes[NumAttr]; }; class Clone { public: Clone(const FnProperties &Props, Function *OrigF, Function *NewF) : Properties(Props), OrigF(OrigF), NewF(NewF) {} FnProperties Properties; Function *OrigF; Function *NewF; }; const TargetMachine *TM; // Clone functions as needed or just set attributes. bool AllowClone; // Option propagation roots. SmallSet Roots; // Clones of functions with their attributes. SmallVector Clones; // Find a clone with required features. Function *findFunction(const FnProperties &PropsNeeded, Function *OrigF); // Clone function \p F and set \p NewProps on the clone. // Cole takes the name of original function. Function *cloneWithProperties(Function &F, const FnProperties &NewProps); // Set new function's features in place. void setFeatures(Function &F, const FeatureBitset &NewFeatures); // Set new function's attributes in place. void setAttributes(Function &F, const ArrayRef> NewAttrs); std::string getFeatureString(const FeatureBitset &Features) const; // Propagate attributes from Roots. bool process(); public: AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone) : TM(TM), AllowClone(AllowClone) {} // Use F as a root and propagate its attributes. bool process(Function &F); // Propagate attributes starting from kernel functions. bool process(Module &M); }; // Allows to propagate attributes early, but no clonning is allowed as it must // be a function pass to run before any optimizations. // TODO: We shall only need a one instance of module pass, but that needs to be // in the linker pipeline which is currently not possible. class AMDGPUPropagateAttributesEarly : public FunctionPass { const TargetMachine *TM; public: static char ID; // Pass identification AMDGPUPropagateAttributesEarly(const TargetMachine *TM = nullptr) : FunctionPass(ID), TM(TM) { initializeAMDGPUPropagateAttributesEarlyPass( *PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; }; // Allows to propagate attributes with clonning but does that late in the // pipeline. class AMDGPUPropagateAttributesLate : public ModulePass { const TargetMachine *TM; public: static char ID; // Pass identification AMDGPUPropagateAttributesLate(const TargetMachine *TM = nullptr) : ModulePass(ID), TM(TM) { initializeAMDGPUPropagateAttributesLatePass( *PassRegistry::getPassRegistry()); } bool runOnModule(Module &M) override; }; } // end anonymous namespace. char AMDGPUPropagateAttributesEarly::ID = 0; char AMDGPUPropagateAttributesLate::ID = 0; INITIALIZE_PASS(AMDGPUPropagateAttributesEarly, "amdgpu-propagate-attributes-early", "Early propagate attributes from kernels to functions", false, false) INITIALIZE_PASS(AMDGPUPropagateAttributesLate, "amdgpu-propagate-attributes-late", "Late propagate attributes from kernels to functions", false, false) Function * AMDGPUPropagateAttributes::findFunction(const FnProperties &PropsNeeded, Function *OrigF) { // TODO: search for clone's clones. for (Clone &C : Clones) if (C.OrigF == OrigF && PropsNeeded == C.Properties) return C.NewF; return nullptr; } bool AMDGPUPropagateAttributes::process(Module &M) { for (auto &F : M.functions()) if (AMDGPU::isEntryFunctionCC(F.getCallingConv())) Roots.insert(&F); return process(); } bool AMDGPUPropagateAttributes::process(Function &F) { Roots.insert(&F); return process(); } bool AMDGPUPropagateAttributes::process() { bool Changed = false; SmallSet NewRoots; SmallSet Replaced; if (Roots.empty()) return false; Module &M = *(*Roots.begin())->getParent(); do { Roots.insert(NewRoots.begin(), NewRoots.end()); NewRoots.clear(); for (auto &F : M.functions()) { if (F.isDeclaration()) continue; const FnProperties CalleeProps(*TM, F); SmallVector, 32> ToReplace; SmallSet Visited; for (User *U : F.users()) { Instruction *I = dyn_cast(U); if (!I) continue; CallBase *CI = dyn_cast(I); if (!CI) continue; Function *Caller = CI->getCaller(); if (!Caller || !Visited.insert(CI).second) continue; if (!Roots.count(Caller) && !NewRoots.count(Caller)) continue; const FnProperties CallerProps(*TM, *Caller); if (CalleeProps == CallerProps) { if (!Roots.count(&F)) NewRoots.insert(&F); continue; } Function *NewF = findFunction(CallerProps, &F); if (!NewF) { const FnProperties NewProps = CalleeProps.adjustToCaller(CallerProps); if (!AllowClone) { // This may set different features on different iteartions if // there is a contradiction in callers' attributes. In this case // we rely on a second pass running on Module, which is allowed // to clone. setFeatures(F, NewProps.Features); setAttributes(F, NewProps.Attributes); NewRoots.insert(&F); Changed = true; break; } NewF = cloneWithProperties(F, NewProps); Clones.push_back(Clone(CallerProps, &F, NewF)); NewRoots.insert(NewF); } ToReplace.push_back(std::make_pair(CI, NewF)); Replaced.insert(&F); Changed = true; } while (!ToReplace.empty()) { auto R = ToReplace.pop_back_val(); R.first->setCalledFunction(R.second); } } } while (!NewRoots.empty()); for (Function *F : Replaced) { if (F->use_empty()) F->eraseFromParent(); } Roots.clear(); Clones.clear(); return Changed; } Function * AMDGPUPropagateAttributes::cloneWithProperties(Function &F, const FnProperties &NewProps) { LLVM_DEBUG(dbgs() << "Cloning " << F.getName() << '\n'); ValueToValueMapTy dummy; Function *NewF = CloneFunction(&F, dummy); setFeatures(*NewF, NewProps.Features); setAttributes(*NewF, NewProps.Attributes); NewF->setVisibility(GlobalValue::DefaultVisibility); NewF->setLinkage(GlobalValue::InternalLinkage); // Swap names. If that is the only clone it will retain the name of now // dead value. Preserve original name for externally visible functions. if (F.hasName() && F.hasLocalLinkage()) { std::string NewName = std::string(NewF->getName()); NewF->takeName(&F); F.setName(NewName); } return NewF; } void AMDGPUPropagateAttributes::setFeatures(Function &F, const FeatureBitset &NewFeatures) { std::string NewFeatureStr = getFeatureString(NewFeatures); LLVM_DEBUG(dbgs() << "Set features " << getFeatureString(NewFeatures & TargetFeatures) << " on " << F.getName() << '\n'); F.removeFnAttr("target-features"); F.addFnAttr("target-features", NewFeatureStr); } void AMDGPUPropagateAttributes::setAttributes(Function &F, const ArrayRef> NewAttrs) { LLVM_DEBUG(dbgs() << "Set attributes on " << F.getName() << ":\n"); for (unsigned I = 0; I < NumAttr; ++I) { F.removeFnAttr(AttributeNames[I]); if (NewAttrs[I]) { LLVM_DEBUG(dbgs() << '\t' << NewAttrs[I]->getAsString() << '\n'); F.addFnAttr(*NewAttrs[I]); } } } std::string AMDGPUPropagateAttributes::getFeatureString(const FeatureBitset &Features) const { std::string Ret; for (const SubtargetFeatureKV &KV : AMDGPUFeatureKV) { if (Features[KV.Value]) Ret += (StringRef("+") + KV.Key + ",").str(); else if (TargetFeatures[KV.Value]) Ret += (StringRef("-") + KV.Key + ",").str(); } Ret.pop_back(); // Remove last comma. return Ret; } bool AMDGPUPropagateAttributesEarly::runOnFunction(Function &F) { if (!TM) { auto *TPC = getAnalysisIfAvailable(); if (!TPC) return false; TM = &TPC->getTM(); } if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) return false; return AMDGPUPropagateAttributes(TM, false).process(F); } bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) { if (!TM) { auto *TPC = getAnalysisIfAvailable(); if (!TPC) return false; TM = &TPC->getTM(); } return AMDGPUPropagateAttributes(TM, true).process(M); } FunctionPass *llvm::createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *TM) { return new AMDGPUPropagateAttributesEarly(TM); } ModulePass *llvm::createAMDGPUPropagateAttributesLatePass(const TargetMachine *TM) { return new AMDGPUPropagateAttributesLate(TM); } PreservedAnalyses AMDGPUPropagateAttributesEarlyPass::run(Function &F, FunctionAnalysisManager &AM) { if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) return PreservedAnalyses::all(); return AMDGPUPropagateAttributes(&TM, false).process(F) ? PreservedAnalyses::none() : PreservedAnalyses::all(); } PreservedAnalyses AMDGPUPropagateAttributesLatePass::run(Module &M, ModuleAnalysisManager &AM) { return AMDGPUPropagateAttributes(&TM, true).process(M) ? PreservedAnalyses::none() : PreservedAnalyses::all(); }