474 lines
15 KiB
C++
474 lines
15 KiB
C++
|
//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
|
||
|
//
|
||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||
|
//
|
||
|
//===----------------------------------------------------------------------===//
|
||
|
/// This file implements SLP analysis based on VPlan. The analysis is based on
|
||
|
/// the ideas described in
|
||
|
///
|
||
|
/// Look-ahead SLP: auto-vectorization in the presence of commutative
|
||
|
/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
|
||
|
/// Luís F. W. Góes
|
||
|
///
|
||
|
//===----------------------------------------------------------------------===//
|
||
|
|
||
|
#include "VPlan.h"
|
||
|
#include "llvm/ADT/DepthFirstIterator.h"
|
||
|
#include "llvm/ADT/PostOrderIterator.h"
|
||
|
#include "llvm/ADT/SmallVector.h"
|
||
|
#include "llvm/ADT/Twine.h"
|
||
|
#include "llvm/Analysis/LoopInfo.h"
|
||
|
#include "llvm/Analysis/VectorUtils.h"
|
||
|
#include "llvm/IR/BasicBlock.h"
|
||
|
#include "llvm/IR/CFG.h"
|
||
|
#include "llvm/IR/Dominators.h"
|
||
|
#include "llvm/IR/InstrTypes.h"
|
||
|
#include "llvm/IR/Instruction.h"
|
||
|
#include "llvm/IR/Instructions.h"
|
||
|
#include "llvm/IR/Type.h"
|
||
|
#include "llvm/IR/Value.h"
|
||
|
#include "llvm/Support/Casting.h"
|
||
|
#include "llvm/Support/Debug.h"
|
||
|
#include "llvm/Support/ErrorHandling.h"
|
||
|
#include "llvm/Support/GraphWriter.h"
|
||
|
#include "llvm/Support/raw_ostream.h"
|
||
|
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
||
|
#include <cassert>
|
||
|
#include <iterator>
|
||
|
#include <string>
|
||
|
#include <vector>
|
||
|
|
||
|
using namespace llvm;
|
||
|
|
||
|
#define DEBUG_TYPE "vplan-slp"
|
||
|
|
||
|
// Number of levels to look ahead when re-ordering multi node operands.
|
||
|
static unsigned LookaheadMaxDepth = 5;
|
||
|
|
||
|
VPInstruction *VPlanSlp::markFailed() {
|
||
|
// FIXME: Currently this is used to signal we hit instructions we cannot
|
||
|
// trivially SLP'ize.
|
||
|
CompletelySLP = false;
|
||
|
return nullptr;
|
||
|
}
|
||
|
|
||
|
void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {
|
||
|
if (all_of(Operands, [](VPValue *V) {
|
||
|
return cast<VPInstruction>(V)->getUnderlyingInstr();
|
||
|
})) {
|
||
|
unsigned BundleSize = 0;
|
||
|
for (VPValue *V : Operands) {
|
||
|
Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
|
||
|
assert(!T->isVectorTy() && "Only scalar types supported for now");
|
||
|
BundleSize += T->getScalarSizeInBits();
|
||
|
}
|
||
|
WidestBundleBits = std::max(WidestBundleBits, BundleSize);
|
||
|
}
|
||
|
|
||
|
auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);
|
||
|
assert(Res.second &&
|
||
|
"Already created a combined instruction for the operand bundle");
|
||
|
(void)Res;
|
||
|
}
|
||
|
|
||
|
bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
|
||
|
// Currently we only support VPInstructions.
|
||
|
if (!all_of(Operands, [](VPValue *Op) {
|
||
|
return Op && isa<VPInstruction>(Op) &&
|
||
|
cast<VPInstruction>(Op)->getUnderlyingInstr();
|
||
|
})) {
|
||
|
LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// Check if opcodes and type width agree for all instructions in the bundle.
|
||
|
// FIXME: Differing widths/opcodes can be handled by inserting additional
|
||
|
// instructions.
|
||
|
// FIXME: Deal with non-primitive types.
|
||
|
const Instruction *OriginalInstr =
|
||
|
cast<VPInstruction>(Operands[0])->getUnderlyingInstr();
|
||
|
unsigned Opcode = OriginalInstr->getOpcode();
|
||
|
unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
|
||
|
if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
|
||
|
const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();
|
||
|
return I->getOpcode() == Opcode &&
|
||
|
I->getType()->getPrimitiveSizeInBits() == Width;
|
||
|
})) {
|
||
|
LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// For now, all operands must be defined in the same BB.
|
||
|
if (any_of(Operands, [this](VPValue *Op) {
|
||
|
return cast<VPInstruction>(Op)->getParent() != &this->BB;
|
||
|
})) {
|
||
|
LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
if (any_of(Operands,
|
||
|
[](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
|
||
|
LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// For loads, check that there are no instructions writing to memory in
|
||
|
// between them.
|
||
|
// TODO: we only have to forbid instructions writing to memory that could
|
||
|
// interfere with any of the loads in the bundle
|
||
|
if (Opcode == Instruction::Load) {
|
||
|
unsigned LoadsSeen = 0;
|
||
|
VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
|
||
|
for (auto &I : *Parent) {
|
||
|
auto *VPI = cast<VPInstruction>(&I);
|
||
|
if (VPI->getOpcode() == Instruction::Load &&
|
||
|
llvm::is_contained(Operands, VPI))
|
||
|
LoadsSeen++;
|
||
|
|
||
|
if (LoadsSeen == Operands.size())
|
||
|
break;
|
||
|
if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
|
||
|
LLVM_DEBUG(
|
||
|
dbgs() << "VPSLP: instruction modifying memory between loads\n");
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (!all_of(Operands, [](VPValue *Op) {
|
||
|
return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
|
||
|
->isSimple();
|
||
|
})) {
|
||
|
LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (Opcode == Instruction::Store)
|
||
|
if (!all_of(Operands, [](VPValue *Op) {
|
||
|
return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
|
||
|
->isSimple();
|
||
|
})) {
|
||
|
LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
|
||
|
unsigned OperandIndex) {
|
||
|
SmallVector<VPValue *, 4> Operands;
|
||
|
for (VPValue *V : Values) {
|
||
|
// Currently we only support VPInstructions.
|
||
|
auto *U = cast<VPInstruction>(V);
|
||
|
Operands.push_back(U->getOperand(OperandIndex));
|
||
|
}
|
||
|
return Operands;
|
||
|
}
|
||
|
|
||
|
static bool areCommutative(ArrayRef<VPValue *> Values) {
|
||
|
return Instruction::isCommutative(
|
||
|
cast<VPInstruction>(Values[0])->getOpcode());
|
||
|
}
|
||
|
|
||
|
static SmallVector<SmallVector<VPValue *, 4>, 4>
|
||
|
getOperands(ArrayRef<VPValue *> Values) {
|
||
|
SmallVector<SmallVector<VPValue *, 4>, 4> Result;
|
||
|
auto *VPI = cast<VPInstruction>(Values[0]);
|
||
|
|
||
|
switch (VPI->getOpcode()) {
|
||
|
case Instruction::Load:
|
||
|
llvm_unreachable("Loads terminate a tree, no need to get operands");
|
||
|
case Instruction::Store:
|
||
|
Result.push_back(getOperands(Values, 0));
|
||
|
break;
|
||
|
default:
|
||
|
for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
|
||
|
Result.push_back(getOperands(Values, I));
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
return Result;
|
||
|
}
|
||
|
|
||
|
/// Returns the opcode of Values or ~0 if they do not all agree.
|
||
|
static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
|
||
|
unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
|
||
|
if (any_of(Values, [Opcode](VPValue *V) {
|
||
|
return cast<VPInstruction>(V)->getOpcode() != Opcode;
|
||
|
}))
|
||
|
return None;
|
||
|
return {Opcode};
|
||
|
}
|
||
|
|
||
|
/// Returns true if A and B access sequential memory if they are loads or
|
||
|
/// stores or if they have identical opcodes otherwise.
|
||
|
static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
|
||
|
VPInterleavedAccessInfo &IAI) {
|
||
|
if (A->getOpcode() != B->getOpcode())
|
||
|
return false;
|
||
|
|
||
|
if (A->getOpcode() != Instruction::Load &&
|
||
|
A->getOpcode() != Instruction::Store)
|
||
|
return true;
|
||
|
auto *GA = IAI.getInterleaveGroup(A);
|
||
|
auto *GB = IAI.getInterleaveGroup(B);
|
||
|
|
||
|
return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
|
||
|
}
|
||
|
|
||
|
/// Implements getLAScore from Listing 7 in the paper.
|
||
|
/// Traverses and compares operands of V1 and V2 to MaxLevel.
|
||
|
static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
|
||
|
VPInterleavedAccessInfo &IAI) {
|
||
|
auto *I1 = dyn_cast<VPInstruction>(V1);
|
||
|
auto *I2 = dyn_cast<VPInstruction>(V2);
|
||
|
// Currently we only support VPInstructions.
|
||
|
if (!I1 || !I2)
|
||
|
return 0;
|
||
|
|
||
|
if (MaxLevel == 0)
|
||
|
return (unsigned)areConsecutiveOrMatch(I1, I2, IAI);
|
||
|
|
||
|
unsigned Score = 0;
|
||
|
for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I)
|
||
|
for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J)
|
||
|
Score +=
|
||
|
getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI);
|
||
|
return Score;
|
||
|
}
|
||
|
|
||
|
std::pair<VPlanSlp::OpMode, VPValue *>
|
||
|
VPlanSlp::getBest(OpMode Mode, VPValue *Last,
|
||
|
SmallPtrSetImpl<VPValue *> &Candidates,
|
||
|
VPInterleavedAccessInfo &IAI) {
|
||
|
assert((Mode == OpMode::Load || Mode == OpMode::Opcode) &&
|
||
|
"Currently we only handle load and commutative opcodes");
|
||
|
LLVM_DEBUG(dbgs() << " getBest\n");
|
||
|
|
||
|
SmallVector<VPValue *, 4> BestCandidates;
|
||
|
LLVM_DEBUG(dbgs() << " Candidates for "
|
||
|
<< *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
|
||
|
for (auto *Candidate : Candidates) {
|
||
|
auto *LastI = cast<VPInstruction>(Last);
|
||
|
auto *CandidateI = cast<VPInstruction>(Candidate);
|
||
|
if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
|
||
|
LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
|
||
|
<< " ");
|
||
|
BestCandidates.push_back(Candidate);
|
||
|
}
|
||
|
}
|
||
|
LLVM_DEBUG(dbgs() << "\n");
|
||
|
|
||
|
if (BestCandidates.empty())
|
||
|
return {OpMode::Failed, nullptr};
|
||
|
|
||
|
if (BestCandidates.size() == 1)
|
||
|
return {Mode, BestCandidates[0]};
|
||
|
|
||
|
VPValue *Best = nullptr;
|
||
|
unsigned BestScore = 0;
|
||
|
for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
|
||
|
unsigned PrevScore = ~0u;
|
||
|
bool AllSame = true;
|
||
|
|
||
|
// FIXME: Avoid visiting the same operands multiple times.
|
||
|
for (auto *Candidate : BestCandidates) {
|
||
|
unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
|
||
|
if (PrevScore == ~0u)
|
||
|
PrevScore = Score;
|
||
|
if (PrevScore != Score)
|
||
|
AllSame = false;
|
||
|
PrevScore = Score;
|
||
|
|
||
|
if (Score > BestScore) {
|
||
|
BestScore = Score;
|
||
|
Best = Candidate;
|
||
|
}
|
||
|
}
|
||
|
if (!AllSame)
|
||
|
break;
|
||
|
}
|
||
|
LLVM_DEBUG(dbgs() << "Found best "
|
||
|
<< *cast<VPInstruction>(Best)->getUnderlyingInstr()
|
||
|
<< "\n");
|
||
|
Candidates.erase(Best);
|
||
|
|
||
|
return {Mode, Best};
|
||
|
}
|
||
|
|
||
|
SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
|
||
|
SmallVector<MultiNodeOpTy, 4> FinalOrder;
|
||
|
SmallVector<OpMode, 4> Mode;
|
||
|
FinalOrder.reserve(MultiNodeOps.size());
|
||
|
Mode.reserve(MultiNodeOps.size());
|
||
|
|
||
|
LLVM_DEBUG(dbgs() << "Reordering multinode\n");
|
||
|
|
||
|
for (auto &Operands : MultiNodeOps) {
|
||
|
FinalOrder.push_back({Operands.first, {Operands.second[0]}});
|
||
|
if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
|
||
|
Instruction::Load)
|
||
|
Mode.push_back(OpMode::Load);
|
||
|
else
|
||
|
Mode.push_back(OpMode::Opcode);
|
||
|
}
|
||
|
|
||
|
for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
|
||
|
LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n");
|
||
|
SmallPtrSet<VPValue *, 4> Candidates;
|
||
|
LLVM_DEBUG(dbgs() << " Candidates ");
|
||
|
for (auto Ops : MultiNodeOps) {
|
||
|
LLVM_DEBUG(
|
||
|
dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
|
||
|
<< " ");
|
||
|
Candidates.insert(Ops.second[Lane]);
|
||
|
}
|
||
|
LLVM_DEBUG(dbgs() << "\n");
|
||
|
|
||
|
for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
|
||
|
LLVM_DEBUG(dbgs() << " Checking " << Op << "\n");
|
||
|
if (Mode[Op] == OpMode::Failed)
|
||
|
continue;
|
||
|
|
||
|
VPValue *Last = FinalOrder[Op].second[Lane - 1];
|
||
|
std::pair<OpMode, VPValue *> Res =
|
||
|
getBest(Mode[Op], Last, Candidates, IAI);
|
||
|
if (Res.second)
|
||
|
FinalOrder[Op].second.push_back(Res.second);
|
||
|
else
|
||
|
// TODO: handle this case
|
||
|
FinalOrder[Op].second.push_back(markFailed());
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return FinalOrder;
|
||
|
}
|
||
|
|
||
|
void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
|
||
|
dbgs() << " Ops: ";
|
||
|
for (auto Op : Values) {
|
||
|
if (auto *VPInstr = cast_or_null<VPInstruction>(Op))
|
||
|
if (auto *Instr = VPInstr->getUnderlyingInstr()) {
|
||
|
dbgs() << *Instr << " | ";
|
||
|
continue;
|
||
|
}
|
||
|
dbgs() << " nullptr | ";
|
||
|
}
|
||
|
dbgs() << "\n";
|
||
|
}
|
||
|
|
||
|
VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
|
||
|
assert(!Values.empty() && "Need some operands!");
|
||
|
|
||
|
// If we already visited this instruction bundle, re-use the existing node
|
||
|
auto I = BundleToCombined.find(to_vector<4>(Values));
|
||
|
if (I != BundleToCombined.end()) {
|
||
|
#ifndef NDEBUG
|
||
|
// Check that the resulting graph is a tree. If we re-use a node, this means
|
||
|
// its values have multiple users. We only allow this, if all users of each
|
||
|
// value are the same instruction.
|
||
|
for (auto *V : Values) {
|
||
|
auto UI = V->user_begin();
|
||
|
auto *FirstUser = *UI++;
|
||
|
while (UI != V->user_end()) {
|
||
|
assert(*UI == FirstUser && "Currently we only support SLP trees.");
|
||
|
UI++;
|
||
|
}
|
||
|
}
|
||
|
#endif
|
||
|
return I->second;
|
||
|
}
|
||
|
|
||
|
// Dump inputs
|
||
|
LLVM_DEBUG({
|
||
|
dbgs() << "buildGraph: ";
|
||
|
dumpBundle(Values);
|
||
|
});
|
||
|
|
||
|
if (!areVectorizable(Values))
|
||
|
return markFailed();
|
||
|
|
||
|
assert(getOpcode(Values) && "Opcodes for all values must match");
|
||
|
unsigned ValuesOpcode = getOpcode(Values).getValue();
|
||
|
|
||
|
SmallVector<VPValue *, 4> CombinedOperands;
|
||
|
if (areCommutative(Values)) {
|
||
|
bool MultiNodeRoot = !MultiNodeActive;
|
||
|
MultiNodeActive = true;
|
||
|
for (auto &Operands : getOperands(Values)) {
|
||
|
LLVM_DEBUG({
|
||
|
dbgs() << " Visiting Commutative";
|
||
|
dumpBundle(Operands);
|
||
|
});
|
||
|
|
||
|
auto OperandsOpcode = getOpcode(Operands);
|
||
|
if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
|
||
|
LLVM_DEBUG(dbgs() << " Same opcode, continue building\n");
|
||
|
CombinedOperands.push_back(buildGraph(Operands));
|
||
|
} else {
|
||
|
LLVM_DEBUG(dbgs() << " Adding multinode Ops\n");
|
||
|
// Create dummy VPInstruction, which will we replace later by the
|
||
|
// re-ordered operand.
|
||
|
VPInstruction *Op = new VPInstruction(0, {});
|
||
|
CombinedOperands.push_back(Op);
|
||
|
MultiNodeOps.emplace_back(Op, Operands);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (MultiNodeRoot) {
|
||
|
LLVM_DEBUG(dbgs() << "Reorder \n");
|
||
|
MultiNodeActive = false;
|
||
|
|
||
|
auto FinalOrder = reorderMultiNodeOps();
|
||
|
|
||
|
MultiNodeOps.clear();
|
||
|
for (auto &Ops : FinalOrder) {
|
||
|
VPInstruction *NewOp = buildGraph(Ops.second);
|
||
|
Ops.first->replaceAllUsesWith(NewOp);
|
||
|
for (unsigned i = 0; i < CombinedOperands.size(); i++)
|
||
|
if (CombinedOperands[i] == Ops.first)
|
||
|
CombinedOperands[i] = NewOp;
|
||
|
delete Ops.first;
|
||
|
Ops.first = NewOp;
|
||
|
}
|
||
|
LLVM_DEBUG(dbgs() << "Found final order\n");
|
||
|
}
|
||
|
} else {
|
||
|
LLVM_DEBUG(dbgs() << " NonCommuntative\n");
|
||
|
if (ValuesOpcode == Instruction::Load)
|
||
|
for (VPValue *V : Values)
|
||
|
CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
|
||
|
else
|
||
|
for (auto &Operands : getOperands(Values))
|
||
|
CombinedOperands.push_back(buildGraph(Operands));
|
||
|
}
|
||
|
|
||
|
unsigned Opcode;
|
||
|
switch (ValuesOpcode) {
|
||
|
case Instruction::Load:
|
||
|
Opcode = VPInstruction::SLPLoad;
|
||
|
break;
|
||
|
case Instruction::Store:
|
||
|
Opcode = VPInstruction::SLPStore;
|
||
|
break;
|
||
|
default:
|
||
|
Opcode = ValuesOpcode;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if (!CompletelySLP)
|
||
|
return markFailed();
|
||
|
|
||
|
assert(CombinedOperands.size() > 0 && "Need more some operands");
|
||
|
auto *VPI = new VPInstruction(Opcode, CombinedOperands);
|
||
|
VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());
|
||
|
|
||
|
LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " "
|
||
|
<< *cast<VPInstruction>(Values[0]) << "\n");
|
||
|
addCombined(Values, VPI);
|
||
|
return VPI;
|
||
|
}
|