940 lines
32 KiB
C++
940 lines
32 KiB
C++
|
//===-- Target.cpp ----------------------------------------------*- C++ -*-===//
|
||
|
//
|
||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||
|
//
|
||
|
//===----------------------------------------------------------------------===//
|
||
|
#include "../Target.h"
|
||
|
|
||
|
#include "../Error.h"
|
||
|
#include "../ParallelSnippetGenerator.h"
|
||
|
#include "../SerialSnippetGenerator.h"
|
||
|
#include "../SnippetGenerator.h"
|
||
|
#include "MCTargetDesc/X86BaseInfo.h"
|
||
|
#include "MCTargetDesc/X86MCTargetDesc.h"
|
||
|
#include "X86.h"
|
||
|
#include "X86Counter.h"
|
||
|
#include "X86RegisterInfo.h"
|
||
|
#include "X86Subtarget.h"
|
||
|
#include "llvm/ADT/Sequence.h"
|
||
|
#include "llvm/MC/MCInstBuilder.h"
|
||
|
#include "llvm/Support/Errc.h"
|
||
|
#include "llvm/Support/Error.h"
|
||
|
#include "llvm/Support/FormatVariadic.h"
|
||
|
|
||
|
#include <memory>
|
||
|
#include <string>
|
||
|
#include <vector>
|
||
|
#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
|
||
|
#include <immintrin.h>
|
||
|
#include <intrin.h>
|
||
|
#endif
|
||
|
|
||
|
namespace llvm {
|
||
|
namespace exegesis {
|
||
|
|
||
|
static cl::OptionCategory
|
||
|
BenchmarkOptions("llvm-exegesis benchmark x86-options");
|
||
|
|
||
|
// If a positive value is specified, we are going to use the LBR in
|
||
|
// latency-mode.
|
||
|
//
|
||
|
// Note:
|
||
|
// - A small value is preferred, but too low a value could result in
|
||
|
// throttling.
|
||
|
// - A prime number is preferred to avoid always skipping certain blocks.
|
||
|
//
|
||
|
static cl::opt<unsigned> LbrSamplingPeriod(
|
||
|
"x86-lbr-sample-period",
|
||
|
cl::desc("The sample period (nbranches/sample), used for LBR sampling"),
|
||
|
cl::cat(BenchmarkOptions), cl::init(0));
|
||
|
|
||
|
// FIXME: Validates that repetition-mode is loop if LBR is requested.
|
||
|
|
||
|
// Returns a non-null reason if we cannot handle the memory references in this
|
||
|
// instruction.
|
||
|
static const char *isInvalidMemoryInstr(const Instruction &Instr) {
|
||
|
switch (Instr.Description.TSFlags & X86II::FormMask) {
|
||
|
default:
|
||
|
return "Unknown FormMask value";
|
||
|
// These have no memory access.
|
||
|
case X86II::Pseudo:
|
||
|
case X86II::RawFrm:
|
||
|
case X86II::AddCCFrm:
|
||
|
case X86II::PrefixByte:
|
||
|
case X86II::MRMDestReg:
|
||
|
case X86II::MRMSrcReg:
|
||
|
case X86II::MRMSrcReg4VOp3:
|
||
|
case X86II::MRMSrcRegOp4:
|
||
|
case X86II::MRMSrcRegCC:
|
||
|
case X86II::MRMXrCC:
|
||
|
case X86II::MRMr0:
|
||
|
case X86II::MRMXr:
|
||
|
case X86II::MRM0r:
|
||
|
case X86II::MRM1r:
|
||
|
case X86II::MRM2r:
|
||
|
case X86II::MRM3r:
|
||
|
case X86II::MRM4r:
|
||
|
case X86II::MRM5r:
|
||
|
case X86II::MRM6r:
|
||
|
case X86II::MRM7r:
|
||
|
case X86II::MRM0X:
|
||
|
case X86II::MRM1X:
|
||
|
case X86II::MRM2X:
|
||
|
case X86II::MRM3X:
|
||
|
case X86II::MRM4X:
|
||
|
case X86II::MRM5X:
|
||
|
case X86II::MRM6X:
|
||
|
case X86II::MRM7X:
|
||
|
case X86II::MRM_C0:
|
||
|
case X86II::MRM_C1:
|
||
|
case X86II::MRM_C2:
|
||
|
case X86II::MRM_C3:
|
||
|
case X86II::MRM_C4:
|
||
|
case X86II::MRM_C5:
|
||
|
case X86II::MRM_C6:
|
||
|
case X86II::MRM_C7:
|
||
|
case X86II::MRM_C8:
|
||
|
case X86II::MRM_C9:
|
||
|
case X86II::MRM_CA:
|
||
|
case X86II::MRM_CB:
|
||
|
case X86II::MRM_CC:
|
||
|
case X86II::MRM_CD:
|
||
|
case X86II::MRM_CE:
|
||
|
case X86II::MRM_CF:
|
||
|
case X86II::MRM_D0:
|
||
|
case X86II::MRM_D1:
|
||
|
case X86II::MRM_D2:
|
||
|
case X86II::MRM_D3:
|
||
|
case X86II::MRM_D4:
|
||
|
case X86II::MRM_D5:
|
||
|
case X86II::MRM_D6:
|
||
|
case X86II::MRM_D7:
|
||
|
case X86II::MRM_D8:
|
||
|
case X86II::MRM_D9:
|
||
|
case X86II::MRM_DA:
|
||
|
case X86II::MRM_DB:
|
||
|
case X86II::MRM_DC:
|
||
|
case X86II::MRM_DD:
|
||
|
case X86II::MRM_DE:
|
||
|
case X86II::MRM_DF:
|
||
|
case X86II::MRM_E0:
|
||
|
case X86II::MRM_E1:
|
||
|
case X86II::MRM_E2:
|
||
|
case X86II::MRM_E3:
|
||
|
case X86II::MRM_E4:
|
||
|
case X86II::MRM_E5:
|
||
|
case X86II::MRM_E6:
|
||
|
case X86II::MRM_E7:
|
||
|
case X86II::MRM_E8:
|
||
|
case X86II::MRM_E9:
|
||
|
case X86II::MRM_EA:
|
||
|
case X86II::MRM_EB:
|
||
|
case X86II::MRM_EC:
|
||
|
case X86II::MRM_ED:
|
||
|
case X86II::MRM_EE:
|
||
|
case X86II::MRM_EF:
|
||
|
case X86II::MRM_F0:
|
||
|
case X86II::MRM_F1:
|
||
|
case X86II::MRM_F2:
|
||
|
case X86II::MRM_F3:
|
||
|
case X86II::MRM_F4:
|
||
|
case X86II::MRM_F5:
|
||
|
case X86II::MRM_F6:
|
||
|
case X86II::MRM_F7:
|
||
|
case X86II::MRM_F8:
|
||
|
case X86II::MRM_F9:
|
||
|
case X86II::MRM_FA:
|
||
|
case X86II::MRM_FB:
|
||
|
case X86II::MRM_FC:
|
||
|
case X86II::MRM_FD:
|
||
|
case X86II::MRM_FE:
|
||
|
case X86II::MRM_FF:
|
||
|
case X86II::RawFrmImm8:
|
||
|
return nullptr;
|
||
|
case X86II::AddRegFrm:
|
||
|
return (Instr.Description.Opcode == X86::POP16r ||
|
||
|
Instr.Description.Opcode == X86::POP32r ||
|
||
|
Instr.Description.Opcode == X86::PUSH16r ||
|
||
|
Instr.Description.Opcode == X86::PUSH32r)
|
||
|
? "unsupported opcode: unsupported memory access"
|
||
|
: nullptr;
|
||
|
// These access memory and are handled.
|
||
|
case X86II::MRMDestMem:
|
||
|
case X86II::MRMSrcMem:
|
||
|
case X86II::MRMSrcMem4VOp3:
|
||
|
case X86II::MRMSrcMemOp4:
|
||
|
case X86II::MRMSrcMemCC:
|
||
|
case X86II::MRMXmCC:
|
||
|
case X86II::MRMXm:
|
||
|
case X86II::MRM0m:
|
||
|
case X86II::MRM1m:
|
||
|
case X86II::MRM2m:
|
||
|
case X86II::MRM3m:
|
||
|
case X86II::MRM4m:
|
||
|
case X86II::MRM5m:
|
||
|
case X86II::MRM6m:
|
||
|
case X86II::MRM7m:
|
||
|
return nullptr;
|
||
|
// These access memory and are not handled yet.
|
||
|
case X86II::RawFrmImm16:
|
||
|
case X86II::RawFrmMemOffs:
|
||
|
case X86II::RawFrmSrc:
|
||
|
case X86II::RawFrmDst:
|
||
|
case X86II::RawFrmDstSrc:
|
||
|
return "unsupported opcode: non uniform memory access";
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// If the opcode is invalid, returns a pointer to a character literal indicating
|
||
|
// the reason. nullptr indicates a valid opcode.
|
||
|
static const char *isInvalidOpcode(const Instruction &Instr) {
|
||
|
const auto OpcodeName = Instr.Name;
|
||
|
if ((Instr.Description.TSFlags & X86II::FormMask) == X86II::Pseudo)
|
||
|
return "unsupported opcode: pseudo instruction";
|
||
|
if (OpcodeName.startswith("POP") || OpcodeName.startswith("PUSH") ||
|
||
|
OpcodeName.startswith("ADJCALLSTACK") || OpcodeName.startswith("LEAVE"))
|
||
|
return "unsupported opcode: Push/Pop/AdjCallStack/Leave";
|
||
|
if (const auto reason = isInvalidMemoryInstr(Instr))
|
||
|
return reason;
|
||
|
// We do not handle instructions with OPERAND_PCREL.
|
||
|
for (const Operand &Op : Instr.Operands)
|
||
|
if (Op.isExplicit() &&
|
||
|
Op.getExplicitOperandInfo().OperandType == MCOI::OPERAND_PCREL)
|
||
|
return "unsupported opcode: PC relative operand";
|
||
|
// We do not handle second-form X87 instructions. We only handle first-form
|
||
|
// ones (_Fp), see comment in X86InstrFPStack.td.
|
||
|
for (const Operand &Op : Instr.Operands)
|
||
|
if (Op.isReg() && Op.isExplicit() &&
|
||
|
Op.getExplicitOperandInfo().RegClass == X86::RSTRegClassID)
|
||
|
return "unsupported second-form X87 instruction";
|
||
|
return nullptr;
|
||
|
}
|
||
|
|
||
|
static unsigned getX86FPFlags(const Instruction &Instr) {
|
||
|
return Instr.Description.TSFlags & X86II::FPTypeMask;
|
||
|
}
|
||
|
|
||
|
// Helper to fill a memory operand with a value.
|
||
|
static void setMemOp(InstructionTemplate &IT, int OpIdx,
|
||
|
const MCOperand &OpVal) {
|
||
|
const auto Op = IT.getInstr().Operands[OpIdx];
|
||
|
assert(Op.isExplicit() && "invalid memory pattern");
|
||
|
IT.getValueFor(Op) = OpVal;
|
||
|
}
|
||
|
|
||
|
// Common (latency, uops) code for LEA templates. `GetDestReg` takes the
|
||
|
// addressing base and index registers and returns the LEA destination register.
|
||
|
static Expected<std::vector<CodeTemplate>> generateLEATemplatesCommon(
|
||
|
const Instruction &Instr, const BitVector &ForbiddenRegisters,
|
||
|
const LLVMState &State, const SnippetGenerator::Options &Opts,
|
||
|
std::function<void(unsigned, unsigned, BitVector &CandidateDestRegs)>
|
||
|
RestrictDestRegs) {
|
||
|
assert(Instr.Operands.size() == 6 && "invalid LEA");
|
||
|
assert(X86II::getMemoryOperandNo(Instr.Description.TSFlags) == 1 &&
|
||
|
"invalid LEA");
|
||
|
|
||
|
constexpr const int kDestOp = 0;
|
||
|
constexpr const int kBaseOp = 1;
|
||
|
constexpr const int kIndexOp = 3;
|
||
|
auto PossibleDestRegs =
|
||
|
Instr.Operands[kDestOp].getRegisterAliasing().sourceBits();
|
||
|
remove(PossibleDestRegs, ForbiddenRegisters);
|
||
|
auto PossibleBaseRegs =
|
||
|
Instr.Operands[kBaseOp].getRegisterAliasing().sourceBits();
|
||
|
remove(PossibleBaseRegs, ForbiddenRegisters);
|
||
|
auto PossibleIndexRegs =
|
||
|
Instr.Operands[kIndexOp].getRegisterAliasing().sourceBits();
|
||
|
remove(PossibleIndexRegs, ForbiddenRegisters);
|
||
|
|
||
|
const auto &RegInfo = State.getRegInfo();
|
||
|
std::vector<CodeTemplate> Result;
|
||
|
for (const unsigned BaseReg : PossibleBaseRegs.set_bits()) {
|
||
|
for (const unsigned IndexReg : PossibleIndexRegs.set_bits()) {
|
||
|
for (int LogScale = 0; LogScale <= 3; ++LogScale) {
|
||
|
// FIXME: Add an option for controlling how we explore immediates.
|
||
|
for (const int Disp : {0, 42}) {
|
||
|
InstructionTemplate IT(&Instr);
|
||
|
const int64_t Scale = 1ull << LogScale;
|
||
|
setMemOp(IT, 1, MCOperand::createReg(BaseReg));
|
||
|
setMemOp(IT, 2, MCOperand::createImm(Scale));
|
||
|
setMemOp(IT, 3, MCOperand::createReg(IndexReg));
|
||
|
setMemOp(IT, 4, MCOperand::createImm(Disp));
|
||
|
// SegmentReg must be 0 for LEA.
|
||
|
setMemOp(IT, 5, MCOperand::createReg(0));
|
||
|
|
||
|
// Output reg candidates are selected by the caller.
|
||
|
auto PossibleDestRegsNow = PossibleDestRegs;
|
||
|
RestrictDestRegs(BaseReg, IndexReg, PossibleDestRegsNow);
|
||
|
assert(PossibleDestRegsNow.set_bits().begin() !=
|
||
|
PossibleDestRegsNow.set_bits().end() &&
|
||
|
"no remaining registers");
|
||
|
setMemOp(
|
||
|
IT, 0,
|
||
|
MCOperand::createReg(*PossibleDestRegsNow.set_bits().begin()));
|
||
|
|
||
|
CodeTemplate CT;
|
||
|
CT.Instructions.push_back(std::move(IT));
|
||
|
CT.Config = formatv("{3}(%{0}, %{1}, {2})", RegInfo.getName(BaseReg),
|
||
|
RegInfo.getName(IndexReg), Scale, Disp)
|
||
|
.str();
|
||
|
Result.push_back(std::move(CT));
|
||
|
if (Result.size() >= Opts.MaxConfigsPerOpcode)
|
||
|
return std::move(Result);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return std::move(Result);
|
||
|
}
|
||
|
|
||
|
namespace {
|
||
|
class X86SerialSnippetGenerator : public SerialSnippetGenerator {
|
||
|
public:
|
||
|
using SerialSnippetGenerator::SerialSnippetGenerator;
|
||
|
|
||
|
Expected<std::vector<CodeTemplate>>
|
||
|
generateCodeTemplates(InstructionTemplate Variant,
|
||
|
const BitVector &ForbiddenRegisters) const override;
|
||
|
};
|
||
|
} // namespace
|
||
|
|
||
|
Expected<std::vector<CodeTemplate>>
|
||
|
X86SerialSnippetGenerator::generateCodeTemplates(
|
||
|
InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
|
||
|
const Instruction &Instr = Variant.getInstr();
|
||
|
|
||
|
if (const auto reason = isInvalidOpcode(Instr))
|
||
|
return make_error<Failure>(reason);
|
||
|
|
||
|
// LEA gets special attention.
|
||
|
const auto Opcode = Instr.Description.getOpcode();
|
||
|
if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) {
|
||
|
return generateLEATemplatesCommon(
|
||
|
Instr, ForbiddenRegisters, State, Opts,
|
||
|
[this](unsigned BaseReg, unsigned IndexReg,
|
||
|
BitVector &CandidateDestRegs) {
|
||
|
// We just select a destination register that aliases the base
|
||
|
// register.
|
||
|
CandidateDestRegs &=
|
||
|
State.getRATC().getRegister(BaseReg).aliasedBits();
|
||
|
});
|
||
|
}
|
||
|
|
||
|
if (Instr.hasMemoryOperands())
|
||
|
return make_error<Failure>(
|
||
|
"unsupported memory operand in latency measurements");
|
||
|
|
||
|
switch (getX86FPFlags(Instr)) {
|
||
|
case X86II::NotFP:
|
||
|
return SerialSnippetGenerator::generateCodeTemplates(Variant,
|
||
|
ForbiddenRegisters);
|
||
|
case X86II::ZeroArgFP:
|
||
|
case X86II::OneArgFP:
|
||
|
case X86II::SpecialFP:
|
||
|
case X86II::CompareFP:
|
||
|
case X86II::CondMovFP:
|
||
|
return make_error<Failure>("Unsupported x87 Instruction");
|
||
|
case X86II::OneArgFPRW:
|
||
|
case X86II::TwoArgFP:
|
||
|
// These are instructions like
|
||
|
// - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
|
||
|
// - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
|
||
|
// They are intrinsically serial and do not modify the state of the stack.
|
||
|
return generateSelfAliasingCodeTemplates(Variant);
|
||
|
default:
|
||
|
llvm_unreachable("Unknown FP Type!");
|
||
|
}
|
||
|
}
|
||
|
|
||
|
namespace {
|
||
|
class X86ParallelSnippetGenerator : public ParallelSnippetGenerator {
|
||
|
public:
|
||
|
using ParallelSnippetGenerator::ParallelSnippetGenerator;
|
||
|
|
||
|
Expected<std::vector<CodeTemplate>>
|
||
|
generateCodeTemplates(InstructionTemplate Variant,
|
||
|
const BitVector &ForbiddenRegisters) const override;
|
||
|
};
|
||
|
|
||
|
} // namespace
|
||
|
|
||
|
Expected<std::vector<CodeTemplate>>
|
||
|
X86ParallelSnippetGenerator::generateCodeTemplates(
|
||
|
InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
|
||
|
const Instruction &Instr = Variant.getInstr();
|
||
|
|
||
|
if (const auto reason = isInvalidOpcode(Instr))
|
||
|
return make_error<Failure>(reason);
|
||
|
|
||
|
// LEA gets special attention.
|
||
|
const auto Opcode = Instr.Description.getOpcode();
|
||
|
if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) {
|
||
|
return generateLEATemplatesCommon(
|
||
|
Instr, ForbiddenRegisters, State, Opts,
|
||
|
[this](unsigned BaseReg, unsigned IndexReg,
|
||
|
BitVector &CandidateDestRegs) {
|
||
|
// Any destination register that is not used for addressing is fine.
|
||
|
remove(CandidateDestRegs,
|
||
|
State.getRATC().getRegister(BaseReg).aliasedBits());
|
||
|
remove(CandidateDestRegs,
|
||
|
State.getRATC().getRegister(IndexReg).aliasedBits());
|
||
|
});
|
||
|
}
|
||
|
|
||
|
switch (getX86FPFlags(Instr)) {
|
||
|
case X86II::NotFP:
|
||
|
return ParallelSnippetGenerator::generateCodeTemplates(Variant,
|
||
|
ForbiddenRegisters);
|
||
|
case X86II::ZeroArgFP:
|
||
|
case X86II::OneArgFP:
|
||
|
case X86II::SpecialFP:
|
||
|
return make_error<Failure>("Unsupported x87 Instruction");
|
||
|
case X86II::OneArgFPRW:
|
||
|
case X86II::TwoArgFP:
|
||
|
// These are instructions like
|
||
|
// - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
|
||
|
// - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
|
||
|
// They are intrinsically serial and do not modify the state of the stack.
|
||
|
// We generate the same code for latency and uops.
|
||
|
return generateSelfAliasingCodeTemplates(Variant);
|
||
|
case X86II::CompareFP:
|
||
|
case X86II::CondMovFP:
|
||
|
// We can compute uops for any FP instruction that does not grow or shrink
|
||
|
// the stack (either do not touch the stack or push as much as they pop).
|
||
|
return generateUnconstrainedCodeTemplates(
|
||
|
Variant, "instruction does not grow/shrink the FP stack");
|
||
|
default:
|
||
|
llvm_unreachable("Unknown FP Type!");
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static unsigned getLoadImmediateOpcode(unsigned RegBitWidth) {
|
||
|
switch (RegBitWidth) {
|
||
|
case 8:
|
||
|
return X86::MOV8ri;
|
||
|
case 16:
|
||
|
return X86::MOV16ri;
|
||
|
case 32:
|
||
|
return X86::MOV32ri;
|
||
|
case 64:
|
||
|
return X86::MOV64ri;
|
||
|
}
|
||
|
llvm_unreachable("Invalid Value Width");
|
||
|
}
|
||
|
|
||
|
// Generates instruction to load an immediate value into a register.
|
||
|
static MCInst loadImmediate(unsigned Reg, unsigned RegBitWidth,
|
||
|
const APInt &Value) {
|
||
|
if (Value.getBitWidth() > RegBitWidth)
|
||
|
llvm_unreachable("Value must fit in the Register");
|
||
|
return MCInstBuilder(getLoadImmediateOpcode(RegBitWidth))
|
||
|
.addReg(Reg)
|
||
|
.addImm(Value.getZExtValue());
|
||
|
}
|
||
|
|
||
|
// Allocates scratch memory on the stack.
|
||
|
static MCInst allocateStackSpace(unsigned Bytes) {
|
||
|
return MCInstBuilder(X86::SUB64ri8)
|
||
|
.addReg(X86::RSP)
|
||
|
.addReg(X86::RSP)
|
||
|
.addImm(Bytes);
|
||
|
}
|
||
|
|
||
|
// Fills scratch memory at offset `OffsetBytes` with value `Imm`.
|
||
|
static MCInst fillStackSpace(unsigned MovOpcode, unsigned OffsetBytes,
|
||
|
uint64_t Imm) {
|
||
|
return MCInstBuilder(MovOpcode)
|
||
|
// Address = ESP
|
||
|
.addReg(X86::RSP) // BaseReg
|
||
|
.addImm(1) // ScaleAmt
|
||
|
.addReg(0) // IndexReg
|
||
|
.addImm(OffsetBytes) // Disp
|
||
|
.addReg(0) // Segment
|
||
|
// Immediate.
|
||
|
.addImm(Imm);
|
||
|
}
|
||
|
|
||
|
// Loads scratch memory into register `Reg` using opcode `RMOpcode`.
|
||
|
static MCInst loadToReg(unsigned Reg, unsigned RMOpcode) {
|
||
|
return MCInstBuilder(RMOpcode)
|
||
|
.addReg(Reg)
|
||
|
// Address = ESP
|
||
|
.addReg(X86::RSP) // BaseReg
|
||
|
.addImm(1) // ScaleAmt
|
||
|
.addReg(0) // IndexReg
|
||
|
.addImm(0) // Disp
|
||
|
.addReg(0); // Segment
|
||
|
}
|
||
|
|
||
|
// Releases scratch memory.
|
||
|
static MCInst releaseStackSpace(unsigned Bytes) {
|
||
|
return MCInstBuilder(X86::ADD64ri8)
|
||
|
.addReg(X86::RSP)
|
||
|
.addReg(X86::RSP)
|
||
|
.addImm(Bytes);
|
||
|
}
|
||
|
|
||
|
// Reserves some space on the stack, fills it with the content of the provided
|
||
|
// constant and provide methods to load the stack value into a register.
|
||
|
namespace {
|
||
|
struct ConstantInliner {
|
||
|
explicit ConstantInliner(const APInt &Constant) : Constant_(Constant) {}
|
||
|
|
||
|
std::vector<MCInst> loadAndFinalize(unsigned Reg, unsigned RegBitWidth,
|
||
|
unsigned Opcode);
|
||
|
|
||
|
std::vector<MCInst> loadX87STAndFinalize(unsigned Reg);
|
||
|
|
||
|
std::vector<MCInst> loadX87FPAndFinalize(unsigned Reg);
|
||
|
|
||
|
std::vector<MCInst> popFlagAndFinalize();
|
||
|
|
||
|
std::vector<MCInst> loadImplicitRegAndFinalize(unsigned Opcode,
|
||
|
unsigned Value);
|
||
|
|
||
|
private:
|
||
|
ConstantInliner &add(const MCInst &Inst) {
|
||
|
Instructions.push_back(Inst);
|
||
|
return *this;
|
||
|
}
|
||
|
|
||
|
void initStack(unsigned Bytes);
|
||
|
|
||
|
static constexpr const unsigned kF80Bytes = 10; // 80 bits.
|
||
|
|
||
|
APInt Constant_;
|
||
|
std::vector<MCInst> Instructions;
|
||
|
};
|
||
|
} // namespace
|
||
|
|
||
|
std::vector<MCInst> ConstantInliner::loadAndFinalize(unsigned Reg,
|
||
|
unsigned RegBitWidth,
|
||
|
unsigned Opcode) {
|
||
|
assert((RegBitWidth & 7) == 0 && "RegBitWidth must be a multiple of 8 bits");
|
||
|
initStack(RegBitWidth / 8);
|
||
|
add(loadToReg(Reg, Opcode));
|
||
|
add(releaseStackSpace(RegBitWidth / 8));
|
||
|
return std::move(Instructions);
|
||
|
}
|
||
|
|
||
|
std::vector<MCInst> ConstantInliner::loadX87STAndFinalize(unsigned Reg) {
|
||
|
initStack(kF80Bytes);
|
||
|
add(MCInstBuilder(X86::LD_F80m)
|
||
|
// Address = ESP
|
||
|
.addReg(X86::RSP) // BaseReg
|
||
|
.addImm(1) // ScaleAmt
|
||
|
.addReg(0) // IndexReg
|
||
|
.addImm(0) // Disp
|
||
|
.addReg(0)); // Segment
|
||
|
if (Reg != X86::ST0)
|
||
|
add(MCInstBuilder(X86::ST_Frr).addReg(Reg));
|
||
|
add(releaseStackSpace(kF80Bytes));
|
||
|
return std::move(Instructions);
|
||
|
}
|
||
|
|
||
|
std::vector<MCInst> ConstantInliner::loadX87FPAndFinalize(unsigned Reg) {
|
||
|
initStack(kF80Bytes);
|
||
|
add(MCInstBuilder(X86::LD_Fp80m)
|
||
|
.addReg(Reg)
|
||
|
// Address = ESP
|
||
|
.addReg(X86::RSP) // BaseReg
|
||
|
.addImm(1) // ScaleAmt
|
||
|
.addReg(0) // IndexReg
|
||
|
.addImm(0) // Disp
|
||
|
.addReg(0)); // Segment
|
||
|
add(releaseStackSpace(kF80Bytes));
|
||
|
return std::move(Instructions);
|
||
|
}
|
||
|
|
||
|
std::vector<MCInst> ConstantInliner::popFlagAndFinalize() {
|
||
|
initStack(8);
|
||
|
add(MCInstBuilder(X86::POPF64));
|
||
|
return std::move(Instructions);
|
||
|
}
|
||
|
|
||
|
std::vector<MCInst>
|
||
|
ConstantInliner::loadImplicitRegAndFinalize(unsigned Opcode, unsigned Value) {
|
||
|
add(allocateStackSpace(4));
|
||
|
add(fillStackSpace(X86::MOV32mi, 0, Value)); // Mask all FP exceptions
|
||
|
add(MCInstBuilder(Opcode)
|
||
|
// Address = ESP
|
||
|
.addReg(X86::RSP) // BaseReg
|
||
|
.addImm(1) // ScaleAmt
|
||
|
.addReg(0) // IndexReg
|
||
|
.addImm(0) // Disp
|
||
|
.addReg(0)); // Segment
|
||
|
add(releaseStackSpace(4));
|
||
|
return std::move(Instructions);
|
||
|
}
|
||
|
|
||
|
void ConstantInliner::initStack(unsigned Bytes) {
|
||
|
assert(Constant_.getBitWidth() <= Bytes * 8 &&
|
||
|
"Value does not have the correct size");
|
||
|
const APInt WideConstant = Constant_.getBitWidth() < Bytes * 8
|
||
|
? Constant_.sext(Bytes * 8)
|
||
|
: Constant_;
|
||
|
add(allocateStackSpace(Bytes));
|
||
|
size_t ByteOffset = 0;
|
||
|
for (; Bytes - ByteOffset >= 4; ByteOffset += 4)
|
||
|
add(fillStackSpace(
|
||
|
X86::MOV32mi, ByteOffset,
|
||
|
WideConstant.extractBits(32, ByteOffset * 8).getZExtValue()));
|
||
|
if (Bytes - ByteOffset >= 2) {
|
||
|
add(fillStackSpace(
|
||
|
X86::MOV16mi, ByteOffset,
|
||
|
WideConstant.extractBits(16, ByteOffset * 8).getZExtValue()));
|
||
|
ByteOffset += 2;
|
||
|
}
|
||
|
if (Bytes - ByteOffset >= 1)
|
||
|
add(fillStackSpace(
|
||
|
X86::MOV8mi, ByteOffset,
|
||
|
WideConstant.extractBits(8, ByteOffset * 8).getZExtValue()));
|
||
|
}
|
||
|
|
||
|
#include "X86GenExegesis.inc"
|
||
|
|
||
|
namespace {
|
||
|
|
||
|
class X86SavedState : public ExegesisTarget::SavedState {
|
||
|
public:
|
||
|
X86SavedState() {
|
||
|
#ifdef __x86_64__
|
||
|
# if defined(_MSC_VER)
|
||
|
_fxsave64(FPState);
|
||
|
Eflags = __readeflags();
|
||
|
# elif defined(__GNUC__)
|
||
|
__builtin_ia32_fxsave64(FPState);
|
||
|
Eflags = __builtin_ia32_readeflags_u64();
|
||
|
# endif
|
||
|
#else
|
||
|
llvm_unreachable("X86 exegesis running on non-X86 target");
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
~X86SavedState() {
|
||
|
// Restoring the X87 state does not flush pending exceptions, make sure
|
||
|
// these exceptions are flushed now.
|
||
|
#ifdef __x86_64__
|
||
|
# if defined(_MSC_VER)
|
||
|
_clearfp();
|
||
|
_fxrstor64(FPState);
|
||
|
__writeeflags(Eflags);
|
||
|
# elif defined(__GNUC__)
|
||
|
asm volatile("fwait");
|
||
|
__builtin_ia32_fxrstor64(FPState);
|
||
|
__builtin_ia32_writeeflags_u64(Eflags);
|
||
|
# endif
|
||
|
#else
|
||
|
llvm_unreachable("X86 exegesis running on non-X86 target");
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
#ifdef __x86_64__
|
||
|
alignas(16) char FPState[512];
|
||
|
uint64_t Eflags;
|
||
|
#endif
|
||
|
};
|
||
|
|
||
|
class ExegesisX86Target : public ExegesisTarget {
|
||
|
public:
|
||
|
ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters) {}
|
||
|
|
||
|
Expected<std::unique_ptr<pfm::Counter>>
|
||
|
createCounter(StringRef CounterName, const LLVMState &State) const override {
|
||
|
// If LbrSamplingPeriod was provided, then ignore the
|
||
|
// CounterName because we only have one for LBR.
|
||
|
if (LbrSamplingPeriod > 0) {
|
||
|
// Can't use LBR without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, or without
|
||
|
// __linux__ (for now)
|
||
|
#if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES) && \
|
||
|
defined(__linux__)
|
||
|
return std::make_unique<X86LbrCounter>(
|
||
|
X86LbrPerfEvent(LbrSamplingPeriod));
|
||
|
#else
|
||
|
return llvm::make_error<llvm::StringError>(
|
||
|
"LBR counter requested without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, "
|
||
|
"or running on Linux.",
|
||
|
llvm::errc::invalid_argument);
|
||
|
#endif
|
||
|
}
|
||
|
return ExegesisTarget::createCounter(CounterName, State);
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
void addTargetSpecificPasses(PassManagerBase &PM) const override;
|
||
|
|
||
|
unsigned getScratchMemoryRegister(const Triple &TT) const override;
|
||
|
|
||
|
unsigned getLoopCounterRegister(const Triple &) const override;
|
||
|
|
||
|
unsigned getMaxMemoryAccessSize() const override { return 64; }
|
||
|
|
||
|
Error randomizeTargetMCOperand(const Instruction &Instr, const Variable &Var,
|
||
|
MCOperand &AssignedValue,
|
||
|
const BitVector &ForbiddenRegs) const override;
|
||
|
|
||
|
void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg,
|
||
|
unsigned Offset) const override;
|
||
|
|
||
|
void decrementLoopCounterAndJump(MachineBasicBlock &MBB,
|
||
|
MachineBasicBlock &TargetMBB,
|
||
|
const MCInstrInfo &MII) const override;
|
||
|
|
||
|
std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, unsigned Reg,
|
||
|
const APInt &Value) const override;
|
||
|
|
||
|
ArrayRef<unsigned> getUnavailableRegisters() const override {
|
||
|
return makeArrayRef(kUnavailableRegisters,
|
||
|
sizeof(kUnavailableRegisters) /
|
||
|
sizeof(kUnavailableRegisters[0]));
|
||
|
}
|
||
|
|
||
|
bool allowAsBackToBack(const Instruction &Instr) const override {
|
||
|
const unsigned Opcode = Instr.Description.Opcode;
|
||
|
return !isInvalidOpcode(Instr) && Opcode != X86::LEA64r &&
|
||
|
Opcode != X86::LEA64_32r && Opcode != X86::LEA16r;
|
||
|
}
|
||
|
|
||
|
std::vector<InstructionTemplate>
|
||
|
generateInstructionVariants(const Instruction &Instr,
|
||
|
unsigned MaxConfigsPerOpcode) const override;
|
||
|
|
||
|
std::unique_ptr<SnippetGenerator> createSerialSnippetGenerator(
|
||
|
const LLVMState &State,
|
||
|
const SnippetGenerator::Options &Opts) const override {
|
||
|
return std::make_unique<X86SerialSnippetGenerator>(State, Opts);
|
||
|
}
|
||
|
|
||
|
std::unique_ptr<SnippetGenerator> createParallelSnippetGenerator(
|
||
|
const LLVMState &State,
|
||
|
const SnippetGenerator::Options &Opts) const override {
|
||
|
return std::make_unique<X86ParallelSnippetGenerator>(State, Opts);
|
||
|
}
|
||
|
|
||
|
bool matchesArch(Triple::ArchType Arch) const override {
|
||
|
return Arch == Triple::x86_64 || Arch == Triple::x86;
|
||
|
}
|
||
|
|
||
|
Error checkFeatureSupport() const override {
|
||
|
// LBR is the only feature we conditionally support now.
|
||
|
// So if LBR is not requested, then we should be able to run the benchmarks.
|
||
|
if (LbrSamplingPeriod == 0)
|
||
|
return Error::success();
|
||
|
|
||
|
#if defined(__linux__) && defined(HAVE_LIBPFM) && \
|
||
|
defined(LIBPFM_HAS_FIELD_CYCLES)
|
||
|
// If the kernel supports it, the hardware still may not have it.
|
||
|
return X86LbrCounter::checkLbrSupport();
|
||
|
#else
|
||
|
return llvm::make_error<llvm::StringError>(
|
||
|
"LBR not supported on this kernel and/or platform",
|
||
|
llvm::errc::not_supported);
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
std::unique_ptr<SavedState> withSavedState() const override {
|
||
|
return std::make_unique<X86SavedState>();
|
||
|
}
|
||
|
|
||
|
static const unsigned kUnavailableRegisters[4];
|
||
|
};
|
||
|
|
||
|
// We disable a few registers that cannot be encoded on instructions with a REX
|
||
|
// prefix.
|
||
|
const unsigned ExegesisX86Target::kUnavailableRegisters[4] = {X86::AH, X86::BH,
|
||
|
X86::CH, X86::DH};
|
||
|
|
||
|
// We're using one of R8-R15 because these registers are never hardcoded in
|
||
|
// instructions (e.g. MOVS writes to EDI, ESI, EDX), so they have less
|
||
|
// conflicts.
|
||
|
constexpr const unsigned kLoopCounterReg = X86::R8;
|
||
|
|
||
|
} // namespace
|
||
|
|
||
|
void ExegesisX86Target::addTargetSpecificPasses(PassManagerBase &PM) const {
|
||
|
// Lowers FP pseudo-instructions, e.g. ABS_Fp32 -> ABS_F.
|
||
|
PM.add(createX86FloatingPointStackifierPass());
|
||
|
}
|
||
|
|
||
|
unsigned ExegesisX86Target::getScratchMemoryRegister(const Triple &TT) const {
|
||
|
if (!TT.isArch64Bit()) {
|
||
|
// FIXME: This would require popping from the stack, so we would have to
|
||
|
// add some additional setup code.
|
||
|
return 0;
|
||
|
}
|
||
|
return TT.isOSWindows() ? X86::RCX : X86::RDI;
|
||
|
}
|
||
|
|
||
|
unsigned ExegesisX86Target::getLoopCounterRegister(const Triple &TT) const {
|
||
|
if (!TT.isArch64Bit()) {
|
||
|
return 0;
|
||
|
}
|
||
|
return kLoopCounterReg;
|
||
|
}
|
||
|
|
||
|
Error ExegesisX86Target::randomizeTargetMCOperand(
|
||
|
const Instruction &Instr, const Variable &Var, MCOperand &AssignedValue,
|
||
|
const BitVector &ForbiddenRegs) const {
|
||
|
const Operand &Op = Instr.getPrimaryOperand(Var);
|
||
|
switch (Op.getExplicitOperandInfo().OperandType) {
|
||
|
case X86::OperandType::OPERAND_ROUNDING_CONTROL:
|
||
|
AssignedValue =
|
||
|
MCOperand::createImm(randomIndex(X86::STATIC_ROUNDING::TO_ZERO));
|
||
|
return Error::success();
|
||
|
default:
|
||
|
break;
|
||
|
}
|
||
|
return make_error<Failure>(
|
||
|
Twine("unimplemented operand type ")
|
||
|
.concat(Twine(Op.getExplicitOperandInfo().OperandType)));
|
||
|
}
|
||
|
|
||
|
void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT,
|
||
|
unsigned Reg,
|
||
|
unsigned Offset) const {
|
||
|
assert(!isInvalidMemoryInstr(IT.getInstr()) &&
|
||
|
"fillMemoryOperands requires a valid memory instruction");
|
||
|
int MemOpIdx = X86II::getMemoryOperandNo(IT.getInstr().Description.TSFlags);
|
||
|
assert(MemOpIdx >= 0 && "invalid memory operand index");
|
||
|
// getMemoryOperandNo() ignores tied operands, so we have to add them back.
|
||
|
MemOpIdx += X86II::getOperandBias(IT.getInstr().Description);
|
||
|
setMemOp(IT, MemOpIdx + 0, MCOperand::createReg(Reg)); // BaseReg
|
||
|
setMemOp(IT, MemOpIdx + 1, MCOperand::createImm(1)); // ScaleAmt
|
||
|
setMemOp(IT, MemOpIdx + 2, MCOperand::createReg(0)); // IndexReg
|
||
|
setMemOp(IT, MemOpIdx + 3, MCOperand::createImm(Offset)); // Disp
|
||
|
setMemOp(IT, MemOpIdx + 4, MCOperand::createReg(0)); // Segment
|
||
|
}
|
||
|
|
||
|
void ExegesisX86Target::decrementLoopCounterAndJump(
|
||
|
MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB,
|
||
|
const MCInstrInfo &MII) const {
|
||
|
BuildMI(&MBB, DebugLoc(), MII.get(X86::ADD64ri8))
|
||
|
.addDef(kLoopCounterReg)
|
||
|
.addUse(kLoopCounterReg)
|
||
|
.addImm(-1);
|
||
|
BuildMI(&MBB, DebugLoc(), MII.get(X86::JCC_1))
|
||
|
.addMBB(&TargetMBB)
|
||
|
.addImm(X86::COND_NE);
|
||
|
}
|
||
|
|
||
|
std::vector<MCInst> ExegesisX86Target::setRegTo(const MCSubtargetInfo &STI,
|
||
|
unsigned Reg,
|
||
|
const APInt &Value) const {
|
||
|
if (X86::GR8RegClass.contains(Reg))
|
||
|
return {loadImmediate(Reg, 8, Value)};
|
||
|
if (X86::GR16RegClass.contains(Reg))
|
||
|
return {loadImmediate(Reg, 16, Value)};
|
||
|
if (X86::GR32RegClass.contains(Reg))
|
||
|
return {loadImmediate(Reg, 32, Value)};
|
||
|
if (X86::GR64RegClass.contains(Reg))
|
||
|
return {loadImmediate(Reg, 64, Value)};
|
||
|
ConstantInliner CI(Value);
|
||
|
if (X86::VR64RegClass.contains(Reg))
|
||
|
return CI.loadAndFinalize(Reg, 64, X86::MMX_MOVQ64rm);
|
||
|
if (X86::VR128XRegClass.contains(Reg)) {
|
||
|
if (STI.getFeatureBits()[X86::FeatureAVX512])
|
||
|
return CI.loadAndFinalize(Reg, 128, X86::VMOVDQU32Z128rm);
|
||
|
if (STI.getFeatureBits()[X86::FeatureAVX])
|
||
|
return CI.loadAndFinalize(Reg, 128, X86::VMOVDQUrm);
|
||
|
return CI.loadAndFinalize(Reg, 128, X86::MOVDQUrm);
|
||
|
}
|
||
|
if (X86::VR256XRegClass.contains(Reg)) {
|
||
|
if (STI.getFeatureBits()[X86::FeatureAVX512])
|
||
|
return CI.loadAndFinalize(Reg, 256, X86::VMOVDQU32Z256rm);
|
||
|
if (STI.getFeatureBits()[X86::FeatureAVX])
|
||
|
return CI.loadAndFinalize(Reg, 256, X86::VMOVDQUYrm);
|
||
|
}
|
||
|
if (X86::VR512RegClass.contains(Reg))
|
||
|
if (STI.getFeatureBits()[X86::FeatureAVX512])
|
||
|
return CI.loadAndFinalize(Reg, 512, X86::VMOVDQU32Zrm);
|
||
|
if (X86::RSTRegClass.contains(Reg)) {
|
||
|
return CI.loadX87STAndFinalize(Reg);
|
||
|
}
|
||
|
if (X86::RFP32RegClass.contains(Reg) || X86::RFP64RegClass.contains(Reg) ||
|
||
|
X86::RFP80RegClass.contains(Reg)) {
|
||
|
return CI.loadX87FPAndFinalize(Reg);
|
||
|
}
|
||
|
if (Reg == X86::EFLAGS)
|
||
|
return CI.popFlagAndFinalize();
|
||
|
if (Reg == X86::MXCSR)
|
||
|
return CI.loadImplicitRegAndFinalize(
|
||
|
STI.getFeatureBits()[X86::FeatureAVX] ? X86::VLDMXCSR : X86::LDMXCSR,
|
||
|
0x1f80);
|
||
|
if (Reg == X86::FPCW)
|
||
|
return CI.loadImplicitRegAndFinalize(X86::FLDCW16m, 0x37f);
|
||
|
return {}; // Not yet implemented.
|
||
|
}
|
||
|
|
||
|
// Instruction can have some variable operands, and we may want to see how
|
||
|
// different operands affect performance. So for each operand position,
|
||
|
// precompute all the possible choices we might care about,
|
||
|
// and greedily generate all the possible combinations of choices.
|
||
|
std::vector<InstructionTemplate> ExegesisX86Target::generateInstructionVariants(
|
||
|
const Instruction &Instr, unsigned MaxConfigsPerOpcode) const {
|
||
|
bool Exploration = false;
|
||
|
SmallVector<SmallVector<MCOperand, 1>, 4> VariableChoices;
|
||
|
VariableChoices.resize(Instr.Variables.size());
|
||
|
for (auto I : llvm::zip(Instr.Variables, VariableChoices)) {
|
||
|
const Variable &Var = std::get<0>(I);
|
||
|
SmallVectorImpl<MCOperand> &Choices = std::get<1>(I);
|
||
|
|
||
|
switch (Instr.getPrimaryOperand(Var).getExplicitOperandInfo().OperandType) {
|
||
|
default:
|
||
|
// We don't wish to explicitly explore this variable.
|
||
|
Choices.emplace_back(); // But add invalid MCOperand to simplify logic.
|
||
|
continue;
|
||
|
case X86::OperandType::OPERAND_COND_CODE: {
|
||
|
Exploration = true;
|
||
|
auto CondCodes = seq((int)X86::CondCode::COND_O,
|
||
|
1 + (int)X86::CondCode::LAST_VALID_COND);
|
||
|
Choices.reserve(std::distance(CondCodes.begin(), CondCodes.end()));
|
||
|
for (int CondCode : CondCodes)
|
||
|
Choices.emplace_back(MCOperand::createImm(CondCode));
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// If we don't wish to explore any variables, defer to the baseline method.
|
||
|
if (!Exploration)
|
||
|
return ExegesisTarget::generateInstructionVariants(Instr,
|
||
|
MaxConfigsPerOpcode);
|
||
|
|
||
|
std::vector<InstructionTemplate> Variants;
|
||
|
size_t NumVariants;
|
||
|
CombinationGenerator<MCOperand, decltype(VariableChoices)::value_type, 4> G(
|
||
|
VariableChoices);
|
||
|
|
||
|
// How many operand combinations can we produce, within the limit?
|
||
|
NumVariants = std::min(G.numCombinations(), (size_t)MaxConfigsPerOpcode);
|
||
|
// And actually produce all the wanted operand combinations.
|
||
|
Variants.reserve(NumVariants);
|
||
|
G.generate([&](ArrayRef<MCOperand> State) -> bool {
|
||
|
Variants.emplace_back(&Instr);
|
||
|
Variants.back().setVariableValues(State);
|
||
|
// Did we run out of space for variants?
|
||
|
return Variants.size() >= NumVariants;
|
||
|
});
|
||
|
|
||
|
assert(Variants.size() == NumVariants &&
|
||
|
Variants.size() <= MaxConfigsPerOpcode &&
|
||
|
"Should not produce too many variants");
|
||
|
return Variants;
|
||
|
}
|
||
|
|
||
|
static ExegesisTarget *getTheExegesisX86Target() {
|
||
|
static ExegesisX86Target Target;
|
||
|
return &Target;
|
||
|
}
|
||
|
|
||
|
void InitializeX86ExegesisTarget() {
|
||
|
ExegesisTarget::registerTarget(getTheExegesisX86Target());
|
||
|
}
|
||
|
|
||
|
} // namespace exegesis
|
||
|
} // namespace llvm
|