llvm-for-llvmta/tools/llvm-exegesis/lib/ParallelSnippetGenerator.cpp

//===-- ParallelSnippetGenerator.cpp ----------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "ParallelSnippetGenerator.h"

#include "BenchmarkRunner.h"
#include "MCInstrDescView.h"
#include "Target.h"

// FIXME: Load constants into registers (e.g. with fld1) to not break
// instructions like x87.

// Ideally we would like the only limitation on executing instructions to be the
// availability of the CPU resources (e.g. execution ports) needed to execute
// them, instead of the availability of their data dependencies.

// To achieve that, one approach is to generate instructions that do not have
// data dependencies between them.
//
// For some instructions, this is trivial:
//    mov rax, qword ptr [rsi]
//    mov rax, qword ptr [rsi]
//    mov rax, qword ptr [rsi]
//    mov rax, qword ptr [rsi]
// For the above snippet, haswell just renames rax four times and executes the
// four instructions two at a time on P23 and P0126.
//
// For some instructions, we just need to make sure that the source is
// different from the destination. For example, IDIV8r reads from GPR and
// writes to AX. We just need to ensure that the Var is assigned a
// register which is different from AX:
//    idiv bx
//    idiv bx
//    idiv bx
//    idiv bx
// The above snippet will be able to fully saturate the ports, while the same
// with ax would issue one uop every `latency(IDIV8r)` cycles.
//
// Some instructions make this harder because they both read and write from
// the same register:
//    inc rax
//    inc rax
//    inc rax
//    inc rax
// This has a data dependency from each instruction to the next, limit the
// number of instructions that can be issued in parallel.
// It turns out that this is not a big issue on recent Intel CPUs because they
// have heuristics to balance port pressure. In the snippet above, subsequent
// instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
// might end up executing them all on P0 (just because they can), or try
// avoiding P5 because it's usually under high pressure from vector
// instructions.
// This issue is even more important for high-latency instructions because
// they increase the idle time of the CPU, e.g. :
//    imul rax, rbx
//    imul rax, rbx
//    imul rax, rbx
//    imul rax, rbx
//
// To avoid that, we do the renaming statically by generating as many
// independent exclusive assignments as possible (until all possible registers
// are exhausted) e.g.:
//    imul rax, rbx
//    imul rcx, rbx
//    imul rdx, rbx
//    imul r8,  rbx
//
// Some instruction even make the above static renaming impossible because
// they implicitly read and write from the same operand, e.g. ADC16rr reads
// and writes from EFLAGS.
// In that case we just use a greedy register assignment and hope for the
// best.

namespace llvm {
namespace exegesis {

static SmallVector<const Variable *, 8>
getVariablesWithTiedOperands(const Instruction &Instr) {
  SmallVector<const Variable *, 8> Result;
  for (const auto &Var : Instr.Variables)
    if (Var.hasTiedOperands())
      Result.push_back(&Var);
  return Result;
}

ParallelSnippetGenerator::~ParallelSnippetGenerator() = default;

void ParallelSnippetGenerator::instantiateMemoryOperands(
    const unsigned ScratchSpacePointerInReg,
    std::vector<InstructionTemplate> &Instructions) const {
  if (ScratchSpacePointerInReg == 0)
    return; // no memory operands.
  const auto &ET = State.getExegesisTarget();
  const unsigned MemStep = ET.getMaxMemoryAccessSize();
  const size_t OriginalInstructionsSize = Instructions.size();
  size_t I = 0;
  for (InstructionTemplate &IT : Instructions) {
    ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
    ++I;
  }

  while (Instructions.size() < kMinNumDifferentAddresses) {
    InstructionTemplate IT = Instructions[I % OriginalInstructionsSize];
    ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
    ++I;
    Instructions.push_back(std::move(IT));
  }
  assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize &&
         "not enough scratch space");
}

static std::vector<InstructionTemplate> generateSnippetUsingStaticRenaming(
    const LLVMState &State, const InstructionTemplate &IT,
    const ArrayRef<const Variable *> TiedVariables,
    const BitVector &ForbiddenRegisters) {
  std::vector<InstructionTemplate> Instructions;
  // Assign registers to variables in a round-robin manner. This is simple but
  // ensures that the most register-constrained variable does not get starved.
  std::vector<BitVector> PossibleRegsForVar;
  for (const Variable *Var : TiedVariables) {
    assert(Var);
    const Operand &Op = IT.getInstr().getPrimaryOperand(*Var);
    assert(Op.isReg());
    BitVector PossibleRegs = Op.getRegisterAliasing().sourceBits();
    remove(PossibleRegs, ForbiddenRegisters);
    PossibleRegsForVar.push_back(std::move(PossibleRegs));
  }
  SmallVector<int, 2> Iterators(TiedVariables.size(), 0);
  while (true) {
    InstructionTemplate TmpIT = IT;
    // Find a possible register for each variable in turn, marking the
    // register as taken.
    for (size_t VarId = 0; VarId < TiedVariables.size(); ++VarId) {
      const int NextPossibleReg =
          PossibleRegsForVar[VarId].find_next(Iterators[VarId]);
      if (NextPossibleReg <= 0) {
        return Instructions;
      }
      TmpIT.getValueFor(*TiedVariables[VarId]) =
          MCOperand::createReg(NextPossibleReg);
      // Bump iterator.
      Iterators[VarId] = NextPossibleReg;
      // Prevent other variables from using the register.
      for (BitVector &OtherPossibleRegs : PossibleRegsForVar) {
        OtherPossibleRegs.reset(NextPossibleReg);
      }
    }
    Instructions.push_back(std::move(TmpIT));
  }
}

Expected<std::vector<CodeTemplate>>
ParallelSnippetGenerator::generateCodeTemplates(
    InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
  const Instruction &Instr = Variant.getInstr();
  CodeTemplate CT;
  CT.ScratchSpacePointerInReg =
      Instr.hasMemoryOperands()
          ? State.getExegesisTarget().getScratchMemoryRegister(
                State.getTargetMachine().getTargetTriple())
          : 0;
  const AliasingConfigurations SelfAliasing(Instr, Instr);
  if (SelfAliasing.empty()) {
    CT.Info = "instruction is parallel, repeating a random one.";
    CT.Instructions.push_back(std::move(Variant));
    instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
    return getSingleton(std::move(CT));
  }
  if (SelfAliasing.hasImplicitAliasing()) {
    CT.Info = "instruction is serial, repeating a random one.";
    CT.Instructions.push_back(std::move(Variant));
    instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
    return getSingleton(std::move(CT));
  }
  const auto TiedVariables = getVariablesWithTiedOperands(Instr);
  if (!TiedVariables.empty()) {
    CT.Info = "instruction has tied variables, using static renaming.";
    CT.Instructions = generateSnippetUsingStaticRenaming(
        State, Variant, TiedVariables, ForbiddenRegisters);
    instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
    return getSingleton(std::move(CT));
  }
  // No tied variables, we pick random values for defs.
  BitVector Defs(State.getRegInfo().getNumRegs());
  for (const auto &Op : Instr.Operands) {
    if (Op.isReg() && Op.isExplicit() && Op.isDef() && !Op.isMemory()) {
      auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
      // Do not use forbidden registers.
      remove(PossibleRegisters, ForbiddenRegisters);
      assert(PossibleRegisters.any() && "No register left to choose from");
      const auto RandomReg = randomBit(PossibleRegisters);
      Defs.set(RandomReg);
      Variant.getValueFor(Op) = MCOperand::createReg(RandomReg);
    }
  }
  // And pick random use values that are not reserved and don't alias with defs.
  const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs);
  for (const auto &Op : Instr.Operands) {
    if (Op.isReg() && Op.isExplicit() && Op.isUse() && !Op.isMemory()) {
      auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
      remove(PossibleRegisters, ForbiddenRegisters);
      remove(PossibleRegisters, DefAliases);
      assert(PossibleRegisters.any() && "No register left to choose from");
      const auto RandomReg = randomBit(PossibleRegisters);
      Variant.getValueFor(Op) = MCOperand::createReg(RandomReg);
    }
  }
  CT.Info =
      "instruction has no tied variables picking Uses different from defs";
  CT.Instructions.push_back(std::move(Variant));
  instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
  return getSingleton(std::move(CT));
}

constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses;

} // namespace exegesis
} // namespace llvm
first commit 2022-04-25 10:02:23 +02:00			`//===-- ParallelSnippetGenerator.cpp ----------------------------- C++ --===//`
			`//`
			`// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.`
			`// See https://llvm.org/LICENSE.txt for license information.`
			`// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception`
			`//`
			`//===----------------------------------------------------------------------===//`

			`#include "ParallelSnippetGenerator.h"`

			`#include "BenchmarkRunner.h"`
			`#include "MCInstrDescView.h"`
			`#include "Target.h"`

			`// FIXME: Load constants into registers (e.g. with fld1) to not break`
			`// instructions like x87.`

			`// Ideally we would like the only limitation on executing instructions to be the`
			`// availability of the CPU resources (e.g. execution ports) needed to execute`
			`// them, instead of the availability of their data dependencies.`

			`// To achieve that, one approach is to generate instructions that do not have`
			`// data dependencies between them.`
			`//`
			`// For some instructions, this is trivial:`
			`// mov rax, qword ptr [rsi]`
			`// mov rax, qword ptr [rsi]`
			`// mov rax, qword ptr [rsi]`
			`// mov rax, qword ptr [rsi]`
			`// For the above snippet, haswell just renames rax four times and executes the`
			`// four instructions two at a time on P23 and P0126.`
			`//`
			`// For some instructions, we just need to make sure that the source is`
			`// different from the destination. For example, IDIV8r reads from GPR and`
			`// writes to AX. We just need to ensure that the Var is assigned a`
			`// register which is different from AX:`
			`// idiv bx`
			`// idiv bx`
			`// idiv bx`
			`// idiv bx`
			`// The above snippet will be able to fully saturate the ports, while the same`
			// with ax would issue one uop every `latency(IDIV8r)` cycles.
			`//`
			`// Some instructions make this harder because they both read and write from`
			`// the same register:`
			`// inc rax`
			`// inc rax`
			`// inc rax`
			`// inc rax`
			`// This has a data dependency from each instruction to the next, limit the`
			`// number of instructions that can be issued in parallel.`
			`// It turns out that this is not a big issue on recent Intel CPUs because they`
			`// have heuristics to balance port pressure. In the snippet above, subsequent`
			`// instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs`
			`// might end up executing them all on P0 (just because they can), or try`
			`// avoiding P5 because it's usually under high pressure from vector`
			`// instructions.`
			`// This issue is even more important for high-latency instructions because`
			`// they increase the idle time of the CPU, e.g. :`
			`// imul rax, rbx`
			`// imul rax, rbx`
			`// imul rax, rbx`
			`// imul rax, rbx`
			`//`
			`// To avoid that, we do the renaming statically by generating as many`
			`// independent exclusive assignments as possible (until all possible registers`
			`// are exhausted) e.g.:`
			`// imul rax, rbx`
			`// imul rcx, rbx`
			`// imul rdx, rbx`
			`// imul r8, rbx`
			`//`
			`// Some instruction even make the above static renaming impossible because`
			`// they implicitly read and write from the same operand, e.g. ADC16rr reads`
			`// and writes from EFLAGS.`
			`// In that case we just use a greedy register assignment and hope for the`
			`// best.`

			`namespace llvm {`
			`namespace exegesis {`

			`static SmallVector<const Variable *, 8>`
			`getVariablesWithTiedOperands(const Instruction &Instr) {`
			`SmallVector<const Variable *, 8> Result;`
			`for (const auto &Var : Instr.Variables)`
			`if (Var.hasTiedOperands())`
			`Result.push_back(&Var);`
			`return Result;`
			`}`

			`ParallelSnippetGenerator::~ParallelSnippetGenerator() = default;`

			`void ParallelSnippetGenerator::instantiateMemoryOperands(`
			`const unsigned ScratchSpacePointerInReg,`
			`std::vector<InstructionTemplate> &Instructions) const {`
			`if (ScratchSpacePointerInReg == 0)`
			`return; // no memory operands.`
			`const auto &ET = State.getExegesisTarget();`
			`const unsigned MemStep = ET.getMaxMemoryAccessSize();`
			`const size_t OriginalInstructionsSize = Instructions.size();`
			`size_t I = 0;`
			`for (InstructionTemplate &IT : Instructions) {`
			`ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);`
			`++I;`
			`}`

			`while (Instructions.size() < kMinNumDifferentAddresses) {`
			`InstructionTemplate IT = Instructions[I % OriginalInstructionsSize];`
			`ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);`
			`++I;`
			`Instructions.push_back(std::move(IT));`
			`}`
			`assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize &&`
			`"not enough scratch space");`
			`}`

			`static std::vector<InstructionTemplate> generateSnippetUsingStaticRenaming(`
			`const LLVMState &State, const InstructionTemplate &IT,`
			`const ArrayRef<const Variable *> TiedVariables,`
			`const BitVector &ForbiddenRegisters) {`
			`std::vector<InstructionTemplate> Instructions;`
			`// Assign registers to variables in a round-robin manner. This is simple but`
			`// ensures that the most register-constrained variable does not get starved.`
			`std::vector<BitVector> PossibleRegsForVar;`
			`for (const Variable *Var : TiedVariables) {`
			`assert(Var);`
			`const Operand &Op = IT.getInstr().getPrimaryOperand(*Var);`
			`assert(Op.isReg());`
			`BitVector PossibleRegs = Op.getRegisterAliasing().sourceBits();`
			`remove(PossibleRegs, ForbiddenRegisters);`
			`PossibleRegsForVar.push_back(std::move(PossibleRegs));`
			`}`
			`SmallVector<int, 2> Iterators(TiedVariables.size(), 0);`
			`while (true) {`
			`InstructionTemplate TmpIT = IT;`
			`// Find a possible register for each variable in turn, marking the`
			`// register as taken.`
			`for (size_t VarId = 0; VarId < TiedVariables.size(); ++VarId) {`
			`const int NextPossibleReg =`
			`PossibleRegsForVar[VarId].find_next(Iterators[VarId]);`
			`if (NextPossibleReg <= 0) {`
			`return Instructions;`
			`}`
			`TmpIT.getValueFor(*TiedVariables[VarId]) =`
			`MCOperand::createReg(NextPossibleReg);`
			`// Bump iterator.`
			`Iterators[VarId] = NextPossibleReg;`
			`// Prevent other variables from using the register.`
			`for (BitVector &OtherPossibleRegs : PossibleRegsForVar) {`
			`OtherPossibleRegs.reset(NextPossibleReg);`
			`}`
			`}`
			`Instructions.push_back(std::move(TmpIT));`
			`}`
			`}`

			`Expected<std::vector<CodeTemplate>>`
			`ParallelSnippetGenerator::generateCodeTemplates(`
			`InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {`
			`const Instruction &Instr = Variant.getInstr();`
			`CodeTemplate CT;`
			`CT.ScratchSpacePointerInReg =`
			`Instr.hasMemoryOperands()`
			`? State.getExegesisTarget().getScratchMemoryRegister(`
			`State.getTargetMachine().getTargetTriple())`
			`: 0;`
			`const AliasingConfigurations SelfAliasing(Instr, Instr);`
			`if (SelfAliasing.empty()) {`
			`CT.Info = "instruction is parallel, repeating a random one.";`
			`CT.Instructions.push_back(std::move(Variant));`
			`instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);`
			`return getSingleton(std::move(CT));`
			`}`
			`if (SelfAliasing.hasImplicitAliasing()) {`
			`CT.Info = "instruction is serial, repeating a random one.";`
			`CT.Instructions.push_back(std::move(Variant));`
			`instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);`
			`return getSingleton(std::move(CT));`
			`}`
			`const auto TiedVariables = getVariablesWithTiedOperands(Instr);`
			`if (!TiedVariables.empty()) {`
			`CT.Info = "instruction has tied variables, using static renaming.";`
			`CT.Instructions = generateSnippetUsingStaticRenaming(`
			`State, Variant, TiedVariables, ForbiddenRegisters);`
			`instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);`
			`return getSingleton(std::move(CT));`
			`}`
			`// No tied variables, we pick random values for defs.`
			`BitVector Defs(State.getRegInfo().getNumRegs());`
			`for (const auto &Op : Instr.Operands) {`
			`if (Op.isReg() && Op.isExplicit() && Op.isDef() && !Op.isMemory()) {`
			`auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();`
			`// Do not use forbidden registers.`
			`remove(PossibleRegisters, ForbiddenRegisters);`
			`assert(PossibleRegisters.any() && "No register left to choose from");`
			`const auto RandomReg = randomBit(PossibleRegisters);`
			`Defs.set(RandomReg);`
			`Variant.getValueFor(Op) = MCOperand::createReg(RandomReg);`
			`}`
			`}`
			`// And pick random use values that are not reserved and don't alias with defs.`
			`const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs);`
			`for (const auto &Op : Instr.Operands) {`
			`if (Op.isReg() && Op.isExplicit() && Op.isUse() && !Op.isMemory()) {`
			`auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();`
			`remove(PossibleRegisters, ForbiddenRegisters);`
			`remove(PossibleRegisters, DefAliases);`
			`assert(PossibleRegisters.any() && "No register left to choose from");`
			`const auto RandomReg = randomBit(PossibleRegisters);`
			`Variant.getValueFor(Op) = MCOperand::createReg(RandomReg);`
			`}`
			`}`
			`CT.Info =`
			`"instruction has no tied variables picking Uses different from defs";`
			`CT.Instructions.push_back(std::move(Variant));`
			`instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);`
			`return getSingleton(std::move(CT));`
			`}`

			`constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses;`

			`} // namespace exegesis`
			`} // namespace llvm`