llvm-for-llvmta/tools/llvm-mca/Views/BottleneckAnalysis.h

//===--------------------- BottleneckAnalysis.h -----------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
///
/// This file implements the bottleneck analysis view.
///
/// This view internally observes backend pressure increase events in order to
/// identify problematic data dependencies and processor resource interferences.
///
/// Example of bottleneck analysis report for a dot-product on X86 btver2:
///
/// Cycles with backend pressure increase [ 40.76% ]
/// Throughput Bottlenecks: 
///   Resource Pressure       [ 39.34% ]
///   - JFPA  [ 39.34% ]
///   - JFPU0  [ 39.34% ]
///   Data Dependencies:      [ 1.42% ]
///   - Register Dependencies [ 1.42% ]
///   - Memory Dependencies   [ 0.00% ]
///
/// According to the example, backend pressure increased during the 40.76% of
/// the simulated cycles.  In particular, the major cause of backend pressure
/// increases was the contention on floating point adder JFPA accessible from
/// pipeline resource JFPU0.
///
/// At the end of each cycle, if pressure on the simulated out-of-order buffers
/// has increased, a backend pressure event is reported.
/// In particular, this occurs when there is a delta between the number of uOps
/// dispatched and the number of uOps issued to the underlying pipelines.
///
/// The bottleneck analysis view is also responsible for identifying and printing
/// the most "critical" sequence of dependent instructions according to the
/// simulated run.
///
/// Below is the critical sequence computed for the dot-product example on
/// btver2:
///
///              Instruction                     Dependency Information
/// +----< 2.    vhaddps %xmm3, %xmm3, %xmm4
/// |
/// |    < loop carried > 
/// |
/// |      0.    vmulps	 %xmm0, %xmm0, %xmm2
/// +----> 1.    vhaddps %xmm2, %xmm2, %xmm3     ## RESOURCE interference:  JFPA [ probability: 73% ]
/// +----> 2.    vhaddps %xmm3, %xmm3, %xmm4     ## REGISTER dependency:  %xmm3
/// |
/// |    < loop carried > 
/// |
/// +----> 1.    vhaddps %xmm2, %xmm2, %xmm3     ## RESOURCE interference:  JFPA [ probability: 73% ]
///
///
/// The algorithm that computes the critical sequence is very similar to a
/// critical path analysis.
/// 
/// A dependency graph is used internally to track dependencies between nodes.
/// Nodes of the graph represent instructions from the input assembly sequence,
/// and edges of the graph represent data dependencies or processor resource
/// interferences.
///
/// Edges are dynamically 'discovered' by observing instruction state transitions
/// and backend pressure increase events. Edges are internally ranked based on
/// their "criticality". A dependency is considered to be critical if it takes a
/// long time to execute, and if it contributes to backend pressure increases.
/// Criticality is internally measured in terms of cycles; it is computed for
/// every edge in the graph as a function of the edge latency and the number of
/// backend pressure increase cycles contributed by that edge.
///
/// At the end of simulation, costs are propagated to nodes through the edges of
/// the graph, and the most expensive path connecting the root-set (a
/// set of nodes with no predecessors) to a leaf node is reported as critical
/// sequence.
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
#define LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H

#include "Views/InstructionView.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/MC/MCInstPrinter.h"
#include "llvm/MC/MCSchedule.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/FormattedStream.h"
#include "llvm/Support/raw_ostream.h"

namespace llvm {
namespace mca {

class PressureTracker {
  const MCSchedModel &SM;

  // Resource pressure distribution. There is an element for every processor
  // resource declared by the scheduling model. Quantities are number of cycles.
  SmallVector<unsigned, 4> ResourcePressureDistribution;

  // Each processor resource is associated with a so-called processor resource
  // mask. This vector allows to correlate processor resource IDs with processor
  // resource masks. There is exactly one element per each processor resource
  // declared by the scheduling model.
  SmallVector<uint64_t, 4> ProcResID2Mask;

  // Maps processor resource state indices (returned by calls to
  // `getResourceStateIndex(Mask)` to processor resource identifiers.
  SmallVector<unsigned, 4> ResIdx2ProcResID;

  // Maps Processor Resource identifiers to ResourceUsers indices.
  SmallVector<unsigned, 4> ProcResID2ResourceUsersIndex;

  // Identifies the last user of a processor resource unit.
  // This vector is updated on every instruction issued event.
  // There is one entry for every processor resource unit declared by the
  // processor model. An all_ones value is treated like an invalid instruction
  // identifier.
  using User = std::pair<unsigned, unsigned>;
  SmallVector<User, 4> ResourceUsers;

  struct InstructionPressureInfo {
    unsigned RegisterPressureCycles;
    unsigned MemoryPressureCycles;
    unsigned ResourcePressureCycles;
  };
  DenseMap<unsigned, InstructionPressureInfo> IPI;

  void updateResourcePressureDistribution(uint64_t CumulativeMask);

  User getResourceUser(unsigned ProcResID, unsigned UnitID) const {
    unsigned Index = ProcResID2ResourceUsersIndex[ProcResID];
    return ResourceUsers[Index + UnitID];
  }

public:
  PressureTracker(const MCSchedModel &Model);

  ArrayRef<unsigned> getResourcePressureDistribution() const {
    return ResourcePressureDistribution;
  }

  void getResourceUsers(uint64_t ResourceMask,
                        SmallVectorImpl<User> &Users) const;

  unsigned getRegisterPressureCycles(unsigned IID) const {
    assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
    const InstructionPressureInfo &Info = IPI.find(IID)->second;
    return Info.RegisterPressureCycles;
  }

  unsigned getMemoryPressureCycles(unsigned IID) const {
    assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
    const InstructionPressureInfo &Info = IPI.find(IID)->second;
    return Info.MemoryPressureCycles;
  }

  unsigned getResourcePressureCycles(unsigned IID) const {
    assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
    const InstructionPressureInfo &Info = IPI.find(IID)->second;
    return Info.ResourcePressureCycles;
  }

  const char *resolveResourceName(uint64_t ResourceMask) const {
    unsigned Index = getResourceStateIndex(ResourceMask);
    unsigned ProcResID = ResIdx2ProcResID[Index];
    const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID);
    return PRDesc.Name;
  }

  void onInstructionDispatched(unsigned IID);
  void onInstructionExecuted(unsigned IID);

  void handlePressureEvent(const HWPressureEvent &Event);
  void handleInstructionIssuedEvent(const HWInstructionIssuedEvent &Event);
};

// A dependency edge.
struct DependencyEdge {
  enum DependencyType { DT_INVALID, DT_REGISTER, DT_MEMORY, DT_RESOURCE };

  // Dependency edge descriptor.
  //
  // It specifies the dependency type, as well as the edge cost in cycles.
  struct Dependency {
    DependencyType Type;
    uint64_t ResourceOrRegID;
    uint64_t Cost;
  };
  Dependency Dep;

  unsigned FromIID;
  unsigned ToIID;

  // Used by the bottleneck analysis to compute the interference
  // probability for processor resources.
  unsigned Frequency;
};

// A dependency graph used by the bottleneck analysis to describe data
// dependencies and processor resource interferences between instructions.
//
// There is a node (an instance of struct DGNode) for every instruction in the
// input assembly sequence. Edges of the graph represent dependencies between
// instructions.
//
// Each edge of the graph is associated with a cost value which is used
// internally to rank dependency based on their impact on the runtime
// performance (see field DependencyEdge::Dependency::Cost). In general, the
// higher the cost of an edge, the higher the impact on performance.
//
// The cost of a dependency is a function of both the latency and the number of
// cycles where the dependency has been seen as critical (i.e. contributing to
// back-pressure increases).
//
// Loop carried dependencies are carefully expanded by the bottleneck analysis
// to guarantee that the graph stays acyclic. To this end, extra nodes are
// pre-allocated at construction time to describe instructions from "past and
// future" iterations. The graph is kept acyclic mainly because it simplifies the
// complexity of the algorithm that computes the critical sequence.
class DependencyGraph {
  struct DGNode {
    unsigned NumPredecessors;
    unsigned NumVisitedPredecessors;
    uint64_t Cost;
    unsigned Depth;

    DependencyEdge CriticalPredecessor;
    SmallVector<DependencyEdge, 8> OutgoingEdges;
  };
  SmallVector<DGNode, 16> Nodes;

  DependencyGraph(const DependencyGraph &) = delete;
  DependencyGraph &operator=(const DependencyGraph &) = delete;

  void addDependency(unsigned From, unsigned To,
                     DependencyEdge::Dependency &&DE);

  void pruneEdges(unsigned Iterations);
  void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const;
  void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet, unsigned Iterations);

#ifndef NDEBUG
  void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE,
                          MCInstPrinter &MCIP) const;
#endif

public:
  DependencyGraph(unsigned Size) : Nodes(Size) {}

  void addRegisterDep(unsigned From, unsigned To, unsigned RegID,
                      unsigned Cost) {
    addDependency(From, To, {DependencyEdge::DT_REGISTER, RegID, Cost});
  }

  void addMemoryDep(unsigned From, unsigned To, unsigned Cost) {
    addDependency(From, To, {DependencyEdge::DT_MEMORY, /* unused */ 0, Cost});
  }

  void addResourceDep(unsigned From, unsigned To, uint64_t Mask,
                      unsigned Cost) {
    addDependency(From, To, {DependencyEdge::DT_RESOURCE, Mask, Cost});
  }

  // Called by the bottleneck analysis at the end of simulation to propagate
  // costs through the edges of the graph, and compute a critical path.
  void finalizeGraph(unsigned Iterations) {
    SmallVector<unsigned, 16> RootSet;
    pruneEdges(Iterations);
    initializeRootSet(RootSet);
    propagateThroughEdges(RootSet, Iterations);
  }

  // Returns a sequence of edges representing the critical sequence based on the
  // simulated run. It assumes that the graph has already been finalized (i.e.
  // method `finalizeGraph()` has already been called on this graph).
  void getCriticalSequence(SmallVectorImpl<const DependencyEdge *> &Seq) const;

#ifndef NDEBUG
  void dump(raw_ostream &OS, MCInstPrinter &MCIP) const;
#endif
};

/// A view that collects and prints a few performance numbers.
class BottleneckAnalysis : public InstructionView {
  PressureTracker Tracker;
  DependencyGraph DG;

  unsigned Iterations;
  unsigned TotalCycles;

  bool PressureIncreasedBecauseOfResources;
  bool PressureIncreasedBecauseOfRegisterDependencies;
  bool PressureIncreasedBecauseOfMemoryDependencies;
  // True if throughput was affected by dispatch stalls.
  bool SeenStallCycles;

  struct BackPressureInfo {
    // Cycles where backpressure increased.
    unsigned PressureIncreaseCycles;
    // Cycles where backpressure increased because of pipeline pressure.
    unsigned ResourcePressureCycles;
    // Cycles where backpressure increased because of data dependencies.
    unsigned DataDependencyCycles;
    // Cycles where backpressure increased because of register dependencies.
    unsigned RegisterDependencyCycles;
    // Cycles where backpressure increased because of memory dependencies.
    unsigned MemoryDependencyCycles;
  };
  BackPressureInfo BPI;

  // Used to populate the dependency graph DG.
  void addRegisterDep(unsigned From, unsigned To, unsigned RegID, unsigned Cy);
  void addMemoryDep(unsigned From, unsigned To, unsigned Cy);
  void addResourceDep(unsigned From, unsigned To, uint64_t Mask, unsigned Cy);

  void printInstruction(formatted_raw_ostream &FOS, const MCInst &MCI,
                        bool UseDifferentColor = false) const;

  // Prints a bottleneck message to OS.
  void printBottleneckHints(raw_ostream &OS) const;
  void printCriticalSequence(raw_ostream &OS) const;

public:
  BottleneckAnalysis(const MCSubtargetInfo &STI, MCInstPrinter &MCIP,
                     ArrayRef<MCInst> Sequence, unsigned Iterations);

  void onCycleEnd() override;
  void onEvent(const HWStallEvent &Event) override { SeenStallCycles = true; }
  void onEvent(const HWPressureEvent &Event) override;
  void onEvent(const HWInstructionEvent &Event) override;

  void printView(raw_ostream &OS) const override;
  StringRef getNameAsString() const override { return "BottleneckAnalysis"; }
  json::Value toJSON() const override { return "not implemented"; }

#ifndef NDEBUG
  void dump(raw_ostream &OS, MCInstPrinter &MCIP) const { DG.dump(OS, MCIP); }
#endif
};

} // namespace mca
} // namespace llvm

#endif
first commit 2022-04-25 10:02:23 +02:00			`//===--------------------- BottleneckAnalysis.h ------------------ C++ --===//`
			`//`
			`// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.`
			`// See https://llvm.org/LICENSE.txt for license information.`
			`// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception`
			`//`
			`//===----------------------------------------------------------------------===//`
			`/// \file`
			`///`
			`/// This file implements the bottleneck analysis view.`
			`///`
			`/// This view internally observes backend pressure increase events in order to`
			`/// identify problematic data dependencies and processor resource interferences.`
			`///`
			`/// Example of bottleneck analysis report for a dot-product on X86 btver2:`
			`///`
			`/// Cycles with backend pressure increase [ 40.76% ]`
			`/// Throughput Bottlenecks:`
			`/// Resource Pressure [ 39.34% ]`
			`/// - JFPA [ 39.34% ]`
			`/// - JFPU0 [ 39.34% ]`
			`/// Data Dependencies: [ 1.42% ]`
			`/// - Register Dependencies [ 1.42% ]`
			`/// - Memory Dependencies [ 0.00% ]`
			`///`
			`/// According to the example, backend pressure increased during the 40.76% of`
			`/// the simulated cycles. In particular, the major cause of backend pressure`
			`/// increases was the contention on floating point adder JFPA accessible from`
			`/// pipeline resource JFPU0.`
			`///`
			`/// At the end of each cycle, if pressure on the simulated out-of-order buffers`
			`/// has increased, a backend pressure event is reported.`
			`/// In particular, this occurs when there is a delta between the number of uOps`
			`/// dispatched and the number of uOps issued to the underlying pipelines.`
			`///`
			`/// The bottleneck analysis view is also responsible for identifying and printing`
			`/// the most "critical" sequence of dependent instructions according to the`
			`/// simulated run.`
			`///`
			`/// Below is the critical sequence computed for the dot-product example on`
			`/// btver2:`
			`///`
			`/// Instruction Dependency Information`
			`/// +----< 2. vhaddps %xmm3, %xmm3, %xmm4`
			`/// \|`
			`/// \| < loop carried >`
			`/// \|`
			`/// \| 0. vmulps %xmm0, %xmm0, %xmm2`
			`/// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ]`
			`/// +----> 2. vhaddps %xmm3, %xmm3, %xmm4 ## REGISTER dependency: %xmm3`
			`/// \|`
			`/// \| < loop carried >`
			`/// \|`
			`/// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ]`
			`///`
			`///`
			`/// The algorithm that computes the critical sequence is very similar to a`
			`/// critical path analysis.`
			`///`
			`/// A dependency graph is used internally to track dependencies between nodes.`
			`/// Nodes of the graph represent instructions from the input assembly sequence,`
			`/// and edges of the graph represent data dependencies or processor resource`
			`/// interferences.`
			`///`
			`/// Edges are dynamically 'discovered' by observing instruction state transitions`
			`/// and backend pressure increase events. Edges are internally ranked based on`
			`/// their "criticality". A dependency is considered to be critical if it takes a`
			`/// long time to execute, and if it contributes to backend pressure increases.`
			`/// Criticality is internally measured in terms of cycles; it is computed for`
			`/// every edge in the graph as a function of the edge latency and the number of`
			`/// backend pressure increase cycles contributed by that edge.`
			`///`
			`/// At the end of simulation, costs are propagated to nodes through the edges of`
			`/// the graph, and the most expensive path connecting the root-set (a`
			`/// set of nodes with no predecessors) to a leaf node is reported as critical`
			`/// sequence.`
			`//`
			`//===----------------------------------------------------------------------===//`

			`#ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H`
			`#define LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H`

			`#include "Views/InstructionView.h"`
			`#include "llvm/ADT/DenseMap.h"`
			`#include "llvm/ADT/SmallVector.h"`
			`#include "llvm/MC/MCInstPrinter.h"`
			`#include "llvm/MC/MCSchedule.h"`
			`#include "llvm/MC/MCSubtargetInfo.h"`
			`#include "llvm/Support/FormattedStream.h"`
			`#include "llvm/Support/raw_ostream.h"`

			`namespace llvm {`
			`namespace mca {`

			`class PressureTracker {`
			`const MCSchedModel &SM;`

			`// Resource pressure distribution. There is an element for every processor`
			`// resource declared by the scheduling model. Quantities are number of cycles.`
			`SmallVector<unsigned, 4> ResourcePressureDistribution;`

			`// Each processor resource is associated with a so-called processor resource`
			`// mask. This vector allows to correlate processor resource IDs with processor`
			`// resource masks. There is exactly one element per each processor resource`
			`// declared by the scheduling model.`
			`SmallVector<uint64_t, 4> ProcResID2Mask;`

			`// Maps processor resource state indices (returned by calls to`
			// `getResourceStateIndex(Mask)` to processor resource identifiers.
			`SmallVector<unsigned, 4> ResIdx2ProcResID;`

			`// Maps Processor Resource identifiers to ResourceUsers indices.`
			`SmallVector<unsigned, 4> ProcResID2ResourceUsersIndex;`

			`// Identifies the last user of a processor resource unit.`
			`// This vector is updated on every instruction issued event.`
			`// There is one entry for every processor resource unit declared by the`
			`// processor model. An all_ones value is treated like an invalid instruction`
			`// identifier.`
			`using User = std::pair<unsigned, unsigned>;`
			`SmallVector<User, 4> ResourceUsers;`

			`struct InstructionPressureInfo {`
			`unsigned RegisterPressureCycles;`
			`unsigned MemoryPressureCycles;`
			`unsigned ResourcePressureCycles;`
			`};`
			`DenseMap<unsigned, InstructionPressureInfo> IPI;`

			`void updateResourcePressureDistribution(uint64_t CumulativeMask);`

			`User getResourceUser(unsigned ProcResID, unsigned UnitID) const {`
			`unsigned Index = ProcResID2ResourceUsersIndex[ProcResID];`
			`return ResourceUsers[Index + UnitID];`
			`}`

			`public:`
			`PressureTracker(const MCSchedModel &Model);`

			`ArrayRef<unsigned> getResourcePressureDistribution() const {`
			`return ResourcePressureDistribution;`
			`}`

			`void getResourceUsers(uint64_t ResourceMask,`
			`SmallVectorImpl<User> &Users) const;`

			`unsigned getRegisterPressureCycles(unsigned IID) const {`
			`assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");`
			`const InstructionPressureInfo &Info = IPI.find(IID)->second;`
			`return Info.RegisterPressureCycles;`
			`}`

			`unsigned getMemoryPressureCycles(unsigned IID) const {`
			`assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");`
			`const InstructionPressureInfo &Info = IPI.find(IID)->second;`
			`return Info.MemoryPressureCycles;`
			`}`

			`unsigned getResourcePressureCycles(unsigned IID) const {`
			`assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");`
			`const InstructionPressureInfo &Info = IPI.find(IID)->second;`
			`return Info.ResourcePressureCycles;`
			`}`

			`const char *resolveResourceName(uint64_t ResourceMask) const {`
			`unsigned Index = getResourceStateIndex(ResourceMask);`
			`unsigned ProcResID = ResIdx2ProcResID[Index];`
			`const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID);`
			`return PRDesc.Name;`
			`}`

			`void onInstructionDispatched(unsigned IID);`
			`void onInstructionExecuted(unsigned IID);`

			`void handlePressureEvent(const HWPressureEvent &Event);`
			`void handleInstructionIssuedEvent(const HWInstructionIssuedEvent &Event);`
			`};`

			`// A dependency edge.`
			`struct DependencyEdge {`
			`enum DependencyType { DT_INVALID, DT_REGISTER, DT_MEMORY, DT_RESOURCE };`

			`// Dependency edge descriptor.`
			`//`
			`// It specifies the dependency type, as well as the edge cost in cycles.`
			`struct Dependency {`
			`DependencyType Type;`
			`uint64_t ResourceOrRegID;`
			`uint64_t Cost;`
			`};`
			`Dependency Dep;`

			`unsigned FromIID;`
			`unsigned ToIID;`

			`// Used by the bottleneck analysis to compute the interference`
			`// probability for processor resources.`
			`unsigned Frequency;`
			`};`

			`// A dependency graph used by the bottleneck analysis to describe data`
			`// dependencies and processor resource interferences between instructions.`
			`//`
			`// There is a node (an instance of struct DGNode) for every instruction in the`
			`// input assembly sequence. Edges of the graph represent dependencies between`
			`// instructions.`
			`//`
			`// Each edge of the graph is associated with a cost value which is used`
			`// internally to rank dependency based on their impact on the runtime`
			`// performance (see field DependencyEdge::Dependency::Cost). In general, the`
			`// higher the cost of an edge, the higher the impact on performance.`
			`//`
			`// The cost of a dependency is a function of both the latency and the number of`
			`// cycles where the dependency has been seen as critical (i.e. contributing to`
			`// back-pressure increases).`
			`//`
			`// Loop carried dependencies are carefully expanded by the bottleneck analysis`
			`// to guarantee that the graph stays acyclic. To this end, extra nodes are`
			`// pre-allocated at construction time to describe instructions from "past and`
			`// future" iterations. The graph is kept acyclic mainly because it simplifies the`
			`// complexity of the algorithm that computes the critical sequence.`
			`class DependencyGraph {`
			`struct DGNode {`
			`unsigned NumPredecessors;`
			`unsigned NumVisitedPredecessors;`
			`uint64_t Cost;`
			`unsigned Depth;`

			`DependencyEdge CriticalPredecessor;`
			`SmallVector<DependencyEdge, 8> OutgoingEdges;`
			`};`
			`SmallVector<DGNode, 16> Nodes;`

			`DependencyGraph(const DependencyGraph &) = delete;`
			`DependencyGraph &operator=(const DependencyGraph &) = delete;`

			`void addDependency(unsigned From, unsigned To,`
			`DependencyEdge::Dependency &&DE);`

			`void pruneEdges(unsigned Iterations);`
			`void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const;`
			`void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet, unsigned Iterations);`

			`#ifndef NDEBUG`
			`void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE,`
			`MCInstPrinter &MCIP) const;`
			`#endif`

			`public:`
			`DependencyGraph(unsigned Size) : Nodes(Size) {}`

			`void addRegisterDep(unsigned From, unsigned To, unsigned RegID,`
			`unsigned Cost) {`
			`addDependency(From, To, {DependencyEdge::DT_REGISTER, RegID, Cost});`
			`}`

			`void addMemoryDep(unsigned From, unsigned To, unsigned Cost) {`
			`addDependency(From, To, {DependencyEdge::DT_MEMORY, /* unused */ 0, Cost});`
			`}`

			`void addResourceDep(unsigned From, unsigned To, uint64_t Mask,`
			`unsigned Cost) {`
			`addDependency(From, To, {DependencyEdge::DT_RESOURCE, Mask, Cost});`
			`}`

			`// Called by the bottleneck analysis at the end of simulation to propagate`
			`// costs through the edges of the graph, and compute a critical path.`
			`void finalizeGraph(unsigned Iterations) {`
			`SmallVector<unsigned, 16> RootSet;`
			`pruneEdges(Iterations);`
			`initializeRootSet(RootSet);`
			`propagateThroughEdges(RootSet, Iterations);`
			`}`

			`// Returns a sequence of edges representing the critical sequence based on the`
			`// simulated run. It assumes that the graph has already been finalized (i.e.`
			// method `finalizeGraph()` has already been called on this graph).
			`void getCriticalSequence(SmallVectorImpl<const DependencyEdge *> &Seq) const;`

			`#ifndef NDEBUG`
			`void dump(raw_ostream &OS, MCInstPrinter &MCIP) const;`
			`#endif`
			`};`

			`/// A view that collects and prints a few performance numbers.`
			`class BottleneckAnalysis : public InstructionView {`
			`PressureTracker Tracker;`
			`DependencyGraph DG;`

			`unsigned Iterations;`
			`unsigned TotalCycles;`

			`bool PressureIncreasedBecauseOfResources;`
			`bool PressureIncreasedBecauseOfRegisterDependencies;`
			`bool PressureIncreasedBecauseOfMemoryDependencies;`
			`// True if throughput was affected by dispatch stalls.`
			`bool SeenStallCycles;`

			`struct BackPressureInfo {`
			`// Cycles where backpressure increased.`
			`unsigned PressureIncreaseCycles;`
			`// Cycles where backpressure increased because of pipeline pressure.`
			`unsigned ResourcePressureCycles;`
			`// Cycles where backpressure increased because of data dependencies.`
			`unsigned DataDependencyCycles;`
			`// Cycles where backpressure increased because of register dependencies.`
			`unsigned RegisterDependencyCycles;`
			`// Cycles where backpressure increased because of memory dependencies.`
			`unsigned MemoryDependencyCycles;`
			`};`
			`BackPressureInfo BPI;`

			`// Used to populate the dependency graph DG.`
			`void addRegisterDep(unsigned From, unsigned To, unsigned RegID, unsigned Cy);`
			`void addMemoryDep(unsigned From, unsigned To, unsigned Cy);`
			`void addResourceDep(unsigned From, unsigned To, uint64_t Mask, unsigned Cy);`

			`void printInstruction(formatted_raw_ostream &FOS, const MCInst &MCI,`
			`bool UseDifferentColor = false) const;`

			`// Prints a bottleneck message to OS.`
			`void printBottleneckHints(raw_ostream &OS) const;`
			`void printCriticalSequence(raw_ostream &OS) const;`

			`public:`
			`BottleneckAnalysis(const MCSubtargetInfo &STI, MCInstPrinter &MCIP,`
			`ArrayRef<MCInst> Sequence, unsigned Iterations);`

			`void onCycleEnd() override;`
			`void onEvent(const HWStallEvent &Event) override { SeenStallCycles = true; }`
			`void onEvent(const HWPressureEvent &Event) override;`
			`void onEvent(const HWInstructionEvent &Event) override;`

			`void printView(raw_ostream &OS) const override;`
			`StringRef getNameAsString() const override { return "BottleneckAnalysis"; }`
			`json::Value toJSON() const override { return "not implemented"; }`

			`#ifndef NDEBUG`
			`void dump(raw_ostream &OS, MCInstPrinter &MCIP) const { DG.dump(OS, MCIP); }`
			`#endif`
			`};`

			`} // namespace mca`
			`} // namespace llvm`

			`#endif`