//===-- SIAddIMGInit.cpp - Add any required IMG inits ---------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// Any MIMG instructions that use tfe or lwe require an initialization of the
/// result register that will be written in the case of a memory access failure
/// The required code is also added to tie this init code to the result of the
/// img instruction
///
//===----------------------------------------------------------------------===//
//

#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineFunctionPass.h"

#define DEBUG_TYPE "si-img-init"

using namespace llvm;

namespace {

class SIAddIMGInit : public MachineFunctionPass {
public:
  static char ID;

public:
  SIAddIMGInit() : MachineFunctionPass(ID) {
    initializeSIAddIMGInitPass(*PassRegistry::getPassRegistry());
  }

  bool runOnMachineFunction(MachineFunction &MF) override;

  void getAnalysisUsage(AnalysisUsage &AU) const override {
    AU.setPreservesCFG();
    MachineFunctionPass::getAnalysisUsage(AU);
  }
};

} // End anonymous namespace.

INITIALIZE_PASS(SIAddIMGInit, DEBUG_TYPE, "SI Add IMG Init", false, false)

char SIAddIMGInit::ID = 0;

char &llvm::SIAddIMGInitID = SIAddIMGInit::ID;

FunctionPass *llvm::createSIAddIMGInitPass() { return new SIAddIMGInit(); }

bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
  MachineRegisterInfo &MRI = MF.getRegInfo();
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
  const SIInstrInfo *TII = ST.getInstrInfo();
  const SIRegisterInfo *RI = ST.getRegisterInfo();
  bool Changed = false;

  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
       ++BI) {
    MachineBasicBlock &MBB = *BI;
    MachineBasicBlock::iterator I, Next;
    for (I = MBB.begin(); I != MBB.end(); I = Next) {
      Next = std::next(I);
      MachineInstr &MI = *I;

      auto Opcode = MI.getOpcode();
      if (TII->isMIMG(Opcode) && !MI.mayStore()) {
        MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
        MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
        MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);

        if (!TFE && !LWE) // intersect_ray
          continue;

        unsigned TFEVal = TFE->getImm();
        unsigned LWEVal = LWE->getImm();
        unsigned D16Val = D16 ? D16->getImm() : 0;

        if (TFEVal || LWEVal) {
          // At least one of TFE or LWE are non-zero
          // We have to insert a suitable initialization of the result value and
          // tie this to the dest of the image instruction.

          const DebugLoc &DL = MI.getDebugLoc();

          int DstIdx =
              AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);

          // Calculate which dword we have to initialize to 0.
          MachineOperand *MO_Dmask =
              TII->getNamedOperand(MI, AMDGPU::OpName::dmask);

          // check that dmask operand is found.
          assert(MO_Dmask && "Expected dmask operand in instruction");

          unsigned dmask = MO_Dmask->getImm();
          // Determine the number of active lanes taking into account the
          // Gather4 special case
          unsigned ActiveLanes =
              TII->isGather4(Opcode) ? 4 : countPopulation(dmask);

          bool Packed = !ST.hasUnpackedD16VMem();

          unsigned InitIdx =
              D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;

          // Abandon attempt if the dst size isn't large enough
          // - this is in fact an error but this is picked up elsewhere and
          // reported correctly.
          uint32_t DstSize =
              RI->getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
          if (DstSize < InitIdx)
            continue;

          // Create a register for the intialization value.
          Register PrevDst =
              MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
          unsigned NewDst = 0; // Final initialized value will be in here

          // If PRTStrictNull feature is enabled (the default) then initialize
          // all the result registers to 0, otherwise just the error indication
          // register (VGPRn+1)
          unsigned SizeLeft = ST.usePRTStrictNull() ? InitIdx : 1;
          unsigned CurrIdx = ST.usePRTStrictNull() ? 0 : (InitIdx - 1);

          if (DstSize == 1) {
            // In this case we can just initialize the result directly
            BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), PrevDst)
                .addImm(0);
            NewDst = PrevDst;
          } else {
            BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
            for (; SizeLeft; SizeLeft--, CurrIdx++) {
              NewDst =
                  MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
              // Initialize dword
              Register SubReg =
                  MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
              BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
                  .addImm(0);
              // Insert into the super-reg
              BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
                  .addReg(PrevDst)
                  .addReg(SubReg)
                  .addImm(SIRegisterInfo::getSubRegFromChannel(CurrIdx));

              PrevDst = NewDst;
            }
          }

          // Add as an implicit operand
          MachineInstrBuilder(MF, MI).addReg(NewDst, RegState::Implicit);

          // Tie the just added implicit operand to the dst
          MI.tieOperands(DstIdx, MI.getNumOperands() - 1);

          Changed = true;
        }
      }
    }
  }

  return Changed;
}