//====--- OMPGridValues.h - Language-specific address spaces --*- C++ -*-====//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// \brief Provides definitions for Target specific Grid Values
///
//===----------------------------------------------------------------------===//

#ifndef LLVM_OPENMP_GRIDVALUES_H
#define LLVM_OPENMP_GRIDVALUES_H

namespace llvm {

namespace omp {

/// \brief Defines various target-specific GPU grid values that must be
///        consistent between host RTL (plugin), device RTL, and clang.
///        We can change grid values for a "fat" binary so that different
///        passes get the correct values when generating code for a
///        multi-target binary. Both amdgcn and nvptx values are stored in
///        this file. In the future, should there be differences between GPUs
///        of the same architecture, then simply make a different array and
///        use the new array name.
///
/// Example usage in clang:
///   const unsigned slot_size =
///   ctx.GetTargetInfo().getGridValue(llvm::omp::GVIDX::GV_Warp_Size);
///
/// Example usage in libomptarget/deviceRTLs:
///   #include "llvm/Frontend/OpenMP/OMPGridValues.h"
///   #ifdef __AMDGPU__
///     #define GRIDVAL AMDGPUGpuGridValues
///   #else
///     #define GRIDVAL NVPTXGpuGridValues
///   #endif
///   ... Then use this reference for GV_Warp_Size in the deviceRTL source.
///   llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size]
///
/// Example usage in libomptarget hsa plugin:
///   #include "llvm/Frontend/OpenMP/OMPGridValues.h"
///   #define GRIDVAL AMDGPUGpuGridValues
///   ... Then use this reference to access GV_Warp_Size in the hsa plugin.
///   llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size]
///
/// Example usage in libomptarget cuda plugin:
///    #include "llvm/Frontend/OpenMP/OMPGridValues.h"
///    #define GRIDVAL NVPTXGpuGridValues
///   ... Then use this reference to access GV_Warp_Size in the cuda plugin.
///    llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size]
///
enum GVIDX {
  /// The maximum number of workers in a kernel.
  /// (THREAD_ABSOLUTE_LIMIT) - (GV_Warp_Size), might be issue for blockDim.z
  GV_Threads,
  /// The size reserved for data in a shared memory slot.
  GV_Slot_Size,
  /// The default value of maximum number of threads in a worker warp.
  GV_Warp_Size,
  /// Alternate warp size for some AMDGCN architectures. Same as GV_Warp_Size
  /// for NVPTX.
  GV_Warp_Size_32,
  /// The number of bits required to represent the max number of threads in warp
  GV_Warp_Size_Log2,
  /// GV_Warp_Size * GV_Slot_Size,
  GV_Warp_Slot_Size,
  /// the maximum number of teams.
  GV_Max_Teams,
  /// Global Memory Alignment
  GV_Mem_Align,
  /// (~0u >> (GV_Warp_Size - GV_Warp_Size_Log2))
  GV_Warp_Size_Log2_Mask,
  // An alternative to the heavy data sharing infrastructure that uses global
  // memory is one that uses device __shared__ memory.  The amount of such space
  // (in bytes) reserved by the OpenMP runtime is noted here.
  GV_SimpleBufferSize,
  // The absolute maximum team size for a working group
  GV_Max_WG_Size,
  // The default maximum team size for a working group
  GV_Default_WG_Size,
  // This is GV_Max_WG_Size / GV_WarpSize. 32 for NVPTX and 16 for AMDGCN.
  GV_Max_Warp_Number,
  /// The slot size that should be reserved for a working warp.
  /// (~0u >> (GV_Warp_Size - GV_Warp_Size_Log2))
  GV_Warp_Size_Log2_MaskL
};

/// For AMDGPU GPUs
static constexpr unsigned AMDGPUGpuGridValues[] = {
    448,       // GV_Threads
    256,       // GV_Slot_Size
    64,        // GV_Warp_Size
    32,        // GV_Warp_Size_32
    6,         // GV_Warp_Size_Log2
    64 * 256,  // GV_Warp_Slot_Size
    128,       // GV_Max_Teams
    256,       // GV_Mem_Align
    63,        // GV_Warp_Size_Log2_Mask
    896,       // GV_SimpleBufferSize
    1024,      // GV_Max_WG_Size,
    256,       // GV_Defaut_WG_Size
    1024 / 64, // GV_Max_WG_Size / GV_WarpSize
    63         // GV_Warp_Size_Log2_MaskL
};

/// For Nvidia GPUs
static constexpr unsigned NVPTXGpuGridValues[] = {
    992,               // GV_Threads
    256,               // GV_Slot_Size
    32,                // GV_Warp_Size
    32,                // GV_Warp_Size_32
    5,                 // GV_Warp_Size_Log2
    32 * 256,          // GV_Warp_Slot_Size
    1024,              // GV_Max_Teams
    256,               // GV_Mem_Align
    (~0u >> (32 - 5)), // GV_Warp_Size_Log2_Mask
    896,               // GV_SimpleBufferSize
    1024,              // GV_Max_WG_Size
    128,               // GV_Defaut_WG_Size
    1024 / 32,         // GV_Max_WG_Size / GV_WarpSize
    31                 // GV_Warp_Size_Log2_MaskL
};

} // namespace omp
} // namespace llvm

#endif // LLVM_OPENMP_GRIDVALUES_H