llvm-for-llvmta/test/CodeGen/NVPTX/wmma.py

# This test generates all variants of wmma intrinsics and verifies that LLVM
# generates correct instructions for them.

# Check all variants of instructions supported by PTX60 on SM70
# RUN: python %s --ptx=60 --gpu-arch=70 > %t-ptx60-sm_70.ll
# RUN: FileCheck %t-ptx60-sm_70.ll < %t-ptx60-sm_70.ll \
# RUN:           --check-prefixes=INTRINSICS,M16N16
# RUN: FileCheck %t-ptx60-sm_70.ll < %t-ptx60-sm_70.ll \
# RUN:           --check-prefixes=INTRINSICS,NOEXTGEOM,NOINT,NOSUBINT,NOMMA
# RUN: llc < %t-ptx60-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 \
# RUN:           | FileCheck %t-ptx60-sm_70.ll

# Check all variants of instructions supported by PTX61 on SM70
# RUN: python %s --ptx=61 --gpu-arch=70 > %t-ptx61-sm_70.ll
# RUN: FileCheck %t-ptx61-sm_70.ll < %t-ptx61-sm_70.ll \
# RUN:           --check-prefixes=INTRINSICS,M16N16,EXTGEOM
# RUN: FileCheck %t-ptx61-sm_70.ll < %t-ptx61-sm_70.ll \
# RUN:           --check-prefixes=INTRINSICS,NOINT,NOSUBINT,NOMMA
# RUN: llc < %t-ptx61-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx61 \
# RUN:           | FileCheck %t-ptx61-sm_70.ll

# Check all variants of instructions supported by PTX63 on SM72
# RUN: python %s --ptx=63 --gpu-arch=72 > %t-ptx63-sm_72.ll
# RUN: FileCheck %t-ptx63-sm_72.ll < %t-ptx63-sm_72.ll \
# RUN:           --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT
# RUN: FileCheck %t-ptx63-sm_72.ll < %t-ptx63-sm_72.ll \
# RUN:           --check-prefixes=INTRINSICS,NOSUBINT,NOMMA
# RUN: llc < %t-ptx63-sm_72.ll -march=nvptx64 -mcpu=sm_72 -mattr=+ptx63 \
# RUN:           | FileCheck %t-ptx63-sm_72.ll

# Check all variants of instructions supported by PTX63 on SM75
# RUN: python %s --ptx=63 --gpu-arch=75 > %t-ptx63-sm_75.ll
# RUN: FileCheck %t-ptx63-sm_75.ll < %t-ptx63-sm_75.ll \
# RUN:           --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT,SUBINT
# RUN: FileCheck %t-ptx63-sm_75.ll < %t-ptx63-sm_75.ll \
# RUN:           --check-prefixes=INTRINSICS,NOMMA
# RUN: llc < %t-ptx63-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx63 \
# RUN:           | FileCheck %t-ptx63-sm_75.ll

# Check all variants of instructions supported by PTX64 on SM70+
# RUN: python %s --ptx=64 --gpu-arch=70 > %t-ptx64-sm_70.ll
# RUN: FileCheck %t-ptx64-sm_70.ll < %t-ptx64-sm_70.ll \
# RUN:           --check-prefixes=INTRINSICS,M16N16,EXTGEOM,MMA
# RUN: FileCheck %t-ptx64-sm_70.ll < %t-ptx64-sm_70.ll \
# RUN:           --check-prefixes=INTRINSICS,NOINT,NOSUBINT
# RUN: llc < %t-ptx64-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 \
# RUN:           | FileCheck %t-ptx64-sm_70.ll

from __future__ import print_function

import argparse
from itertools import product
from string import Template

class MMAType:
  def __init__(self, ptx_type):
    self.ptx_type = ptx_type
    self.llvm_type = {
        "f16" : "<2 x half>",
        "f32" : "float",
        "s32" : "i32",
        "s8"  : "i32",
        "u8"  : "i32",
        "s4"  : "i32",
        "u4"  : "i32",
        "b1"  : "i32",
    }[ptx_type];

    self.ptx_reg_pattern = {
        "f16" : "%hh[0-9]+",
        "f32" : "%f[0-9]+",
    }.get(ptx_type, "%r[0-9]+")

  def __repr__(self):
    return "%s/%s" % (self.ptx_type, self.llvm_type)

class MMAFrag:
  def __init__(self, geom, frag, ptx_elt_type):
    self.geom = geom
    self.frag = frag
    self.is_mma = True if geom == "m8n8k4" else False;
    self.mma_type = MMAType(ptx_elt_type);
    self.nregs = {
        "a:f16" : 2 if self.is_mma else 8,
        "b:f16" : 2 if self.is_mma else 8,
        "c:f16" : 4,
        "d:f16" : 4,
        "c:f32" : 8,
        "d:f32" : 8,
    }.get("%s:%s" % (frag, ptx_elt_type), {
        # u8/s8 -> s32 @ m16n16k16/m8n32k16/m32n8k16
        "m16n16k16:a:u8" : 2,
        "m16n16k16:a:s8" : 2,
        "m16n16k16:b:u8" : 2,
        "m16n16k16:b:s8" : 2,
        "m16n16k16:c:s32" : 8,
        "m16n16k16:d:s32" : 8,

        "m8n32k16:a:u8" : 1,
        "m8n32k16:a:s8" : 1,
        "m8n32k16:b:u8" : 4,
        "m8n32k16:b:s8" : 4,
        "m8n32k16:c:s32" : 8,
        "m8n32k16:d:s32" : 8,

        "m32n8k16:a:u8" : 4,
        "m32n8k16:a:s8" : 4,
        "m32n8k16:b:u8" : 1,
        "m32n8k16:b:s8" : 1,
        "m32n8k16:c:s32" : 8,
        "m32n8k16:d:s32" : 8,

        # u4/s4/b1 -> s32 @ m8n8k32 (u4/s4), m8n8k128(b1)
        "m8n8k128:a:b1" : 1,
        "m8n8k32:a:u4" : 1,
        "m8n8k32:a:s4" : 1,
        "m8n8k128:b:b1" : 1,
        "m8n8k32:b:u4" : 1,
        "m8n8k32:b:s4" : 1,
        "m8n8k128:c:s32" : 2,
        "m8n8k128:d:s32" : 2,
        "m8n8k32:c:s32" : 2,
        "m8n8k32:d:s32" : 2,
    }.get("%s:%s:%s" % (geom, frag, ptx_elt_type), None));
    assert(self.nregs);

  def __repr__(self):
    return "%s:%s:%s%s" % (self.geom, self.frag, self.mma_type,
                           "" if self.nregs == 1 else ("*%d" % self.nregs))

class MMAOp:
  def __init__(self, a, b, c, d):
    self.a = a
    self.b = b
    self.c = c
    self.d = d

  def __repr__(self):
    return ("{A:%s, B:%s, C:%s, D:%s}" % (self.a, self.b, self.c, self.d ))

def make_mma_ops(geoms, types_a, types_b, types_c, types_d):
  ops = []
  for geom, type_a, type_c in product( geoms,  types_a, types_c):
    for type_b, type_d in product(types_b if types_b else [type_a],
                                  types_d if types_d else [type_c]):
      ops.append(MMAOp(MMAFrag(geom, "a", type_a),
                       MMAFrag(geom, "b", type_b),
                       MMAFrag(geom, "c", type_c),
                       MMAFrag(geom, "d", type_d)))
  return ops

def make_ldst_ops(geoms, frags, types):
  return [MMAFrag(geom, frag, ptx_type) for (geom, frag, ptx_type)
          in product(geoms, frags, types)]

def get_mma_ops():
  return (make_mma_ops(["m8n8k4"],
                       ["f16"], [], ["f16", "f32"], ["f16", "f32"]) +
          make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
                       ["f16"], [], ["f16", "f32"], ["f16", "f32"]) +
          make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
                       ["s8", "u8"], [], ["s32"], []) +
          make_mma_ops(["m8n8k32"],
                       ["s4", "u4"], [], ["s32"], []) +
          make_mma_ops(["m8n8k128"],
                       ["b1"], [], ["s32"], []))
def get_ldst_ops(kind):
  ldst_ops = (make_ldst_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
                            ["a", "b"], ["f16", "u8", "s8"]) +
              make_ldst_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
                            ["c", "d"], ["f16", "f32", "s32"]) +
              make_ldst_ops(["m8n8k32"], ["a", "b"], ["s4","u4"]) +
              make_ldst_ops(["m8n8k128"], ["a", "b"], ["b1"]) +
              make_ldst_ops(["m8n8k32", "m8n8k128"],  ["c", "d"], ["s32"]))
  return [ x for x in ldst_ops if (x.frag == "d") == (kind == "store")]

def is_geom_supported(geom):
  # geometries for FP and ints.
  if geom == "m8n8k4":
    return ptx_version >= 64
  if geom in ["m8n32k16", "m32n8k16"]:
    return ptx_version >= 61
  # geometries for sub-ints.
  if geom in ["m8n8k32", "m8n8k128"]:
    return ptx_version >= 63 and gpu_arch >= 75
  if geom == "m16n16k16":
    return ptx_version >= 60
  assert(False) # Unexpected geometry.

def is_type_supported(ptx_type):
  if ptx_type in ["s8", "u8", "s32"]:
    return ptx_version >= 63 and gpu_arch >= 72
  if ptx_type in ["s4", "u4", "b1"]:
    return ptx_version >= 63 and gpu_arch >= 75
  return ptx_version >= 60 and gpu_arch >= 70


def is_mma_variant_supported(op, layout_a, layout_b, satf):
  if not (is_type_supported(op.a.mma_type.ptx_type)
          and is_geom_supported(op.a.geom)):
    return False
  if op.a.geom == "m8n8k4":
    if satf:
      return False
    if op.c.mma_type.ptx_type == "f32":
      # If C is f32, D must be, too.
      return op.d.mma_type.ptx_type == "f32"

  # sub-integer require row/col layout, and no satf.
  if op.a.mma_type.ptx_type in ["s4", "u4", "b1"]:
    if op.a.mma_type.ptx_type == "b1" and satf:
      return False
    return layout_a == "row" and layout_b == "col"
  return True

def is_ldst_variant_supported(frag, layout):
  if not (is_type_supported(frag.mma_type.ptx_type)
          and is_geom_supported(frag.geom)):
    return False
  if frag.mma_type.ptx_type in ["s4", "u4", "b1"]:
    # sub-integer require sm_75 and ptx63, row/col layout for a/b.
    return ((frag.frag == "a" and layout == "row")
            or (frag.frag == "b" and layout == "col")
            or frag.frag in ["c", "d"])
  return True

def make_wmma_slice_ty(frag):
  return [frag.mma_type.llvm_type] * frag.nregs

def make_wmma_ld_ret_ty(frag):
  results = make_wmma_slice_ty(frag)
  if len(results) == 1:
    return "%s" % results[0]
  return "{%s}" % ", ".join(results)

# returns address space
def get_aspace(space):
  space_map = {
      ".global" : 1,
      ".shared" : 3,
      ".const"  : 4,
      ".local"  : 5,
      ".param"  : 101,
      ""        : 0,
      ".generic": 0
  }
  return space_map[space];

def get_pspace(space):
  return "p%di8" % get_aspace(space);

def check_pattern(frag):
   return "{{%s}}" % ", *".join([frag.mma_type.ptx_reg_pattern] * frag.nregs)

def gen_wmma_load_tests():
  load_template = """
declare ${ret_ty} @${intrinsic}(i8 ${as}* %src ${extra_args});

; CHECK-LABEL: .func {{.*}}test_${function}(
define ${ret_ty} @test_${function}(i8 ${as}* %src ${extra_args}) {
; CHECK: ${instruction}
; CHECK: {${check_result}}
; CHECK: [%rd{{[0-9]+}}]${stride_pattern}
  %v0 = call ${ret_ty} @${intrinsic}(i8 ${as}* %src ${extra_args});
  ret ${ret_ty} %v0;
}

; CHECK-LABEL: .func{{.*}}test_${function}_o(
define ${ret_ty} @test_${function}_o(i8 ${as}* %src ${extra_args}) {
; CHECK: ${instruction}
; CHECK: {${check_result}}
; CHECK: [%rd{{[0-9]+}}+128]${stride_pattern}
  %src1 = getelementptr i8, i8 ${as}* %src, i32 128;
  %v0 = call ${ret_ty} @${intrinsic}(i8 ${as}* %src1 ${extra_args});
  ret ${ret_ty} %v0;
}
"""
  intrinsic_template = "llvm.nvvm.wmma.${geom}.load.${abc}.${layout}${stride}.${itype}.${pspace}"
  instruction_template = "wmma.load.${abc}.sync${aligned}.${layout}.${geom}${space}.${itype}"

  generated_items = []

  for frag, layout, space, stride in product(
      get_ldst_ops("load"),
      ["row","col"],
      ["",".shared",".global"],
      ["", ".stride"],
      ):
    if not is_ldst_variant_supported(frag, layout):
      continue

    params = {
        "abc" : frag.frag,
        "aligned" : ".aligned" if ptx_version >= 63 else "",
        "layout" : layout,
        "space" : space,
        "stride" : stride,
        "itype" : frag.mma_type.ptx_type,
        "pspace" : get_pspace(space),
        "as"     : "addrspace(%d)" % get_aspace(space),
        "geom"   : frag.geom,
    }

    test_params = params
    test_params["intrinsic"] = Template(intrinsic_template).substitute(params)
    test_params["function"] = test_params["intrinsic"].replace(".","_")
    test_params["instruction"] = Template(instruction_template).substitute(params)
    test_params["ret_ty"] = make_wmma_ld_ret_ty(frag)
    test_params["check_result"] = check_pattern(frag)

    if stride:
      test_params["extra_args"] = ", i32 %stride";
      test_params["stride_pattern"] = ", %r{{[0-9]+}}"
    else:
      test_params["extra_args"] = ""
      test_params["stride_pattern"] = ""

    print(Template(load_template).substitute(test_params))

    generated_items.append((test_params["intrinsic"],
                            test_params["instruction"]))

  return generated_items

def make_wmma_slice_args(frag):
  return ", ".join(["%s %%%s%d" % (t, frag.frag, i) for i,t
                  in enumerate(make_wmma_slice_ty(frag))])

def gen_wmma_store_tests():
  store_template = """
declare void @${intrinsic}(i8 ${as}* %src, ${args}${extra_args});

; CHECK-LABEL: .func {{.*}}test_${function}(
define void @test_${function}(i8 ${as}* %src, ${args}${extra_args}) {
; CHECK: ${instruction} {{.*}}[%rd{{[0-9+]}}
; CHECK: {${check_args}}
; CHECK: ${stride_pattern}
  call void @${intrinsic}(i8 ${as}* %src, ${args} ${extra_args});
  ret void
}

; CHECK-LABEL: .func{{.*}}test_${function}_o(
define void @test_${function}_o(i8 ${as}* %src, ${args}${extra_args}) {
; CHECK: ${instruction} {{.*}}[%rd{{[0-9+]}}+128]
; CHECK: ${check_args}
; CHECK: ${stride_pattern}
  %src1 = getelementptr i8, i8 ${as}* %src, i32 128;
  call void @${intrinsic}(i8 ${as}* %src1, ${args}${extra_args});
  ret void
}
"""
  intrinsic_template = "llvm.nvvm.wmma.${geom}.store.${abc}.${layout}${stride}.${itype}.${pspace}"
  instruction_template = "wmma.store.${abc}.sync${aligned}.${layout}.${geom}${space}.${itype}"

  generated_items = []

  for frag, layout, space, stride in product(
      get_ldst_ops("store"),
      ["row","col"],
      ["",".shared",".global"],
      ["", ".stride"]):

    if not is_ldst_variant_supported(frag, layout):
      continue

    params = {
        "abc" : frag.frag,
        "aligned" : ".aligned" if ptx_version >= 63 else "",
        "layout" : layout,
        "space" : space,
        "stride" : stride,
        "itype" : frag.mma_type.ptx_type,
        "pspace" : get_pspace(space),
        "as"     : "addrspace(%d)" % get_aspace(space),
        "geom"   : frag.geom,
    }

    test_params = params
    test_params["intrinsic"] = Template(intrinsic_template).substitute(params)
    test_params["function"] = test_params["intrinsic"].replace(".","_")
    test_params["instruction"] = Template(instruction_template).substitute(params)
    test_params["ret_ty"] = make_wmma_ld_ret_ty(frag)
    test_params["check_args"] = check_pattern(frag)
    if stride:
      test_params["extra_args"] = ", i32 %stride";
      test_params["stride_pattern"] = ", %r{{[0-9]+}};"
    else:
      test_params["extra_args"] = ""
      test_params["stride_pattern"] = ";"
    test_params["args"] = make_wmma_slice_args(frag);

    print(Template(store_template).substitute(test_params))
    generated_items.append((test_params["intrinsic"],
                            test_params["instruction"]))

  return generated_items

def mma_signature(op):
  if op.a.mma_type.ptx_type in ["s8", "u8", "s4", "u4", "b1"]:
    # int and sub-int ops are identified by input type.
    return op.a.mma_type.ptx_type
  else:
    # the rest are FP ops identified by accumulator & result type.
    return "%s.%s" % (op.d.mma_type.ptx_type, op.c.mma_type.ptx_type)

def mma_ptx_signature(op):
  if op.a.mma_type.ptx_type in ["s8", "u8", "s4", "u4", "b1"]:
    # int and sub-int instructions encode all four types as D.A.B.C
    return ".".join(x.mma_type.ptx_type for x in (op.d, op.a, op.b, op.c))
  if op.a.geom == "m8n8k4":
    return "%s.f16.f16.%s" % (op.d.mma_type.ptx_type, op.c.mma_type.ptx_type)
  else:
    # the rest are FP instructions use D.C
    return "%s.%s" % (op.d.mma_type.ptx_type, op.c.mma_type.ptx_type)

def gen_wmma_mma_tests():
  mma_template = """
declare ${ret_ty} @${intrinsic}(
        ${args});

; CHECK-LABEL: .func {{.*}}test_${function}(
define ${ret_ty} @test_${function}(
        ${args}) {
; CHECK: ${instruction}
; CHECK-NEXT: ${check_d}
; CHECK-NEXT: ${check_a}
; CHECK-NEXT: ${check_b}
; CHECK-NEXT: ${check_c}
  %r = call ${ret_ty} @${intrinsic}(
        ${args});
  ret ${ret_ty} %r;
}
"""
  wmma_intrinsic_template = "llvm.nvvm.wmma.${geom}.mma.${alayout}.${blayout}.${intrinsic_signature}${satf}"
  wmma_instruction_template = "wmma.mma${mma_variant}.sync${aligned}.${alayout}.${blayout}.${geom}.${ptx_signature}${satf}"
  mma_intrinsic_template = "llvm.nvvm.mma.${geom}.${alayout}.${blayout}.${intrinsic_signature}"
  mma_instruction_template = "mma.sync${aligned}.${geom}.${alayout}.${blayout}.${ptx_signature}"

  generated_items=[]

  for op, alayout, blayout, satf in product(
      get_mma_ops(),
      ["row","col"],
      ["row","col"],
      [".satfinite", ""]):

    if not is_mma_variant_supported(op, alayout, blayout, satf):
      continue

    params = {
        "aligned" : ".aligned" if ptx_version >= 63 else "",
        "alayout" : alayout,
        "blayout" : blayout,
        "intrinsic_signature" : mma_signature(op),
        "ptx_signature" : mma_ptx_signature(op),
        "satf"  : satf,
        "geom"  : op.a.geom,
        "mma_variant" : ".xor.popc" if op.a.mma_type.ptx_type == "b1" else "",
    }

    if op.a.geom == "m8n8k4":
      intrinsic_template = mma_intrinsic_template
      instruction_template = mma_instruction_template
    else:
      intrinsic_template = wmma_intrinsic_template
      instruction_template = wmma_instruction_template

    test_params = params
    test_params["intrinsic"] = Template(intrinsic_template).substitute(params)
    test_params["function"] = test_params["intrinsic"].replace(".", "_")
    test_params["instruction"] = Template(instruction_template).substitute(params)
    test_params["ret_ty"] = make_wmma_ld_ret_ty(op.d)
    test_params["check_a"] = check_pattern(op.a)
    test_params["check_b"] = check_pattern(op.b)
    test_params["check_c"] = check_pattern(op.c)
    test_params["check_d"] = check_pattern(op.d)
    args = ",\n        ".join(make_wmma_slice_args(frag)
                              for frag in (op.a, op.b, op.c))
    test_params["args"] = args
    print(Template(mma_template).substitute(test_params))
    generated_items.append((test_params["intrinsic"],
                            test_params["instruction"]))

  return generated_items

# Append complete list of intrinsics and instructions we've generated tests for.
# Generate set of checks to verify that that we did generate sensible set of
# tests for the given combination of PTX and SM variants.
#
def gen_check_unsupported_ops(items):
  print("; Complete list of intrinsics supported by PTX%d on sm_%d"
        % (ptx_version, gpu_arch))
  print("; INTRINSICS: {{^; INTRINSICS_LIST_BEGIN}}")
  print("""

; NOEXTGEOM-NOT: {{m8n32|m32n8}}
; NOINT-NOT: .{{s32|s8}}
; NOSUBINT-NOT: {{s4|u4|b1}}
; NOMMA-NOT: .m8n8k4.

; M16N16-DAG: m16n16k16.load.{{[ab].*}}.f16.p
; M16N16-DAG: m16n16k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p
; M16N16-DAG: m16n16k16.mma.{{.*}}.f16.f32
; M16N16-DAG: m16n16k16.mma.{{.*}}.f32.f16
; M16N16-DAG: m16n16k16.mma.{{.*}}.f16.f16
; M16N16-DAG: m16n16k16.mma.{{.*}}.f32.f32

; PTX60 adds support for m32n8k16/m8n32k16 geometries.
; EXTGEOM-DAG: m32n8k16.load.{{[ab].*}}.f16.p
; EXTGEOM-DAG: m32n8k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p
; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f16.f32
; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f32.f16
; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f16.f16
; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f32.f32

; EXTGEOM-DAG: m8n32k16.load.{{[ab].*}}.f16.p
; EXTGEOM-DAG: m8n32k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p
; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f16.f32
; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f32.f16
; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f16.f16
; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f32.f32

; INT-DAG: m16n16k16.load.{{[ab].*}}.s8.p
; INT-DAG: m8n32k16.load.{{[ab].*}}.s8.p
; INT-DAG: m32n8k16.load.{{[ab].*}}.s8.p
; INT-DAG: m16n16k16.load.{{[ab].*}}.u8.p
; INT-DAG: m8n32k16.load.{{[ab].*}}.u8.p
; INT-DAG: m32n8k16.load.{{[ab].*}}.u8.p
; INT-DAG: m32n8k16.{{load|store}}.{{[cd].*\.s32}}.p
; INT-DAG: m16n16k16.mma.{{.*}}.u8
; INT-DAG: m16n16k16.mma.{{.*}}.s8
; INT-DAG: m8n32k16.mma.{{.*}}.u8
; INT-DAG: m8n32k16.mma.{{.*}}.s8
; INT-DAG: m32n8k16.mma.{{.*}}.u8
; INT-DAG: m32n8k16.mma.{{.*}}.s8

; SUBINT-DAG: m8n8k128.load.{{[ab].*}}.b1.p
; SUBINT-DAG: m8n8k32.load.{{[ab].*}}.s4.p
; SUBINT-DAG: m8n8k32.load.{{[ab].*}}.u4.p
; SUBINT-DAG: m8n8k128.{{load|store}}.{{[cd].*\.s32}}.p
; SUBINT-DAG: m8n8k32.{{load|store}}.{{[cd].*\.s32}}.p
; SUBINT-DAG: m8n8k32.mma.{{.*}}.u4
; SUBINT-DAG: m8n8k32.mma.{{.*}}.s4
; SUBINT-DAG: m8n8k128.mma.{{.*}}.b1

; MMA-DAG: mma.m8n8k4.{{.*}}.f16.f32
; MMA-DAG: mma.m8n8k4.{{.*}}.f32.f16
; MMA-DAG: mma.m8n8k4.{{.*}}.f16.f16
; MMA-DAG: mma.m8n8k4.{{.*}}.f32.f32
;

""")

  print("; INTRINSICS_LIST_BEGIN")
  for intrinsic, instruction in sorted(items):
    print("; ", intrinsic, " -> ", instruction,"")
  print("; INTRINSICS_LIST_END")
  print("; INTRINSICS: ; INTRINSICS_LIST_END")

def gen_tests():
  items = gen_wmma_load_tests()
  items += gen_wmma_store_tests()
  items += gen_wmma_mma_tests()
  gen_check_unsupported_ops(items)

parser = argparse.ArgumentParser()
parser.add_argument("--ptx", type=int, default=60)
parser.add_argument("--gpu-arch", type=int, default=70)
args = parser.parse_args()
ptx_version = args.ptx
gpu_arch = args.gpu_arch

gen_tests()
first commit 2022-04-25 10:02:23 +02:00			`# This test generates all variants of wmma intrinsics and verifies that LLVM`
			`# generates correct instructions for them.`

			`# Check all variants of instructions supported by PTX60 on SM70`
			`# RUN: python %s --ptx=60 --gpu-arch=70 > %t-ptx60-sm_70.ll`
			`# RUN: FileCheck %t-ptx60-sm_70.ll < %t-ptx60-sm_70.ll \`
			`# RUN: --check-prefixes=INTRINSICS,M16N16`
			`# RUN: FileCheck %t-ptx60-sm_70.ll < %t-ptx60-sm_70.ll \`
			`# RUN: --check-prefixes=INTRINSICS,NOEXTGEOM,NOINT,NOSUBINT,NOMMA`
			`# RUN: llc < %t-ptx60-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 \`
			`# RUN: \| FileCheck %t-ptx60-sm_70.ll`

			`# Check all variants of instructions supported by PTX61 on SM70`
			`# RUN: python %s --ptx=61 --gpu-arch=70 > %t-ptx61-sm_70.ll`
			`# RUN: FileCheck %t-ptx61-sm_70.ll < %t-ptx61-sm_70.ll \`
			`# RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM`
			`# RUN: FileCheck %t-ptx61-sm_70.ll < %t-ptx61-sm_70.ll \`
			`# RUN: --check-prefixes=INTRINSICS,NOINT,NOSUBINT,NOMMA`
			`# RUN: llc < %t-ptx61-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx61 \`
			`# RUN: \| FileCheck %t-ptx61-sm_70.ll`

			`# Check all variants of instructions supported by PTX63 on SM72`
			`# RUN: python %s --ptx=63 --gpu-arch=72 > %t-ptx63-sm_72.ll`
			`# RUN: FileCheck %t-ptx63-sm_72.ll < %t-ptx63-sm_72.ll \`
			`# RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT`
			`# RUN: FileCheck %t-ptx63-sm_72.ll < %t-ptx63-sm_72.ll \`
			`# RUN: --check-prefixes=INTRINSICS,NOSUBINT,NOMMA`
			`# RUN: llc < %t-ptx63-sm_72.ll -march=nvptx64 -mcpu=sm_72 -mattr=+ptx63 \`
			`# RUN: \| FileCheck %t-ptx63-sm_72.ll`

			`# Check all variants of instructions supported by PTX63 on SM75`
			`# RUN: python %s --ptx=63 --gpu-arch=75 > %t-ptx63-sm_75.ll`
			`# RUN: FileCheck %t-ptx63-sm_75.ll < %t-ptx63-sm_75.ll \`
			`# RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT,SUBINT`
			`# RUN: FileCheck %t-ptx63-sm_75.ll < %t-ptx63-sm_75.ll \`
			`# RUN: --check-prefixes=INTRINSICS,NOMMA`
			`# RUN: llc < %t-ptx63-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx63 \`
			`# RUN: \| FileCheck %t-ptx63-sm_75.ll`

			`# Check all variants of instructions supported by PTX64 on SM70+`
			`# RUN: python %s --ptx=64 --gpu-arch=70 > %t-ptx64-sm_70.ll`
			`# RUN: FileCheck %t-ptx64-sm_70.ll < %t-ptx64-sm_70.ll \`
			`# RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,MMA`
			`# RUN: FileCheck %t-ptx64-sm_70.ll < %t-ptx64-sm_70.ll \`
			`# RUN: --check-prefixes=INTRINSICS,NOINT,NOSUBINT`
			`# RUN: llc < %t-ptx64-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 \`
			`# RUN: \| FileCheck %t-ptx64-sm_70.ll`

			`from __future__ import print_function`

			`import argparse`
			`from itertools import product`
			`from string import Template`

			`class MMAType:`
			`def __init__(self, ptx_type):`
			`self.ptx_type = ptx_type`
			`self.llvm_type = {`
			`"f16" : "<2 x half>",`
			`"f32" : "float",`
			`"s32" : "i32",`
			`"s8" : "i32",`
			`"u8" : "i32",`
			`"s4" : "i32",`
			`"u4" : "i32",`
			`"b1" : "i32",`
			`}[ptx_type];`

			`self.ptx_reg_pattern = {`
			`"f16" : "%hh[0-9]+",`
			`"f32" : "%f[0-9]+",`
			`}.get(ptx_type, "%r[0-9]+")`

			`def __repr__(self):`
			`return "%s/%s" % (self.ptx_type, self.llvm_type)`

			`class MMAFrag:`
			`def __init__(self, geom, frag, ptx_elt_type):`
			`self.geom = geom`
			`self.frag = frag`
			`self.is_mma = True if geom == "m8n8k4" else False;`
			`self.mma_type = MMAType(ptx_elt_type);`
			`self.nregs = {`
			`"a:f16" : 2 if self.is_mma else 8,`
			`"b:f16" : 2 if self.is_mma else 8,`
			`"c:f16" : 4,`
			`"d:f16" : 4,`
			`"c:f32" : 8,`
			`"d:f32" : 8,`
			`}.get("%s:%s" % (frag, ptx_elt_type), {`
			`# u8/s8 -> s32 @ m16n16k16/m8n32k16/m32n8k16`
			`"m16n16k16:a:u8" : 2,`
			`"m16n16k16:a:s8" : 2,`
			`"m16n16k16:b:u8" : 2,`
			`"m16n16k16:b:s8" : 2,`
			`"m16n16k16:c:s32" : 8,`
			`"m16n16k16:d:s32" : 8,`

			`"m8n32k16:a:u8" : 1,`
			`"m8n32k16:a:s8" : 1,`
			`"m8n32k16:b:u8" : 4,`
			`"m8n32k16:b:s8" : 4,`
			`"m8n32k16:c:s32" : 8,`
			`"m8n32k16:d:s32" : 8,`

			`"m32n8k16:a:u8" : 4,`
			`"m32n8k16:a:s8" : 4,`
			`"m32n8k16:b:u8" : 1,`
			`"m32n8k16:b:s8" : 1,`
			`"m32n8k16:c:s32" : 8,`
			`"m32n8k16:d:s32" : 8,`

			`# u4/s4/b1 -> s32 @ m8n8k32 (u4/s4), m8n8k128(b1)`
			`"m8n8k128:a:b1" : 1,`
			`"m8n8k32:a:u4" : 1,`
			`"m8n8k32:a:s4" : 1,`
			`"m8n8k128:b:b1" : 1,`
			`"m8n8k32:b:u4" : 1,`
			`"m8n8k32:b:s4" : 1,`
			`"m8n8k128:c:s32" : 2,`
			`"m8n8k128:d:s32" : 2,`
			`"m8n8k32:c:s32" : 2,`
			`"m8n8k32:d:s32" : 2,`
			`}.get("%s:%s:%s" % (geom, frag, ptx_elt_type), None));`
			`assert(self.nregs);`

			`def __repr__(self):`
			`return "%s:%s:%s%s" % (self.geom, self.frag, self.mma_type,`
			`"" if self.nregs == 1 else ("*%d" % self.nregs))`

			`class MMAOp:`
			`def __init__(self, a, b, c, d):`
			`self.a = a`
			`self.b = b`
			`self.c = c`
			`self.d = d`

			`def __repr__(self):`
			`return ("{A:%s, B:%s, C:%s, D:%s}" % (self.a, self.b, self.c, self.d ))`

			`def make_mma_ops(geoms, types_a, types_b, types_c, types_d):`
			`ops = []`
			`for geom, type_a, type_c in product( geoms, types_a, types_c):`
			`for type_b, type_d in product(types_b if types_b else [type_a],`
			`types_d if types_d else [type_c]):`
			`ops.append(MMAOp(MMAFrag(geom, "a", type_a),`
			`MMAFrag(geom, "b", type_b),`
			`MMAFrag(geom, "c", type_c),`
			`MMAFrag(geom, "d", type_d)))`
			`return ops`

			`def make_ldst_ops(geoms, frags, types):`
			`return [MMAFrag(geom, frag, ptx_type) for (geom, frag, ptx_type)`
			`in product(geoms, frags, types)]`

			`def get_mma_ops():`
			`return (make_mma_ops(["m8n8k4"],`
			`["f16"], [], ["f16", "f32"], ["f16", "f32"]) +`
			`make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],`
			`["f16"], [], ["f16", "f32"], ["f16", "f32"]) +`
			`make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],`
			`["s8", "u8"], [], ["s32"], []) +`
			`make_mma_ops(["m8n8k32"],`
			`["s4", "u4"], [], ["s32"], []) +`
			`make_mma_ops(["m8n8k128"],`
			`["b1"], [], ["s32"], []))`
			`def get_ldst_ops(kind):`
			`ldst_ops = (make_ldst_ops(["m16n16k16", "m32n8k16", "m8n32k16"],`
			`["a", "b"], ["f16", "u8", "s8"]) +`
			`make_ldst_ops(["m16n16k16", "m32n8k16", "m8n32k16"],`
			`["c", "d"], ["f16", "f32", "s32"]) +`
			`make_ldst_ops(["m8n8k32"], ["a", "b"], ["s4","u4"]) +`
			`make_ldst_ops(["m8n8k128"], ["a", "b"], ["b1"]) +`
			`make_ldst_ops(["m8n8k32", "m8n8k128"], ["c", "d"], ["s32"]))`
			`return [ x for x in ldst_ops if (x.frag == "d") == (kind == "store")]`

			`def is_geom_supported(geom):`
			`# geometries for FP and ints.`
			`if geom == "m8n8k4":`
			`return ptx_version >= 64`
			`if geom in ["m8n32k16", "m32n8k16"]:`
			`return ptx_version >= 61`
			`# geometries for sub-ints.`
			`if geom in ["m8n8k32", "m8n8k128"]:`
			`return ptx_version >= 63 and gpu_arch >= 75`
			`if geom == "m16n16k16":`
			`return ptx_version >= 60`
			`assert(False) # Unexpected geometry.`

			`def is_type_supported(ptx_type):`
			`if ptx_type in ["s8", "u8", "s32"]:`
			`return ptx_version >= 63 and gpu_arch >= 72`
			`if ptx_type in ["s4", "u4", "b1"]:`
			`return ptx_version >= 63 and gpu_arch >= 75`
			`return ptx_version >= 60 and gpu_arch >= 70`


			`def is_mma_variant_supported(op, layout_a, layout_b, satf):`
			`if not (is_type_supported(op.a.mma_type.ptx_type)`
			`and is_geom_supported(op.a.geom)):`
			`return False`
			`if op.a.geom == "m8n8k4":`
			`if satf:`
			`return False`
			`if op.c.mma_type.ptx_type == "f32":`
			`# If C is f32, D must be, too.`
			`return op.d.mma_type.ptx_type == "f32"`

			`# sub-integer require row/col layout, and no satf.`
			`if op.a.mma_type.ptx_type in ["s4", "u4", "b1"]:`
			`if op.a.mma_type.ptx_type == "b1" and satf:`
			`return False`
			`return layout_a == "row" and layout_b == "col"`
			`return True`

			`def is_ldst_variant_supported(frag, layout):`
			`if not (is_type_supported(frag.mma_type.ptx_type)`
			`and is_geom_supported(frag.geom)):`
			`return False`
			`if frag.mma_type.ptx_type in ["s4", "u4", "b1"]:`
			`# sub-integer require sm_75 and ptx63, row/col layout for a/b.`
			`return ((frag.frag == "a" and layout == "row")`
			`or (frag.frag == "b" and layout == "col")`
			`or frag.frag in ["c", "d"])`
			`return True`

			`def make_wmma_slice_ty(frag):`
			`return [frag.mma_type.llvm_type] * frag.nregs`

			`def make_wmma_ld_ret_ty(frag):`
			`results = make_wmma_slice_ty(frag)`
			`if len(results) == 1:`
			`return "%s" % results[0]`
			`return "{%s}" % ", ".join(results)`

			`# returns address space`
			`def get_aspace(space):`
			`space_map = {`
			`".global" : 1,`
			`".shared" : 3,`
			`".const" : 4,`
			`".local" : 5,`
			`".param" : 101,`
			`"" : 0,`
			`".generic": 0`
			`}`
			`return space_map[space];`

			`def get_pspace(space):`
			`return "p%di8" % get_aspace(space);`

			`def check_pattern(frag):`
			`return "{{%s}}" % ", ".join([frag.mma_type.ptx_reg_pattern] frag.nregs)`

			`def gen_wmma_load_tests():`
			`load_template = """`
			`declare ${ret_ty} @${intrinsic}(i8 ${as}* %src ${extra_args});`

			`; CHECK-LABEL: .func {{.*}}test_${function}(`
			`define ${ret_ty} @test_${function}(i8 ${as}* %src ${extra_args}) {`
			`; CHECK: ${instruction}`
			`; CHECK: {${check_result}}`
			`; CHECK: [%rd{{[0-9]+}}]${stride_pattern}`
			`%v0 = call ${ret_ty} @${intrinsic}(i8 ${as}* %src ${extra_args});`
			`ret ${ret_ty} %v0;`
			`}`

			`; CHECK-LABEL: .func{{.*}}test_${function}_o(`
			`define ${ret_ty} @test_${function}_o(i8 ${as}* %src ${extra_args}) {`
			`; CHECK: ${instruction}`
			`; CHECK: {${check_result}}`
			`; CHECK: [%rd{{[0-9]+}}+128]${stride_pattern}`
			`%src1 = getelementptr i8, i8 ${as}* %src, i32 128;`
			`%v0 = call ${ret_ty} @${intrinsic}(i8 ${as}* %src1 ${extra_args});`
			`ret ${ret_ty} %v0;`
			`}`
			`"""`
			`intrinsic_template = "llvm.nvvm.wmma.${geom}.load.${abc}.${layout}${stride}.${itype}.${pspace}"`
			`instruction_template = "wmma.load.${abc}.sync${aligned}.${layout}.${geom}${space}.${itype}"`

			`generated_items = []`

			`for frag, layout, space, stride in product(`
			`get_ldst_ops("load"),`
			`["row","col"],`
			`["",".shared",".global"],`
			`["", ".stride"],`
			`):`
			`if not is_ldst_variant_supported(frag, layout):`
			`continue`

			`params = {`
			`"abc" : frag.frag,`
			`"aligned" : ".aligned" if ptx_version >= 63 else "",`
			`"layout" : layout,`
			`"space" : space,`
			`"stride" : stride,`
			`"itype" : frag.mma_type.ptx_type,`
			`"pspace" : get_pspace(space),`
			`"as" : "addrspace(%d)" % get_aspace(space),`
			`"geom" : frag.geom,`
			`}`

			`test_params = params`
			`test_params["intrinsic"] = Template(intrinsic_template).substitute(params)`
			`test_params["function"] = test_params["intrinsic"].replace(".","_")`
			`test_params["instruction"] = Template(instruction_template).substitute(params)`
			`test_params["ret_ty"] = make_wmma_ld_ret_ty(frag)`
			`test_params["check_result"] = check_pattern(frag)`

			`if stride:`
			`test_params["extra_args"] = ", i32 %stride";`
			`test_params["stride_pattern"] = ", %r{{[0-9]+}}"`
			`else:`
			`test_params["extra_args"] = ""`
			`test_params["stride_pattern"] = ""`

			`print(Template(load_template).substitute(test_params))`

			`generated_items.append((test_params["intrinsic"],`
			`test_params["instruction"]))`

			`return generated_items`

			`def make_wmma_slice_args(frag):`
			`return ", ".join(["%s %%%s%d" % (t, frag.frag, i) for i,t`
			`in enumerate(make_wmma_slice_ty(frag))])`

			`def gen_wmma_store_tests():`
			`store_template = """`
			`declare void @${intrinsic}(i8 ${as}* %src, ${args}${extra_args});`

			`; CHECK-LABEL: .func {{.*}}test_${function}(`
			`define void @test_${function}(i8 ${as}* %src, ${args}${extra_args}) {`
			`; CHECK: ${instruction} {{.*}}[%rd{{[0-9+]}}`
			`; CHECK: {${check_args}}`
			`; CHECK: ${stride_pattern}`
			`call void @${intrinsic}(i8 ${as}* %src, ${args} ${extra_args});`
			`ret void`
			`}`

			`; CHECK-LABEL: .func{{.*}}test_${function}_o(`
			`define void @test_${function}_o(i8 ${as}* %src, ${args}${extra_args}) {`
			`; CHECK: ${instruction} {{.*}}[%rd{{[0-9+]}}+128]`
			`; CHECK: ${check_args}`
			`; CHECK: ${stride_pattern}`
			`%src1 = getelementptr i8, i8 ${as}* %src, i32 128;`
			`call void @${intrinsic}(i8 ${as}* %src1, ${args}${extra_args});`
			`ret void`
			`}`
			`"""`
			`intrinsic_template = "llvm.nvvm.wmma.${geom}.store.${abc}.${layout}${stride}.${itype}.${pspace}"`
			`instruction_template = "wmma.store.${abc}.sync${aligned}.${layout}.${geom}${space}.${itype}"`

			`generated_items = []`

			`for frag, layout, space, stride in product(`
			`get_ldst_ops("store"),`
			`["row","col"],`
			`["",".shared",".global"],`
			`["", ".stride"]):`

			`if not is_ldst_variant_supported(frag, layout):`
			`continue`

			`params = {`
			`"abc" : frag.frag,`
			`"aligned" : ".aligned" if ptx_version >= 63 else "",`
			`"layout" : layout,`
			`"space" : space,`
			`"stride" : stride,`
			`"itype" : frag.mma_type.ptx_type,`
			`"pspace" : get_pspace(space),`
			`"as" : "addrspace(%d)" % get_aspace(space),`
			`"geom" : frag.geom,`
			`}`

			`test_params = params`
			`test_params["intrinsic"] = Template(intrinsic_template).substitute(params)`
			`test_params["function"] = test_params["intrinsic"].replace(".","_")`
			`test_params["instruction"] = Template(instruction_template).substitute(params)`
			`test_params["ret_ty"] = make_wmma_ld_ret_ty(frag)`
			`test_params["check_args"] = check_pattern(frag)`
			`if stride:`
			`test_params["extra_args"] = ", i32 %stride";`
			`test_params["stride_pattern"] = ", %r{{[0-9]+}};"`
			`else:`
			`test_params["extra_args"] = ""`
			`test_params["stride_pattern"] = ";"`
			`test_params["args"] = make_wmma_slice_args(frag);`

			`print(Template(store_template).substitute(test_params))`
			`generated_items.append((test_params["intrinsic"],`
			`test_params["instruction"]))`

			`return generated_items`

			`def mma_signature(op):`
			`if op.a.mma_type.ptx_type in ["s8", "u8", "s4", "u4", "b1"]:`
			`# int and sub-int ops are identified by input type.`
			`return op.a.mma_type.ptx_type`
			`else:`
			`# the rest are FP ops identified by accumulator & result type.`
			`return "%s.%s" % (op.d.mma_type.ptx_type, op.c.mma_type.ptx_type)`

			`def mma_ptx_signature(op):`
			`if op.a.mma_type.ptx_type in ["s8", "u8", "s4", "u4", "b1"]:`
			`# int and sub-int instructions encode all four types as D.A.B.C`
			`return ".".join(x.mma_type.ptx_type for x in (op.d, op.a, op.b, op.c))`
			`if op.a.geom == "m8n8k4":`
			`return "%s.f16.f16.%s" % (op.d.mma_type.ptx_type, op.c.mma_type.ptx_type)`
			`else:`
			`# the rest are FP instructions use D.C`
			`return "%s.%s" % (op.d.mma_type.ptx_type, op.c.mma_type.ptx_type)`

			`def gen_wmma_mma_tests():`
			`mma_template = """`
			`declare ${ret_ty} @${intrinsic}(`
			`${args});`

			`; CHECK-LABEL: .func {{.*}}test_${function}(`
			`define ${ret_ty} @test_${function}(`
			`${args}) {`
			`; CHECK: ${instruction}`
			`; CHECK-NEXT: ${check_d}`
			`; CHECK-NEXT: ${check_a}`
			`; CHECK-NEXT: ${check_b}`
			`; CHECK-NEXT: ${check_c}`
			`%r = call ${ret_ty} @${intrinsic}(`
			`${args});`
			`ret ${ret_ty} %r;`
			`}`
			`"""`
			`wmma_intrinsic_template = "llvm.nvvm.wmma.${geom}.mma.${alayout}.${blayout}.${intrinsic_signature}${satf}"`
			`wmma_instruction_template = "wmma.mma${mma_variant}.sync${aligned}.${alayout}.${blayout}.${geom}.${ptx_signature}${satf}"`
			`mma_intrinsic_template = "llvm.nvvm.mma.${geom}.${alayout}.${blayout}.${intrinsic_signature}"`
			`mma_instruction_template = "mma.sync${aligned}.${geom}.${alayout}.${blayout}.${ptx_signature}"`

			`generated_items=[]`

			`for op, alayout, blayout, satf in product(`
			`get_mma_ops(),`
			`["row","col"],`
			`["row","col"],`
			`[".satfinite", ""]):`

			`if not is_mma_variant_supported(op, alayout, blayout, satf):`
			`continue`

			`params = {`
			`"aligned" : ".aligned" if ptx_version >= 63 else "",`
			`"alayout" : alayout,`
			`"blayout" : blayout,`
			`"intrinsic_signature" : mma_signature(op),`
			`"ptx_signature" : mma_ptx_signature(op),`
			`"satf" : satf,`
			`"geom" : op.a.geom,`
			`"mma_variant" : ".xor.popc" if op.a.mma_type.ptx_type == "b1" else "",`
			`}`

			`if op.a.geom == "m8n8k4":`
			`intrinsic_template = mma_intrinsic_template`
			`instruction_template = mma_instruction_template`
			`else:`
			`intrinsic_template = wmma_intrinsic_template`
			`instruction_template = wmma_instruction_template`

			`test_params = params`
			`test_params["intrinsic"] = Template(intrinsic_template).substitute(params)`
			`test_params["function"] = test_params["intrinsic"].replace(".", "_")`
			`test_params["instruction"] = Template(instruction_template).substitute(params)`
			`test_params["ret_ty"] = make_wmma_ld_ret_ty(op.d)`
			`test_params["check_a"] = check_pattern(op.a)`
			`test_params["check_b"] = check_pattern(op.b)`
			`test_params["check_c"] = check_pattern(op.c)`
			`test_params["check_d"] = check_pattern(op.d)`
			`args = ",\n ".join(make_wmma_slice_args(frag)`
			`for frag in (op.a, op.b, op.c))`
			`test_params["args"] = args`
			`print(Template(mma_template).substitute(test_params))`
			`generated_items.append((test_params["intrinsic"],`
			`test_params["instruction"]))`

			`return generated_items`

			`# Append complete list of intrinsics and instructions we've generated tests for.`
			`# Generate set of checks to verify that that we did generate sensible set of`
			`# tests for the given combination of PTX and SM variants.`
			`#`
			`def gen_check_unsupported_ops(items):`
			`print("; Complete list of intrinsics supported by PTX%d on sm_%d"`
			`% (ptx_version, gpu_arch))`
			`print("; INTRINSICS: {{^; INTRINSICS_LIST_BEGIN}}")`
			`print("""`

			`; NOEXTGEOM-NOT: {{m8n32\|m32n8}}`
			`; NOINT-NOT: .{{s32\|s8}}`
			`; NOSUBINT-NOT: {{s4\|u4\|b1}}`
			`; NOMMA-NOT: .m8n8k4.`

			`; M16N16-DAG: m16n16k16.load.{{[ab].*}}.f16.p`
			`; M16N16-DAG: m16n16k16.{{load\|store}}.{{[cd].*\.(f16\|f32)}}.p`
			`; M16N16-DAG: m16n16k16.mma.{{.*}}.f16.f32`
			`; M16N16-DAG: m16n16k16.mma.{{.*}}.f32.f16`
			`; M16N16-DAG: m16n16k16.mma.{{.*}}.f16.f16`
			`; M16N16-DAG: m16n16k16.mma.{{.*}}.f32.f32`

			`; PTX60 adds support for m32n8k16/m8n32k16 geometries.`
			`; EXTGEOM-DAG: m32n8k16.load.{{[ab].*}}.f16.p`
			`; EXTGEOM-DAG: m32n8k16.{{load\|store}}.{{[cd].*\.(f16\|f32)}}.p`
			`; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f16.f32`
			`; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f32.f16`
			`; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f16.f16`
			`; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f32.f32`

			`; EXTGEOM-DAG: m8n32k16.load.{{[ab].*}}.f16.p`
			`; EXTGEOM-DAG: m8n32k16.{{load\|store}}.{{[cd].*\.(f16\|f32)}}.p`
			`; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f16.f32`
			`; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f32.f16`
			`; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f16.f16`
			`; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f32.f32`

			`; INT-DAG: m16n16k16.load.{{[ab].*}}.s8.p`
			`; INT-DAG: m8n32k16.load.{{[ab].*}}.s8.p`
			`; INT-DAG: m32n8k16.load.{{[ab].*}}.s8.p`
			`; INT-DAG: m16n16k16.load.{{[ab].*}}.u8.p`
			`; INT-DAG: m8n32k16.load.{{[ab].*}}.u8.p`
			`; INT-DAG: m32n8k16.load.{{[ab].*}}.u8.p`
			`; INT-DAG: m32n8k16.{{load\|store}}.{{[cd].*\.s32}}.p`
			`; INT-DAG: m16n16k16.mma.{{.*}}.u8`
			`; INT-DAG: m16n16k16.mma.{{.*}}.s8`
			`; INT-DAG: m8n32k16.mma.{{.*}}.u8`
			`; INT-DAG: m8n32k16.mma.{{.*}}.s8`
			`; INT-DAG: m32n8k16.mma.{{.*}}.u8`
			`; INT-DAG: m32n8k16.mma.{{.*}}.s8`

			`; SUBINT-DAG: m8n8k128.load.{{[ab].*}}.b1.p`
			`; SUBINT-DAG: m8n8k32.load.{{[ab].*}}.s4.p`
			`; SUBINT-DAG: m8n8k32.load.{{[ab].*}}.u4.p`
			`; SUBINT-DAG: m8n8k128.{{load\|store}}.{{[cd].*\.s32}}.p`
			`; SUBINT-DAG: m8n8k32.{{load\|store}}.{{[cd].*\.s32}}.p`
			`; SUBINT-DAG: m8n8k32.mma.{{.*}}.u4`
			`; SUBINT-DAG: m8n8k32.mma.{{.*}}.s4`
			`; SUBINT-DAG: m8n8k128.mma.{{.*}}.b1`

			`; MMA-DAG: mma.m8n8k4.{{.*}}.f16.f32`
			`; MMA-DAG: mma.m8n8k4.{{.*}}.f32.f16`
			`; MMA-DAG: mma.m8n8k4.{{.*}}.f16.f16`
			`; MMA-DAG: mma.m8n8k4.{{.*}}.f32.f32`
			`;`

			`""")`

			`print("; INTRINSICS_LIST_BEGIN")`
			`for intrinsic, instruction in sorted(items):`
			`print("; ", intrinsic, " -> ", instruction,"")`
			`print("; INTRINSICS_LIST_END")`
			`print("; INTRINSICS: ; INTRINSICS_LIST_END")`

			`def gen_tests():`
			`items = gen_wmma_load_tests()`
			`items += gen_wmma_store_tests()`
			`items += gen_wmma_mma_tests()`
			`gen_check_unsupported_ops(items)`

			`parser = argparse.ArgumentParser()`
			`parser.add_argument("--ptx", type=int, default=60)`
			`parser.add_argument("--gpu-arch", type=int, default=70)`
			`args = parser.parse_args()`
			`ptx_version = args.ptx`
			`gpu_arch = args.gpu_arch`

			`gen_tests()`