llvm-for-llvmta/test/CodeGen/AMDGPU/no-shrink-extloads.ll

; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s

declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone

; Make sure we don't turn the 32-bit argument load into a 16-bit
; load. There aren't extending scalar lods, so that would require
; using a buffer_load instruction.

; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i16:
; SI: s_load_dword s
; SI: buffer_store_short v
define amdgpu_kernel void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind {
  %trunc = trunc i32 %arg to i16
  store i16 %trunc, i16 addrspace(1)* %out
  ret void
}

; It should be OK (and probably performance neutral) to reduce this,
; but we don't know if the load is uniform yet.

; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i16:
; SI: buffer_load_dword v
; SI: buffer_store_short v
define amdgpu_kernel void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
  %load = load i32, i32 addrspace(1)* %gep.in
  %trunc = trunc i32 %load to i16
  store i16 %trunc, i16 addrspace(1)* %gep.out
  ret void
}

; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i8:
; SI: s_load_dword s
; SI: buffer_store_byte v
define amdgpu_kernel void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind {
  %trunc = trunc i32 %arg to i8
  store i8 %trunc, i8 addrspace(1)* %out
  ret void
}

; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i8:
; SI: buffer_load_dword v
; SI: buffer_store_byte v
define amdgpu_kernel void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
  %load = load i32, i32 addrspace(1)* %gep.in
  %trunc = trunc i32 %load to i8
  store i8 %trunc, i8 addrspace(1)* %gep.out
  ret void
}

; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i1:
; SI: s_load_dword s
; SI: buffer_store_byte v
define amdgpu_kernel void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind {
  %trunc = trunc i32 %arg to i1
  store i1 %trunc, i1 addrspace(1)* %out
  ret void
}

; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i1:
; SI: buffer_load_dword v
; SI: buffer_store_byte v
define amdgpu_kernel void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i1, i1 addrspace(1)* %out, i32 %tid
  %load = load i32, i32 addrspace(1)* %gep.in
  %trunc = trunc i32 %load to i1
  store i1 %trunc, i1 addrspace(1)* %gep.out
  ret void
}

; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32:
; SI: s_load_dword s
; SI: buffer_store_dword v
define amdgpu_kernel void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
  %trunc = trunc i64 %arg to i32
  store i32 %trunc, i32 addrspace(1)* %out
  ret void
}

; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i32:
; SI: buffer_load_dword v
; SI: buffer_store_dword v
define amdgpu_kernel void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
  %load = load i64, i64 addrspace(1)* %gep.in
  %trunc = trunc i64 %load to i32
  store i32 %trunc, i32 addrspace(1)* %gep.out
  ret void
}

; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32:
; SI: s_load_dword s
; SI: buffer_store_dword v
define amdgpu_kernel void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
  %srl = lshr i64 %arg, 32
  %trunc = trunc i64 %srl to i32
  store i32 %trunc, i32 addrspace(1)* %out
  ret void
}

; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i32:
; SI: buffer_load_dword v
; SI: buffer_store_dword v
define amdgpu_kernel void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
  %load = load i64, i64 addrspace(1)* %gep.in
  %srl = lshr i64 %load, 32
  %trunc = trunc i64 %srl to i32
  store i32 %trunc, i32 addrspace(1)* %gep.out
  ret void
}

; Might as well reduce to 8-bit loads.
; FUNC-LABEL: {{^}}truncate_kernarg_i16_to_i8:
; SI: s_load_dword s
; SI: buffer_store_byte v
define amdgpu_kernel void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind {
  %trunc = trunc i16 %arg to i8
  store i8 %trunc, i8 addrspace(1)* %out
  ret void
}

; FUNC-LABEL: {{^}}truncate_buffer_load_i16_to_i8:
; SI: buffer_load_ubyte v
; SI: buffer_store_byte v
define amdgpu_kernel void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
  %gep.in = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
  %load = load i16, i16 addrspace(1)* %gep.in
  %trunc = trunc i16 %load to i8
  store i8 %trunc, i8 addrspace(1)* %gep.out
  ret void
}

; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8:
; SI: s_load_dword s
; SI: buffer_store_byte v
define amdgpu_kernel void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
  %srl = lshr i64 %arg, 32
  %trunc = trunc i64 %srl to i8
  store i8 %trunc, i8 addrspace(1)* %out
  ret void
}

; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i8:
; SI: buffer_load_dword v
; SI: buffer_store_byte v
define amdgpu_kernel void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
  %load = load i64, i64 addrspace(1)* %gep.in
  %srl = lshr i64 %load, 32
  %trunc = trunc i64 %srl to i8
  store i8 %trunc, i8 addrspace(1)* %gep.out
  ret void
}

; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8:
; SI: s_load_dword s
; SI: buffer_store_byte v
define amdgpu_kernel void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
  %trunc = trunc i64 %arg to i8
  store i8 %trunc, i8 addrspace(1)* %out
  ret void
}

; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i8:
; SI: buffer_load_dword v
; SI: buffer_store_byte v
define amdgpu_kernel void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
  %load = load i64, i64 addrspace(1)* %gep.in
  %trunc = trunc i64 %load to i8
  store i8 %trunc, i8 addrspace(1)* %gep.out
  ret void
}

; FUNC-LABEL: {{^}}smrd_mask_i32_to_i16
; SI: s_load_dword [[LOAD:s[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0x0
; SI: s_waitcnt lgkmcnt(0)
; SI: s_and_b32 s{{[0-9]+}}, [[LOAD]], 0xffff
define amdgpu_kernel void @smrd_mask_i32_to_i16(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
entry:
  %val = load i32, i32 addrspace(4)* %in
  %mask = and i32 %val, 65535
  store i32 %mask, i32 addrspace(1)* %out
  ret void
}

; FUNC-LABEL: {{^}}extract_hi_i64_bitcast_v2i32:
; SI: buffer_load_dword v
; SI: buffer_store_dword v
define amdgpu_kernel void @extract_hi_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
  %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
  %bc = bitcast <2 x i32> %ld to i64
  %hi = lshr i64 %bc, 32
  %trunc = trunc i64 %hi to i32
  store i32 %trunc, i32 addrspace(1)* %out
  ret void
}
first commit 2022-04-25 10:02:23 +02:00			`; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=FUNC %s`

			`declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone`

			`; Make sure we don't turn the 32-bit argument load into a 16-bit`
			`; load. There aren't extending scalar lods, so that would require`
			`; using a buffer_load instruction.`

			`; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i16:`
			`; SI: s_load_dword s`
			`; SI: buffer_store_short v`
			`define amdgpu_kernel void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind {`
			`%trunc = trunc i32 %arg to i16`
			`store i16 %trunc, i16 addrspace(1)* %out`
			`ret void`
			`}`

			`; It should be OK (and probably performance neutral) to reduce this,`
			`; but we don't know if the load is uniform yet.`

			`; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i16:`
			`; SI: buffer_load_dword v`
			`; SI: buffer_store_short v`
			`define amdgpu_kernel void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {`
			`%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone`
			`%gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid`
			`%gep.out = getelementptr i16, i16 addrspace(1)* %out, i32 %tid`
			`%load = load i32, i32 addrspace(1)* %gep.in`
			`%trunc = trunc i32 %load to i16`
			`store i16 %trunc, i16 addrspace(1)* %gep.out`
			`ret void`
			`}`

			`; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i8:`
			`; SI: s_load_dword s`
			`; SI: buffer_store_byte v`
			`define amdgpu_kernel void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind {`
			`%trunc = trunc i32 %arg to i8`
			`store i8 %trunc, i8 addrspace(1)* %out`
			`ret void`
			`}`

			`; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i8:`
			`; SI: buffer_load_dword v`
			`; SI: buffer_store_byte v`
			`define amdgpu_kernel void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {`
			`%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone`
			`%gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid`
			`%gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid`
			`%load = load i32, i32 addrspace(1)* %gep.in`
			`%trunc = trunc i32 %load to i8`
			`store i8 %trunc, i8 addrspace(1)* %gep.out`
			`ret void`
			`}`

			`; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i1:`
			`; SI: s_load_dword s`
			`; SI: buffer_store_byte v`
			`define amdgpu_kernel void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind {`
			`%trunc = trunc i32 %arg to i1`
			`store i1 %trunc, i1 addrspace(1)* %out`
			`ret void`
			`}`

			`; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i1:`
			`; SI: buffer_load_dword v`
			`; SI: buffer_store_byte v`
			`define amdgpu_kernel void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {`
			`%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone`
			`%gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid`
			`%gep.out = getelementptr i1, i1 addrspace(1)* %out, i32 %tid`
			`%load = load i32, i32 addrspace(1)* %gep.in`
			`%trunc = trunc i32 %load to i1`
			`store i1 %trunc, i1 addrspace(1)* %gep.out`
			`ret void`
			`}`

			`; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32:`
			`; SI: s_load_dword s`
			`; SI: buffer_store_dword v`
			`define amdgpu_kernel void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {`
			`%trunc = trunc i64 %arg to i32`
			`store i32 %trunc, i32 addrspace(1)* %out`
			`ret void`
			`}`

			`; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i32:`
			`; SI: buffer_load_dword v`
			`; SI: buffer_store_dword v`
			`define amdgpu_kernel void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {`
			`%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone`
			`%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid`
			`%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid`
			`%load = load i64, i64 addrspace(1)* %gep.in`
			`%trunc = trunc i64 %load to i32`
			`store i32 %trunc, i32 addrspace(1)* %gep.out`
			`ret void`
			`}`

			`; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32:`
			`; SI: s_load_dword s`
			`; SI: buffer_store_dword v`
			`define amdgpu_kernel void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {`
			`%srl = lshr i64 %arg, 32`
			`%trunc = trunc i64 %srl to i32`
			`store i32 %trunc, i32 addrspace(1)* %out`
			`ret void`
			`}`

			`; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i32:`
			`; SI: buffer_load_dword v`
			`; SI: buffer_store_dword v`
			`define amdgpu_kernel void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {`
			`%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone`
			`%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid`
			`%gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid`
			`%load = load i64, i64 addrspace(1)* %gep.in`
			`%srl = lshr i64 %load, 32`
			`%trunc = trunc i64 %srl to i32`
			`store i32 %trunc, i32 addrspace(1)* %gep.out`
			`ret void`
			`}`

			`; Might as well reduce to 8-bit loads.`
			`; FUNC-LABEL: {{^}}truncate_kernarg_i16_to_i8:`
			`; SI: s_load_dword s`
			`; SI: buffer_store_byte v`
			`define amdgpu_kernel void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind {`
			`%trunc = trunc i16 %arg to i8`
			`store i8 %trunc, i8 addrspace(1)* %out`
			`ret void`
			`}`

			`; FUNC-LABEL: {{^}}truncate_buffer_load_i16_to_i8:`
			`; SI: buffer_load_ubyte v`
			`; SI: buffer_store_byte v`
			`define amdgpu_kernel void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {`
			`%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone`
			`%gep.in = getelementptr i16, i16 addrspace(1)* %in, i32 %tid`
			`%gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid`
			`%load = load i16, i16 addrspace(1)* %gep.in`
			`%trunc = trunc i16 %load to i8`
			`store i8 %trunc, i8 addrspace(1)* %gep.out`
			`ret void`
			`}`

			`; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8:`
			`; SI: s_load_dword s`
			`; SI: buffer_store_byte v`
			`define amdgpu_kernel void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {`
			`%srl = lshr i64 %arg, 32`
			`%trunc = trunc i64 %srl to i8`
			`store i8 %trunc, i8 addrspace(1)* %out`
			`ret void`
			`}`

			`; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i8:`
			`; SI: buffer_load_dword v`
			`; SI: buffer_store_byte v`
			`define amdgpu_kernel void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {`
			`%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone`
			`%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid`
			`%gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid`
			`%load = load i64, i64 addrspace(1)* %gep.in`
			`%srl = lshr i64 %load, 32`
			`%trunc = trunc i64 %srl to i8`
			`store i8 %trunc, i8 addrspace(1)* %gep.out`
			`ret void`
			`}`

			`; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8:`
			`; SI: s_load_dword s`
			`; SI: buffer_store_byte v`
			`define amdgpu_kernel void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {`
			`%trunc = trunc i64 %arg to i8`
			`store i8 %trunc, i8 addrspace(1)* %out`
			`ret void`
			`}`

			`; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i8:`
			`; SI: buffer_load_dword v`
			`; SI: buffer_store_byte v`
			`define amdgpu_kernel void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {`
			`%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone`
			`%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid`
			`%gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid`
			`%load = load i64, i64 addrspace(1)* %gep.in`
			`%trunc = trunc i64 %load to i8`
			`store i8 %trunc, i8 addrspace(1)* %gep.out`
			`ret void`
			`}`

			`; FUNC-LABEL: {{^}}smrd_mask_i32_to_i16`
			`; SI: s_load_dword [[LOAD:s[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0x0`
			`; SI: s_waitcnt lgkmcnt(0)`
			`; SI: s_and_b32 s{{[0-9]+}}, [[LOAD]], 0xffff`
			`define amdgpu_kernel void @smrd_mask_i32_to_i16(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {`
			`entry:`
			`%val = load i32, i32 addrspace(4)* %in`
			`%mask = and i32 %val, 65535`
			`store i32 %mask, i32 addrspace(1)* %out`
			`ret void`
			`}`

			`; FUNC-LABEL: {{^}}extract_hi_i64_bitcast_v2i32:`
			`; SI: buffer_load_dword v`
			`; SI: buffer_store_dword v`
			`define amdgpu_kernel void @extract_hi_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {`
			`%ld = load <2 x i32>, <2 x i32> addrspace(1)* %in`
			`%bc = bitcast <2 x i32> %ld to i64`
			`%hi = lshr i64 %bc, 32`
			`%trunc = trunc i64 %hi to i32`
			`store i32 %trunc, i32 addrspace(1)* %out`
			`ret void`
			`}`