llvm-for-llvmta/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -O3 -tail-predication=enabled -mtriple=thumbv8.1m.main -mattr=+mve,+mve.fp %s -o - | FileCheck %s

target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "thumbv8.1m-arm-none-eabi"

; Tests that LSR will not interfere with the VCTP intrinsic,
; and that this loop will correctly become tail-predicated.

define arm_aapcs_vfpcc float @vctpi32(float* %0, i32 %1) {
; CHECK-LABEL: vctpi32:
; CHECK:       @ %bb.0:
; CHECK-NEXT:    push {r7, lr}
; CHECK-NEXT:    vmvn.i32 q1, #0x1f
; CHECK-NEXT:    vmov.32 q3[0], r0
; CHECK-NEXT:    movs r2, #0
; CHECK-NEXT:    vadd.i32 q1, q3, q1
; CHECK-NEXT:    subs r3, r1, #1
; CHECK-NEXT:    vidup.u32 q2, r2, #8
; CHECK-NEXT:    vmov r0, s4
; CHECK-NEXT:    vadd.i32 q1, q2, r0
; CHECK-NEXT:    vmov.i32 q0, #0x0
; CHECK-NEXT:    dlstp.32 lr, r3
; CHECK-NEXT:  .LBB0_1: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vldrw.u32 q2, [q1, #32]!
; CHECK-NEXT:    vadd.f32 q0, q0, q2
; CHECK-NEXT:    letp lr, .LBB0_1
; CHECK-NEXT:  @ %bb.2:
; CHECK-NEXT:    bl vecAddAcrossF32Mve
; CHECK-NEXT:    vmov s0, r0
; CHECK-NEXT:    vcvt.f32.s32 s0, s0
; CHECK-NEXT:    vabs.f32 s0, s0
; CHECK-NEXT:    pop {r7, pc}
  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
  %4 = extractvalue { <4 x i32>, i32 } %3, 0
  %5 = add nsw i32 %1, -1
  %6 = ptrtoint float* %0 to i32
  %7 = insertelement <4 x i32> undef, i32 %6, i32 0
  %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
  %10 = add <4 x i32> %4, %9
  br label %11

11:
  %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
  %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
  %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
  %15 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %12)
  %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %15)
  %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
  %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
  %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %15, <4 x float> %13)
  %20 = add nsw i32 %12, -4
  %21 = icmp sgt i32 %12, 4
  br i1 %21, label %11, label %22

22:
  %23 = tail call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19)
  %24 = sitofp i32 %23 to float
  %25 = tail call float @llvm.fabs.f32(float %24)
  ret float %25
}

declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32, i32)
declare <4 x i1> @llvm.arm.mve.vctp32(i32)
declare { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>)
declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
declare arm_aapcs_vfpcc i32 @vecAddAcrossF32Mve(...)
declare float @llvm.fabs.f32(float)
first commit 2022-04-25 10:02:23 +02:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
			`; RUN: llc -O3 -tail-predication=enabled -mtriple=thumbv8.1m.main -mattr=+mve,+mve.fp %s -o - \| FileCheck %s`

			`target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"`
			`target triple = "thumbv8.1m-arm-none-eabi"`

			`; Tests that LSR will not interfere with the VCTP intrinsic,`
			`; and that this loop will correctly become tail-predicated.`

			`define arm_aapcs_vfpcc float @vctpi32(float* %0, i32 %1) {`
			`; CHECK-LABEL: vctpi32:`
			`; CHECK: @ %bb.0:`
			`; CHECK-NEXT: push {r7, lr}`
			`; CHECK-NEXT: vmvn.i32 q1, #0x1f`
			`; CHECK-NEXT: vmov.32 q3[0], r0`
			`; CHECK-NEXT: movs r2, #0`
			`; CHECK-NEXT: vadd.i32 q1, q3, q1`
			`; CHECK-NEXT: subs r3, r1, #1`
			`; CHECK-NEXT: vidup.u32 q2, r2, #8`
			`; CHECK-NEXT: vmov r0, s4`
			`; CHECK-NEXT: vadd.i32 q1, q2, r0`
			`; CHECK-NEXT: vmov.i32 q0, #0x0`
			`; CHECK-NEXT: dlstp.32 lr, r3`
			`; CHECK-NEXT: .LBB0_1: @ =>This Inner Loop Header: Depth=1`
			`; CHECK-NEXT: vldrw.u32 q2, [q1, #32]!`
			`; CHECK-NEXT: vadd.f32 q0, q0, q2`
			`; CHECK-NEXT: letp lr, .LBB0_1`
			`; CHECK-NEXT: @ %bb.2:`
			`; CHECK-NEXT: bl vecAddAcrossF32Mve`
			`; CHECK-NEXT: vmov s0, r0`
			`; CHECK-NEXT: vcvt.f32.s32 s0, s0`
			`; CHECK-NEXT: vabs.f32 s0, s0`
			`; CHECK-NEXT: pop {r7, pc}`
			`%3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)`
			`%4 = extractvalue { <4 x i32>, i32 } %3, 0`
			`%5 = add nsw i32 %1, -1`
			`%6 = ptrtoint float* %0 to i32`
			`%7 = insertelement <4 x i32> undef, i32 %6, i32 0`
			`%8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>`
			`%9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer`
			`%10 = add <4 x i32> %4, %9`
			`br label %11`

			`11:`
			`%12 = phi i32 [ %5, %2 ], [ %20, %11 ]`
			`%13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]`
			`%14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]`
			`%15 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %12)`
			`%16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %15)`
			`%17 = extractvalue { <4 x float>, <4 x i32> } %16, 1`
			`%18 = extractvalue { <4 x float>, <4 x i32> } %16, 0`
			`%19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %15, <4 x float> %13)`
			`%20 = add nsw i32 %12, -4`
			`%21 = icmp sgt i32 %12, 4`
			`br i1 %21, label %11, label %22`

			`22:`
			`%23 = tail call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19)`
			`%24 = sitofp i32 %23 to float`
			`%25 = tail call float @llvm.fabs.f32(float %24)`
			`ret float %25`
			`}`

			`declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32, i32)`
			`declare <4 x i1> @llvm.arm.mve.vctp32(i32)`
			`declare { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>)`
			`declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)`
			`declare arm_aapcs_vfpcc i32 @vecAddAcrossF32Mve(...)`
			`declare float @llvm.fabs.f32(float)`