2647 lines
92 KiB
LLVM
2647 lines
92 KiB
LLVM
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||
|
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
|
||
|
; RUN: llc -mtriple=thumbebv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
|
||
|
|
||
|
define i8* @ldrwu32_4(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrwu32_4:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrwt.u32 q0, [r0], #4
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 4
|
||
|
%0 = bitcast i8* %x to <4 x i32>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
|
||
|
%2 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %1, <4 x i32>* %2, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrwu32_3(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrwu32_3:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrwt.u32 q0, [r0]
|
||
|
; CHECK-NEXT: adds r0, #3
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 3
|
||
|
%0 = bitcast i8* %x to <4 x i32>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
|
||
|
%2 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %1, <4 x i32>* %2, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrwu32_2(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrwu32_2:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrwt.u32 q0, [r0]
|
||
|
; CHECK-NEXT: adds r0, #2
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 2
|
||
|
%0 = bitcast i8* %x to <4 x i32>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
|
||
|
%2 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %1, <4 x i32>* %2, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrwu32_508(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrwu32_508:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrwt.u32 q0, [r0], #508
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 508
|
||
|
%0 = bitcast i8* %x to <4 x i32>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
|
||
|
%2 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %1, <4 x i32>* %2, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrwu32_512(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrwu32_512:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrwt.u32 q0, [r0]
|
||
|
; CHECK-NEXT: add.w r0, r0, #512
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 512
|
||
|
%0 = bitcast i8* %x to <4 x i32>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
|
||
|
%2 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %1, <4 x i32>* %2, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrwu32_m508(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrwu32_m508:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrwt.u32 q0, [r0], #-508
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 -508
|
||
|
%0 = bitcast i8* %x to <4 x i32>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
|
||
|
%2 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %1, <4 x i32>* %2, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrwu32_m512(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrwu32_m512:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrwt.u32 q0, [r0]
|
||
|
; CHECK-NEXT: sub.w r0, r0, #512
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 -512
|
||
|
%0 = bitcast i8* %x to <4 x i32>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
|
||
|
%2 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %1, <4 x i32>* %2, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhu32_4(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrhu32_4:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.u32 q0, [r0], #4
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 4
|
||
|
%0 = bitcast i8* %x to <4 x i16>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
|
||
|
%2 = zext <4 x i16> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhu32_3(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrhu32_3:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.u32 q0, [r0]
|
||
|
; CHECK-NEXT: adds r0, #3
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 3
|
||
|
%0 = bitcast i8* %x to <4 x i16>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
|
||
|
%2 = zext <4 x i16> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhu32_2(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrhu32_2:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.u32 q0, [r0], #2
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 2
|
||
|
%0 = bitcast i8* %x to <4 x i16>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
|
||
|
%2 = zext <4 x i16> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhu32_254(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrhu32_254:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.u32 q0, [r0], #254
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 254
|
||
|
%0 = bitcast i8* %x to <4 x i16>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
|
||
|
%2 = zext <4 x i16> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhu32_256(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrhu32_256:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.u32 q0, [r0]
|
||
|
; CHECK-NEXT: add.w r0, r0, #256
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 256
|
||
|
%0 = bitcast i8* %x to <4 x i16>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
|
||
|
%2 = zext <4 x i16> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhu32_m254(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrhu32_m254:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.u32 q0, [r0], #-254
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 -254
|
||
|
%0 = bitcast i8* %x to <4 x i16>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
|
||
|
%2 = zext <4 x i16> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhu32_m256(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrhu32_m256:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.u32 q0, [r0]
|
||
|
; CHECK-NEXT: sub.w r0, r0, #256
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 -256
|
||
|
%0 = bitcast i8* %x to <4 x i16>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
|
||
|
%2 = zext <4 x i16> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhs32_4(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrhs32_4:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.s32 q0, [r0], #4
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 4
|
||
|
%0 = bitcast i8* %x to <4 x i16>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
|
||
|
%2 = sext <4 x i16> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhs32_3(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrhs32_3:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.s32 q0, [r0]
|
||
|
; CHECK-NEXT: adds r0, #3
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 3
|
||
|
%0 = bitcast i8* %x to <4 x i16>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
|
||
|
%2 = sext <4 x i16> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhs32_2(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrhs32_2:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.s32 q0, [r0], #2
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 2
|
||
|
%0 = bitcast i8* %x to <4 x i16>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
|
||
|
%2 = sext <4 x i16> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhs32_254(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrhs32_254:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.s32 q0, [r0], #254
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 254
|
||
|
%0 = bitcast i8* %x to <4 x i16>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
|
||
|
%2 = sext <4 x i16> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhs32_256(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrhs32_256:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.s32 q0, [r0]
|
||
|
; CHECK-NEXT: add.w r0, r0, #256
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 256
|
||
|
%0 = bitcast i8* %x to <4 x i16>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
|
||
|
%2 = sext <4 x i16> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhs32_m254(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrhs32_m254:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.s32 q0, [r0], #-254
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 -254
|
||
|
%0 = bitcast i8* %x to <4 x i16>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
|
||
|
%2 = sext <4 x i16> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhs32_m256(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrhs32_m256:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.s32 q0, [r0]
|
||
|
; CHECK-NEXT: sub.w r0, r0, #256
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 -256
|
||
|
%0 = bitcast i8* %x to <4 x i16>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
|
||
|
%2 = sext <4 x i16> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhu16_4(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrhu16_4:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.u16 q0, [r0], #4
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 4
|
||
|
%0 = bitcast i8* %x to <8 x i16>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
|
||
|
%2 = bitcast i8* %y to <8 x i16>*
|
||
|
store <8 x i16> %1, <8 x i16>* %2, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhu16_3(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrhu16_3:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.u16 q0, [r0]
|
||
|
; CHECK-NEXT: adds r0, #3
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 3
|
||
|
%0 = bitcast i8* %x to <8 x i16>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
|
||
|
%2 = bitcast i8* %y to <8 x i16>*
|
||
|
store <8 x i16> %1, <8 x i16>* %2, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhu16_2(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrhu16_2:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.u16 q0, [r0], #2
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 2
|
||
|
%0 = bitcast i8* %x to <8 x i16>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
|
||
|
%2 = bitcast i8* %y to <8 x i16>*
|
||
|
store <8 x i16> %1, <8 x i16>* %2, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhu16_254(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrhu16_254:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.u16 q0, [r0], #254
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 254
|
||
|
%0 = bitcast i8* %x to <8 x i16>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
|
||
|
%2 = bitcast i8* %y to <8 x i16>*
|
||
|
store <8 x i16> %1, <8 x i16>* %2, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhu16_256(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrhu16_256:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.u16 q0, [r0]
|
||
|
; CHECK-NEXT: add.w r0, r0, #256
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 256
|
||
|
%0 = bitcast i8* %x to <8 x i16>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
|
||
|
%2 = bitcast i8* %y to <8 x i16>*
|
||
|
store <8 x i16> %1, <8 x i16>* %2, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhu16_m254(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrhu16_m254:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.u16 q0, [r0], #-254
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 -254
|
||
|
%0 = bitcast i8* %x to <8 x i16>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
|
||
|
%2 = bitcast i8* %y to <8 x i16>*
|
||
|
store <8 x i16> %1, <8 x i16>* %2, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhu16_m256(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrhu16_m256:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.u16 q0, [r0]
|
||
|
; CHECK-NEXT: sub.w r0, r0, #256
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 -256
|
||
|
%0 = bitcast i8* %x to <8 x i16>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
|
||
|
%2 = bitcast i8* %y to <8 x i16>*
|
||
|
store <8 x i16> %1, <8 x i16>* %2, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbu32_4(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrbu32_4:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.u32 q0, [r0], #4
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 4
|
||
|
%0 = bitcast i8* %x to <4 x i8>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
|
||
|
%2 = zext <4 x i8> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbu32_3(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrbu32_3:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.u32 q0, [r0], #3
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 3
|
||
|
%0 = bitcast i8* %x to <4 x i8>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
|
||
|
%2 = zext <4 x i8> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbu32_2(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrbu32_2:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.u32 q0, [r0], #2
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 2
|
||
|
%0 = bitcast i8* %x to <4 x i8>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
|
||
|
%2 = zext <4 x i8> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbu32_127(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrbu32_127:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.u32 q0, [r0], #127
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 127
|
||
|
%0 = bitcast i8* %x to <4 x i8>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
|
||
|
%2 = zext <4 x i8> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbu32_128(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrbu32_128:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.u32 q0, [r0]
|
||
|
; CHECK-NEXT: adds r0, #128
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 128
|
||
|
%0 = bitcast i8* %x to <4 x i8>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
|
||
|
%2 = zext <4 x i8> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbu32_m127(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrbu32_m127:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.u32 q0, [r0], #-127
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 -127
|
||
|
%0 = bitcast i8* %x to <4 x i8>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
|
||
|
%2 = zext <4 x i8> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbu32_m128(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrbu32_m128:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.u32 q0, [r0]
|
||
|
; CHECK-NEXT: subs r0, #128
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 -128
|
||
|
%0 = bitcast i8* %x to <4 x i8>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
|
||
|
%2 = zext <4 x i8> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbs32_4(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrbs32_4:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.s32 q0, [r0], #4
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 4
|
||
|
%0 = bitcast i8* %x to <4 x i8>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
|
||
|
%2 = sext <4 x i8> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbs32_3(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrbs32_3:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.s32 q0, [r0], #3
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 3
|
||
|
%0 = bitcast i8* %x to <4 x i8>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
|
||
|
%2 = sext <4 x i8> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbs32_2(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrbs32_2:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.s32 q0, [r0], #2
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 2
|
||
|
%0 = bitcast i8* %x to <4 x i8>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
|
||
|
%2 = sext <4 x i8> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbs32_127(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrbs32_127:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.s32 q0, [r0], #127
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 127
|
||
|
%0 = bitcast i8* %x to <4 x i8>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
|
||
|
%2 = sext <4 x i8> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbs32_128(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrbs32_128:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.s32 q0, [r0]
|
||
|
; CHECK-NEXT: adds r0, #128
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 128
|
||
|
%0 = bitcast i8* %x to <4 x i8>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
|
||
|
%2 = sext <4 x i8> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbs32_m127(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrbs32_m127:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.s32 q0, [r0], #-127
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 -127
|
||
|
%0 = bitcast i8* %x to <4 x i8>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
|
||
|
%2 = sext <4 x i8> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbs32_m128(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrbs32_m128:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.s32 q0, [r0]
|
||
|
; CHECK-NEXT: subs r0, #128
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 -128
|
||
|
%0 = bitcast i8* %x to <4 x i8>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
|
||
|
%2 = sext <4 x i8> %1 to <4 x i32>
|
||
|
%3 = bitcast i8* %y to <4 x i32>*
|
||
|
store <4 x i32> %2, <4 x i32>* %3, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbu16_4(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrbu16_4:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.u16 q0, [r0], #4
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 4
|
||
|
%0 = bitcast i8* %x to <8 x i8>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
|
||
|
%2 = zext <8 x i8> %1 to <8 x i16>
|
||
|
%3 = bitcast i8* %y to <8 x i16>*
|
||
|
store <8 x i16> %2, <8 x i16>* %3, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbu16_3(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrbu16_3:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.u16 q0, [r0], #3
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 3
|
||
|
%0 = bitcast i8* %x to <8 x i8>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
|
||
|
%2 = zext <8 x i8> %1 to <8 x i16>
|
||
|
%3 = bitcast i8* %y to <8 x i16>*
|
||
|
store <8 x i16> %2, <8 x i16>* %3, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbu16_2(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrbu16_2:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.u16 q0, [r0], #2
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 2
|
||
|
%0 = bitcast i8* %x to <8 x i8>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
|
||
|
%2 = zext <8 x i8> %1 to <8 x i16>
|
||
|
%3 = bitcast i8* %y to <8 x i16>*
|
||
|
store <8 x i16> %2, <8 x i16>* %3, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbu16_127(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrbu16_127:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.u16 q0, [r0], #127
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 127
|
||
|
%0 = bitcast i8* %x to <8 x i8>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
|
||
|
%2 = zext <8 x i8> %1 to <8 x i16>
|
||
|
%3 = bitcast i8* %y to <8 x i16>*
|
||
|
store <8 x i16> %2, <8 x i16>* %3, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbu16_128(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrbu16_128:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.u16 q0, [r0]
|
||
|
; CHECK-NEXT: adds r0, #128
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 128
|
||
|
%0 = bitcast i8* %x to <8 x i8>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
|
||
|
%2 = zext <8 x i8> %1 to <8 x i16>
|
||
|
%3 = bitcast i8* %y to <8 x i16>*
|
||
|
store <8 x i16> %2, <8 x i16>* %3, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbu16_m127(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrbu16_m127:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.u16 q0, [r0], #-127
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 -127
|
||
|
%0 = bitcast i8* %x to <8 x i8>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
|
||
|
%2 = zext <8 x i8> %1 to <8 x i16>
|
||
|
%3 = bitcast i8* %y to <8 x i16>*
|
||
|
store <8 x i16> %2, <8 x i16>* %3, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbu16_m128(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrbu16_m128:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.u16 q0, [r0]
|
||
|
; CHECK-NEXT: subs r0, #128
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 -128
|
||
|
%0 = bitcast i8* %x to <8 x i8>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
|
||
|
%2 = zext <8 x i8> %1 to <8 x i16>
|
||
|
%3 = bitcast i8* %y to <8 x i16>*
|
||
|
store <8 x i16> %2, <8 x i16>* %3, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbs16_4(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrbs16_4:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.s16 q0, [r0], #4
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 4
|
||
|
%0 = bitcast i8* %x to <8 x i8>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
|
||
|
%2 = sext <8 x i8> %1 to <8 x i16>
|
||
|
%3 = bitcast i8* %y to <8 x i16>*
|
||
|
store <8 x i16> %2, <8 x i16>* %3, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbs16_3(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrbs16_3:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.s16 q0, [r0], #3
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 3
|
||
|
%0 = bitcast i8* %x to <8 x i8>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
|
||
|
%2 = sext <8 x i8> %1 to <8 x i16>
|
||
|
%3 = bitcast i8* %y to <8 x i16>*
|
||
|
store <8 x i16> %2, <8 x i16>* %3, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbs16_2(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrbs16_2:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.s16 q0, [r0], #2
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 2
|
||
|
%0 = bitcast i8* %x to <8 x i8>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
|
||
|
%2 = sext <8 x i8> %1 to <8 x i16>
|
||
|
%3 = bitcast i8* %y to <8 x i16>*
|
||
|
store <8 x i16> %2, <8 x i16>* %3, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbs16_127(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrbs16_127:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.s16 q0, [r0], #127
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 127
|
||
|
%0 = bitcast i8* %x to <8 x i8>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
|
||
|
%2 = sext <8 x i8> %1 to <8 x i16>
|
||
|
%3 = bitcast i8* %y to <8 x i16>*
|
||
|
store <8 x i16> %2, <8 x i16>* %3, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbs16_128(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrbs16_128:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.s16 q0, [r0]
|
||
|
; CHECK-NEXT: adds r0, #128
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 128
|
||
|
%0 = bitcast i8* %x to <8 x i8>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
|
||
|
%2 = sext <8 x i8> %1 to <8 x i16>
|
||
|
%3 = bitcast i8* %y to <8 x i16>*
|
||
|
store <8 x i16> %2, <8 x i16>* %3, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbs16_m127(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrbs16_m127:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.s16 q0, [r0], #-127
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 -127
|
||
|
%0 = bitcast i8* %x to <8 x i8>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
|
||
|
%2 = sext <8 x i8> %1 to <8 x i16>
|
||
|
%3 = bitcast i8* %y to <8 x i16>*
|
||
|
store <8 x i16> %2, <8 x i16>* %3, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbs16_m128(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrbs16_m128:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.s16 q0, [r0]
|
||
|
; CHECK-NEXT: subs r0, #128
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 -128
|
||
|
%0 = bitcast i8* %x to <8 x i8>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
|
||
|
%2 = sext <8 x i8> %1 to <8 x i16>
|
||
|
%3 = bitcast i8* %y to <8 x i16>*
|
||
|
store <8 x i16> %2, <8 x i16>* %3, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbu8_4(i8* %x, i8* %y, <16 x i8> *%m) {
|
||
|
; CHECK-LABEL: ldrbu8_4:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u8 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i8 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.u8 q0, [r0], #4
|
||
|
; CHECK-NEXT: vstrb.8 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 4
|
||
|
%0 = bitcast i8* %x to <16 x i8>*
|
||
|
%mask = load <16 x i8>, <16 x i8>* %m, align 1
|
||
|
%c = icmp ne <16 x i8> %mask, zeroinitializer
|
||
|
%1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
|
||
|
%2 = bitcast i8* %y to <16 x i8>*
|
||
|
store <16 x i8> %1, <16 x i8>* %2, align 1
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbu8_3(i8* %x, i8* %y, <16 x i8> *%m) {
|
||
|
; CHECK-LABEL: ldrbu8_3:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u8 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i8 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.u8 q0, [r0], #3
|
||
|
; CHECK-NEXT: vstrb.8 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 3
|
||
|
%0 = bitcast i8* %x to <16 x i8>*
|
||
|
%mask = load <16 x i8>, <16 x i8>* %m, align 1
|
||
|
%c = icmp ne <16 x i8> %mask, zeroinitializer
|
||
|
%1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
|
||
|
%2 = bitcast i8* %y to <16 x i8>*
|
||
|
store <16 x i8> %1, <16 x i8>* %2, align 1
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbu8_2(i8* %x, i8* %y, <16 x i8> *%m) {
|
||
|
; CHECK-LABEL: ldrbu8_2:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u8 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i8 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.u8 q0, [r0], #2
|
||
|
; CHECK-NEXT: vstrb.8 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 2
|
||
|
%0 = bitcast i8* %x to <16 x i8>*
|
||
|
%mask = load <16 x i8>, <16 x i8>* %m, align 1
|
||
|
%c = icmp ne <16 x i8> %mask, zeroinitializer
|
||
|
%1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
|
||
|
%2 = bitcast i8* %y to <16 x i8>*
|
||
|
store <16 x i8> %1, <16 x i8>* %2, align 1
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbu8_127(i8* %x, i8* %y, <16 x i8> *%m) {
|
||
|
; CHECK-LABEL: ldrbu8_127:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u8 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i8 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.u8 q0, [r0], #127
|
||
|
; CHECK-NEXT: vstrb.8 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 127
|
||
|
%0 = bitcast i8* %x to <16 x i8>*
|
||
|
%mask = load <16 x i8>, <16 x i8>* %m, align 1
|
||
|
%c = icmp ne <16 x i8> %mask, zeroinitializer
|
||
|
%1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
|
||
|
%2 = bitcast i8* %y to <16 x i8>*
|
||
|
store <16 x i8> %1, <16 x i8>* %2, align 1
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbu8_128(i8* %x, i8* %y, <16 x i8> *%m) {
|
||
|
; CHECK-LABEL: ldrbu8_128:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u8 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i8 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.u8 q0, [r0]
|
||
|
; CHECK-NEXT: adds r0, #128
|
||
|
; CHECK-NEXT: vstrb.8 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 128
|
||
|
%0 = bitcast i8* %x to <16 x i8>*
|
||
|
%mask = load <16 x i8>, <16 x i8>* %m, align 1
|
||
|
%c = icmp ne <16 x i8> %mask, zeroinitializer
|
||
|
%1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
|
||
|
%2 = bitcast i8* %y to <16 x i8>*
|
||
|
store <16 x i8> %1, <16 x i8>* %2, align 1
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbu8_m127(i8* %x, i8* %y, <16 x i8> *%m) {
|
||
|
; CHECK-LABEL: ldrbu8_m127:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u8 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i8 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.u8 q0, [r0], #-127
|
||
|
; CHECK-NEXT: vstrb.8 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 -127
|
||
|
%0 = bitcast i8* %x to <16 x i8>*
|
||
|
%mask = load <16 x i8>, <16 x i8>* %m, align 1
|
||
|
%c = icmp ne <16 x i8> %mask, zeroinitializer
|
||
|
%1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
|
||
|
%2 = bitcast i8* %y to <16 x i8>*
|
||
|
store <16 x i8> %1, <16 x i8>* %2, align 1
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrbu8_m128(i8* %x, i8* %y, <16 x i8> *%m) {
|
||
|
; CHECK-LABEL: ldrbu8_m128:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u8 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i8 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrbt.u8 q0, [r0]
|
||
|
; CHECK-NEXT: subs r0, #128
|
||
|
; CHECK-NEXT: vstrb.8 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 -128
|
||
|
%0 = bitcast i8* %x to <16 x i8>*
|
||
|
%mask = load <16 x i8>, <16 x i8>* %m, align 1
|
||
|
%c = icmp ne <16 x i8> %mask, zeroinitializer
|
||
|
%1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
|
||
|
%2 = bitcast i8* %y to <16 x i8>*
|
||
|
store <16 x i8> %1, <16 x i8>* %2, align 1
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrwf32_4(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrwf32_4:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrwt.u32 q0, [r0], #4
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 4
|
||
|
%0 = bitcast i8* %x to <4 x float>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
|
||
|
%2 = bitcast i8* %y to <4 x float>*
|
||
|
store <4 x float> %1, <4 x float>* %2, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrwf32_3(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrwf32_3:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrwt.u32 q0, [r0]
|
||
|
; CHECK-NEXT: adds r0, #3
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 3
|
||
|
%0 = bitcast i8* %x to <4 x float>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
|
||
|
%2 = bitcast i8* %y to <4 x float>*
|
||
|
store <4 x float> %1, <4 x float>* %2, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrwf32_2(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrwf32_2:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrwt.u32 q0, [r0]
|
||
|
; CHECK-NEXT: adds r0, #2
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 2
|
||
|
%0 = bitcast i8* %x to <4 x float>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
|
||
|
%2 = bitcast i8* %y to <4 x float>*
|
||
|
store <4 x float> %1, <4 x float>* %2, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrwf32_508(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrwf32_508:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrwt.u32 q0, [r0], #508
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 508
|
||
|
%0 = bitcast i8* %x to <4 x float>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
|
||
|
%2 = bitcast i8* %y to <4 x float>*
|
||
|
store <4 x float> %1, <4 x float>* %2, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrwf32_512(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrwf32_512:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrwt.u32 q0, [r0]
|
||
|
; CHECK-NEXT: add.w r0, r0, #512
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 512
|
||
|
%0 = bitcast i8* %x to <4 x float>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
|
||
|
%2 = bitcast i8* %y to <4 x float>*
|
||
|
store <4 x float> %1, <4 x float>* %2, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrwf32_m508(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrwf32_m508:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrwt.u32 q0, [r0], #-508
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 -508
|
||
|
%0 = bitcast i8* %x to <4 x float>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
|
||
|
%2 = bitcast i8* %y to <4 x float>*
|
||
|
store <4 x float> %1, <4 x float>* %2, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrwf32_m512(i8* %x, i8* %y, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: ldrwf32_m512:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrwt.u32 q0, [r0]
|
||
|
; CHECK-NEXT: sub.w r0, r0, #512
|
||
|
; CHECK-NEXT: vstrw.32 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 -512
|
||
|
%0 = bitcast i8* %x to <4 x float>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
|
||
|
%2 = bitcast i8* %y to <4 x float>*
|
||
|
store <4 x float> %1, <4 x float>* %2, align 4
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhf16_4(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrhf16_4:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.u16 q0, [r0], #4
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 4
|
||
|
%0 = bitcast i8* %x to <8 x half>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
|
||
|
%2 = bitcast i8* %y to <8 x half>*
|
||
|
store <8 x half> %1, <8 x half>* %2, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhf16_3(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrhf16_3:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.u16 q0, [r0]
|
||
|
; CHECK-NEXT: adds r0, #3
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 3
|
||
|
%0 = bitcast i8* %x to <8 x half>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
|
||
|
%2 = bitcast i8* %y to <8 x half>*
|
||
|
store <8 x half> %1, <8 x half>* %2, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhf16_2(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrhf16_2:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.u16 q0, [r0], #2
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 2
|
||
|
%0 = bitcast i8* %x to <8 x half>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
|
||
|
%2 = bitcast i8* %y to <8 x half>*
|
||
|
store <8 x half> %1, <8 x half>* %2, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhf16_254(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrhf16_254:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.u16 q0, [r0], #254
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 254
|
||
|
%0 = bitcast i8* %x to <8 x half>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
|
||
|
%2 = bitcast i8* %y to <8 x half>*
|
||
|
store <8 x half> %1, <8 x half>* %2, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhf16_256(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrhf16_256:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.u16 q0, [r0]
|
||
|
; CHECK-NEXT: add.w r0, r0, #256
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 256
|
||
|
%0 = bitcast i8* %x to <8 x half>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
|
||
|
%2 = bitcast i8* %y to <8 x half>*
|
||
|
store <8 x half> %1, <8 x half>* %2, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhf16_m254(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrhf16_m254:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.u16 q0, [r0], #-254
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 -254
|
||
|
%0 = bitcast i8* %x to <8 x half>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
|
||
|
%2 = bitcast i8* %y to <8 x half>*
|
||
|
store <8 x half> %1, <8 x half>* %2, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @ldrhf16_m256(i8* %x, i8* %y, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: ldrhf16_m256:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q0, zr
|
||
|
; CHECK-NEXT: vldrht.u16 q0, [r0]
|
||
|
; CHECK-NEXT: sub.w r0, r0, #256
|
||
|
; CHECK-NEXT: vstrh.16 q0, [r1]
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %x, i32 -256
|
||
|
%0 = bitcast i8* %x to <8 x half>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
|
||
|
%2 = bitcast i8* %y to <8 x half>*
|
||
|
store <8 x half> %1, <8 x half>* %2, align 2
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
define i8* @strw32_4(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strw32_4:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrwt.32 q0, [r0], #4
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 4
|
||
|
%0 = bitcast i8* %x to <4 x i32>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x i32>, <4 x i32>* %0, align 4
|
||
|
%2 = bitcast i8* %y to <4 x i32>*
|
||
|
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strw32_3(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strw32_3:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrwt.32 q0, [r0]
|
||
|
; CHECK-NEXT: adds r0, #3
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 3
|
||
|
%0 = bitcast i8* %x to <4 x i32>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x i32>, <4 x i32>* %0, align 4
|
||
|
%2 = bitcast i8* %y to <4 x i32>*
|
||
|
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strw32_2(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strw32_2:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrwt.32 q0, [r0]
|
||
|
; CHECK-NEXT: adds r0, #2
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 2
|
||
|
%0 = bitcast i8* %x to <4 x i32>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x i32>, <4 x i32>* %0, align 4
|
||
|
%2 = bitcast i8* %y to <4 x i32>*
|
||
|
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strw32_508(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strw32_508:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrwt.32 q0, [r0], #508
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 508
|
||
|
%0 = bitcast i8* %x to <4 x i32>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x i32>, <4 x i32>* %0, align 4
|
||
|
%2 = bitcast i8* %y to <4 x i32>*
|
||
|
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strw32_512(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strw32_512:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrwt.32 q0, [r0]
|
||
|
; CHECK-NEXT: add.w r0, r0, #512
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 512
|
||
|
%0 = bitcast i8* %x to <4 x i32>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x i32>, <4 x i32>* %0, align 4
|
||
|
%2 = bitcast i8* %y to <4 x i32>*
|
||
|
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strw32_m508(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strw32_m508:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrwt.32 q0, [r0], #-508
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 -508
|
||
|
%0 = bitcast i8* %x to <4 x i32>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x i32>, <4 x i32>* %0, align 4
|
||
|
%2 = bitcast i8* %y to <4 x i32>*
|
||
|
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strw32_m512(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strw32_m512:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrwt.32 q0, [r0]
|
||
|
; CHECK-NEXT: sub.w r0, r0, #512
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 -512
|
||
|
%0 = bitcast i8* %x to <4 x i32>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x i32>, <4 x i32>* %0, align 4
|
||
|
%2 = bitcast i8* %y to <4 x i32>*
|
||
|
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strh32_4(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strh32_4:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrht.32 q0, [r0], #4
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 4
|
||
|
%0 = bitcast i8* %x to <4 x i16>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x i16>, <4 x i16>* %0, align 2
|
||
|
%2 = bitcast i8* %y to <4 x i16>*
|
||
|
call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strh32_3(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strh32_3:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrht.32 q0, [r0]
|
||
|
; CHECK-NEXT: adds r0, #3
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 3
|
||
|
%0 = bitcast i8* %x to <4 x i16>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x i16>, <4 x i16>* %0, align 2
|
||
|
%2 = bitcast i8* %y to <4 x i16>*
|
||
|
call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strh32_2(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strh32_2:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrht.32 q0, [r0], #2
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 2
|
||
|
%0 = bitcast i8* %x to <4 x i16>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x i16>, <4 x i16>* %0, align 2
|
||
|
%2 = bitcast i8* %y to <4 x i16>*
|
||
|
call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strh32_254(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strh32_254:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrht.32 q0, [r0], #254
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 254
|
||
|
%0 = bitcast i8* %x to <4 x i16>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x i16>, <4 x i16>* %0, align 2
|
||
|
%2 = bitcast i8* %y to <4 x i16>*
|
||
|
call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strh32_256(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strh32_256:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrht.32 q0, [r0]
|
||
|
; CHECK-NEXT: add.w r0, r0, #256
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 256
|
||
|
%0 = bitcast i8* %x to <4 x i16>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x i16>, <4 x i16>* %0, align 2
|
||
|
%2 = bitcast i8* %y to <4 x i16>*
|
||
|
call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strh32_m254(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strh32_m254:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrht.32 q0, [r0], #-254
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 -254
|
||
|
%0 = bitcast i8* %x to <4 x i16>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x i16>, <4 x i16>* %0, align 2
|
||
|
%2 = bitcast i8* %y to <4 x i16>*
|
||
|
call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strh32_m256(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strh32_m256:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrht.32 q0, [r0]
|
||
|
; CHECK-NEXT: sub.w r0, r0, #256
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 -256
|
||
|
%0 = bitcast i8* %x to <4 x i16>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x i16>, <4 x i16>* %0, align 2
|
||
|
%2 = bitcast i8* %y to <4 x i16>*
|
||
|
call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strh16_4(i8* %y, i8* %x, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: strh16_4:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r1]
|
||
|
; CHECK-NEXT: vldrh.u16 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrht.16 q0, [r0], #4
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 4
|
||
|
%0 = bitcast i8* %x to <8 x i16>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = load <8 x i16>, <8 x i16>* %0, align 2
|
||
|
%2 = bitcast i8* %y to <8 x i16>*
|
||
|
call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strh16_3(i8* %y, i8* %x, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: strh16_3:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r1]
|
||
|
; CHECK-NEXT: vldrh.u16 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrht.16 q0, [r0]
|
||
|
; CHECK-NEXT: adds r0, #3
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 3
|
||
|
%0 = bitcast i8* %x to <8 x i16>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = load <8 x i16>, <8 x i16>* %0, align 2
|
||
|
%2 = bitcast i8* %y to <8 x i16>*
|
||
|
call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strh16_2(i8* %y, i8* %x, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: strh16_2:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r1]
|
||
|
; CHECK-NEXT: vldrh.u16 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrht.16 q0, [r0], #2
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 2
|
||
|
%0 = bitcast i8* %x to <8 x i16>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = load <8 x i16>, <8 x i16>* %0, align 2
|
||
|
%2 = bitcast i8* %y to <8 x i16>*
|
||
|
call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strh16_254(i8* %y, i8* %x, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: strh16_254:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r1]
|
||
|
; CHECK-NEXT: vldrh.u16 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrht.16 q0, [r0], #254
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 254
|
||
|
%0 = bitcast i8* %x to <8 x i16>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = load <8 x i16>, <8 x i16>* %0, align 2
|
||
|
%2 = bitcast i8* %y to <8 x i16>*
|
||
|
call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strh16_256(i8* %y, i8* %x, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: strh16_256:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r1]
|
||
|
; CHECK-NEXT: vldrh.u16 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrht.16 q0, [r0]
|
||
|
; CHECK-NEXT: add.w r0, r0, #256
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 256
|
||
|
%0 = bitcast i8* %x to <8 x i16>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = load <8 x i16>, <8 x i16>* %0, align 2
|
||
|
%2 = bitcast i8* %y to <8 x i16>*
|
||
|
call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strh16_m254(i8* %y, i8* %x, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: strh16_m254:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r1]
|
||
|
; CHECK-NEXT: vldrh.u16 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrht.16 q0, [r0], #-254
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 -254
|
||
|
%0 = bitcast i8* %x to <8 x i16>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = load <8 x i16>, <8 x i16>* %0, align 2
|
||
|
%2 = bitcast i8* %y to <8 x i16>*
|
||
|
call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strh16_m256(i8* %y, i8* %x, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: strh16_m256:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r1]
|
||
|
; CHECK-NEXT: vldrh.u16 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrht.16 q0, [r0]
|
||
|
; CHECK-NEXT: sub.w r0, r0, #256
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 -256
|
||
|
%0 = bitcast i8* %x to <8 x i16>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = load <8 x i16>, <8 x i16>* %0, align 2
|
||
|
%2 = bitcast i8* %y to <8 x i16>*
|
||
|
call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strb32_4(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strb32_4:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrbt.32 q0, [r0], #4
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 4
|
||
|
%0 = bitcast i8* %x to <4 x i8>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x i8>, <4 x i8>* %0, align 1
|
||
|
%2 = bitcast i8* %y to <4 x i8>*
|
||
|
call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strb32_3(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strb32_3:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrbt.32 q0, [r0], #3
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 3
|
||
|
%0 = bitcast i8* %x to <4 x i8>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x i8>, <4 x i8>* %0, align 1
|
||
|
%2 = bitcast i8* %y to <4 x i8>*
|
||
|
call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strb32_2(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strb32_2:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrbt.32 q0, [r0], #2
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 2
|
||
|
%0 = bitcast i8* %x to <4 x i8>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x i8>, <4 x i8>* %0, align 1
|
||
|
%2 = bitcast i8* %y to <4 x i8>*
|
||
|
call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strb32_127(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strb32_127:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrbt.32 q0, [r0], #127
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 127
|
||
|
%0 = bitcast i8* %x to <4 x i8>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x i8>, <4 x i8>* %0, align 1
|
||
|
%2 = bitcast i8* %y to <4 x i8>*
|
||
|
call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strb32_128(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strb32_128:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrbt.32 q0, [r0]
|
||
|
; CHECK-NEXT: adds r0, #128
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 128
|
||
|
%0 = bitcast i8* %x to <4 x i8>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x i8>, <4 x i8>* %0, align 1
|
||
|
%2 = bitcast i8* %y to <4 x i8>*
|
||
|
call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strb32_m127(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strb32_m127:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrbt.32 q0, [r0], #-127
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 -127
|
||
|
%0 = bitcast i8* %x to <4 x i8>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x i8>, <4 x i8>* %0, align 1
|
||
|
%2 = bitcast i8* %y to <4 x i8>*
|
||
|
call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strb32_m128(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strb32_m128:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrbt.32 q0, [r0]
|
||
|
; CHECK-NEXT: subs r0, #128
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 -128
|
||
|
%0 = bitcast i8* %x to <4 x i8>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x i8>, <4 x i8>* %0, align 1
|
||
|
%2 = bitcast i8* %y to <4 x i8>*
|
||
|
call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strb16_4(i8* %y, i8* %x, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: strb16_4:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u16 q0, [r1]
|
||
|
; CHECK-NEXT: vldrh.u16 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrbt.16 q0, [r0], #4
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 4
|
||
|
%0 = bitcast i8* %x to <8 x i8>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = load <8 x i8>, <8 x i8>* %0, align 1
|
||
|
%2 = bitcast i8* %y to <8 x i8>*
|
||
|
call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strb16_3(i8* %y, i8* %x, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: strb16_3:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u16 q0, [r1]
|
||
|
; CHECK-NEXT: vldrh.u16 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrbt.16 q0, [r0], #3
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 3
|
||
|
%0 = bitcast i8* %x to <8 x i8>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = load <8 x i8>, <8 x i8>* %0, align 1
|
||
|
%2 = bitcast i8* %y to <8 x i8>*
|
||
|
call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strb16_2(i8* %y, i8* %x, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: strb16_2:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u16 q0, [r1]
|
||
|
; CHECK-NEXT: vldrh.u16 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrbt.16 q0, [r0], #2
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 2
|
||
|
%0 = bitcast i8* %x to <8 x i8>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = load <8 x i8>, <8 x i8>* %0, align 1
|
||
|
%2 = bitcast i8* %y to <8 x i8>*
|
||
|
call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strb16_127(i8* %y, i8* %x, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: strb16_127:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u16 q0, [r1]
|
||
|
; CHECK-NEXT: vldrh.u16 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrbt.16 q0, [r0], #127
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 127
|
||
|
%0 = bitcast i8* %x to <8 x i8>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = load <8 x i8>, <8 x i8>* %0, align 1
|
||
|
%2 = bitcast i8* %y to <8 x i8>*
|
||
|
call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strb16_128(i8* %y, i8* %x, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: strb16_128:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u16 q0, [r1]
|
||
|
; CHECK-NEXT: vldrh.u16 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrbt.16 q0, [r0]
|
||
|
; CHECK-NEXT: adds r0, #128
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 128
|
||
|
%0 = bitcast i8* %x to <8 x i8>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = load <8 x i8>, <8 x i8>* %0, align 1
|
||
|
%2 = bitcast i8* %y to <8 x i8>*
|
||
|
call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strb16_m127(i8* %y, i8* %x, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: strb16_m127:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u16 q0, [r1]
|
||
|
; CHECK-NEXT: vldrh.u16 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrbt.16 q0, [r0], #-127
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 -127
|
||
|
%0 = bitcast i8* %x to <8 x i8>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = load <8 x i8>, <8 x i8>* %0, align 1
|
||
|
%2 = bitcast i8* %y to <8 x i8>*
|
||
|
call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strb16_m128(i8* %y, i8* %x, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: strb16_m128:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u16 q0, [r1]
|
||
|
; CHECK-NEXT: vldrh.u16 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrbt.16 q0, [r0]
|
||
|
; CHECK-NEXT: subs r0, #128
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 -128
|
||
|
%0 = bitcast i8* %x to <8 x i8>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = load <8 x i8>, <8 x i8>* %0, align 1
|
||
|
%2 = bitcast i8* %y to <8 x i8>*
|
||
|
call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strb8_4(i8* %y, i8* %x, <16 x i8> *%m) {
|
||
|
; CHECK-LABEL: strb8_4:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u8 q0, [r1]
|
||
|
; CHECK-NEXT: vldrb.u8 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i8 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrbt.8 q0, [r0], #4
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 4
|
||
|
%0 = bitcast i8* %x to <16 x i8>*
|
||
|
%mask = load <16 x i8>, <16 x i8>* %m, align 1
|
||
|
%c = icmp ne <16 x i8> %mask, zeroinitializer
|
||
|
%1 = load <16 x i8>, <16 x i8>* %0, align 1
|
||
|
%2 = bitcast i8* %y to <16 x i8>*
|
||
|
call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strb8_3(i8* %y, i8* %x, <16 x i8> *%m) {
|
||
|
; CHECK-LABEL: strb8_3:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u8 q0, [r1]
|
||
|
; CHECK-NEXT: vldrb.u8 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i8 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrbt.8 q0, [r0], #3
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 3
|
||
|
%0 = bitcast i8* %x to <16 x i8>*
|
||
|
%mask = load <16 x i8>, <16 x i8>* %m, align 1
|
||
|
%c = icmp ne <16 x i8> %mask, zeroinitializer
|
||
|
%1 = load <16 x i8>, <16 x i8>* %0, align 1
|
||
|
%2 = bitcast i8* %y to <16 x i8>*
|
||
|
call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strb8_2(i8* %y, i8* %x, <16 x i8> *%m) {
|
||
|
; CHECK-LABEL: strb8_2:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u8 q0, [r1]
|
||
|
; CHECK-NEXT: vldrb.u8 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i8 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrbt.8 q0, [r0], #2
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 2
|
||
|
%0 = bitcast i8* %x to <16 x i8>*
|
||
|
%mask = load <16 x i8>, <16 x i8>* %m, align 1
|
||
|
%c = icmp ne <16 x i8> %mask, zeroinitializer
|
||
|
%1 = load <16 x i8>, <16 x i8>* %0, align 1
|
||
|
%2 = bitcast i8* %y to <16 x i8>*
|
||
|
call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strb8_127(i8* %y, i8* %x, <16 x i8> *%m) {
|
||
|
; CHECK-LABEL: strb8_127:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u8 q0, [r1]
|
||
|
; CHECK-NEXT: vldrb.u8 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i8 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrbt.8 q0, [r0], #127
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 127
|
||
|
%0 = bitcast i8* %x to <16 x i8>*
|
||
|
%mask = load <16 x i8>, <16 x i8>* %m, align 1
|
||
|
%c = icmp ne <16 x i8> %mask, zeroinitializer
|
||
|
%1 = load <16 x i8>, <16 x i8>* %0, align 1
|
||
|
%2 = bitcast i8* %y to <16 x i8>*
|
||
|
call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strb8_128(i8* %y, i8* %x, <16 x i8> *%m) {
|
||
|
; CHECK-LABEL: strb8_128:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u8 q0, [r1]
|
||
|
; CHECK-NEXT: vldrb.u8 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i8 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrbt.8 q0, [r0]
|
||
|
; CHECK-NEXT: adds r0, #128
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 128
|
||
|
%0 = bitcast i8* %x to <16 x i8>*
|
||
|
%mask = load <16 x i8>, <16 x i8>* %m, align 1
|
||
|
%c = icmp ne <16 x i8> %mask, zeroinitializer
|
||
|
%1 = load <16 x i8>, <16 x i8>* %0, align 1
|
||
|
%2 = bitcast i8* %y to <16 x i8>*
|
||
|
call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strb8_m127(i8* %y, i8* %x, <16 x i8> *%m) {
|
||
|
; CHECK-LABEL: strb8_m127:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u8 q0, [r1]
|
||
|
; CHECK-NEXT: vldrb.u8 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i8 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrbt.8 q0, [r0], #-127
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 -127
|
||
|
%0 = bitcast i8* %x to <16 x i8>*
|
||
|
%mask = load <16 x i8>, <16 x i8>* %m, align 1
|
||
|
%c = icmp ne <16 x i8> %mask, zeroinitializer
|
||
|
%1 = load <16 x i8>, <16 x i8>* %0, align 1
|
||
|
%2 = bitcast i8* %y to <16 x i8>*
|
||
|
call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strb8_m128(i8* %y, i8* %x, <16 x i8> *%m) {
|
||
|
; CHECK-LABEL: strb8_m128:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrb.u8 q0, [r1]
|
||
|
; CHECK-NEXT: vldrb.u8 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i8 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrbt.8 q0, [r0]
|
||
|
; CHECK-NEXT: subs r0, #128
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 -128
|
||
|
%0 = bitcast i8* %x to <16 x i8>*
|
||
|
%mask = load <16 x i8>, <16 x i8>* %m, align 1
|
||
|
%c = icmp ne <16 x i8> %mask, zeroinitializer
|
||
|
%1 = load <16 x i8>, <16 x i8>* %0, align 1
|
||
|
%2 = bitcast i8* %y to <16 x i8>*
|
||
|
call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strwf32_4(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strwf32_4:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrwt.32 q0, [r0], #4
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 4
|
||
|
%0 = bitcast i8* %x to <4 x float>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x float>, <4 x float>* %0, align 4
|
||
|
%2 = bitcast i8* %y to <4 x float>*
|
||
|
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strwf32_3(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strwf32_3:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrwt.32 q0, [r0]
|
||
|
; CHECK-NEXT: adds r0, #3
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 3
|
||
|
%0 = bitcast i8* %x to <4 x float>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x float>, <4 x float>* %0, align 4
|
||
|
%2 = bitcast i8* %y to <4 x float>*
|
||
|
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strwf32_2(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strwf32_2:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrwt.32 q0, [r0]
|
||
|
; CHECK-NEXT: adds r0, #2
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 2
|
||
|
%0 = bitcast i8* %x to <4 x float>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x float>, <4 x float>* %0, align 4
|
||
|
%2 = bitcast i8* %y to <4 x float>*
|
||
|
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strwf32_508(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strwf32_508:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrwt.32 q0, [r0], #508
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 508
|
||
|
%0 = bitcast i8* %x to <4 x float>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x float>, <4 x float>* %0, align 4
|
||
|
%2 = bitcast i8* %y to <4 x float>*
|
||
|
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strwf32_512(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strwf32_512:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrwt.32 q0, [r0]
|
||
|
; CHECK-NEXT: add.w r0, r0, #512
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 512
|
||
|
%0 = bitcast i8* %x to <4 x float>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x float>, <4 x float>* %0, align 4
|
||
|
%2 = bitcast i8* %y to <4 x float>*
|
||
|
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strwf32_m508(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strwf32_m508:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrwt.32 q0, [r0], #-508
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 -508
|
||
|
%0 = bitcast i8* %x to <4 x float>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x float>, <4 x float>* %0, align 4
|
||
|
%2 = bitcast i8* %y to <4 x float>*
|
||
|
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strwf32_m512(i8* %y, i8* %x, <4 x i32> *%m) {
|
||
|
; CHECK-LABEL: strwf32_m512:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
||
|
; CHECK-NEXT: vldrw.u32 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i32 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrwt.32 q0, [r0]
|
||
|
; CHECK-NEXT: sub.w r0, r0, #512
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 -512
|
||
|
%0 = bitcast i8* %x to <4 x float>*
|
||
|
%mask = load <4 x i32>, <4 x i32>* %m, align 4
|
||
|
%c = icmp ne <4 x i32> %mask, zeroinitializer
|
||
|
%1 = load <4 x float>, <4 x float>* %0, align 4
|
||
|
%2 = bitcast i8* %y to <4 x float>*
|
||
|
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strhf16_4(i8* %y, i8* %x, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: strhf16_4:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r1]
|
||
|
; CHECK-NEXT: vldrh.u16 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrht.16 q0, [r0], #4
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 4
|
||
|
%0 = bitcast i8* %x to <8 x half>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = load <8 x half>, <8 x half>* %0, align 2
|
||
|
%2 = bitcast i8* %y to <8 x half>*
|
||
|
call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strhf16_3(i8* %y, i8* %x, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: strhf16_3:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r1]
|
||
|
; CHECK-NEXT: vldrh.u16 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrht.16 q0, [r0]
|
||
|
; CHECK-NEXT: adds r0, #3
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 3
|
||
|
%0 = bitcast i8* %x to <8 x half>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = load <8 x half>, <8 x half>* %0, align 2
|
||
|
%2 = bitcast i8* %y to <8 x half>*
|
||
|
call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strhf16_2(i8* %y, i8* %x, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: strhf16_2:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r1]
|
||
|
; CHECK-NEXT: vldrh.u16 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrht.16 q0, [r0], #2
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 2
|
||
|
%0 = bitcast i8* %x to <8 x half>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = load <8 x half>, <8 x half>* %0, align 2
|
||
|
%2 = bitcast i8* %y to <8 x half>*
|
||
|
call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strhf16_254(i8* %y, i8* %x, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: strhf16_254:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r1]
|
||
|
; CHECK-NEXT: vldrh.u16 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrht.16 q0, [r0], #254
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 254
|
||
|
%0 = bitcast i8* %x to <8 x half>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = load <8 x half>, <8 x half>* %0, align 2
|
||
|
%2 = bitcast i8* %y to <8 x half>*
|
||
|
call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strhf16_256(i8* %y, i8* %x, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: strhf16_256:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r1]
|
||
|
; CHECK-NEXT: vldrh.u16 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrht.16 q0, [r0]
|
||
|
; CHECK-NEXT: add.w r0, r0, #256
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 256
|
||
|
%0 = bitcast i8* %x to <8 x half>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = load <8 x half>, <8 x half>* %0, align 2
|
||
|
%2 = bitcast i8* %y to <8 x half>*
|
||
|
call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strhf16_m254(i8* %y, i8* %x, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: strhf16_m254:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r1]
|
||
|
; CHECK-NEXT: vldrh.u16 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrht.16 q0, [r0], #-254
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 -254
|
||
|
%0 = bitcast i8* %x to <8 x half>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = load <8 x half>, <8 x half>* %0, align 2
|
||
|
%2 = bitcast i8* %y to <8 x half>*
|
||
|
call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
define i8* @strhf16_m256(i8* %y, i8* %x, <8 x i16> *%m) {
|
||
|
; CHECK-LABEL: strhf16_m256:
|
||
|
; CHECK: @ %bb.0: @ %entry
|
||
|
; CHECK-NEXT: vldrh.u16 q0, [r1]
|
||
|
; CHECK-NEXT: vldrh.u16 q1, [r2]
|
||
|
; CHECK-NEXT: vpt.i16 ne, q1, zr
|
||
|
; CHECK-NEXT: vstrht.16 q0, [r0]
|
||
|
; CHECK-NEXT: sub.w r0, r0, #256
|
||
|
; CHECK-NEXT: bx lr
|
||
|
entry:
|
||
|
%z = getelementptr inbounds i8, i8* %y, i32 -256
|
||
|
%0 = bitcast i8* %x to <8 x half>*
|
||
|
%mask = load <8 x i16>, <8 x i16>* %m, align 2
|
||
|
%c = icmp ne <8 x i16> %mask, zeroinitializer
|
||
|
%1 = load <8 x half>, <8 x half>* %0, align 2
|
||
|
%2 = bitcast i8* %y to <8 x half>*
|
||
|
call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
|
||
|
ret i8* %z
|
||
|
}
|
||
|
|
||
|
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
|
||
|
declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>)
|
||
|
declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
|
||
|
declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>)
|
||
|
declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>)
|
||
|
declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
|
||
|
declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
|
||
|
declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>)
|
||
|
|
||
|
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
|
||
|
declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)
|
||
|
declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32, <4 x i1>)
|
||
|
declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
|
||
|
declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>)
|
||
|
declare void @llvm.masked.store.v4i8.p0v4i8(<4 x i8>, <4 x i8>*, i32, <4 x i1>)
|
||
|
declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
|
||
|
declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>)
|