; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8) declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8) declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8) declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8>, i8) declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8>, i8) declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8>, i8) define i1 @pcmpestri_reg_eq_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind { ; X86-LABEL: pcmpestri_reg_eq_i8: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: pcmpestri $24, %xmm1, %xmm0 ; X86-NEXT: setae %al ; X86-NEXT: retl ; ; X64-LABEL: pcmpestri_reg_eq_i8: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %edx ; X64-NEXT: movl %edi, %eax ; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 ; X64-NEXT: setae %al ; X64-NEXT: retq entry: %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) %result = icmp eq i32 %c, 0 ret i1 %result } define i32 @pcmpestri_reg_idx_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind { ; X86-LABEL: pcmpestri_reg_idx_i8: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: pcmpestri $24, %xmm1, %xmm0 ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: pcmpestri_reg_idx_i8: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %edx ; X64-NEXT: movl %edi, %eax ; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 ; X64-NEXT: movl %ecx, %eax ; X64-NEXT: retq entry: %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) ret i32 %idx } define i32 @pcmpestri_reg_diff_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind { ; X86-LABEL: pcmpestri_reg_diff_i8: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $48, %esp ; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl 12(%ebp), %edx ; X86-NEXT: pcmpestri $24, %xmm1, %xmm0 ; X86-NEXT: cmpl $16, %ecx ; X86-NEXT: jne .LBB2_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: jmp .LBB2_3 ; X86-NEXT: .LBB2_2: # %compare ; X86-NEXT: movdqa %xmm0, (%esp) ; X86-NEXT: andl $15, %ecx ; X86-NEXT: movb (%esp,%ecx), %al ; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp) ; X86-NEXT: subb 16(%esp,%ecx), %al ; X86-NEXT: .LBB2_3: # %exit ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: pcmpestri_reg_diff_i8: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %edx ; X64-NEXT: movl %edi, %eax ; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 ; X64-NEXT: # kill: def $ecx killed $ecx def $rcx ; X64-NEXT: cmpl $16, %ecx ; X64-NEXT: jne .LBB2_2 ; X64-NEXT: # %bb.1: ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: movzbl %al, %eax ; X64-NEXT: retq ; X64-NEXT: .LBB2_2: # %compare ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: andl $15, %ecx ; X64-NEXT: movb -24(%rsp,%rcx), %al ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: subb -40(%rsp,%rcx), %al ; X64-NEXT: movzbl %al, %eax ; X64-NEXT: retq entry: %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) %eq = icmp eq i32 %idx, 16 br i1 %eq, label %exit, label %compare compare: %lhs_c = extractelement <16 x i8> %lhs, i32 %idx %rhs_c = extractelement <16 x i8> %rhs, i32 %idx %sub = sub i8 %lhs_c, %rhs_c br label %exit exit: %result = phi i8 [ 0, %entry ], [ %sub, %compare ] %result_ext = zext i8 %result to i32 ret i32 %result_ext } define i1 @pcmpestri_mem_eq_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind { ; X86-LABEL: pcmpestri_mem_eq_i8: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movdqu (%esi), %xmm0 ; X86-NEXT: pcmpestri $24, (%ecx), %xmm0 ; X86-NEXT: setae %al ; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: pcmpestri_mem_eq_i8: ; X64: # %bb.0: # %entry ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movl %esi, %eax ; X64-NEXT: movdqu (%rdi), %xmm0 ; X64-NEXT: movl %ecx, %edx ; X64-NEXT: pcmpestri $24, (%r8), %xmm0 ; X64-NEXT: setae %al ; X64-NEXT: retq entry: %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) %result = icmp eq i32 %c, 0 ret i1 %result } define i32 @pcmpestri_mem_idx_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind { ; X86-LABEL: pcmpestri_mem_idx_i8: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movdqu (%esi), %xmm0 ; X86-NEXT: pcmpestri $24, (%ecx), %xmm0 ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: pcmpestri_mem_idx_i8: ; X64: # %bb.0: # %entry ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movl %esi, %eax ; X64-NEXT: movdqu (%rdi), %xmm0 ; X64-NEXT: movl %ecx, %edx ; X64-NEXT: pcmpestri $24, (%r8), %xmm0 ; X64-NEXT: movl %ecx, %eax ; X64-NEXT: retq entry: %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) ret i32 %idx } define i32 @pcmpestri_mem_diff_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind { ; X86-LABEL: pcmpestri_mem_diff_i8: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $48, %esp ; X86-NEXT: movl 12(%ebp), %eax ; X86-NEXT: movl 20(%ebp), %edx ; X86-NEXT: movl 16(%ebp), %ecx ; X86-NEXT: movl 8(%ebp), %esi ; X86-NEXT: movdqu (%esi), %xmm1 ; X86-NEXT: movdqu (%ecx), %xmm0 ; X86-NEXT: pcmpestri $24, %xmm0, %xmm1 ; X86-NEXT: cmpl $16, %ecx ; X86-NEXT: jne .LBB5_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: jmp .LBB5_3 ; X86-NEXT: .LBB5_2: # %compare ; X86-NEXT: movdqa %xmm1, (%esp) ; X86-NEXT: andl $15, %ecx ; X86-NEXT: movb (%esp,%ecx), %al ; X86-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: subb 16(%esp,%ecx), %al ; X86-NEXT: .LBB5_3: # %exit ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: leal -4(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: pcmpestri_mem_diff_i8: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %eax ; X64-NEXT: movdqu (%rdi), %xmm1 ; X64-NEXT: movdqu (%rdx), %xmm0 ; X64-NEXT: movl %ecx, %edx ; X64-NEXT: pcmpestri $24, %xmm0, %xmm1 ; X64-NEXT: # kill: def $ecx killed $ecx def $rcx ; X64-NEXT: cmpl $16, %ecx ; X64-NEXT: jne .LBB5_2 ; X64-NEXT: # %bb.1: ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: movzbl %al, %eax ; X64-NEXT: retq ; X64-NEXT: .LBB5_2: # %compare ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: andl $15, %ecx ; X64-NEXT: movb -24(%rsp,%rcx), %al ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: subb -40(%rsp,%rcx), %al ; X64-NEXT: movzbl %al, %eax ; X64-NEXT: retq entry: %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) %eq = icmp eq i32 %idx, 16 br i1 %eq, label %exit, label %compare compare: %lhs_c = extractelement <16 x i8> %lhs, i32 %idx %rhs_c = extractelement <16 x i8> %rhs, i32 %idx %sub = sub i8 %lhs_c, %rhs_c br label %exit exit: %result = phi i8 [ 0, %entry ], [ %sub, %compare ] %result_ext = zext i8 %result to i32 ret i32 %result_ext } define i1 @pcmpestri_reg_eq_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind { ; X86-LABEL: pcmpestri_reg_eq_i16: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: pcmpestri $24, %xmm1, %xmm0 ; X86-NEXT: setae %al ; X86-NEXT: retl ; ; X64-LABEL: pcmpestri_reg_eq_i16: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %edx ; X64-NEXT: movl %edi, %eax ; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 ; X64-NEXT: setae %al ; X64-NEXT: retq entry: %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24) %result = icmp eq i32 %c, 0 ret i1 %result } define i32 @pcmpestri_reg_idx_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind { ; X86-LABEL: pcmpestri_reg_idx_i16: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: pcmpestri $24, %xmm1, %xmm0 ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: pcmpestri_reg_idx_i16: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %edx ; X64-NEXT: movl %edi, %eax ; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 ; X64-NEXT: movl %ecx, %eax ; X64-NEXT: retq entry: %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24) ret i32 %idx } define i32 @pcmpestri_reg_diff_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind { ; X86-LABEL: pcmpestri_reg_diff_i16: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $48, %esp ; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl 12(%ebp), %edx ; X86-NEXT: pcmpestri $24, %xmm1, %xmm0 ; X86-NEXT: cmpl $16, %ecx ; X86-NEXT: jne .LBB8_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: jmp .LBB8_3 ; X86-NEXT: .LBB8_2: # %compare ; X86-NEXT: movdqa %xmm0, (%esp) ; X86-NEXT: addl %ecx, %ecx ; X86-NEXT: andl $14, %ecx ; X86-NEXT: movzwl (%esp,%ecx), %eax ; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp) ; X86-NEXT: subw 16(%esp,%ecx), %ax ; X86-NEXT: .LBB8_3: # %exit ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: pcmpestri_reg_diff_i16: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %edx ; X64-NEXT: movl %edi, %eax ; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 ; X64-NEXT: # kill: def $ecx killed $ecx def $rcx ; X64-NEXT: cmpl $16, %ecx ; X64-NEXT: jne .LBB8_2 ; X64-NEXT: # %bb.1: ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: retq ; X64-NEXT: .LBB8_2: # %compare ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: andl $7, %ecx ; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: subw -40(%rsp,%rcx,2), %ax ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: retq entry: %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24) %eq = icmp eq i32 %idx, 16 br i1 %eq, label %exit, label %compare compare: %lhs_c = extractelement <8 x i16> %lhs, i32 %idx %rhs_c = extractelement <8 x i16> %rhs, i32 %idx %sub = sub i16 %lhs_c, %rhs_c br label %exit exit: %result = phi i16 [ 0, %entry ], [ %sub, %compare ] %result_ext = zext i16 %result to i32 ret i32 %result_ext } define i1 @pcmpestri_mem_eq_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind { ; X86-LABEL: pcmpestri_mem_eq_i16: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movdqu (%esi), %xmm0 ; X86-NEXT: pcmpestri $25, (%ecx), %xmm0 ; X86-NEXT: setae %al ; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: pcmpestri_mem_eq_i16: ; X64: # %bb.0: # %entry ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movl %esi, %eax ; X64-NEXT: movdqu (%rdi), %xmm0 ; X64-NEXT: movl %ecx, %edx ; X64-NEXT: pcmpestri $25, (%r8), %xmm0 ; X64-NEXT: setae %al ; X64-NEXT: retq entry: %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25) %result = icmp eq i32 %c, 0 ret i1 %result } define i32 @pcmpestri_mem_idx_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind { ; X86-LABEL: pcmpestri_mem_idx_i16: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movdqu (%esi), %xmm0 ; X86-NEXT: pcmpestri $25, (%ecx), %xmm0 ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: pcmpestri_mem_idx_i16: ; X64: # %bb.0: # %entry ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movl %esi, %eax ; X64-NEXT: movdqu (%rdi), %xmm0 ; X64-NEXT: movl %ecx, %edx ; X64-NEXT: pcmpestri $25, (%r8), %xmm0 ; X64-NEXT: movl %ecx, %eax ; X64-NEXT: retq entry: %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25) ret i32 %idx } define i32 @pcmpestri_mem_diff_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind { ; X86-LABEL: pcmpestri_mem_diff_i16: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $48, %esp ; X86-NEXT: movl 12(%ebp), %eax ; X86-NEXT: movl 20(%ebp), %edx ; X86-NEXT: movl 16(%ebp), %ecx ; X86-NEXT: movl 8(%ebp), %esi ; X86-NEXT: movdqu (%esi), %xmm1 ; X86-NEXT: movdqu (%ecx), %xmm0 ; X86-NEXT: pcmpestri $25, %xmm0, %xmm1 ; X86-NEXT: cmpl $8, %ecx ; X86-NEXT: jne .LBB11_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: jmp .LBB11_3 ; X86-NEXT: .LBB11_2: # %compare ; X86-NEXT: movdqa %xmm1, (%esp) ; X86-NEXT: addl %ecx, %ecx ; X86-NEXT: andl $14, %ecx ; X86-NEXT: movzwl (%esp,%ecx), %eax ; X86-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: subw 16(%esp,%ecx), %ax ; X86-NEXT: .LBB11_3: # %exit ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: leal -4(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: pcmpestri_mem_diff_i16: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %eax ; X64-NEXT: movdqu (%rdi), %xmm1 ; X64-NEXT: movdqu (%rdx), %xmm0 ; X64-NEXT: movl %ecx, %edx ; X64-NEXT: pcmpestri $25, %xmm0, %xmm1 ; X64-NEXT: # kill: def $ecx killed $ecx def $rcx ; X64-NEXT: cmpl $8, %ecx ; X64-NEXT: jne .LBB11_2 ; X64-NEXT: # %bb.1: ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: retq ; X64-NEXT: .LBB11_2: # %compare ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: andl $7, %ecx ; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: subw -40(%rsp,%rcx,2), %ax ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: retq entry: %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25) %eq = icmp eq i32 %idx, 8 br i1 %eq, label %exit, label %compare compare: %lhs_c = extractelement <8 x i16> %lhs, i32 %idx %rhs_c = extractelement <8 x i16> %rhs, i32 %idx %sub = sub i16 %lhs_c, %rhs_c br label %exit exit: %result = phi i16 [ 0, %entry ], [ %sub, %compare ] %result_ext = zext i16 %result to i32 ret i32 %result_ext } define i1 @pcmpistri_reg_eq_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind { ; X86-LABEL: pcmpistri_reg_eq_i8: ; X86: # %bb.0: # %entry ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0 ; X86-NEXT: setae %al ; X86-NEXT: retl ; ; X64-LABEL: pcmpistri_reg_eq_i8: ; X64: # %bb.0: # %entry ; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 ; X64-NEXT: setae %al ; X64-NEXT: retq entry: %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) %result = icmp eq i32 %c, 0 ret i1 %result } define i32 @pcmpistri_reg_idx_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind { ; X86-LABEL: pcmpistri_reg_idx_i8: ; X86: # %bb.0: # %entry ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0 ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: pcmpistri_reg_idx_i8: ; X64: # %bb.0: # %entry ; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 ; X64-NEXT: movl %ecx, %eax ; X64-NEXT: retq entry: %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) ret i32 %idx } define i32 @pcmpistri_reg_diff_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind { ; X86-LABEL: pcmpistri_reg_diff_i8: ; X86: # %bb.0: # %entry ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0 ; X86-NEXT: cmpl $16, %ecx ; X86-NEXT: jne .LBB14_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: retl ; X86-NEXT: .LBB14_2: # %compare ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $48, %esp ; X86-NEXT: movdqa %xmm0, (%esp) ; X86-NEXT: andl $15, %ecx ; X86-NEXT: movb (%esp,%ecx), %al ; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp) ; X86-NEXT: subb 16(%esp,%ecx), %al ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: retl ; ; X64-LABEL: pcmpistri_reg_diff_i8: ; X64: # %bb.0: # %entry ; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 ; X64-NEXT: # kill: def $ecx killed $ecx def $rcx ; X64-NEXT: cmpl $16, %ecx ; X64-NEXT: jne .LBB14_2 ; X64-NEXT: # %bb.1: ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: movzbl %al, %eax ; X64-NEXT: retq ; X64-NEXT: .LBB14_2: # %compare ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: andl $15, %ecx ; X64-NEXT: movb -24(%rsp,%rcx), %al ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: subb -40(%rsp,%rcx), %al ; X64-NEXT: movzbl %al, %eax ; X64-NEXT: retq entry: %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) %eq = icmp eq i32 %idx, 16 br i1 %eq, label %exit, label %compare compare: %lhs_c = extractelement <16 x i8> %lhs, i32 %idx %rhs_c = extractelement <16 x i8> %rhs, i32 %idx %sub = sub i8 %lhs_c, %rhs_c br label %exit exit: %result = phi i8 [ 0, %entry ], [ %sub, %compare ] %result_ext = zext i8 %result to i32 ret i32 %result_ext } define i1 @pcmpistri_mem_eq_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind { ; X86-LABEL: pcmpistri_mem_eq_i8: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movdqu (%ecx), %xmm0 ; X86-NEXT: pcmpistri $24, (%eax), %xmm0 ; X86-NEXT: setae %al ; X86-NEXT: retl ; ; X64-LABEL: pcmpistri_mem_eq_i8: ; X64: # %bb.0: # %entry ; X64-NEXT: movdqu (%rdi), %xmm0 ; X64-NEXT: pcmpistri $24, (%rsi), %xmm0 ; X64-NEXT: setae %al ; X64-NEXT: retq entry: %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) %result = icmp eq i32 %c, 0 ret i1 %result } define i32 @pcmpistri_mem_idx_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind { ; X86-LABEL: pcmpistri_mem_idx_i8: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movdqu (%ecx), %xmm0 ; X86-NEXT: pcmpistri $24, (%eax), %xmm0 ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: pcmpistri_mem_idx_i8: ; X64: # %bb.0: # %entry ; X64-NEXT: movdqu (%rdi), %xmm0 ; X64-NEXT: pcmpistri $24, (%rsi), %xmm0 ; X64-NEXT: movl %ecx, %eax ; X64-NEXT: retq entry: %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) ret i32 %idx } define i32 @pcmpistri_mem_diff_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind { ; X86-LABEL: pcmpistri_mem_diff_i8: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $48, %esp ; X86-NEXT: movl 12(%ebp), %eax ; X86-NEXT: movl 8(%ebp), %ecx ; X86-NEXT: movdqu (%ecx), %xmm1 ; X86-NEXT: movdqu (%eax), %xmm0 ; X86-NEXT: pcmpistri $24, %xmm0, %xmm1 ; X86-NEXT: cmpl $16, %ecx ; X86-NEXT: jne .LBB17_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: jmp .LBB17_3 ; X86-NEXT: .LBB17_2: # %compare ; X86-NEXT: movdqa %xmm1, (%esp) ; X86-NEXT: andl $15, %ecx ; X86-NEXT: movb (%esp,%ecx), %al ; X86-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: subb 16(%esp,%ecx), %al ; X86-NEXT: .LBB17_3: # %exit ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: pcmpistri_mem_diff_i8: ; X64: # %bb.0: # %entry ; X64-NEXT: movdqu (%rdi), %xmm1 ; X64-NEXT: movdqu (%rsi), %xmm0 ; X64-NEXT: pcmpistri $24, %xmm0, %xmm1 ; X64-NEXT: # kill: def $ecx killed $ecx def $rcx ; X64-NEXT: cmpl $16, %ecx ; X64-NEXT: jne .LBB17_2 ; X64-NEXT: # %bb.1: ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: movzbl %al, %eax ; X64-NEXT: retq ; X64-NEXT: .LBB17_2: # %compare ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: andl $15, %ecx ; X64-NEXT: movb -24(%rsp,%rcx), %al ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: subb -40(%rsp,%rcx), %al ; X64-NEXT: movzbl %al, %eax ; X64-NEXT: retq entry: %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) %eq = icmp eq i32 %idx, 16 br i1 %eq, label %exit, label %compare compare: %lhs_c = extractelement <16 x i8> %lhs, i32 %idx %rhs_c = extractelement <16 x i8> %rhs, i32 %idx %sub = sub i8 %lhs_c, %rhs_c br label %exit exit: %result = phi i8 [ 0, %entry ], [ %sub, %compare ] %result_ext = zext i8 %result to i32 ret i32 %result_ext } define i1 @pcmpistri_reg_eq_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind { ; X86-LABEL: pcmpistri_reg_eq_i16: ; X86: # %bb.0: # %entry ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0 ; X86-NEXT: setae %al ; X86-NEXT: retl ; ; X64-LABEL: pcmpistri_reg_eq_i16: ; X64: # %bb.0: # %entry ; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 ; X64-NEXT: setae %al ; X64-NEXT: retq entry: %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24) %result = icmp eq i32 %c, 0 ret i1 %result } define i32 @pcmpistri_reg_idx_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind { ; X86-LABEL: pcmpistri_reg_idx_i16: ; X86: # %bb.0: # %entry ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0 ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: pcmpistri_reg_idx_i16: ; X64: # %bb.0: # %entry ; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 ; X64-NEXT: movl %ecx, %eax ; X64-NEXT: retq entry: %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24) ret i32 %idx } define i32 @pcmpistri_reg_diff_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind { ; X86-LABEL: pcmpistri_reg_diff_i16: ; X86: # %bb.0: # %entry ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0 ; X86-NEXT: cmpl $16, %ecx ; X86-NEXT: jne .LBB20_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: retl ; X86-NEXT: .LBB20_2: # %compare ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $48, %esp ; X86-NEXT: movdqa %xmm0, (%esp) ; X86-NEXT: addl %ecx, %ecx ; X86-NEXT: andl $14, %ecx ; X86-NEXT: movzwl (%esp,%ecx), %eax ; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp) ; X86-NEXT: subw 16(%esp,%ecx), %ax ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: retl ; ; X64-LABEL: pcmpistri_reg_diff_i16: ; X64: # %bb.0: # %entry ; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 ; X64-NEXT: # kill: def $ecx killed $ecx def $rcx ; X64-NEXT: cmpl $16, %ecx ; X64-NEXT: jne .LBB20_2 ; X64-NEXT: # %bb.1: ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: retq ; X64-NEXT: .LBB20_2: # %compare ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: andl $7, %ecx ; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: subw -40(%rsp,%rcx,2), %ax ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: retq entry: %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24) %eq = icmp eq i32 %idx, 16 br i1 %eq, label %exit, label %compare compare: %lhs_c = extractelement <8 x i16> %lhs, i32 %idx %rhs_c = extractelement <8 x i16> %rhs, i32 %idx %sub = sub i16 %lhs_c, %rhs_c br label %exit exit: %result = phi i16 [ 0, %entry ], [ %sub, %compare ] %result_ext = zext i16 %result to i32 ret i32 %result_ext } define i1 @pcmpistri_mem_eq_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind { ; X86-LABEL: pcmpistri_mem_eq_i16: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movdqu (%ecx), %xmm0 ; X86-NEXT: pcmpistri $25, (%eax), %xmm0 ; X86-NEXT: setae %al ; X86-NEXT: retl ; ; X64-LABEL: pcmpistri_mem_eq_i16: ; X64: # %bb.0: # %entry ; X64-NEXT: movdqu (%rdi), %xmm0 ; X64-NEXT: pcmpistri $25, (%rsi), %xmm0 ; X64-NEXT: setae %al ; X64-NEXT: retq entry: %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25) %result = icmp eq i32 %c, 0 ret i1 %result } define i32 @pcmpistri_mem_idx_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind { ; X86-LABEL: pcmpistri_mem_idx_i16: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movdqu (%ecx), %xmm0 ; X86-NEXT: pcmpistri $25, (%eax), %xmm0 ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: pcmpistri_mem_idx_i16: ; X64: # %bb.0: # %entry ; X64-NEXT: movdqu (%rdi), %xmm0 ; X64-NEXT: pcmpistri $25, (%rsi), %xmm0 ; X64-NEXT: movl %ecx, %eax ; X64-NEXT: retq entry: %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25) ret i32 %idx } define i32 @pcmpistri_mem_diff_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind { ; X86-LABEL: pcmpistri_mem_diff_i16: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $48, %esp ; X86-NEXT: movl 12(%ebp), %eax ; X86-NEXT: movl 8(%ebp), %ecx ; X86-NEXT: movdqu (%ecx), %xmm1 ; X86-NEXT: movdqu (%eax), %xmm0 ; X86-NEXT: pcmpistri $25, %xmm0, %xmm1 ; X86-NEXT: cmpl $8, %ecx ; X86-NEXT: jne .LBB23_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: jmp .LBB23_3 ; X86-NEXT: .LBB23_2: # %compare ; X86-NEXT: movdqa %xmm1, (%esp) ; X86-NEXT: addl %ecx, %ecx ; X86-NEXT: andl $14, %ecx ; X86-NEXT: movzwl (%esp,%ecx), %eax ; X86-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: subw 16(%esp,%ecx), %ax ; X86-NEXT: .LBB23_3: # %exit ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: pcmpistri_mem_diff_i16: ; X64: # %bb.0: # %entry ; X64-NEXT: movdqu (%rdi), %xmm1 ; X64-NEXT: movdqu (%rsi), %xmm0 ; X64-NEXT: pcmpistri $25, %xmm0, %xmm1 ; X64-NEXT: # kill: def $ecx killed $ecx def $rcx ; X64-NEXT: cmpl $8, %ecx ; X64-NEXT: jne .LBB23_2 ; X64-NEXT: # %bb.1: ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: retq ; X64-NEXT: .LBB23_2: # %compare ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: andl $7, %ecx ; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: subw -40(%rsp,%rcx,2), %ax ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: retq entry: %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25) %eq = icmp eq i32 %idx, 8 br i1 %eq, label %exit, label %compare compare: %lhs_c = extractelement <8 x i16> %lhs, i32 %idx %rhs_c = extractelement <8 x i16> %rhs, i32 %idx %sub = sub i16 %lhs_c, %rhs_c br label %exit exit: %result = phi i16 [ 0, %entry ], [ %sub, %compare ] %result_ext = zext i16 %result to i32 ret i32 %result_ext } define void @pcmpestr_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i32* %iptr, i32* %fptr) nounwind { ; X86-LABEL: pcmpestr_index_flag: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: pcmpestri $24, %xmm1, %xmm0 ; X86-NEXT: setb %bl ; X86-NEXT: movl %ecx, (%edi) ; X86-NEXT: movl %ebx, (%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx ; X86-NEXT: retl ; ; X64-LABEL: pcmpestr_index_flag: ; X64: # %bb.0: # %entry ; X64-NEXT: movq %rcx, %r8 ; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movl %esi, %edx ; X64-NEXT: movl %edi, %eax ; X64-NEXT: xorl %esi, %esi ; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 ; X64-NEXT: setb %sil ; X64-NEXT: movl %ecx, (%r9) ; X64-NEXT: movl %esi, (%r8) ; X64-NEXT: retq entry: %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) store i32 %index, i32* %iptr store i32 %flag, i32* %fptr ret void } define void @pcmpestr_mask_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %fptr) nounwind { ; X86-LABEL: pcmpestr_mask_flag: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: pcmpestrm $24, %xmm1, %xmm0 ; X86-NEXT: setb %bl ; X86-NEXT: movdqa %xmm0, (%esi) ; X86-NEXT: movl %ebx, (%ecx) ; X86-NEXT: popl %esi ; X86-NEXT: popl %ebx ; X86-NEXT: retl ; ; X64-LABEL: pcmpestr_mask_flag: ; X64: # %bb.0: # %entry ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movl %esi, %edx ; X64-NEXT: movl %edi, %eax ; X64-NEXT: xorl %esi, %esi ; X64-NEXT: pcmpestrm $24, %xmm1, %xmm0 ; X64-NEXT: setb %sil ; X64-NEXT: movdqa %xmm0, (%r8) ; X64-NEXT: movl %esi, (%rcx) ; X64-NEXT: retq entry: %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) store <16 x i8> %mask, <16 x i8>* %mptr store i32 %flag, i32* %fptr ret void } define void @pcmpestr_mask_index(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %iptr) nounwind { ; X86-LABEL: pcmpestr_mask_index: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movdqa %xmm0, %xmm2 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: pcmpestrm $24, %xmm1, %xmm0 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: pcmpestri $24, %xmm1, %xmm2 ; X86-NEXT: movdqa %xmm0, (%edi) ; X86-NEXT: movl %ecx, (%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: pcmpestr_mask_index: ; X64: # %bb.0: # %entry ; X64-NEXT: movq %rcx, %r8 ; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movl %esi, %edx ; X64-NEXT: movl %edi, %eax ; X64-NEXT: movdqa %xmm0, %xmm2 ; X64-NEXT: pcmpestrm $24, %xmm1, %xmm0 ; X64-NEXT: pcmpestri $24, %xmm1, %xmm2 ; X64-NEXT: movdqa %xmm0, (%r9) ; X64-NEXT: movl %ecx, (%r8) ; X64-NEXT: retq entry: %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) store <16 x i8> %mask, <16 x i8>* %mptr store i32 %index, i32* %iptr ret void } define void @pcmpestr_mask_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind { ; X86-LABEL: pcmpestr_mask_index_flag: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movdqa %xmm0, %xmm2 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: pcmpestrm $24, %xmm1, %xmm0 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: pcmpestri $24, %xmm1, %xmm2 ; X86-NEXT: setb %bl ; X86-NEXT: movdqa %xmm0, (%ebp) ; X86-NEXT: movl %ecx, (%edi) ; X86-NEXT: movl %ebx, (%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx ; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: pcmpestr_mask_index_flag: ; X64: # %bb.0: # %entry ; X64-NEXT: movq %rcx, %r9 ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movl %esi, %edx ; X64-NEXT: movl %edi, %eax ; X64-NEXT: movdqa %xmm0, %xmm2 ; X64-NEXT: pcmpestrm $24, %xmm1, %xmm0 ; X64-NEXT: xorl %esi, %esi ; X64-NEXT: pcmpestri $24, %xmm1, %xmm2 ; X64-NEXT: setb %sil ; X64-NEXT: movdqa %xmm0, (%r10) ; X64-NEXT: movl %ecx, (%r9) ; X64-NEXT: movl %esi, (%r8) ; X64-NEXT: retq entry: %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) store <16 x i8> %mask, <16 x i8>* %mptr store i32 %index, i32* %iptr store i32 %flag, i32* %fptr ret void } define void @pcmpistr_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, i32* %iptr, i32* %fptr) nounwind { ; X86-LABEL: pcmpistr_index_flag: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0 ; X86-NEXT: setb %bl ; X86-NEXT: movl %ecx, (%edx) ; X86-NEXT: movl %ebx, (%eax) ; X86-NEXT: popl %ebx ; X86-NEXT: retl ; ; X64-LABEL: pcmpistr_index_flag: ; X64: # %bb.0: # %entry ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 ; X64-NEXT: setb %al ; X64-NEXT: movl %ecx, (%rdi) ; X64-NEXT: movl %eax, (%rsi) ; X64-NEXT: retq entry: %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) store i32 %index, i32* %iptr store i32 %flag, i32* %fptr ret void } define void @pcmpistr_mask_flag(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %fptr) nounwind { ; X86-LABEL: pcmpistr_mask_flag: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: pcmpistrm $24, %xmm1, %xmm0 ; X86-NEXT: setb %dl ; X86-NEXT: movdqa %xmm0, (%ecx) ; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: pcmpistr_mask_flag: ; X64: # %bb.0: # %entry ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: pcmpistrm $24, %xmm1, %xmm0 ; X64-NEXT: setb %al ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: movl %eax, (%rsi) ; X64-NEXT: retq entry: %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) store <16 x i8> %mask, <16 x i8>* %mptr store i32 %flag, i32* %fptr ret void } define void @pcmpistr_mask_index(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %iptr) nounwind { ; X86-LABEL: pcmpistr_mask_index: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0 ; X86-NEXT: pcmpistrm $24, %xmm1, %xmm0 ; X86-NEXT: movdqa %xmm0, (%edx) ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: pcmpistr_mask_index: ; X64: # %bb.0: # %entry ; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 ; X64-NEXT: pcmpistrm $24, %xmm1, %xmm0 ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: movl %ecx, (%rsi) ; X64-NEXT: retq entry: %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) store <16 x i8> %mask, <16 x i8>* %mptr store i32 %index, i32* %iptr ret void } define void @pcmpistr_mask_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind { ; X86-LABEL: pcmpistr_mask_index_flag: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi ; X86-NEXT: movdqa %xmm0, %xmm2 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: pcmpistrm $24, %xmm1, %xmm0 ; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: pcmpistri $24, %xmm1, %xmm2 ; X86-NEXT: setb %bl ; X86-NEXT: movdqa %xmm0, (%esi) ; X86-NEXT: movl %ecx, (%edx) ; X86-NEXT: movl %ebx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %ebx ; X86-NEXT: retl ; ; X64-LABEL: pcmpistr_mask_index_flag: ; X64: # %bb.0: # %entry ; X64-NEXT: movdqa %xmm0, %xmm2 ; X64-NEXT: pcmpistrm $24, %xmm1, %xmm0 ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: pcmpistri $24, %xmm1, %xmm2 ; X64-NEXT: setb %al ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: movl %ecx, (%rsi) ; X64-NEXT: movl %eax, (%rdx) ; X64-NEXT: retq entry: %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) store <16 x i8> %mask, <16 x i8>* %mptr store i32 %index, i32* %iptr store i32 %flag, i32* %fptr ret void } ; Make sure we don't fold loads when we need to emit pcmpistrm and pcmpistri. define void @pcmpistr_mask_index_flag_load(<16 x i8> %lhs, <16 x i8>* %rhsptr, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind { ; X86-LABEL: pcmpistr_mask_index_flag_load: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movdqu (%ecx), %xmm2 ; X86-NEXT: pcmpistrm $24, %xmm2, %xmm0 ; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: pcmpistri $24, %xmm2, %xmm1 ; X86-NEXT: setb %bl ; X86-NEXT: movdqa %xmm0, (%esi) ; X86-NEXT: movl %ecx, (%edx) ; X86-NEXT: movl %ebx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %ebx ; X86-NEXT: retl ; ; X64-LABEL: pcmpistr_mask_index_flag_load: ; X64: # %bb.0: # %entry ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: movdqa %xmm0, %xmm1 ; X64-NEXT: movdqu (%rdi), %xmm2 ; X64-NEXT: pcmpistrm $24, %xmm2, %xmm0 ; X64-NEXT: xorl %edi, %edi ; X64-NEXT: pcmpistri $24, %xmm2, %xmm1 ; X64-NEXT: setb %dil ; X64-NEXT: movdqa %xmm0, (%rsi) ; X64-NEXT: movl %ecx, (%rdx) ; X64-NEXT: movl %edi, (%rax) ; X64-NEXT: retq entry: %rhs = load <16 x i8>, <16 x i8>* %rhsptr, align 1 %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) store <16 x i8> %mask, <16 x i8>* %mptr store i32 %index, i32* %iptr store i32 %flag, i32* %fptr ret void } ; Make sure we don't fold nontemporal loads. define i32 @pcmpestri_nontemporal(<16 x i8> %lhs, i32 %lhs_len, <16 x i8>* %rhsptr, i32 %rhs_len) nounwind { ; X86-LABEL: pcmpestri_nontemporal: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movntdqa (%ecx), %xmm1 ; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: pcmpestri $24, %xmm1, %xmm0 ; X86-NEXT: setb %bl ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: popl %ebx ; X86-NEXT: retl ; ; X64-LABEL: pcmpestri_nontemporal: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %edi, %eax ; X64-NEXT: movntdqa (%rsi), %xmm1 ; X64-NEXT: xorl %esi, %esi ; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 ; X64-NEXT: setb %sil ; X64-NEXT: movl %esi, %eax ; X64-NEXT: retq entry: %rhs = load <16 x i8>, <16 x i8>* %rhsptr, align 16, !nontemporal !0 %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) ret i32 %flag } !0 = !{ i32 1 }