; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s declare <4 x float> @foo(<4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>) ; Due to a bug in X86RegisterInfo::getLargestLegalSuperClass this test case was trying to use XMM16 and spill it without VLX support for the necessary store instruction. We briefly implemented the spill using VEXTRACTF32X4, but the bug in getLargestLegalSuperClass has now been fixed so we no longer use XMM16. define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <4 x float> %a4, <16 x float>%c1, <16 x float>%c2) { ; CHECK-LABEL: bar: ; CHECK: # %bb.0: ; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 80 ; CHECK-NEXT: vmovaps %xmm1, %xmm9 ; CHECK-NEXT: vmovaps {{.*#+}} xmm14 = [4,22,1,17] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm14 ; CHECK-NEXT: vmovaps {{.*#+}} xmm10 = [4,30,1,22] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm10 ; CHECK-NEXT: vmovaps {{.*#+}} xmm8 = [4,28,1,29] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm8 ; CHECK-NEXT: vmovaps {{.*#+}} xmm7 = <5,20,u,u> ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm7 ; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [4,21,1,7] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm4 ; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm5 ; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm6 ; CHECK-NEXT: vunpcklps {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm11[0,1],xmm2[1],xmm11[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm1[0,1,2],xmm3[1] ; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm4[0,1,2],xmm3[1] ; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4 ; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[3,3,3,3] ; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; CHECK-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0,1],xmm2[1],xmm7[3] ; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm3[3] ; CHECK-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0,1,2],xmm3[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0,1,2],xmm3[1] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[1] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm8 ; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm11[0,1],xmm2[3,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2] ; CHECK-NEXT: vaddps %xmm2, %xmm14, %xmm2 ; CHECK-NEXT: vmovaps %xmm13, %xmm1 ; CHECK-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vaddps %xmm10, %xmm13, %xmm10 ; CHECK-NEXT: vaddps %xmm13, %xmm13, %xmm3 ; CHECK-NEXT: vaddps %xmm12, %xmm14, %xmm0 ; CHECK-NEXT: vaddps %xmm0, %xmm8, %xmm0 ; CHECK-NEXT: vaddps %xmm0, %xmm13, %xmm0 ; CHECK-NEXT: vmovaps %xmm3, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovaps %xmm10, (%rsp) ; CHECK-NEXT: vmovaps %xmm9, %xmm3 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %a1 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> %a2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> %a5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> %a6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> %a7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> %a8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> %a9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> %a10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> %ax2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> %ax5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> %ax6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> %ax7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> %ax8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> %ax9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> %ax10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> %ay2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> %ay5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> %ay6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> %ay7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> %ay8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> %ay9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> %ay10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> %r1 = fadd <4 x float> %ay10, %ay9 %r2 = fadd <4 x float> %ay8, %ay7 %r3 = fadd <4 x float> %ay6, %ay5 %r4 = fadd <4 x float> %ay2, %ax10 %r5 = fadd <4 x float> %ay9, %ax8 %r6 = fadd <4 x float> %r5, %r3 %r7 = fadd <4 x float> %a9, %r6 %a11 = call <4 x float> @foo(<4 x float> %r7, <4 x float> %a10, <4 x float> %r1, <4 x float> %a4, <4 x float> %a5, <4 x float> %a6, <4 x float> %a7, <4 x float> %a8, <4 x float> %r2, <4 x float> %r4) %a12 = fadd <4 x float> %a2, %a1 %a13 = fadd <4 x float> %a12, %a11 ret <4 x float> %a13 }