; RUN: llc -march=hexagon < %s | FileCheck %s ; ; Several shufflevector instructions have masks that are shorter than the ; source vectors. They "gather" a subset of the input elements into a single ; vector. Make sure that they are not expanded into a sequence of extract/ ; insert operations. ; ; The C source: ; ; void fred(int *a, int *b, int n) { ; for (int i = 0; i != n; i += 2) { ; a[i] += b[i+1]; ; a[i+1] += b[i]; ; } ; } ; ; Command line: ; clang -target hexagon -mcpu=hexagonv60 -fvectorize -fno-unroll-loops -O2 \ ; -mhvx -mhvx-length=128b -S inp.c ; ; CHECK-NOT: vinsert target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" target triple = "hexagon" ; Function Attrs: norecurse nounwind define void @f0(i32* nocapture %a0, i32* nocapture readonly %a1, i32 %a2) #0 { b0: %v0 = icmp eq i32 %a2, 0 br i1 %v0, label %b7, label %b1 b1: ; preds = %b0 %v1 = add i32 %a2, -2 %v2 = lshr i32 %v1, 1 %v3 = add nuw i32 %v2, 1 %v4 = icmp ult i32 %v3, 32 br i1 %v4, label %b2, label %b3 b2: ; preds = %b6, %b3, %b1 %v5 = phi i32 [ 0, %b3 ], [ 0, %b1 ], [ %v13, %b6 ] br label %b8 b3: ; preds = %b1 %v6 = and i32 %a2, -2 %v7 = getelementptr i32, i32* %a0, i32 %v6 %v8 = getelementptr i32, i32* %a1, i32 %v6 %v9 = icmp ugt i32* %v8, %a0 %v10 = icmp ugt i32* %v7, %a1 %v11 = and i1 %v9, %v10 br i1 %v11, label %b2, label %b4 b4: ; preds = %b3 %v12 = and i32 %v3, -32 %v13 = shl i32 %v12, 1 br label %b5 b5: ; preds = %b5, %b4 %v14 = phi i32 [ 0, %b4 ], [ %v34, %b5 ] %v15 = shl i32 %v14, 1 %v16 = or i32 %v15, 1 %v17 = getelementptr inbounds i32, i32* %a1, i32 -1 %v18 = getelementptr inbounds i32, i32* %v17, i32 %v16 %v19 = bitcast i32* %v18 to <64 x i32>* %v20 = load <64 x i32>, <64 x i32>* %v19, align 4, !tbaa !1 %v21 = shufflevector <64 x i32> %v20, <64 x i32> undef, <32 x i32> %v22 = shufflevector <64 x i32> %v20, <64 x i32> undef, <32 x i32> %v23 = getelementptr inbounds i32, i32* %a0, i32 %v15 %v24 = bitcast i32* %v23 to <64 x i32>* %v25 = load <64 x i32>, <64 x i32>* %v24, align 4, !tbaa !1 %v26 = shufflevector <64 x i32> %v25, <64 x i32> undef, <32 x i32> %v27 = shufflevector <64 x i32> %v25, <64 x i32> undef, <32 x i32> %v28 = add nsw <32 x i32> %v26, %v22 %v29 = getelementptr inbounds i32, i32* %a0, i32 -1 %v30 = add nsw <32 x i32> %v27, %v21 %v31 = getelementptr inbounds i32, i32* %v29, i32 %v16 %v32 = bitcast i32* %v31 to <64 x i32>* %v33 = shufflevector <32 x i32> %v28, <32 x i32> %v30, <64 x i32> store <64 x i32> %v33, <64 x i32>* %v32, align 4, !tbaa !1 %v34 = add i32 %v14, 32 %v35 = icmp eq i32 %v34, %v12 br i1 %v35, label %b6, label %b5, !llvm.loop !5 b6: ; preds = %b5 %v36 = icmp eq i32 %v3, %v12 br i1 %v36, label %b7, label %b2 b7: ; preds = %b8, %b6, %b0 ret void b8: ; preds = %b8, %b2 %v37 = phi i32 [ %v49, %b8 ], [ %v5, %b2 ] %v38 = or i32 %v37, 1 %v39 = getelementptr inbounds i32, i32* %a1, i32 %v38 %v40 = load i32, i32* %v39, align 4, !tbaa !1 %v41 = getelementptr inbounds i32, i32* %a0, i32 %v37 %v42 = load i32, i32* %v41, align 4, !tbaa !1 %v43 = add nsw i32 %v42, %v40 store i32 %v43, i32* %v41, align 4, !tbaa !1 %v44 = getelementptr inbounds i32, i32* %a1, i32 %v37 %v45 = load i32, i32* %v44, align 4, !tbaa !1 %v46 = getelementptr inbounds i32, i32* %a0, i32 %v38 %v47 = load i32, i32* %v46, align 4, !tbaa !1 %v48 = add nsw i32 %v47, %v45 store i32 %v48, i32* %v46, align 4, !tbaa !1 %v49 = add nuw nsw i32 %v37, 2 %v50 = icmp eq i32 %v49, %a2 br i1 %v50, label %b7, label %b8, !llvm.loop !7 } attributes #0 = { norecurse nounwind "target-cpu"="hexagonv60" "target-features"="+hvx-length128b,+hvxv60" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"wchar_size", i32 4} !1 = !{!2, !2, i64 0} !2 = !{!"int", !3, i64 0} !3 = !{!"omnipotent char", !4, i64 0} !4 = !{!"Simple C/C++ TBAA"} !5 = distinct !{!5, !6} !6 = !{!"llvm.loop.isvectorized", i32 1} !7 = distinct !{!7, !6}