; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512 ; RUN: opt < %s -O3 -mcpu=knl -force-vector-width=2 -S | FileCheck %s -check-prefix=FVW2 ; With a force-vector-width, it is sometimes more profitable to generate ; scalarized and predicated stores instead of masked scatter. target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc_linux" ; The source code: ; ;void foo1(float * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger, int * __restrict__ index) { ; ; for (int i=0; i < SIZE; ++i) { ; if (trigger[i] > 0) { ; out[i] = in[index[i]] + (float) 0.5; ; } ; } ;} ; Function Attrs: nounwind uwtable define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) { ; AVX512-LABEL: @foo1( ; AVX512-NEXT: iter.check: ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: ; AVX512-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX6]] ; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4 ; AVX512-NEXT: [[TMP2:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD]], zeroinitializer ; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 [[INDEX6]] ; AVX512-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP4]], i32 4, <16 x i1> [[TMP2]], <16 x i32> poison) ; AVX512-NEXT: [[TMP5:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD]] to <16 x i64> ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <16 x i64> [[TMP5]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP6]], i32 4, <16 x i1> [[TMP2]], <16 x float> undef) ; AVX512-NEXT: [[TMP7:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER]], ; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDEX6]] ; AVX512-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP7]], <16 x float>* [[TMP9]], i32 4, <16 x i1> [[TMP2]]) ; AVX512-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX6]], 16 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4 ; AVX512-NEXT: [[TMP12:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_1]], zeroinitializer ; AVX512-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP14]], i32 4, <16 x i1> [[TMP12]], <16 x i32> poison) ; AVX512-NEXT: [[TMP15:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD_1]] to <16 x i64> ; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[IN]], <16 x i64> [[TMP15]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP16]], i32 4, <16 x i1> [[TMP12]], <16 x float> undef) ; AVX512-NEXT: [[TMP17:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER_1]], ; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP17]], <16 x float>* [[TMP19]], i32 4, <16 x i1> [[TMP12]]) ; AVX512-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX6]], 32 ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_1]] ; AVX512-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD_2:%.*]] = load <16 x i32>, <16 x i32>* [[TMP21]], align 4 ; AVX512-NEXT: [[TMP22:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_2]], zeroinitializer ; AVX512-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_1]] ; AVX512-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD_2:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP24]], i32 4, <16 x i1> [[TMP22]], <16 x i32> poison) ; AVX512-NEXT: [[TMP25:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD_2]] to <16 x i64> ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <16 x i64> [[TMP25]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP26]], i32 4, <16 x i1> [[TMP22]], <16 x float> undef) ; AVX512-NEXT: [[TMP27:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER_2]], ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_1]] ; AVX512-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP27]], <16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP22]]) ; AVX512-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX6]], 48 ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_2]] ; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD_3:%.*]] = load <16 x i32>, <16 x i32>* [[TMP31]], align 4 ; AVX512-NEXT: [[TMP32:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_3]], zeroinitializer ; AVX512-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_2]] ; AVX512-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD_3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP34]], i32 4, <16 x i1> [[TMP32]], <16 x i32> poison) ; AVX512-NEXT: [[TMP35:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD_3]] to <16 x i64> ; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[IN]], <16 x i64> [[TMP35]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP36]], i32 4, <16 x i1> [[TMP32]], <16 x float> undef) ; AVX512-NEXT: [[TMP37:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER_3]], ; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_2]] ; AVX512-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP37]], <16 x float>* [[TMP39]], i32 4, <16 x i1> [[TMP32]]) ; AVX512-NEXT: [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX6]], 64 ; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT_3]], 4096 ; AVX512-NEXT: br i1 [[TMP40]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; ; FVW2-LABEL: @foo1( ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: ; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ] ; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX6]] ; FVW2-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>* ; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 ; FVW2-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer ; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 [[INDEX6]] ; FVW2-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>* ; FVW2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP4]], i32 4, <2 x i1> [[TMP2]], <2 x i32> poison) ; FVW2-NEXT: [[TMP5:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64> ; FVW2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <2 x i64> [[TMP5]] ; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP6]], i32 4, <2 x i1> [[TMP2]], <2 x float> undef) ; FVW2-NEXT: [[TMP7:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], ; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDEX6]] ; FVW2-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <2 x float>* ; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP7]], <2 x float>* [[TMP9]], i32 4, <2 x i1> [[TMP2]]) ; FVW2-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX6]], 2 ; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]] ; FVW2-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <2 x i32>* ; FVW2-NEXT: [[WIDE_LOAD_1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP11]], align 4 ; FVW2-NEXT: [[TMP12:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_1]], zeroinitializer ; FVW2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT]] ; FVW2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <2 x i32>* ; FVW2-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP14]], i32 4, <2 x i1> [[TMP12]], <2 x i32> poison) ; FVW2-NEXT: [[TMP15:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_1]] to <2 x i64> ; FVW2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP15]] ; FVW2-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP16]], i32 4, <2 x i1> [[TMP12]], <2 x float> undef) ; FVW2-NEXT: [[TMP17:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER_1]], ; FVW2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT]] ; FVW2-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <2 x float>* ; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP17]], <2 x float>* [[TMP19]], i32 4, <2 x i1> [[TMP12]]) ; FVW2-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX6]], 4 ; FVW2-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 4096 ; FVW2-NEXT: br i1 [[TMP20]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; entry: %in.addr = alloca float*, align 8 %out.addr = alloca float*, align 8 %trigger.addr = alloca i32*, align 8 %index.addr = alloca i32*, align 8 %i = alloca i32, align 4 store float* %in, float** %in.addr, align 8 store float* %out, float** %out.addr, align 8 store i32* %trigger, i32** %trigger.addr, align 8 store i32* %index, i32** %index.addr, align 8 store i32 0, i32* %i, align 4 br label %for.cond for.cond: ; preds = %for.inc, %entry %0 = load i32, i32* %i, align 4 %cmp = icmp slt i32 %0, 4096 br i1 %cmp, label %for.body, label %for.end for.body: ; preds = %for.cond %1 = load i32, i32* %i, align 4 %idxprom = sext i32 %1 to i64 %2 = load i32*, i32** %trigger.addr, align 8 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom %3 = load i32, i32* %arrayidx, align 4 %cmp1 = icmp sgt i32 %3, 0 br i1 %cmp1, label %if.then, label %if.end if.then: ; preds = %for.body %4 = load i32, i32* %i, align 4 %idxprom2 = sext i32 %4 to i64 %5 = load i32*, i32** %index.addr, align 8 %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2 %6 = load i32, i32* %arrayidx3, align 4 %idxprom4 = sext i32 %6 to i64 %7 = load float*, float** %in.addr, align 8 %arrayidx5 = getelementptr inbounds float, float* %7, i64 %idxprom4 %8 = load float, float* %arrayidx5, align 4 %add = fadd float %8, 5.000000e-01 %9 = load i32, i32* %i, align 4 %idxprom6 = sext i32 %9 to i64 %10 = load float*, float** %out.addr, align 8 %arrayidx7 = getelementptr inbounds float, float* %10, i64 %idxprom6 store float %add, float* %arrayidx7, align 4 br label %if.end if.end: ; preds = %if.then, %for.body br label %for.inc for.inc: ; preds = %if.end %11 = load i32, i32* %i, align 4 %inc = add nsw i32 %11, 1 store i32 %inc, i32* %i, align 4 br label %for.cond for.end: ; preds = %for.cond ret void } ; The source code ;void foo2 (In * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger) { ; ; for (int i=0; i 0) { ; out[i] = in[i].b + (float) 0.5; ; } ; } ;} %struct.In = type { float, float } define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 { ; AVX512-LABEL: @foo2( ; AVX512-NEXT: entry: ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) ; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) ; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], ; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) ; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) ; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer ; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) ; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], ; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) ; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) ; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer ; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) ; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) ; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) ; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer ; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) ; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) ; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) ; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], ; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) ; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) ; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], ; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) ; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer ; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) ; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], ; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) ; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer ; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) ; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], ; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) ; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer ; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) ; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], ; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) ; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer ; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) ; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], ; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) ; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer ; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) ; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], ; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) ; AVX512-NEXT: ret void ; ; FVW2-LABEL: @foo2( ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: ; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ] ; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ] ; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4 ; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]] ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]] ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]] ; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]] ; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> , <2 x i32> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> , <2 x i32> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> , <2 x i32> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> , <2 x i32> undef) ; FVW2-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer ; FVW2-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer ; FVW2-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer ; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer ; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 ; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD]], i32 1 ; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1 ; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1 ; FVW2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) ; FVW2-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], ; FVW2-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], ; FVW2-NEXT: [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], ; FVW2-NEXT: [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], ; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 ; FVW2-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; FVW2: pred.store.if: ; FVW2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]] ; FVW2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0 ; FVW2-NEXT: store float [[TMP18]], float* [[TMP17]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] ; FVW2: pred.store.continue: ; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 ; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] ; FVW2: pred.store.if17: ; FVW2-NEXT: [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16 ; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP20]] ; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1 ; FVW2-NEXT: store float [[TMP22]], float* [[TMP21]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE18]] ; FVW2: pred.store.continue18: ; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 ; FVW2-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] ; FVW2: pred.store.if19: ; FVW2-NEXT: [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32 ; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP24]] ; FVW2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0 ; FVW2-NEXT: store float [[TMP26]], float* [[TMP25]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE20]] ; FVW2: pred.store.continue20: ; FVW2-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 ; FVW2-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] ; FVW2: pred.store.if21: ; FVW2-NEXT: [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48 ; FVW2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP28]] ; FVW2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1 ; FVW2-NEXT: store float [[TMP30]], float* [[TMP29]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE22]] ; FVW2: pred.store.continue22: ; FVW2-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 ; FVW2-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] ; FVW2: pred.store.if23: ; FVW2-NEXT: [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64 ; FVW2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP32]] ; FVW2-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0 ; FVW2-NEXT: store float [[TMP34]], float* [[TMP33]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE24]] ; FVW2: pred.store.continue24: ; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 ; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] ; FVW2: pred.store.if25: ; FVW2-NEXT: [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80 ; FVW2-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP36]] ; FVW2-NEXT: [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1 ; FVW2-NEXT: store float [[TMP38]], float* [[TMP37]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE26]] ; FVW2: pred.store.continue26: ; FVW2-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 ; FVW2-NEXT: br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] ; FVW2: pred.store.if27: ; FVW2-NEXT: [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96 ; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP40]] ; FVW2-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 ; FVW2-NEXT: store float [[TMP42]], float* [[TMP41]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE28]] ; FVW2: pred.store.continue28: ; FVW2-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 ; FVW2-NEXT: br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]] ; FVW2: pred.store.if29: ; FVW2-NEXT: [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112 ; FVW2-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP44]] ; FVW2-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 ; FVW2-NEXT: store float [[TMP46]], float* [[TMP45]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE30]] ; FVW2: pred.store.continue30: ; FVW2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX6]], 8 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; FVW2-NEXT: br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], [[LOOP2:!llvm.loop !.*]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; entry: %in.addr = alloca %struct.In*, align 8 %out.addr = alloca float*, align 8 %trigger.addr = alloca i32*, align 8 %index.addr = alloca i32*, align 8 %i = alloca i32, align 4 store %struct.In* %in, %struct.In** %in.addr, align 8 store float* %out, float** %out.addr, align 8 store i32* %trigger, i32** %trigger.addr, align 8 store i32* %index, i32** %index.addr, align 8 store i32 0, i32* %i, align 4 br label %for.cond for.cond: ; preds = %for.inc, %entry %0 = load i32, i32* %i, align 4 %cmp = icmp slt i32 %0, 4096 br i1 %cmp, label %for.body, label %for.end for.body: ; preds = %for.cond %1 = load i32, i32* %i, align 4 %idxprom = sext i32 %1 to i64 %2 = load i32*, i32** %trigger.addr, align 8 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom %3 = load i32, i32* %arrayidx, align 4 %cmp1 = icmp sgt i32 %3, 0 br i1 %cmp1, label %if.then, label %if.end if.then: ; preds = %for.body %4 = load i32, i32* %i, align 4 %idxprom2 = sext i32 %4 to i64 %5 = load %struct.In*, %struct.In** %in.addr, align 8 %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2 %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1 %6 = load float, float* %b, align 4 %add = fadd float %6, 5.000000e-01 %7 = load i32, i32* %i, align 4 %idxprom4 = sext i32 %7 to i64 %8 = load float*, float** %out.addr, align 8 %arrayidx5 = getelementptr inbounds float, float* %8, i64 %idxprom4 store float %add, float* %arrayidx5, align 4 br label %if.end if.end: ; preds = %if.then, %for.body br label %for.inc for.inc: ; preds = %if.end %9 = load i32, i32* %i, align 4 %inc = add nsw i32 %9, 16 store i32 %inc, i32* %i, align 4 br label %for.cond for.end: ; preds = %for.cond ret void } ; The source code ;struct Out { ; float a; ; float b; ;}; ;void foo3 (In * __restrict__ in, Out * __restrict__ out, int * __restrict__ trigger) { ; ; for (int i=0; i 0) { ; out[i].b = in[i].b + (float) 0.5; ; } ; } ;} %struct.Out = type { float, float } define void @foo3(%struct.In* noalias %in, %struct.Out* noalias %out, i32* noalias %trigger) { ; AVX512-LABEL: @foo3( ; AVX512-NEXT: entry: ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) ; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6]], ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) ; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_1]], ; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) ; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_2]], ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) ; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer ; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) ; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_3]], ; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) ; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_4]], ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) ; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer ; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) ; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_5]], ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) ; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_6]], ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) ; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_7]], ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer ; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) ; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_8]], ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) ; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) ; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_9]], ; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) ; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) ; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_10]], ; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) ; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer ; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) ; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_11]], ; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) ; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer ; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) ; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_12]], ; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) ; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer ; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) ; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_13]], ; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) ; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer ; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) ; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_14]], ; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) ; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer ; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) ; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_15]], ; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) ; AVX512-NEXT: ret void ; ; FVW2-LABEL: @foo3( ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: ; FVW2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE29:%.*]] ] ; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE29]] ] ; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[STEP_ADD6:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 4 ; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]] ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]] ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD6]] ; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]] ; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> , <2 x i32> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> , <2 x i32> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> , <2 x i32> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> , <2 x i32> undef) ; FVW2-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer ; FVW2-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER9]], zeroinitializer ; FVW2-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer ; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer ; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 ; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD]], i32 1 ; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD6]], i32 1 ; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1 ; FVW2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) ; FVW2-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER12]], ; FVW2-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], ; FVW2-NEXT: [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], ; FVW2-NEXT: [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], ; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 ; FVW2-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; FVW2: pred.store.if: ; FVW2-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], i64 [[OFFSET_IDX]], i32 1 ; FVW2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0 ; FVW2-NEXT: store float [[TMP18]], float* [[TMP17]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] ; FVW2: pred.store.continue: ; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 ; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]] ; FVW2: pred.store.if16: ; FVW2-NEXT: [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16 ; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP20]], i32 1 ; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1 ; FVW2-NEXT: store float [[TMP22]], float* [[TMP21]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE17]] ; FVW2: pred.store.continue17: ; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 ; FVW2-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]] ; FVW2: pred.store.if18: ; FVW2-NEXT: [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32 ; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP24]], i32 1 ; FVW2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0 ; FVW2-NEXT: store float [[TMP26]], float* [[TMP25]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE19]] ; FVW2: pred.store.continue19: ; FVW2-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 ; FVW2-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]] ; FVW2: pred.store.if20: ; FVW2-NEXT: [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48 ; FVW2-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP28]], i32 1 ; FVW2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1 ; FVW2-NEXT: store float [[TMP30]], float* [[TMP29]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE21]] ; FVW2: pred.store.continue21: ; FVW2-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 ; FVW2-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]] ; FVW2: pred.store.if22: ; FVW2-NEXT: [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64 ; FVW2-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP32]], i32 1 ; FVW2-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0 ; FVW2-NEXT: store float [[TMP34]], float* [[TMP33]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE23]] ; FVW2: pred.store.continue23: ; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 ; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]] ; FVW2: pred.store.if24: ; FVW2-NEXT: [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80 ; FVW2-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP36]], i32 1 ; FVW2-NEXT: [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1 ; FVW2-NEXT: store float [[TMP38]], float* [[TMP37]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE25]] ; FVW2: pred.store.continue25: ; FVW2-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 ; FVW2-NEXT: br i1 [[TMP39]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]] ; FVW2: pred.store.if26: ; FVW2-NEXT: [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96 ; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP40]], i32 1 ; FVW2-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 ; FVW2-NEXT: store float [[TMP42]], float* [[TMP41]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE27]] ; FVW2: pred.store.continue27: ; FVW2-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 ; FVW2-NEXT: br i1 [[TMP43]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29]] ; FVW2: pred.store.if28: ; FVW2-NEXT: [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112 ; FVW2-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP44]], i32 1 ; FVW2-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 ; FVW2-NEXT: store float [[TMP46]], float* [[TMP45]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE29]] ; FVW2: pred.store.continue29: ; FVW2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; FVW2-NEXT: br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], [[LOOP3:!llvm.loop !.*]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; entry: %in.addr = alloca %struct.In*, align 8 %out.addr = alloca %struct.Out*, align 8 %trigger.addr = alloca i32*, align 8 %i = alloca i32, align 4 store %struct.In* %in, %struct.In** %in.addr, align 8 store %struct.Out* %out, %struct.Out** %out.addr, align 8 store i32* %trigger, i32** %trigger.addr, align 8 store i32 0, i32* %i, align 4 br label %for.cond for.cond: ; preds = %for.inc, %entry %0 = load i32, i32* %i, align 4 %cmp = icmp slt i32 %0, 4096 br i1 %cmp, label %for.body, label %for.end for.body: ; preds = %for.cond %1 = load i32, i32* %i, align 4 %idxprom = sext i32 %1 to i64 %2 = load i32*, i32** %trigger.addr, align 8 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom %3 = load i32, i32* %arrayidx, align 4 %cmp1 = icmp sgt i32 %3, 0 br i1 %cmp1, label %if.then, label %if.end if.then: ; preds = %for.body %4 = load i32, i32* %i, align 4 %idxprom2 = sext i32 %4 to i64 %5 = load %struct.In*, %struct.In** %in.addr, align 8 %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2 %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1 %6 = load float, float* %b, align 4 %add = fadd float %6, 5.000000e-01 %7 = load i32, i32* %i, align 4 %idxprom4 = sext i32 %7 to i64 %8 = load %struct.Out*, %struct.Out** %out.addr, align 8 %arrayidx5 = getelementptr inbounds %struct.Out, %struct.Out* %8, i64 %idxprom4 %b6 = getelementptr inbounds %struct.Out, %struct.Out* %arrayidx5, i32 0, i32 1 store float %add, float* %b6, align 4 br label %if.end if.end: ; preds = %if.then, %for.body br label %for.inc for.inc: ; preds = %if.end %9 = load i32, i32* %i, align 4 %inc = add nsw i32 %9, 16 store i32 %inc, i32* %i, align 4 br label %for.cond for.end: ; preds = %for.cond ret void } declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float>, <16 x float*>, i32, <16 x i1>) ; The same as @foo2 but scatter/gather argument is a vecotr of ptrs with addresspace 1 define void @foo2_addrspace(%struct.In addrspace(1)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 { ; AVX512-LABEL: @foo2_addrspace( ; AVX512-NEXT: entry: ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) ; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP3]], <16 x float addrspace(1)*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) ; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], ; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP8]], <16 x float addrspace(1)*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) ; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP13]], <16 x float addrspace(1)*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) ; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer ; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) ; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], ; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP18]], <16 x float addrspace(1)*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) ; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP23]], <16 x float addrspace(1)*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) ; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer ; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) ; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP28]], <16 x float addrspace(1)*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) ; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP33]], <16 x float addrspace(1)*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) ; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP38]], <16 x float addrspace(1)*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer ; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) ; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP43]], <16 x float addrspace(1)*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) ; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) ; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], ; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP48]], <16 x float addrspace(1)*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) ; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) ; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], ; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP53]], <16 x float addrspace(1)*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) ; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer ; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) ; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], ; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP58]], <16 x float addrspace(1)*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) ; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer ; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) ; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], ; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP63]], <16 x float addrspace(1)*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) ; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer ; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) ; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], ; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP68]], <16 x float addrspace(1)*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) ; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer ; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) ; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], ; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP73]], <16 x float addrspace(1)*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) ; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer ; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) ; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], ; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP78]], <16 x float addrspace(1)*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) ; AVX512-NEXT: ret void ; ; FVW2-LABEL: @foo2_addrspace( ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: ; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ] ; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ] ; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4 ; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]] ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]] ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]] ; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]] ; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> , <2 x i32> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> , <2 x i32> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> , <2 x i32> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> , <2 x i32> undef) ; FVW2-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer ; FVW2-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer ; FVW2-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer ; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer ; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 ; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD]], i32 1 ; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1 ; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1 ; FVW2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) ; FVW2-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], ; FVW2-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], ; FVW2-NEXT: [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], ; FVW2-NEXT: [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], ; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 ; FVW2-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; FVW2: pred.store.if: ; FVW2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]] ; FVW2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0 ; FVW2-NEXT: store float [[TMP18]], float addrspace(1)* [[TMP17]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] ; FVW2: pred.store.continue: ; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 ; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] ; FVW2: pred.store.if17: ; FVW2-NEXT: [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16 ; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP20]] ; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1 ; FVW2-NEXT: store float [[TMP22]], float addrspace(1)* [[TMP21]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE18]] ; FVW2: pred.store.continue18: ; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 ; FVW2-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] ; FVW2: pred.store.if19: ; FVW2-NEXT: [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32 ; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP24]] ; FVW2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0 ; FVW2-NEXT: store float [[TMP26]], float addrspace(1)* [[TMP25]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE20]] ; FVW2: pred.store.continue20: ; FVW2-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 ; FVW2-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] ; FVW2: pred.store.if21: ; FVW2-NEXT: [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48 ; FVW2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP28]] ; FVW2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1 ; FVW2-NEXT: store float [[TMP30]], float addrspace(1)* [[TMP29]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE22]] ; FVW2: pred.store.continue22: ; FVW2-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 ; FVW2-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] ; FVW2: pred.store.if23: ; FVW2-NEXT: [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64 ; FVW2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP32]] ; FVW2-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0 ; FVW2-NEXT: store float [[TMP34]], float addrspace(1)* [[TMP33]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE24]] ; FVW2: pred.store.continue24: ; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 ; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] ; FVW2: pred.store.if25: ; FVW2-NEXT: [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80 ; FVW2-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP36]] ; FVW2-NEXT: [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1 ; FVW2-NEXT: store float [[TMP38]], float addrspace(1)* [[TMP37]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE26]] ; FVW2: pred.store.continue26: ; FVW2-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 ; FVW2-NEXT: br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] ; FVW2: pred.store.if27: ; FVW2-NEXT: [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96 ; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP40]] ; FVW2-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 ; FVW2-NEXT: store float [[TMP42]], float addrspace(1)* [[TMP41]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE28]] ; FVW2: pred.store.continue28: ; FVW2-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 ; FVW2-NEXT: br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]] ; FVW2: pred.store.if29: ; FVW2-NEXT: [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112 ; FVW2-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP44]] ; FVW2-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 ; FVW2-NEXT: store float [[TMP46]], float addrspace(1)* [[TMP45]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE30]] ; FVW2: pred.store.continue30: ; FVW2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX6]], 8 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; FVW2-NEXT: br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; entry: %in.addr = alloca %struct.In addrspace(1)*, align 8 %out.addr = alloca float addrspace(1)*, align 8 %trigger.addr = alloca i32*, align 8 %index.addr = alloca i32*, align 8 %i = alloca i32, align 4 store %struct.In addrspace(1)* %in, %struct.In addrspace(1)** %in.addr, align 8 store float addrspace(1)* %out, float addrspace(1)** %out.addr, align 8 store i32* %trigger, i32** %trigger.addr, align 8 store i32* %index, i32** %index.addr, align 8 store i32 0, i32* %i, align 4 br label %for.cond for.cond: ; preds = %for.inc, %entry %0 = load i32, i32* %i, align 4 %cmp = icmp slt i32 %0, 4096 br i1 %cmp, label %for.body, label %for.end for.body: ; preds = %for.cond %1 = load i32, i32* %i, align 4 %idxprom = sext i32 %1 to i64 %2 = load i32*, i32** %trigger.addr, align 8 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom %3 = load i32, i32* %arrayidx, align 4 %cmp1 = icmp sgt i32 %3, 0 br i1 %cmp1, label %if.then, label %if.end if.then: ; preds = %for.body %4 = load i32, i32* %i, align 4 %idxprom2 = sext i32 %4 to i64 %5 = load %struct.In addrspace(1)*, %struct.In addrspace(1)** %in.addr, align 8 %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %5, i64 %idxprom2 %b = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %arrayidx3, i32 0, i32 1 %6 = load float, float addrspace(1)* %b, align 4 %add = fadd float %6, 5.000000e-01 %7 = load i32, i32* %i, align 4 %idxprom4 = sext i32 %7 to i64 %8 = load float addrspace(1)*, float addrspace(1)** %out.addr, align 8 %arrayidx5 = getelementptr inbounds float, float addrspace(1)* %8, i64 %idxprom4 store float %add, float addrspace(1)* %arrayidx5, align 4 br label %if.end if.end: ; preds = %if.then, %for.body br label %for.inc for.inc: ; preds = %if.end %9 = load i32, i32* %i, align 4 %inc = add nsw i32 %9, 16 store i32 %inc, i32* %i, align 4 br label %for.cond for.end: ; preds = %for.cond ret void } ; Same as foo2_addrspace but here only the input has the non-default address space. define void @foo2_addrspace2(%struct.In addrspace(1)* noalias %in, float addrspace(0)* noalias %out, i32* noalias %trigger, i32* noalias %index) { ; AVX512-LABEL: @foo2_addrspace2( ; AVX512-NEXT: entry: ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) ; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) ; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], ; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) ; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) ; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer ; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) ; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], ; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) ; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) ; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer ; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) ; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) ; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) ; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer ; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) ; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) ; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) ; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], ; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) ; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) ; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], ; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) ; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer ; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) ; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], ; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) ; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer ; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) ; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], ; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) ; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer ; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) ; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], ; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) ; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer ; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) ; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], ; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) ; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer ; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) ; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], ; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) ; AVX512-NEXT: ret void ; ; FVW2-LABEL: @foo2_addrspace2( ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: ; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ] ; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ] ; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4 ; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]] ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]] ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]] ; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]] ; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> , <2 x i32> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> , <2 x i32> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> , <2 x i32> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> , <2 x i32> undef) ; FVW2-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer ; FVW2-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer ; FVW2-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer ; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer ; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 ; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD]], i32 1 ; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1 ; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1 ; FVW2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) ; FVW2-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], ; FVW2-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], ; FVW2-NEXT: [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], ; FVW2-NEXT: [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], ; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 ; FVW2-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; FVW2: pred.store.if: ; FVW2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]] ; FVW2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0 ; FVW2-NEXT: store float [[TMP18]], float* [[TMP17]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] ; FVW2: pred.store.continue: ; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 ; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] ; FVW2: pred.store.if17: ; FVW2-NEXT: [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16 ; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP20]] ; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1 ; FVW2-NEXT: store float [[TMP22]], float* [[TMP21]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE18]] ; FVW2: pred.store.continue18: ; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 ; FVW2-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] ; FVW2: pred.store.if19: ; FVW2-NEXT: [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32 ; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP24]] ; FVW2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0 ; FVW2-NEXT: store float [[TMP26]], float* [[TMP25]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE20]] ; FVW2: pred.store.continue20: ; FVW2-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 ; FVW2-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] ; FVW2: pred.store.if21: ; FVW2-NEXT: [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48 ; FVW2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP28]] ; FVW2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1 ; FVW2-NEXT: store float [[TMP30]], float* [[TMP29]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE22]] ; FVW2: pred.store.continue22: ; FVW2-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 ; FVW2-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] ; FVW2: pred.store.if23: ; FVW2-NEXT: [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64 ; FVW2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP32]] ; FVW2-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0 ; FVW2-NEXT: store float [[TMP34]], float* [[TMP33]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE24]] ; FVW2: pred.store.continue24: ; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 ; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] ; FVW2: pred.store.if25: ; FVW2-NEXT: [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80 ; FVW2-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP36]] ; FVW2-NEXT: [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1 ; FVW2-NEXT: store float [[TMP38]], float* [[TMP37]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE26]] ; FVW2: pred.store.continue26: ; FVW2-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 ; FVW2-NEXT: br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] ; FVW2: pred.store.if27: ; FVW2-NEXT: [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96 ; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP40]] ; FVW2-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 ; FVW2-NEXT: store float [[TMP42]], float* [[TMP41]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE28]] ; FVW2: pred.store.continue28: ; FVW2-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 ; FVW2-NEXT: br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]] ; FVW2: pred.store.if29: ; FVW2-NEXT: [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112 ; FVW2-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP44]] ; FVW2-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 ; FVW2-NEXT: store float [[TMP46]], float* [[TMP45]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE30]] ; FVW2: pred.store.continue30: ; FVW2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX6]], 8 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; FVW2-NEXT: br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; entry: %in.addr = alloca %struct.In addrspace(1)*, align 8 %out.addr = alloca float addrspace(0)*, align 8 %trigger.addr = alloca i32*, align 8 %index.addr = alloca i32*, align 8 %i = alloca i32, align 4 store %struct.In addrspace(1)* %in, %struct.In addrspace(1)** %in.addr, align 8 store float addrspace(0)* %out, float addrspace(0)** %out.addr, align 8 store i32* %trigger, i32** %trigger.addr, align 8 store i32* %index, i32** %index.addr, align 8 store i32 0, i32* %i, align 4 br label %for.cond for.cond: ; preds = %for.inc, %entry %0 = load i32, i32* %i, align 4 %cmp = icmp slt i32 %0, 4096 br i1 %cmp, label %for.body, label %for.end for.body: ; preds = %for.cond %1 = load i32, i32* %i, align 4 %idxprom = sext i32 %1 to i64 %2 = load i32*, i32** %trigger.addr, align 8 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom %3 = load i32, i32* %arrayidx, align 4 %cmp1 = icmp sgt i32 %3, 0 br i1 %cmp1, label %if.then, label %if.end if.then: ; preds = %for.body %4 = load i32, i32* %i, align 4 %idxprom2 = sext i32 %4 to i64 %5 = load %struct.In addrspace(1)*, %struct.In addrspace(1)** %in.addr, align 8 %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %5, i64 %idxprom2 %b = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %arrayidx3, i32 0, i32 1 %6 = load float, float addrspace(1)* %b, align 4 %add = fadd float %6, 5.000000e-01 %7 = load i32, i32* %i, align 4 %idxprom4 = sext i32 %7 to i64 %8 = load float addrspace(0)*, float addrspace(0)** %out.addr, align 8 %arrayidx5 = getelementptr inbounds float, float addrspace(0)* %8, i64 %idxprom4 store float %add, float addrspace(0)* %arrayidx5, align 4 br label %if.end if.end: ; preds = %if.then, %for.body br label %for.inc for.inc: ; preds = %if.end %9 = load i32, i32* %i, align 4 %inc = add nsw i32 %9, 16 store i32 %inc, i32* %i, align 4 br label %for.cond for.end: ; preds = %for.cond ret void } ; Same as foo2_addrspace but here only the output has the non-default address space. define void @foo2_addrspace3(%struct.In addrspace(0)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) { ; AVX512-LABEL: @foo2_addrspace3( ; AVX512-NEXT: entry: ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) ; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP3]], <16 x float addrspace(1)*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) ; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], ; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP8]], <16 x float addrspace(1)*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) ; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP13]], <16 x float addrspace(1)*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) ; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer ; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) ; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], ; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP18]], <16 x float addrspace(1)*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) ; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP23]], <16 x float addrspace(1)*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) ; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer ; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) ; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP28]], <16 x float addrspace(1)*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) ; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP33]], <16 x float addrspace(1)*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) ; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP38]], <16 x float addrspace(1)*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer ; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) ; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP43]], <16 x float addrspace(1)*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) ; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) ; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], ; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP48]], <16 x float addrspace(1)*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) ; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) ; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], ; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP53]], <16 x float addrspace(1)*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) ; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer ; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) ; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], ; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP58]], <16 x float addrspace(1)*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) ; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer ; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) ; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], ; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP63]], <16 x float addrspace(1)*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) ; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer ; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) ; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], ; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP68]], <16 x float addrspace(1)*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) ; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer ; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) ; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], ; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP73]], <16 x float addrspace(1)*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) ; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer ; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) ; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], ; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP78]], <16 x float addrspace(1)*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) ; AVX512-NEXT: ret void ; ; FVW2-LABEL: @foo2_addrspace3( ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: ; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ] ; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ] ; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4 ; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]] ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]] ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]] ; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]] ; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> , <2 x i32> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> , <2 x i32> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> , <2 x i32> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> , <2 x i32> undef) ; FVW2-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer ; FVW2-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer ; FVW2-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer ; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer ; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 ; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD]], i32 1 ; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1 ; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1 ; FVW2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef) ; FVW2-NEXT: [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) ; FVW2-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], ; FVW2-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], ; FVW2-NEXT: [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], ; FVW2-NEXT: [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], ; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 ; FVW2-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; FVW2: pred.store.if: ; FVW2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]] ; FVW2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0 ; FVW2-NEXT: store float [[TMP18]], float addrspace(1)* [[TMP17]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] ; FVW2: pred.store.continue: ; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 ; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] ; FVW2: pred.store.if17: ; FVW2-NEXT: [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16 ; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP20]] ; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1 ; FVW2-NEXT: store float [[TMP22]], float addrspace(1)* [[TMP21]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE18]] ; FVW2: pred.store.continue18: ; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 ; FVW2-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] ; FVW2: pred.store.if19: ; FVW2-NEXT: [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32 ; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP24]] ; FVW2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0 ; FVW2-NEXT: store float [[TMP26]], float addrspace(1)* [[TMP25]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE20]] ; FVW2: pred.store.continue20: ; FVW2-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 ; FVW2-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] ; FVW2: pred.store.if21: ; FVW2-NEXT: [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48 ; FVW2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP28]] ; FVW2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1 ; FVW2-NEXT: store float [[TMP30]], float addrspace(1)* [[TMP29]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE22]] ; FVW2: pred.store.continue22: ; FVW2-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 ; FVW2-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] ; FVW2: pred.store.if23: ; FVW2-NEXT: [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64 ; FVW2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP32]] ; FVW2-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0 ; FVW2-NEXT: store float [[TMP34]], float addrspace(1)* [[TMP33]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE24]] ; FVW2: pred.store.continue24: ; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 ; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] ; FVW2: pred.store.if25: ; FVW2-NEXT: [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80 ; FVW2-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP36]] ; FVW2-NEXT: [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1 ; FVW2-NEXT: store float [[TMP38]], float addrspace(1)* [[TMP37]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE26]] ; FVW2: pred.store.continue26: ; FVW2-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 ; FVW2-NEXT: br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] ; FVW2: pred.store.if27: ; FVW2-NEXT: [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96 ; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP40]] ; FVW2-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 ; FVW2-NEXT: store float [[TMP42]], float addrspace(1)* [[TMP41]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE28]] ; FVW2: pred.store.continue28: ; FVW2-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 ; FVW2-NEXT: br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]] ; FVW2: pred.store.if29: ; FVW2-NEXT: [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112 ; FVW2-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP44]] ; FVW2-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 ; FVW2-NEXT: store float [[TMP46]], float addrspace(1)* [[TMP45]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE30]] ; FVW2: pred.store.continue30: ; FVW2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX6]], 8 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; FVW2-NEXT: br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; entry: %in.addr = alloca %struct.In addrspace(0)*, align 8 %out.addr = alloca float addrspace(1)*, align 8 %trigger.addr = alloca i32*, align 8 %index.addr = alloca i32*, align 8 %i = alloca i32, align 4 store %struct.In addrspace(0)* %in, %struct.In addrspace(0)** %in.addr, align 8 store float addrspace(1)* %out, float addrspace(1)** %out.addr, align 8 store i32* %trigger, i32** %trigger.addr, align 8 store i32* %index, i32** %index.addr, align 8 store i32 0, i32* %i, align 4 br label %for.cond for.cond: ; preds = %for.inc, %entry %0 = load i32, i32* %i, align 4 %cmp = icmp slt i32 %0, 4096 br i1 %cmp, label %for.body, label %for.end for.body: ; preds = %for.cond %1 = load i32, i32* %i, align 4 %idxprom = sext i32 %1 to i64 %2 = load i32*, i32** %trigger.addr, align 8 %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom %3 = load i32, i32* %arrayidx, align 4 %cmp1 = icmp sgt i32 %3, 0 br i1 %cmp1, label %if.then, label %if.end if.then: ; preds = %for.body %4 = load i32, i32* %i, align 4 %idxprom2 = sext i32 %4 to i64 %5 = load %struct.In addrspace(0)*, %struct.In addrspace(0)** %in.addr, align 8 %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(0)* %5, i64 %idxprom2 %b = getelementptr inbounds %struct.In, %struct.In addrspace(0)* %arrayidx3, i32 0, i32 1 %6 = load float, float addrspace(0)* %b, align 4 %add = fadd float %6, 5.000000e-01 %7 = load i32, i32* %i, align 4 %idxprom4 = sext i32 %7 to i64 %8 = load float addrspace(1)*, float addrspace(1)** %out.addr, align 8 %arrayidx5 = getelementptr inbounds float, float addrspace(1)* %8, i64 %idxprom4 store float %add, float addrspace(1)* %arrayidx5, align 4 br label %if.end if.end: ; preds = %if.then, %for.body br label %for.inc for.inc: ; preds = %if.end %9 = load i32, i32* %i, align 4 %inc = add nsw i32 %9, 16 store i32 %inc, i32* %i, align 4 br label %for.cond for.end: ; preds = %for.cond ret void } ; Using gathers is not profitable for this function. PR48429. define void @test_gather_not_profitable_pr48429(i32 %d, float* readonly %ptr, float* nocapture %dest) { ; AVX512-LABEL: @test_gather_not_profitable_pr48429( ; AVX512-NEXT: entry: ; AVX512-NEXT: [[IDX_EXT:%.*]] = sext i32 [[D:%.*]] to i64 ; AVX512-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[PTR:%.*]], i64 [[IDX_EXT]] ; AVX512-NEXT: [[CMP_NOT10:%.*]] = icmp eq i32 [[D]], 0 ; AVX512-NEXT: br i1 [[CMP_NOT10]], label [[FOR_END:%.*]], label [[FOR_BODY_LR_PH:%.*]] ; AVX512: for.body.lr.ph: ; AVX512-NEXT: [[MUL:%.*]] = sub nsw i32 0, [[D]] ; AVX512-NEXT: [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64 ; AVX512-NEXT: [[TMP0:%.*]] = shl nsw i64 [[IDX_EXT]], 2 ; AVX512-NEXT: [[TMP1:%.*]] = add nsw i64 [[TMP0]], -4 ; AVX512-NEXT: [[TMP2:%.*]] = lshr exact i64 [[TMP1]], 2 ; AVX512-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 ; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 60 ; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: ; AVX512-NEXT: [[TMP4:%.*]] = shl nsw i64 [[IDX_EXT]], 2 ; AVX512-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP4]], -4 ; AVX512-NEXT: [[TMP6:%.*]] = lshr exact i64 [[TMP5]], 2 ; AVX512-NEXT: [[TMP7:%.*]] = shl nsw i64 [[TMP5]], 2 ; AVX512-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], 2 ; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[DEST:%.*]], i64 [[TMP8]] ; AVX512-NEXT: [[TMP9:%.*]] = add nuw nsw i64 [[TMP6]], 1 ; AVX512-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP9]] ; AVX512-NEXT: [[SCEVGEP6:%.*]] = getelementptr float, float* [[PTR]], i64 [[IDXPROM]] ; AVX512-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 1 ; AVX512-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], [[IDX_EXT]] ; AVX512-NEXT: [[SCEVGEP8:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP11]] ; AVX512-NEXT: [[BOUND0:%.*]] = icmp ugt float* [[SCEVGEP4]], [[DEST]] ; AVX512-NEXT: [[BOUND1:%.*]] = icmp ugt float* [[SCEVGEP]], [[PTR]] ; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; AVX512-NEXT: [[BOUND010:%.*]] = icmp ugt float* [[SCEVGEP8]], [[DEST]] ; AVX512-NEXT: [[BOUND111:%.*]] = icmp ult float* [[SCEVGEP6]], [[SCEVGEP]] ; AVX512-NEXT: [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]] ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]] ; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_PH:%.*]] ; AVX512: vector.ph: ; AVX512-NEXT: [[N_VEC:%.*]] = and i64 [[TMP3]], 9223372036854775792 ; AVX512-NEXT: [[IND_END:%.*]] = getelementptr float, float* [[PTR]], i64 [[N_VEC]] ; AVX512-NEXT: [[TMP12:%.*]] = shl i64 [[N_VEC]], 4 ; AVX512-NEXT: [[IND_END14:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP12]] ; AVX512-NEXT: [[TMP13:%.*]] = add nsw i64 [[N_VEC]], -16 ; AVX512-NEXT: [[TMP14:%.*]] = lshr exact i64 [[TMP13]], 4 ; AVX512-NEXT: [[TMP15:%.*]] = add nuw nsw i64 [[TMP14]], 1 ; AVX512-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP15]], 7 ; AVX512-NEXT: [[TMP16:%.*]] = icmp ult i64 [[TMP13]], 112 ; AVX512-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]] ; AVX512: vector.ph.new: ; AVX512-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP15]], 2305843009213693944 ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: ; AVX512-NEXT: [[POINTER_PHI:%.*]] = phi float* [ [[DEST]], [[VECTOR_PH_NEW]] ], [ [[PTR_IND_7:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_7:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[NITER:%.*]] = phi i64 [ [[UNROLL_ITER]], [[VECTOR_PH_NEW]] ], [ [[NITER_NSUB_7:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[NEXT_GEP:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP17:%.*]] = getelementptr float, float* [[POINTER_PHI]], <16 x i64> ; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP]], i64 [[IDXPROM]] ; AVX512-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>* ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x float>, <16 x float>* [[TMP19]], align 4, !alias.scope !2 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD]], <16 x float*> [[TMP17]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[TMP20:%.*]] = bitcast float* [[NEXT_GEP]] to <16 x float>* ; AVX512-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x float>, <16 x float>* [[TMP20]], align 4, !alias.scope !9 ; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP17]], i64 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15]], <16 x float*> [[TMP21]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 16 ; AVX512-NEXT: [[PTR_IND:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 256 ; AVX512-NEXT: [[NEXT_GEP_1:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr float, float* [[PTR_IND]], <16 x i64> ; AVX512-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_1]], i64 [[IDXPROM]] ; AVX512-NEXT: [[TMP24:%.*]] = bitcast float* [[TMP23]] to <16 x float>* ; AVX512-NEXT: [[WIDE_LOAD_1:%.*]] = load <16 x float>, <16 x float>* [[TMP24]], align 4, !alias.scope !2 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_1]], <16 x float*> [[TMP22]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[TMP25:%.*]] = bitcast float* [[NEXT_GEP_1]] to <16 x float>* ; AVX512-NEXT: [[WIDE_LOAD15_1:%.*]] = load <16 x float>, <16 x float>* [[TMP25]], align 4, !alias.scope !9 ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP22]], i64 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_1]], <16 x float*> [[TMP26]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX]], 32 ; AVX512-NEXT: [[PTR_IND_1:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 512 ; AVX512-NEXT: [[NEXT_GEP_2:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_1]] ; AVX512-NEXT: [[TMP27:%.*]] = getelementptr float, float* [[PTR_IND_1]], <16 x i64> ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_2]], i64 [[IDXPROM]] ; AVX512-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>* ; AVX512-NEXT: [[WIDE_LOAD_2:%.*]] = load <16 x float>, <16 x float>* [[TMP29]], align 4, !alias.scope !2 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_2]], <16 x float*> [[TMP27]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[TMP30:%.*]] = bitcast float* [[NEXT_GEP_2]] to <16 x float>* ; AVX512-NEXT: [[WIDE_LOAD15_2:%.*]] = load <16 x float>, <16 x float>* [[TMP30]], align 4, !alias.scope !9 ; AVX512-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP27]], i64 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_2]], <16 x float*> [[TMP31]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX]], 48 ; AVX512-NEXT: [[PTR_IND_2:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 768 ; AVX512-NEXT: [[NEXT_GEP_3:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_2]] ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr float, float* [[PTR_IND_2]], <16 x i64> ; AVX512-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_3]], i64 [[IDXPROM]] ; AVX512-NEXT: [[TMP34:%.*]] = bitcast float* [[TMP33]] to <16 x float>* ; AVX512-NEXT: [[WIDE_LOAD_3:%.*]] = load <16 x float>, <16 x float>* [[TMP34]], align 4, !alias.scope !2 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_3]], <16 x float*> [[TMP32]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[TMP35:%.*]] = bitcast float* [[NEXT_GEP_3]] to <16 x float>* ; AVX512-NEXT: [[WIDE_LOAD15_3:%.*]] = load <16 x float>, <16 x float>* [[TMP35]], align 4, !alias.scope !9 ; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP32]], i64 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_3]], <16 x float*> [[TMP36]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[INDEX_NEXT_3:%.*]] = or i64 [[INDEX]], 64 ; AVX512-NEXT: [[PTR_IND_3:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 1024 ; AVX512-NEXT: [[NEXT_GEP_4:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_3]] ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr float, float* [[PTR_IND_3]], <16 x i64> ; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_4]], i64 [[IDXPROM]] ; AVX512-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <16 x float>* ; AVX512-NEXT: [[WIDE_LOAD_4:%.*]] = load <16 x float>, <16 x float>* [[TMP39]], align 4, !alias.scope !2 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_4]], <16 x float*> [[TMP37]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[TMP40:%.*]] = bitcast float* [[NEXT_GEP_4]] to <16 x float>* ; AVX512-NEXT: [[WIDE_LOAD15_4:%.*]] = load <16 x float>, <16 x float>* [[TMP40]], align 4, !alias.scope !9 ; AVX512-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP37]], i64 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_4]], <16 x float*> [[TMP41]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[INDEX_NEXT_4:%.*]] = or i64 [[INDEX]], 80 ; AVX512-NEXT: [[PTR_IND_4:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 1280 ; AVX512-NEXT: [[NEXT_GEP_5:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_4]] ; AVX512-NEXT: [[TMP42:%.*]] = getelementptr float, float* [[PTR_IND_4]], <16 x i64> ; AVX512-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_5]], i64 [[IDXPROM]] ; AVX512-NEXT: [[TMP44:%.*]] = bitcast float* [[TMP43]] to <16 x float>* ; AVX512-NEXT: [[WIDE_LOAD_5:%.*]] = load <16 x float>, <16 x float>* [[TMP44]], align 4, !alias.scope !2 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_5]], <16 x float*> [[TMP42]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[TMP45:%.*]] = bitcast float* [[NEXT_GEP_5]] to <16 x float>* ; AVX512-NEXT: [[WIDE_LOAD15_5:%.*]] = load <16 x float>, <16 x float>* [[TMP45]], align 4, !alias.scope !9 ; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP42]], i64 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_5]], <16 x float*> [[TMP46]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[INDEX_NEXT_5:%.*]] = or i64 [[INDEX]], 96 ; AVX512-NEXT: [[PTR_IND_5:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 1536 ; AVX512-NEXT: [[NEXT_GEP_6:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_5]] ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr float, float* [[PTR_IND_5]], <16 x i64> ; AVX512-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_6]], i64 [[IDXPROM]] ; AVX512-NEXT: [[TMP49:%.*]] = bitcast float* [[TMP48]] to <16 x float>* ; AVX512-NEXT: [[WIDE_LOAD_6:%.*]] = load <16 x float>, <16 x float>* [[TMP49]], align 4, !alias.scope !2 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_6]], <16 x float*> [[TMP47]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[TMP50:%.*]] = bitcast float* [[NEXT_GEP_6]] to <16 x float>* ; AVX512-NEXT: [[WIDE_LOAD15_6:%.*]] = load <16 x float>, <16 x float>* [[TMP50]], align 4, !alias.scope !9 ; AVX512-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP47]], i64 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_6]], <16 x float*> [[TMP51]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[INDEX_NEXT_6:%.*]] = or i64 [[INDEX]], 112 ; AVX512-NEXT: [[PTR_IND_6:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 1792 ; AVX512-NEXT: [[NEXT_GEP_7:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_6]] ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr float, float* [[PTR_IND_6]], <16 x i64> ; AVX512-NEXT: [[TMP53:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_7]], i64 [[IDXPROM]] ; AVX512-NEXT: [[TMP54:%.*]] = bitcast float* [[TMP53]] to <16 x float>* ; AVX512-NEXT: [[WIDE_LOAD_7:%.*]] = load <16 x float>, <16 x float>* [[TMP54]], align 4, !alias.scope !2 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_7]], <16 x float*> [[TMP52]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[TMP55:%.*]] = bitcast float* [[NEXT_GEP_7]] to <16 x float>* ; AVX512-NEXT: [[WIDE_LOAD15_7:%.*]] = load <16 x float>, <16 x float>* [[TMP55]], align 4, !alias.scope !9 ; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP52]], i64 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_7]], <16 x float*> [[TMP56]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[INDEX_NEXT_7]] = add i64 [[INDEX]], 128 ; AVX512-NEXT: [[PTR_IND_7]] = getelementptr float, float* [[POINTER_PHI]], i64 2048 ; AVX512-NEXT: [[NITER_NSUB_7]] = add i64 [[NITER]], -8 ; AVX512-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NSUB_7]], 0 ; AVX512-NEXT: br i1 [[NITER_NCMP_7]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]] ; AVX512: middle.block.unr-lcssa: ; AVX512-NEXT: [[POINTER_PHI_UNR:%.*]] = phi float* [ [[DEST]], [[VECTOR_PH]] ], [ [[PTR_IND_7]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_7]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 ; AVX512-NEXT: br i1 [[LCMP_MOD_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL:%.*]] ; AVX512: vector.body.epil: ; AVX512-NEXT: [[POINTER_PHI_EPIL:%.*]] = phi float* [ [[PTR_IND_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[POINTER_PHI_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] ; AVX512-NEXT: [[INDEX_EPIL:%.*]] = phi i64 [ [[INDEX_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[INDEX_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] ; AVX512-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ [[EPIL_ITER_SUB:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[XTRAITER]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] ; AVX512-NEXT: [[NEXT_GEP_EPIL:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_EPIL]] ; AVX512-NEXT: [[TMP57:%.*]] = getelementptr float, float* [[POINTER_PHI_EPIL]], <16 x i64> ; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_EPIL]], i64 [[IDXPROM]] ; AVX512-NEXT: [[TMP59:%.*]] = bitcast float* [[TMP58]] to <16 x float>* ; AVX512-NEXT: [[WIDE_LOAD_EPIL:%.*]] = load <16 x float>, <16 x float>* [[TMP59]], align 4, !alias.scope !2 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_EPIL]], <16 x float*> [[TMP57]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[TMP60:%.*]] = bitcast float* [[NEXT_GEP_EPIL]] to <16 x float>* ; AVX512-NEXT: [[WIDE_LOAD15_EPIL:%.*]] = load <16 x float>, <16 x float>* [[TMP60]], align 4, !alias.scope !9 ; AVX512-NEXT: [[TMP61:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP57]], i64 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_EPIL]], <16 x float*> [[TMP61]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[INDEX_NEXT_EPIL]] = add i64 [[INDEX_EPIL]], 16 ; AVX512-NEXT: [[PTR_IND_EPIL]] = getelementptr float, float* [[POINTER_PHI_EPIL]], i64 256 ; AVX512-NEXT: [[EPIL_ITER_SUB]] = add i64 [[EPIL_ITER]], -1 ; AVX512-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_SUB]], 0 ; AVX512-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY_EPIL]], [[LOOP11:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER]] ; AVX512: for.body.preheader: ; AVX512-NEXT: [[PTR_ADDR_012_PH:%.*]] = phi float* [ [[PTR]], [[VECTOR_MEMCHECK]] ], [ [[PTR]], [[FOR_BODY_LR_PH]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] ; AVX512-NEXT: [[DEST_ADDR_011_PH:%.*]] = phi float* [ [[DEST]], [[VECTOR_MEMCHECK]] ], [ [[DEST]], [[FOR_BODY_LR_PH]] ], [ [[IND_END14]], [[MIDDLE_BLOCK]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[PTR_ADDR_012:%.*]] = phi float* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[PTR_ADDR_012_PH]], [[FOR_BODY_PREHEADER]] ] ; AVX512-NEXT: [[DEST_ADDR_011:%.*]] = phi float* [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ], [ [[DEST_ADDR_011_PH]], [[FOR_BODY_PREHEADER]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 [[IDXPROM]] ; AVX512-NEXT: [[TMP62:%.*]] = load float, float* [[ARRAYIDX]], align 4 ; AVX512-NEXT: store float [[TMP62]], float* [[DEST_ADDR_011]], align 4 ; AVX512-NEXT: [[TMP63:%.*]] = load float, float* [[PTR_ADDR_012]], align 4 ; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 1 ; AVX512-NEXT: store float [[TMP63]], float* [[ARRAYIDX5]], align 4 ; AVX512-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 1 ; AVX512-NEXT: [[ADD_PTR6]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 16 ; AVX512-NEXT: [[CMP_NOT:%.*]] = icmp eq float* [[INCDEC_PTR]], [[ADD_PTR]] ; AVX512-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP13:!llvm.loop !.*]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; ; FVW2-LABEL: @test_gather_not_profitable_pr48429( ; FVW2-NEXT: entry: ; FVW2-NEXT: [[IDX_EXT:%.*]] = sext i32 [[D:%.*]] to i64 ; FVW2-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[PTR:%.*]], i64 [[IDX_EXT]] ; FVW2-NEXT: [[CMP_NOT10:%.*]] = icmp eq i32 [[D]], 0 ; FVW2-NEXT: br i1 [[CMP_NOT10]], label [[FOR_END:%.*]], label [[FOR_BODY_LR_PH:%.*]] ; FVW2: for.body.lr.ph: ; FVW2-NEXT: [[MUL:%.*]] = sub nsw i32 0, [[D]] ; FVW2-NEXT: [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64 ; FVW2-NEXT: [[TMP0:%.*]] = shl nsw i64 [[IDX_EXT]], 2 ; FVW2-NEXT: [[TMP1:%.*]] = add nsw i64 [[TMP0]], -4 ; FVW2-NEXT: [[TMP2:%.*]] = lshr exact i64 [[TMP1]], 2 ; FVW2-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 ; FVW2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 12 ; FVW2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; FVW2: vector.memcheck: ; FVW2-NEXT: [[TMP4:%.*]] = shl nsw i64 [[IDX_EXT]], 2 ; FVW2-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP4]], -4 ; FVW2-NEXT: [[TMP6:%.*]] = lshr exact i64 [[TMP5]], 2 ; FVW2-NEXT: [[TMP7:%.*]] = shl nsw i64 [[TMP5]], 2 ; FVW2-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], 2 ; FVW2-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[DEST:%.*]], i64 [[TMP8]] ; FVW2-NEXT: [[TMP9:%.*]] = add nuw nsw i64 [[TMP6]], 1 ; FVW2-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP9]] ; FVW2-NEXT: [[SCEVGEP6:%.*]] = getelementptr float, float* [[PTR]], i64 [[IDXPROM]] ; FVW2-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 1 ; FVW2-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], [[IDX_EXT]] ; FVW2-NEXT: [[SCEVGEP8:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP11]] ; FVW2-NEXT: [[BOUND0:%.*]] = icmp ugt float* [[SCEVGEP4]], [[DEST]] ; FVW2-NEXT: [[BOUND1:%.*]] = icmp ugt float* [[SCEVGEP]], [[PTR]] ; FVW2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; FVW2-NEXT: [[BOUND010:%.*]] = icmp ugt float* [[SCEVGEP8]], [[DEST]] ; FVW2-NEXT: [[BOUND111:%.*]] = icmp ult float* [[SCEVGEP6]], [[SCEVGEP]] ; FVW2-NEXT: [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]] ; FVW2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]] ; FVW2-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_PH:%.*]] ; FVW2: vector.ph: ; FVW2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP3]], 9223372036854775804 ; FVW2-NEXT: [[IND_END:%.*]] = getelementptr float, float* [[PTR]], i64 [[N_VEC]] ; FVW2-NEXT: [[TMP12:%.*]] = shl i64 [[N_VEC]], 4 ; FVW2-NEXT: [[IND_END14:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP12]] ; FVW2-NEXT: [[TMP13:%.*]] = add nsw i64 [[N_VEC]], -4 ; FVW2-NEXT: [[TMP14:%.*]] = lshr exact i64 [[TMP13]], 2 ; FVW2-NEXT: [[TMP15:%.*]] = add nuw nsw i64 [[TMP14]], 1 ; FVW2-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP15]], 3 ; FVW2-NEXT: [[TMP16:%.*]] = icmp ult i64 [[TMP13]], 12 ; FVW2-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]] ; FVW2: vector.ph.new: ; FVW2-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP15]], 9223372036854775804 ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: ; FVW2-NEXT: [[POINTER_PHI:%.*]] = phi float* [ [[DEST]], [[VECTOR_PH_NEW]] ], [ [[PTR_IND_3:%.*]], [[VECTOR_BODY]] ] ; FVW2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ] ; FVW2-NEXT: [[NITER:%.*]] = phi i64 [ [[UNROLL_ITER]], [[VECTOR_PH_NEW]] ], [ [[NITER_NSUB_3:%.*]], [[VECTOR_BODY]] ] ; FVW2-NEXT: [[NEXT_GEP:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX]] ; FVW2-NEXT: [[TMP17:%.*]] = getelementptr float, float* [[POINTER_PHI]], <2 x i64> ; FVW2-NEXT: [[TMP18:%.*]] = getelementptr float, float* [[POINTER_PHI]], <2 x i64> ; FVW2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP]], i64 [[IDXPROM]] ; FVW2-NEXT: [[TMP20:%.*]] = bitcast float* [[TMP19]] to <2 x float>* ; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP20]], align 4, !alias.scope !7 ; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP19]], i64 2 ; FVW2-NEXT: [[TMP22:%.*]] = bitcast float* [[TMP21]] to <2 x float>* ; FVW2-NEXT: [[WIDE_LOAD16:%.*]] = load <2 x float>, <2 x float>* [[TMP22]], align 4, !alias.scope !7 ; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD]], <2 x float*> [[TMP17]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 ; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD16]], <2 x float*> [[TMP18]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 ; FVW2-NEXT: [[TMP23:%.*]] = bitcast float* [[NEXT_GEP]] to <2 x float>* ; FVW2-NEXT: [[WIDE_LOAD17:%.*]] = load <2 x float>, <2 x float>* [[TMP23]], align 4, !alias.scope !14 ; FVW2-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[NEXT_GEP]], i64 2 ; FVW2-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <2 x float>* ; FVW2-NEXT: [[WIDE_LOAD18:%.*]] = load <2 x float>, <2 x float>* [[TMP25]], align 4, !alias.scope !14 ; FVW2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP17]], i64 1 ; FVW2-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP18]], i64 1 ; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD17]], <2 x float*> [[TMP26]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 ; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD18]], <2 x float*> [[TMP27]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 ; FVW2-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 4 ; FVW2-NEXT: [[PTR_IND:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 64 ; FVW2-NEXT: [[NEXT_GEP_1:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT]] ; FVW2-NEXT: [[TMP28:%.*]] = getelementptr float, float* [[PTR_IND]], <2 x i64> ; FVW2-NEXT: [[TMP29:%.*]] = getelementptr float, float* [[PTR_IND]], <2 x i64> ; FVW2-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_1]], i64 [[IDXPROM]] ; FVW2-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <2 x float>* ; FVW2-NEXT: [[WIDE_LOAD_1:%.*]] = load <2 x float>, <2 x float>* [[TMP31]], align 4, !alias.scope !7 ; FVW2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 2 ; FVW2-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <2 x float>* ; FVW2-NEXT: [[WIDE_LOAD16_1:%.*]] = load <2 x float>, <2 x float>* [[TMP33]], align 4, !alias.scope !7 ; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD_1]], <2 x float*> [[TMP28]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 ; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD16_1]], <2 x float*> [[TMP29]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 ; FVW2-NEXT: [[TMP34:%.*]] = bitcast float* [[NEXT_GEP_1]] to <2 x float>* ; FVW2-NEXT: [[WIDE_LOAD17_1:%.*]] = load <2 x float>, <2 x float>* [[TMP34]], align 4, !alias.scope !14 ; FVW2-NEXT: [[TMP35:%.*]] = getelementptr float, float* [[NEXT_GEP_1]], i64 2 ; FVW2-NEXT: [[TMP36:%.*]] = bitcast float* [[TMP35]] to <2 x float>* ; FVW2-NEXT: [[WIDE_LOAD18_1:%.*]] = load <2 x float>, <2 x float>* [[TMP36]], align 4, !alias.scope !14 ; FVW2-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP28]], i64 1 ; FVW2-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP29]], i64 1 ; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD17_1]], <2 x float*> [[TMP37]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 ; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD18_1]], <2 x float*> [[TMP38]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 ; FVW2-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX]], 8 ; FVW2-NEXT: [[PTR_IND_1:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 128 ; FVW2-NEXT: [[NEXT_GEP_2:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_1]] ; FVW2-NEXT: [[TMP39:%.*]] = getelementptr float, float* [[PTR_IND_1]], <2 x i64> ; FVW2-NEXT: [[TMP40:%.*]] = getelementptr float, float* [[PTR_IND_1]], <2 x i64> ; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_2]], i64 [[IDXPROM]] ; FVW2-NEXT: [[TMP42:%.*]] = bitcast float* [[TMP41]] to <2 x float>* ; FVW2-NEXT: [[WIDE_LOAD_2:%.*]] = load <2 x float>, <2 x float>* [[TMP42]], align 4, !alias.scope !7 ; FVW2-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, float* [[TMP41]], i64 2 ; FVW2-NEXT: [[TMP44:%.*]] = bitcast float* [[TMP43]] to <2 x float>* ; FVW2-NEXT: [[WIDE_LOAD16_2:%.*]] = load <2 x float>, <2 x float>* [[TMP44]], align 4, !alias.scope !7 ; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD_2]], <2 x float*> [[TMP39]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 ; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD16_2]], <2 x float*> [[TMP40]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 ; FVW2-NEXT: [[TMP45:%.*]] = bitcast float* [[NEXT_GEP_2]] to <2 x float>* ; FVW2-NEXT: [[WIDE_LOAD17_2:%.*]] = load <2 x float>, <2 x float>* [[TMP45]], align 4, !alias.scope !14 ; FVW2-NEXT: [[TMP46:%.*]] = getelementptr float, float* [[NEXT_GEP_2]], i64 2 ; FVW2-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <2 x float>* ; FVW2-NEXT: [[WIDE_LOAD18_2:%.*]] = load <2 x float>, <2 x float>* [[TMP47]], align 4, !alias.scope !14 ; FVW2-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP39]], i64 1 ; FVW2-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP40]], i64 1 ; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD17_2]], <2 x float*> [[TMP48]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 ; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD18_2]], <2 x float*> [[TMP49]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 ; FVW2-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX]], 12 ; FVW2-NEXT: [[PTR_IND_2:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 192 ; FVW2-NEXT: [[NEXT_GEP_3:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_2]] ; FVW2-NEXT: [[TMP50:%.*]] = getelementptr float, float* [[PTR_IND_2]], <2 x i64> ; FVW2-NEXT: [[TMP51:%.*]] = getelementptr float, float* [[PTR_IND_2]], <2 x i64> ; FVW2-NEXT: [[TMP52:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_3]], i64 [[IDXPROM]] ; FVW2-NEXT: [[TMP53:%.*]] = bitcast float* [[TMP52]] to <2 x float>* ; FVW2-NEXT: [[WIDE_LOAD_3:%.*]] = load <2 x float>, <2 x float>* [[TMP53]], align 4, !alias.scope !7 ; FVW2-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[TMP52]], i64 2 ; FVW2-NEXT: [[TMP55:%.*]] = bitcast float* [[TMP54]] to <2 x float>* ; FVW2-NEXT: [[WIDE_LOAD16_3:%.*]] = load <2 x float>, <2 x float>* [[TMP55]], align 4, !alias.scope !7 ; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD_3]], <2 x float*> [[TMP50]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 ; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD16_3]], <2 x float*> [[TMP51]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 ; FVW2-NEXT: [[TMP56:%.*]] = bitcast float* [[NEXT_GEP_3]] to <2 x float>* ; FVW2-NEXT: [[WIDE_LOAD17_3:%.*]] = load <2 x float>, <2 x float>* [[TMP56]], align 4, !alias.scope !14 ; FVW2-NEXT: [[TMP57:%.*]] = getelementptr float, float* [[NEXT_GEP_3]], i64 2 ; FVW2-NEXT: [[TMP58:%.*]] = bitcast float* [[TMP57]] to <2 x float>* ; FVW2-NEXT: [[WIDE_LOAD18_3:%.*]] = load <2 x float>, <2 x float>* [[TMP58]], align 4, !alias.scope !14 ; FVW2-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP50]], i64 1 ; FVW2-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP51]], i64 1 ; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD17_3]], <2 x float*> [[TMP59]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 ; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD18_3]], <2 x float*> [[TMP60]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 ; FVW2-NEXT: [[INDEX_NEXT_3]] = add i64 [[INDEX]], 16 ; FVW2-NEXT: [[PTR_IND_3]] = getelementptr float, float* [[POINTER_PHI]], i64 256 ; FVW2-NEXT: [[NITER_NSUB_3]] = add i64 [[NITER]], -4 ; FVW2-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NSUB_3]], 0 ; FVW2-NEXT: br i1 [[NITER_NCMP_3]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], [[LOOP15:!llvm.loop !.*]] ; FVW2: middle.block.unr-lcssa: ; FVW2-NEXT: [[POINTER_PHI_UNR:%.*]] = phi float* [ [[DEST]], [[VECTOR_PH]] ], [ [[PTR_IND_3]], [[VECTOR_BODY]] ] ; FVW2-NEXT: [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_3]], [[VECTOR_BODY]] ] ; FVW2-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 ; FVW2-NEXT: br i1 [[LCMP_MOD_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL:%.*]] ; FVW2: vector.body.epil: ; FVW2-NEXT: [[POINTER_PHI_EPIL:%.*]] = phi float* [ [[PTR_IND_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[POINTER_PHI_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] ; FVW2-NEXT: [[INDEX_EPIL:%.*]] = phi i64 [ [[INDEX_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[INDEX_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] ; FVW2-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ [[EPIL_ITER_SUB:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[XTRAITER]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] ; FVW2-NEXT: [[NEXT_GEP_EPIL:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_EPIL]] ; FVW2-NEXT: [[TMP61:%.*]] = getelementptr float, float* [[POINTER_PHI_EPIL]], <2 x i64> ; FVW2-NEXT: [[TMP62:%.*]] = getelementptr float, float* [[POINTER_PHI_EPIL]], <2 x i64> ; FVW2-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_EPIL]], i64 [[IDXPROM]] ; FVW2-NEXT: [[TMP64:%.*]] = bitcast float* [[TMP63]] to <2 x float>* ; FVW2-NEXT: [[WIDE_LOAD_EPIL:%.*]] = load <2 x float>, <2 x float>* [[TMP64]], align 4, !alias.scope !7 ; FVW2-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, float* [[TMP63]], i64 2 ; FVW2-NEXT: [[TMP66:%.*]] = bitcast float* [[TMP65]] to <2 x float>* ; FVW2-NEXT: [[WIDE_LOAD16_EPIL:%.*]] = load <2 x float>, <2 x float>* [[TMP66]], align 4, !alias.scope !7 ; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD_EPIL]], <2 x float*> [[TMP61]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 ; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD16_EPIL]], <2 x float*> [[TMP62]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 ; FVW2-NEXT: [[TMP67:%.*]] = bitcast float* [[NEXT_GEP_EPIL]] to <2 x float>* ; FVW2-NEXT: [[WIDE_LOAD17_EPIL:%.*]] = load <2 x float>, <2 x float>* [[TMP67]], align 4, !alias.scope !14 ; FVW2-NEXT: [[TMP68:%.*]] = getelementptr float, float* [[NEXT_GEP_EPIL]], i64 2 ; FVW2-NEXT: [[TMP69:%.*]] = bitcast float* [[TMP68]] to <2 x float>* ; FVW2-NEXT: [[WIDE_LOAD18_EPIL:%.*]] = load <2 x float>, <2 x float>* [[TMP69]], align 4, !alias.scope !14 ; FVW2-NEXT: [[TMP70:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP61]], i64 1 ; FVW2-NEXT: [[TMP71:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP62]], i64 1 ; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD17_EPIL]], <2 x float*> [[TMP70]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 ; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD18_EPIL]], <2 x float*> [[TMP71]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 ; FVW2-NEXT: [[INDEX_NEXT_EPIL]] = add i64 [[INDEX_EPIL]], 4 ; FVW2-NEXT: [[PTR_IND_EPIL]] = getelementptr float, float* [[POINTER_PHI_EPIL]], i64 64 ; FVW2-NEXT: [[EPIL_ITER_SUB]] = add i64 [[EPIL_ITER]], -1 ; FVW2-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_SUB]], 0 ; FVW2-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY_EPIL]], [[LOOP16:!llvm.loop !.*]] ; FVW2: middle.block: ; FVW2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; FVW2-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER]] ; FVW2: for.body.preheader: ; FVW2-NEXT: [[PTR_ADDR_012_PH:%.*]] = phi float* [ [[PTR]], [[VECTOR_MEMCHECK]] ], [ [[PTR]], [[FOR_BODY_LR_PH]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] ; FVW2-NEXT: [[DEST_ADDR_011_PH:%.*]] = phi float* [ [[DEST]], [[VECTOR_MEMCHECK]] ], [ [[DEST]], [[FOR_BODY_LR_PH]] ], [ [[IND_END14]], [[MIDDLE_BLOCK]] ] ; FVW2-NEXT: br label [[FOR_BODY:%.*]] ; FVW2: for.body: ; FVW2-NEXT: [[PTR_ADDR_012:%.*]] = phi float* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[PTR_ADDR_012_PH]], [[FOR_BODY_PREHEADER]] ] ; FVW2-NEXT: [[DEST_ADDR_011:%.*]] = phi float* [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ], [ [[DEST_ADDR_011_PH]], [[FOR_BODY_PREHEADER]] ] ; FVW2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 [[IDXPROM]] ; FVW2-NEXT: [[TMP72:%.*]] = load float, float* [[ARRAYIDX]], align 4 ; FVW2-NEXT: store float [[TMP72]], float* [[DEST_ADDR_011]], align 4 ; FVW2-NEXT: [[TMP73:%.*]] = load float, float* [[PTR_ADDR_012]], align 4 ; FVW2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 1 ; FVW2-NEXT: store float [[TMP73]], float* [[ARRAYIDX5]], align 4 ; FVW2-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 1 ; FVW2-NEXT: [[ADD_PTR6]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 16 ; FVW2-NEXT: [[CMP_NOT:%.*]] = icmp eq float* [[INCDEC_PTR]], [[ADD_PTR]] ; FVW2-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP18:!llvm.loop !.*]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; entry: %idx.ext = sext i32 %d to i64 %add.ptr = getelementptr inbounds float, float* %ptr, i64 %idx.ext %cmp.not10 = icmp eq i32 %d, 0 br i1 %cmp.not10, label %for.end, label %for.body.lr.ph for.body.lr.ph: ; preds = %entry %mul = sub nsw i32 0, %d %idxprom = sext i32 %mul to i64 br label %for.body for.body: ; preds = %for.body.lr.ph, %for.body %ptr.addr.012 = phi float* [ %ptr, %for.body.lr.ph ], [ %incdec.ptr, %for.body ] %dest.addr.011 = phi float* [ %dest, %for.body.lr.ph ], [ %add.ptr6, %for.body ] %arrayidx = getelementptr inbounds float, float* %ptr.addr.012, i64 %idxprom %0 = load float, float* %arrayidx, align 4 store float %0, float* %dest.addr.011, align 4 %1 = load float, float* %ptr.addr.012, align 4 %arrayidx5 = getelementptr inbounds float, float* %dest.addr.011, i64 1 store float %1, float* %arrayidx5, align 4 %incdec.ptr = getelementptr inbounds float, float* %ptr.addr.012, i64 1 %add.ptr6 = getelementptr inbounds float, float* %dest.addr.011, i64 16 %cmp.not = icmp eq float* %incdec.ptr, %add.ptr br i1 %cmp.not, label %for.end.loopexit, label %for.body for.end.loopexit: ; preds = %for.body br label %for.end for.end: ; preds = %for.end.loopexit, %entry ret void }