; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SSE2 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SLM ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+avx512bw -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX ; ; vXi8 ; define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) { ; SSE2-LABEL: @loadext_2i8_to_2i64( ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* ; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64> ; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 ; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 ; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 ; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 ; SSE2-NEXT: ret <2 x i64> [[V1]] ; ; SLM-LABEL: @loadext_2i8_to_2i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 ; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 ; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i64 ; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i64 ; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 ; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 ; SLM-NEXT: ret <2 x i64> [[V1]] ; ; AVX-LABEL: @loadext_2i8_to_2i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64> ; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 ; AVX-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 ; AVX-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 ; AVX-NEXT: ret <2 x i64> [[V1]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %i0 = load i8, i8* %p0, align 1 %i1 = load i8, i8* %p1, align 1 %x0 = zext i8 %i0 to i64 %x1 = zext i8 %i1 to i64 %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0 %v1 = insertelement <2 x i64> %v0, i64 %x1, i32 1 ret <2 x i64> %v1 } define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) { ; SSE2-LABEL: @loadext_4i8_to_4i32( ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* ; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> ; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 ; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 ; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 ; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 ; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 ; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 ; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 ; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 ; SSE2-NEXT: ret <4 x i32> [[V3]] ; ; SLM-LABEL: @loadext_4i8_to_4i32( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 ; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 ; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 ; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 ; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 ; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i32 ; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i32 ; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i32 ; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i32 ; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0 ; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1 ; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2 ; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3 ; SLM-NEXT: ret <4 x i32> [[V3]] ; ; AVX-LABEL: @loadext_4i8_to_4i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> ; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 ; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 ; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 ; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 ; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 ; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 ; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 ; AVX-NEXT: ret <4 x i32> [[V3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 %p3 = getelementptr inbounds i8, i8* %p0, i64 3 %i0 = load i8, i8* %p0, align 1 %i1 = load i8, i8* %p1, align 1 %i2 = load i8, i8* %p2, align 1 %i3 = load i8, i8* %p3, align 1 %x0 = zext i8 %i0 to i32 %x1 = zext i8 %i1 to i32 %x2 = zext i8 %i2 to i32 %x3 = zext i8 %i3 to i32 %v0 = insertelement <4 x i32> undef, i32 %x0, i32 0 %v1 = insertelement <4 x i32> %v0, i32 %x1, i32 1 %v2 = insertelement <4 x i32> %v1, i32 %x2, i32 2 %v3 = insertelement <4 x i32> %v2, i32 %x3, i32 3 ret <4 x i32> %v3 } define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) { ; SSE2-LABEL: @loadext_4i8_to_4i64( ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* ; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> ; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 ; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 ; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 ; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 ; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 ; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 ; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 ; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 ; SSE2-NEXT: ret <4 x i64> [[V3]] ; ; SLM-LABEL: @loadext_4i8_to_4i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 ; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 ; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 ; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 ; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 ; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i64 ; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i64 ; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i64 ; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i64 ; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 ; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 ; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 ; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 ; SLM-NEXT: ret <4 x i64> [[V3]] ; ; AVX-LABEL: @loadext_4i8_to_4i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> ; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 ; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 ; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 ; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 ; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 ; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 ; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 ; AVX-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 %p3 = getelementptr inbounds i8, i8* %p0, i64 3 %i0 = load i8, i8* %p0, align 1 %i1 = load i8, i8* %p1, align 1 %i2 = load i8, i8* %p2, align 1 %i3 = load i8, i8* %p3, align 1 %x0 = zext i8 %i0 to i64 %x1 = zext i8 %i1 to i64 %x2 = zext i8 %i2 to i64 %x3 = zext i8 %i3 to i64 %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0 %v1 = insertelement <4 x i64> %v0, i64 %x1, i32 1 %v2 = insertelement <4 x i64> %v1, i64 %x2, i32 2 %v3 = insertelement <4 x i64> %v2, i64 %x3, i32 3 ret <4 x i64> %v3 } define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) { ; SSE2-LABEL: @loadext_8i8_to_8i16( ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 ; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 ; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 ; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 ; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* ; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16> ; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0 ; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 ; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1 ; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 ; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2 ; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 ; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3 ; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 ; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4 ; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 ; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5 ; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 ; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6 ; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 ; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7 ; SSE2-NEXT: ret <8 x i16> [[V7]] ; ; SLM-LABEL: @loadext_8i8_to_8i16( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 ; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 ; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 ; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 ; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 ; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 ; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 ; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 ; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 ; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 ; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 ; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 ; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 ; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i16 ; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i16 ; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i16 ; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i16 ; SLM-NEXT: [[X4:%.*]] = zext i8 [[I4]] to i16 ; SLM-NEXT: [[X5:%.*]] = zext i8 [[I5]] to i16 ; SLM-NEXT: [[X6:%.*]] = zext i8 [[I6]] to i16 ; SLM-NEXT: [[X7:%.*]] = zext i8 [[I7]] to i16 ; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[X0]], i32 0 ; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[X1]], i32 1 ; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[X2]], i32 2 ; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[X3]], i32 3 ; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[X4]], i32 4 ; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[X5]], i32 5 ; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[X6]], i32 6 ; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[X7]], i32 7 ; SLM-NEXT: ret <8 x i16> [[V7]] ; ; AVX-LABEL: @loadext_8i8_to_8i16( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 ; AVX-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 ; AVX-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 ; AVX-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 ; AVX-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16> ; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; AVX-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 ; AVX-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1 ; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 ; AVX-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2 ; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 ; AVX-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3 ; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 ; AVX-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4 ; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 ; AVX-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5 ; AVX-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 ; AVX-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6 ; AVX-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 ; AVX-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7 ; AVX-NEXT: ret <8 x i16> [[V7]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 %p3 = getelementptr inbounds i8, i8* %p0, i64 3 %p4 = getelementptr inbounds i8, i8* %p0, i64 4 %p5 = getelementptr inbounds i8, i8* %p0, i64 5 %p6 = getelementptr inbounds i8, i8* %p0, i64 6 %p7 = getelementptr inbounds i8, i8* %p0, i64 7 %i0 = load i8, i8* %p0, align 1 %i1 = load i8, i8* %p1, align 1 %i2 = load i8, i8* %p2, align 1 %i3 = load i8, i8* %p3, align 1 %i4 = load i8, i8* %p4, align 1 %i5 = load i8, i8* %p5, align 1 %i6 = load i8, i8* %p6, align 1 %i7 = load i8, i8* %p7, align 1 %x0 = zext i8 %i0 to i16 %x1 = zext i8 %i1 to i16 %x2 = zext i8 %i2 to i16 %x3 = zext i8 %i3 to i16 %x4 = zext i8 %i4 to i16 %x5 = zext i8 %i5 to i16 %x6 = zext i8 %i6 to i16 %x7 = zext i8 %i7 to i16 %v0 = insertelement <8 x i16> undef, i16 %x0, i32 0 %v1 = insertelement <8 x i16> %v0, i16 %x1, i32 1 %v2 = insertelement <8 x i16> %v1, i16 %x2, i32 2 %v3 = insertelement <8 x i16> %v2, i16 %x3, i32 3 %v4 = insertelement <8 x i16> %v3, i16 %x4, i32 4 %v5 = insertelement <8 x i16> %v4, i16 %x5, i32 5 %v6 = insertelement <8 x i16> %v5, i16 %x6, i32 6 %v7 = insertelement <8 x i16> %v6, i16 %x7, i32 7 ret <8 x i16> %v7 } define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) { ; SSE2-LABEL: @loadext_8i8_to_8i32( ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 ; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 ; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 ; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 ; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* ; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32> ; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 ; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 ; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 ; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 ; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 ; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 ; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 ; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 ; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 ; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 ; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 ; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 ; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 ; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 ; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 ; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 ; SSE2-NEXT: ret <8 x i32> [[V7]] ; ; SLM-LABEL: @loadext_8i8_to_8i32( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 ; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 ; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 ; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 ; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 ; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 ; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 ; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 ; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 ; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 ; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 ; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 ; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 ; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i32 ; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i32 ; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i32 ; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i32 ; SLM-NEXT: [[X4:%.*]] = zext i8 [[I4]] to i32 ; SLM-NEXT: [[X5:%.*]] = zext i8 [[I5]] to i32 ; SLM-NEXT: [[X6:%.*]] = zext i8 [[I6]] to i32 ; SLM-NEXT: [[X7:%.*]] = zext i8 [[I7]] to i32 ; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0 ; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1 ; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2 ; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3 ; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4 ; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5 ; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6 ; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7 ; SLM-NEXT: ret <8 x i32> [[V7]] ; ; AVX-LABEL: @loadext_8i8_to_8i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 ; AVX-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 ; AVX-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 ; AVX-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 ; AVX-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32> ; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 ; AVX-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 ; AVX-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 ; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 ; AVX-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 ; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 ; AVX-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 ; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 ; AVX-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 ; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 ; AVX-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 ; AVX-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 ; AVX-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 ; AVX-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 ; AVX-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 ; AVX-NEXT: ret <8 x i32> [[V7]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 %p3 = getelementptr inbounds i8, i8* %p0, i64 3 %p4 = getelementptr inbounds i8, i8* %p0, i64 4 %p5 = getelementptr inbounds i8, i8* %p0, i64 5 %p6 = getelementptr inbounds i8, i8* %p0, i64 6 %p7 = getelementptr inbounds i8, i8* %p0, i64 7 %i0 = load i8, i8* %p0, align 1 %i1 = load i8, i8* %p1, align 1 %i2 = load i8, i8* %p2, align 1 %i3 = load i8, i8* %p3, align 1 %i4 = load i8, i8* %p4, align 1 %i5 = load i8, i8* %p5, align 1 %i6 = load i8, i8* %p6, align 1 %i7 = load i8, i8* %p7, align 1 %x0 = zext i8 %i0 to i32 %x1 = zext i8 %i1 to i32 %x2 = zext i8 %i2 to i32 %x3 = zext i8 %i3 to i32 %x4 = zext i8 %i4 to i32 %x5 = zext i8 %i5 to i32 %x6 = zext i8 %i6 to i32 %x7 = zext i8 %i7 to i32 %v0 = insertelement <8 x i32> undef, i32 %x0, i32 0 %v1 = insertelement <8 x i32> %v0, i32 %x1, i32 1 %v2 = insertelement <8 x i32> %v1, i32 %x2, i32 2 %v3 = insertelement <8 x i32> %v2, i32 %x3, i32 3 %v4 = insertelement <8 x i32> %v3, i32 %x4, i32 4 %v5 = insertelement <8 x i32> %v4, i32 %x5, i32 5 %v6 = insertelement <8 x i32> %v5, i32 %x6, i32 6 %v7 = insertelement <8 x i32> %v6, i32 %x7, i32 7 ret <8 x i32> %v7 } define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) { ; SSE2-LABEL: @loadext_16i8_to_16i16( ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 ; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 ; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 ; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 ; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 ; SSE2-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8 ; SSE2-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9 ; SSE2-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10 ; SSE2-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11 ; SSE2-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12 ; SSE2-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13 ; SSE2-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14 ; SSE2-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15 ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>* ; SSE2-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16> ; SSE2-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0 ; SSE2-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0 ; SSE2-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1 ; SSE2-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1 ; SSE2-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2 ; SSE2-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2 ; SSE2-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3 ; SSE2-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3 ; SSE2-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4 ; SSE2-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4 ; SSE2-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5 ; SSE2-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5 ; SSE2-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6 ; SSE2-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6 ; SSE2-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7 ; SSE2-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7 ; SSE2-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8 ; SSE2-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8 ; SSE2-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9 ; SSE2-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9 ; SSE2-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10 ; SSE2-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10 ; SSE2-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11 ; SSE2-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11 ; SSE2-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12 ; SSE2-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12 ; SSE2-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13 ; SSE2-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13 ; SSE2-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14 ; SSE2-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14 ; SSE2-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15 ; SSE2-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15 ; SSE2-NEXT: ret <16 x i16> [[V15]] ; ; SLM-LABEL: @loadext_16i8_to_16i16( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 ; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 ; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 ; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 ; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 ; SLM-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8 ; SLM-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9 ; SLM-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10 ; SLM-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11 ; SLM-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12 ; SLM-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13 ; SLM-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14 ; SLM-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15 ; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 ; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 ; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 ; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 ; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 ; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 ; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 ; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 ; SLM-NEXT: [[I8:%.*]] = load i8, i8* [[P8]], align 1 ; SLM-NEXT: [[I9:%.*]] = load i8, i8* [[P9]], align 1 ; SLM-NEXT: [[I10:%.*]] = load i8, i8* [[P10]], align 1 ; SLM-NEXT: [[I11:%.*]] = load i8, i8* [[P11]], align 1 ; SLM-NEXT: [[I12:%.*]] = load i8, i8* [[P12]], align 1 ; SLM-NEXT: [[I13:%.*]] = load i8, i8* [[P13]], align 1 ; SLM-NEXT: [[I14:%.*]] = load i8, i8* [[P14]], align 1 ; SLM-NEXT: [[I15:%.*]] = load i8, i8* [[P15]], align 1 ; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i16 ; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i16 ; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i16 ; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i16 ; SLM-NEXT: [[X4:%.*]] = zext i8 [[I4]] to i16 ; SLM-NEXT: [[X5:%.*]] = zext i8 [[I5]] to i16 ; SLM-NEXT: [[X6:%.*]] = zext i8 [[I6]] to i16 ; SLM-NEXT: [[X7:%.*]] = zext i8 [[I7]] to i16 ; SLM-NEXT: [[X8:%.*]] = zext i8 [[I8]] to i16 ; SLM-NEXT: [[X9:%.*]] = zext i8 [[I9]] to i16 ; SLM-NEXT: [[X10:%.*]] = zext i8 [[I10]] to i16 ; SLM-NEXT: [[X11:%.*]] = zext i8 [[I11]] to i16 ; SLM-NEXT: [[X12:%.*]] = zext i8 [[I12]] to i16 ; SLM-NEXT: [[X13:%.*]] = zext i8 [[I13]] to i16 ; SLM-NEXT: [[X14:%.*]] = zext i8 [[I14]] to i16 ; SLM-NEXT: [[X15:%.*]] = zext i8 [[I15]] to i16 ; SLM-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[X0]], i32 0 ; SLM-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[X1]], i32 1 ; SLM-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[X2]], i32 2 ; SLM-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[X3]], i32 3 ; SLM-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[X4]], i32 4 ; SLM-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[X5]], i32 5 ; SLM-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[X6]], i32 6 ; SLM-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[X7]], i32 7 ; SLM-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[X8]], i32 8 ; SLM-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[X9]], i32 9 ; SLM-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[X10]], i32 10 ; SLM-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[X11]], i32 11 ; SLM-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[X12]], i32 12 ; SLM-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[X13]], i32 13 ; SLM-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[X14]], i32 14 ; SLM-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[X15]], i32 15 ; SLM-NEXT: ret <16 x i16> [[V15]] ; ; AVX-LABEL: @loadext_16i8_to_16i16( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 ; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 ; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 ; AVX-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 ; AVX-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 ; AVX-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 ; AVX-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 ; AVX-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8 ; AVX-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9 ; AVX-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10 ; AVX-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11 ; AVX-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12 ; AVX-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13 ; AVX-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14 ; AVX-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>* ; AVX-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16> ; AVX-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0 ; AVX-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1 ; AVX-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1 ; AVX-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2 ; AVX-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2 ; AVX-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3 ; AVX-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3 ; AVX-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4 ; AVX-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4 ; AVX-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5 ; AVX-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5 ; AVX-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6 ; AVX-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6 ; AVX-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7 ; AVX-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7 ; AVX-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8 ; AVX-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8 ; AVX-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9 ; AVX-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9 ; AVX-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10 ; AVX-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10 ; AVX-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11 ; AVX-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11 ; AVX-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12 ; AVX-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12 ; AVX-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13 ; AVX-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13 ; AVX-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14 ; AVX-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14 ; AVX-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15 ; AVX-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15 ; AVX-NEXT: ret <16 x i16> [[V15]] ; %p1 = getelementptr inbounds i8, i8* %p0, i64 1 %p2 = getelementptr inbounds i8, i8* %p0, i64 2 %p3 = getelementptr inbounds i8, i8* %p0, i64 3 %p4 = getelementptr inbounds i8, i8* %p0, i64 4 %p5 = getelementptr inbounds i8, i8* %p0, i64 5 %p6 = getelementptr inbounds i8, i8* %p0, i64 6 %p7 = getelementptr inbounds i8, i8* %p0, i64 7 %p8 = getelementptr inbounds i8, i8* %p0, i64 8 %p9 = getelementptr inbounds i8, i8* %p0, i64 9 %p10 = getelementptr inbounds i8, i8* %p0, i64 10 %p11 = getelementptr inbounds i8, i8* %p0, i64 11 %p12 = getelementptr inbounds i8, i8* %p0, i64 12 %p13 = getelementptr inbounds i8, i8* %p0, i64 13 %p14 = getelementptr inbounds i8, i8* %p0, i64 14 %p15 = getelementptr inbounds i8, i8* %p0, i64 15 %i0 = load i8, i8* %p0, align 1 %i1 = load i8, i8* %p1, align 1 %i2 = load i8, i8* %p2, align 1 %i3 = load i8, i8* %p3, align 1 %i4 = load i8, i8* %p4, align 1 %i5 = load i8, i8* %p5, align 1 %i6 = load i8, i8* %p6, align 1 %i7 = load i8, i8* %p7, align 1 %i8 = load i8, i8* %p8, align 1 %i9 = load i8, i8* %p9, align 1 %i10 = load i8, i8* %p10, align 1 %i11 = load i8, i8* %p11, align 1 %i12 = load i8, i8* %p12, align 1 %i13 = load i8, i8* %p13, align 1 %i14 = load i8, i8* %p14, align 1 %i15 = load i8, i8* %p15, align 1 %x0 = zext i8 %i0 to i16 %x1 = zext i8 %i1 to i16 %x2 = zext i8 %i2 to i16 %x3 = zext i8 %i3 to i16 %x4 = zext i8 %i4 to i16 %x5 = zext i8 %i5 to i16 %x6 = zext i8 %i6 to i16 %x7 = zext i8 %i7 to i16 %x8 = zext i8 %i8 to i16 %x9 = zext i8 %i9 to i16 %x10 = zext i8 %i10 to i16 %x11 = zext i8 %i11 to i16 %x12 = zext i8 %i12 to i16 %x13 = zext i8 %i13 to i16 %x14 = zext i8 %i14 to i16 %x15 = zext i8 %i15 to i16 %v0 = insertelement <16 x i16> undef, i16 %x0, i32 0 %v1 = insertelement <16 x i16> %v0, i16 %x1, i32 1 %v2 = insertelement <16 x i16> %v1, i16 %x2, i32 2 %v3 = insertelement <16 x i16> %v2, i16 %x3, i32 3 %v4 = insertelement <16 x i16> %v3, i16 %x4, i32 4 %v5 = insertelement <16 x i16> %v4, i16 %x5, i32 5 %v6 = insertelement <16 x i16> %v5, i16 %x6, i32 6 %v7 = insertelement <16 x i16> %v6, i16 %x7, i32 7 %v8 = insertelement <16 x i16> %v7, i16 %x8, i32 8 %v9 = insertelement <16 x i16> %v8, i16 %x9, i32 9 %v10 = insertelement <16 x i16> %v9, i16 %x10, i32 10 %v11 = insertelement <16 x i16> %v10, i16 %x11, i32 11 %v12 = insertelement <16 x i16> %v11, i16 %x12, i32 12 %v13 = insertelement <16 x i16> %v12, i16 %x13, i32 13 %v14 = insertelement <16 x i16> %v13, i16 %x14, i32 14 %v15 = insertelement <16 x i16> %v14, i16 %x15, i32 15 ret <16 x i16> %v15 } ; ; vXi16 ; define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) { ; SSE2-LABEL: @loadext_2i16_to_2i64( ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* ; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64> ; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 ; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 ; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 ; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 ; SSE2-NEXT: ret <2 x i64> [[V1]] ; ; SLM-LABEL: @loadext_2i16_to_2i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 ; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 ; SLM-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i64 ; SLM-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i64 ; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 ; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 ; SLM-NEXT: ret <2 x i64> [[V1]] ; ; AVX-LABEL: @loadext_2i16_to_2i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64> ; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 ; AVX-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 ; AVX-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 ; AVX-NEXT: ret <2 x i64> [[V1]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %i0 = load i16, i16* %p0, align 1 %i1 = load i16, i16* %p1, align 1 %x0 = zext i16 %i0 to i64 %x1 = zext i16 %i1 to i64 %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0 %v1 = insertelement <2 x i64> %v0, i64 %x1, i32 1 ret <2 x i64> %v1 } define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) { ; SSE2-LABEL: @loadext_4i16_to_4i32( ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* ; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> ; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 ; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 ; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 ; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 ; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 ; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 ; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 ; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 ; SSE2-NEXT: ret <4 x i32> [[V3]] ; ; SLM-LABEL: @loadext_4i16_to_4i32( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 ; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 ; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 ; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 ; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 ; SLM-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i32 ; SLM-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i32 ; SLM-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i32 ; SLM-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i32 ; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0 ; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1 ; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2 ; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3 ; SLM-NEXT: ret <4 x i32> [[V3]] ; ; AVX-LABEL: @loadext_4i16_to_4i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 ; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> ; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 ; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 ; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1 ; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 ; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2 ; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 ; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3 ; AVX-NEXT: ret <4 x i32> [[V3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %p2 = getelementptr inbounds i16, i16* %p0, i64 2 %p3 = getelementptr inbounds i16, i16* %p0, i64 3 %i0 = load i16, i16* %p0, align 1 %i1 = load i16, i16* %p1, align 1 %i2 = load i16, i16* %p2, align 1 %i3 = load i16, i16* %p3, align 1 %x0 = zext i16 %i0 to i32 %x1 = zext i16 %i1 to i32 %x2 = zext i16 %i2 to i32 %x3 = zext i16 %i3 to i32 %v0 = insertelement <4 x i32> undef, i32 %x0, i32 0 %v1 = insertelement <4 x i32> %v0, i32 %x1, i32 1 %v2 = insertelement <4 x i32> %v1, i32 %x2, i32 2 %v3 = insertelement <4 x i32> %v2, i32 %x3, i32 3 ret <4 x i32> %v3 } define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) { ; SSE2-LABEL: @loadext_4i16_to_4i64( ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* ; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> ; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 ; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 ; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 ; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 ; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 ; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 ; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 ; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 ; SSE2-NEXT: ret <4 x i64> [[V3]] ; ; SLM-LABEL: @loadext_4i16_to_4i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 ; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 ; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 ; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 ; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 ; SLM-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i64 ; SLM-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i64 ; SLM-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i64 ; SLM-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i64 ; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 ; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 ; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 ; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 ; SLM-NEXT: ret <4 x i64> [[V3]] ; ; AVX-LABEL: @loadext_4i16_to_4i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 ; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64> ; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 ; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 ; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 ; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 ; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 ; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 ; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 ; AVX-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %p2 = getelementptr inbounds i16, i16* %p0, i64 2 %p3 = getelementptr inbounds i16, i16* %p0, i64 3 %i0 = load i16, i16* %p0, align 1 %i1 = load i16, i16* %p1, align 1 %i2 = load i16, i16* %p2, align 1 %i3 = load i16, i16* %p3, align 1 %x0 = zext i16 %i0 to i64 %x1 = zext i16 %i1 to i64 %x2 = zext i16 %i2 to i64 %x3 = zext i16 %i3 to i64 %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0 %v1 = insertelement <4 x i64> %v0, i64 %x1, i32 1 %v2 = insertelement <4 x i64> %v1, i64 %x2, i32 2 %v3 = insertelement <4 x i64> %v2, i64 %x3, i32 3 ret <4 x i64> %v3 } define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) { ; SSE2-LABEL: @loadext_8i16_to_8i32( ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 ; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4 ; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5 ; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6 ; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7 ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>* ; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32> ; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 ; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 ; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 ; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 ; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 ; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 ; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 ; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 ; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 ; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 ; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 ; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 ; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 ; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 ; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 ; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 ; SSE2-NEXT: ret <8 x i32> [[V7]] ; ; SLM-LABEL: @loadext_8i16_to_8i32( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 ; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4 ; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5 ; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6 ; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7 ; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 ; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 ; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 ; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 ; SLM-NEXT: [[I4:%.*]] = load i16, i16* [[P4]], align 1 ; SLM-NEXT: [[I5:%.*]] = load i16, i16* [[P5]], align 1 ; SLM-NEXT: [[I6:%.*]] = load i16, i16* [[P6]], align 1 ; SLM-NEXT: [[I7:%.*]] = load i16, i16* [[P7]], align 1 ; SLM-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i32 ; SLM-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i32 ; SLM-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i32 ; SLM-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i32 ; SLM-NEXT: [[X4:%.*]] = zext i16 [[I4]] to i32 ; SLM-NEXT: [[X5:%.*]] = zext i16 [[I5]] to i32 ; SLM-NEXT: [[X6:%.*]] = zext i16 [[I6]] to i32 ; SLM-NEXT: [[X7:%.*]] = zext i16 [[I7]] to i32 ; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0 ; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1 ; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2 ; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3 ; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4 ; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5 ; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6 ; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7 ; SLM-NEXT: ret <8 x i32> [[V7]] ; ; AVX-LABEL: @loadext_8i16_to_8i32( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 ; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 ; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 ; AVX-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4 ; AVX-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5 ; AVX-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6 ; AVX-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>* ; AVX-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32> ; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0 ; AVX-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1 ; AVX-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1 ; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2 ; AVX-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2 ; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3 ; AVX-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3 ; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4 ; AVX-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4 ; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5 ; AVX-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5 ; AVX-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6 ; AVX-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6 ; AVX-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 ; AVX-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7 ; AVX-NEXT: ret <8 x i32> [[V7]] ; %p1 = getelementptr inbounds i16, i16* %p0, i64 1 %p2 = getelementptr inbounds i16, i16* %p0, i64 2 %p3 = getelementptr inbounds i16, i16* %p0, i64 3 %p4 = getelementptr inbounds i16, i16* %p0, i64 4 %p5 = getelementptr inbounds i16, i16* %p0, i64 5 %p6 = getelementptr inbounds i16, i16* %p0, i64 6 %p7 = getelementptr inbounds i16, i16* %p0, i64 7 %i0 = load i16, i16* %p0, align 1 %i1 = load i16, i16* %p1, align 1 %i2 = load i16, i16* %p2, align 1 %i3 = load i16, i16* %p3, align 1 %i4 = load i16, i16* %p4, align 1 %i5 = load i16, i16* %p5, align 1 %i6 = load i16, i16* %p6, align 1 %i7 = load i16, i16* %p7, align 1 %x0 = zext i16 %i0 to i32 %x1 = zext i16 %i1 to i32 %x2 = zext i16 %i2 to i32 %x3 = zext i16 %i3 to i32 %x4 = zext i16 %i4 to i32 %x5 = zext i16 %i5 to i32 %x6 = zext i16 %i6 to i32 %x7 = zext i16 %i7 to i32 %v0 = insertelement <8 x i32> undef, i32 %x0, i32 0 %v1 = insertelement <8 x i32> %v0, i32 %x1, i32 1 %v2 = insertelement <8 x i32> %v1, i32 %x2, i32 2 %v3 = insertelement <8 x i32> %v2, i32 %x3, i32 3 %v4 = insertelement <8 x i32> %v3, i32 %x4, i32 4 %v5 = insertelement <8 x i32> %v4, i32 %x5, i32 5 %v6 = insertelement <8 x i32> %v5, i32 %x6, i32 6 %v7 = insertelement <8 x i32> %v6, i32 %x7, i32 7 ret <8 x i32> %v7 } ; ; vXi32 ; define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) { ; SSE2-LABEL: @loadext_2i32_to_2i64( ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* ; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> ; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 ; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 ; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 ; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 ; SSE2-NEXT: ret <2 x i64> [[V1]] ; ; SLM-LABEL: @loadext_2i32_to_2i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 ; SLM-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 ; SLM-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 ; SLM-NEXT: [[X0:%.*]] = zext i32 [[I0]] to i64 ; SLM-NEXT: [[X1:%.*]] = zext i32 [[I1]] to i64 ; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 ; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 ; SLM-NEXT: ret <2 x i64> [[V1]] ; ; AVX-LABEL: @loadext_2i32_to_2i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* ; AVX-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> ; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 ; AVX-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 ; AVX-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 ; AVX-NEXT: ret <2 x i64> [[V1]] ; %p1 = getelementptr inbounds i32, i32* %p0, i64 1 %i0 = load i32, i32* %p0, align 1 %i1 = load i32, i32* %p1, align 1 %x0 = zext i32 %i0 to i64 %x1 = zext i32 %i1 to i64 %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0 %v1 = insertelement <2 x i64> %v0, i64 %x1, i32 1 ret <2 x i64> %v1 } define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) { ; SSE2-LABEL: @loadext_4i32_to_4i64( ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* ; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 ; SSE2-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> ; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 ; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 ; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 ; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 ; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 ; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 ; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 ; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 ; SSE2-NEXT: ret <4 x i64> [[V3]] ; ; SLM-LABEL: @loadext_4i32_to_4i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 ; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 ; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 ; SLM-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 ; SLM-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 ; SLM-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 ; SLM-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 ; SLM-NEXT: [[X0:%.*]] = zext i32 [[I0]] to i64 ; SLM-NEXT: [[X1:%.*]] = zext i32 [[I1]] to i64 ; SLM-NEXT: [[X2:%.*]] = zext i32 [[I2]] to i64 ; SLM-NEXT: [[X3:%.*]] = zext i32 [[I3]] to i64 ; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 ; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 ; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 ; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 ; SLM-NEXT: ret <4 x i64> [[V3]] ; ; AVX-LABEL: @loadext_4i32_to_4i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 ; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 ; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 ; AVX-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 ; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> ; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 ; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 ; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 ; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 ; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2 ; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 ; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3 ; AVX-NEXT: ret <4 x i64> [[V3]] ; %p1 = getelementptr inbounds i32, i32* %p0, i64 1 %p2 = getelementptr inbounds i32, i32* %p0, i64 2 %p3 = getelementptr inbounds i32, i32* %p0, i64 3 %i0 = load i32, i32* %p0, align 1 %i1 = load i32, i32* %p1, align 1 %i2 = load i32, i32* %p2, align 1 %i3 = load i32, i32* %p3, align 1 %x0 = zext i32 %i0 to i64 %x1 = zext i32 %i1 to i64 %x2 = zext i32 %i2 to i64 %x3 = zext i32 %i3 to i64 %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0 %v1 = insertelement <4 x i64> %v0, i64 %x1, i32 1 %v2 = insertelement <4 x i64> %v1, i64 %x2, i32 2 %v3 = insertelement <4 x i64> %v2, i64 %x3, i32 3 ret <4 x i64> %v3 }