; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) { ; GFX7-LABEL: s_mul_i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mul_i32 s0, s0, s1 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_mul_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_and_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s1, s1, s2 ; GFX8-NEXT: s_mul_i32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_and_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s1, s1, s2 ; GFX9-NEXT: s_mul_i32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result } define i16 @v_mul_i16(i16 %num, i16 %den) { ; GFX7-LABEL: v_mul_i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = mul i16 %num, %den ret i16 %result } define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inreg zeroext %den) { ; GFX7-LABEL: s_mul_i16_zeroext: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mul_i32 s0, s0, s1 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_mul_i16_zeroext: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_and_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s1, s1, s2 ; GFX8-NEXT: s_mul_i32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s0, s0, s2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i16_zeroext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_and_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s1, s1, s2 ; GFX9-NEXT: s_mul_i32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s0, s0, s2 ; GFX9-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result } define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) { ; GFX7-LABEL: v_mul_i16_zeroext: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i16_zeroext: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i16_zeroext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = mul i16 %num, %den ret i16 %result } define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inreg signext %den) { ; GFX7-LABEL: s_mul_i16_signext: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mul_i32 s0, s0, s1 ; GFX7-NEXT: s_sext_i32_i16 s0, s0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_mul_i16_signext: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_and_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s1, s1, s2 ; GFX8-NEXT: s_mul_i32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i16_signext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_and_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s1, s1, s2 ; GFX9-NEXT: s_mul_i32 s0, s0, s1 ; GFX9-NEXT: s_sext_i32_i16 s0, s0 ; GFX9-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result } define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) { ; GFX7-LABEL: v_mul_i16_signext: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i16_signext: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i16_signext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = mul i16 %num, %den ret i16 %result } define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) { ; GCN-LABEL: s_mul_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_mul_i32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog %result = mul i32 %num, %den ret i32 %result } define i32 @v_mul_i32(i32 %num, i32 %den) { ; GCN-LABEL: v_mul_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = mul i32 %num, %den ret i32 %result } define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) { ; GCN-LABEL: s_mul_v2i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_mul_i32 s0, s0, s2 ; GCN-NEXT: s_mul_i32 s1, s1, s3 ; GCN-NEXT: ; return to shader part epilog %result = mul <2 x i32> %num, %den ret <2 x i32> %result } define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) { ; GCN-LABEL: v_mul_v2i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v0, v0, v2 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = mul <2 x i32> %num, %den ret <2 x i32> %result } define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) { ; GFX7-LABEL: s_mul_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX7-NEXT: s_mul_i32 s4, s0, s2 ; GFX7-NEXT: s_mul_i32 s1, s1, s2 ; GFX7-NEXT: s_mul_i32 s0, s0, s3 ; GFX7-NEXT: s_add_i32 s1, s1, s0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s1, v0 ; GFX7-NEXT: v_readfirstlane_b32 s1, v0 ; GFX7-NEXT: s_mov_b32 s0, s4 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_mul_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX8-NEXT: s_mul_i32 s4, s0, s2 ; GFX8-NEXT: s_mul_i32 s1, s1, s2 ; GFX8-NEXT: s_mul_i32 s0, s0, s3 ; GFX8-NEXT: s_add_i32 s1, s1, s0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s1, v0 ; GFX8-NEXT: v_readfirstlane_b32 s1, v0 ; GFX8-NEXT: s_mov_b32 s0, s4 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mul_i32 s1, s1, s2 ; GFX9-NEXT: s_mul_i32 s3, s0, s3 ; GFX9-NEXT: s_mul_i32 s4, s0, s2 ; GFX9-NEXT: s_mul_hi_u32 s0, s0, s2 ; GFX9-NEXT: s_add_i32 s1, s1, s3 ; GFX9-NEXT: s_add_i32 s1, s1, s0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: ; return to shader part epilog %result = mul i64 %num, %den ret i64 %result } define i64 @v_mul_i64(i64 %num, i64 %den) { ; GFX7-LABEL: v_mul_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_lo_u32 v4, v0, v3 ; GFX7-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX7-NEXT: v_mul_lo_u32 v3, v0, v2 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, v2 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v4, v0, v3 ; GFX8-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX8-NEXT: v_mul_lo_u32 v3, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, v3 ; GFX9-NEXT: v_mul_hi_u32 v4, v0, v2 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, v2 ; GFX9-NEXT: v_add3_u32 v1, v1, v3, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = mul i64 %num, %den ret i64 %result } define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) { ; GFX7-LABEL: s_mul_i96: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: s_mul_i32 s7, s1, s3 ; GFX7-NEXT: s_mul_i32 s8, s0, s4 ; GFX7-NEXT: s_add_u32 s7, s7, s8 ; GFX7-NEXT: v_mov_b32_e32 v3, s4 ; GFX7-NEXT: v_mul_hi_u32 v2, v2, s3 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s7, v0 ; GFX7-NEXT: s_mul_i32 s7, s1, s4 ; GFX7-NEXT: s_mul_i32 s2, s2, s3 ; GFX7-NEXT: v_mul_hi_u32 v3, s0, v3 ; GFX7-NEXT: s_cselect_b32 s8, 1, 0 ; GFX7-NEXT: s_mul_i32 s6, s0, s3 ; GFX7-NEXT: s_mul_i32 s5, s0, s5 ; GFX7-NEXT: s_add_i32 s0, s2, s7 ; GFX7-NEXT: s_add_i32 s0, s0, s5 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; GFX7-NEXT: s_and_b32 s8, s8, 1 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, s8, v1 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX7-NEXT: v_readfirstlane_b32 s1, v0 ; GFX7-NEXT: v_readfirstlane_b32 s2, v1 ; GFX7-NEXT: s_mov_b32 s0, s6 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_mul_i96: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_mul_i32 s7, s1, s3 ; GFX8-NEXT: s_mul_i32 s8, s0, s4 ; GFX8-NEXT: s_add_u32 s7, s7, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: v_mul_hi_u32 v2, v2, s3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s7, v0 ; GFX8-NEXT: s_mul_i32 s7, s1, s4 ; GFX8-NEXT: s_mul_i32 s2, s2, s3 ; GFX8-NEXT: v_mul_hi_u32 v3, s0, v3 ; GFX8-NEXT: s_cselect_b32 s8, 1, 0 ; GFX8-NEXT: s_mul_i32 s6, s0, s3 ; GFX8-NEXT: s_mul_i32 s5, s0, s5 ; GFX8-NEXT: s_add_i32 s0, s2, s7 ; GFX8-NEXT: s_add_i32 s0, s0, s5 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; GFX8-NEXT: s_and_b32 s8, s8, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s8, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 ; GFX8-NEXT: v_readfirstlane_b32 s1, v0 ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: s_mov_b32 s0, s6 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i96: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mul_i32 s7, s1, s3 ; GFX9-NEXT: s_mul_i32 s8, s0, s4 ; GFX9-NEXT: s_add_u32 s7, s7, s8 ; GFX9-NEXT: s_cselect_b32 s8, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s9, s0, s3 ; GFX9-NEXT: s_and_b32 s8, s8, 1 ; GFX9-NEXT: s_add_u32 s7, s7, s9 ; GFX9-NEXT: s_cselect_b32 s9, 1, 0 ; GFX9-NEXT: s_and_b32 s9, s9, 1 ; GFX9-NEXT: s_add_i32 s8, s8, s9 ; GFX9-NEXT: s_mul_i32 s9, s1, s4 ; GFX9-NEXT: s_mul_i32 s2, s2, s3 ; GFX9-NEXT: s_mul_i32 s5, s0, s5 ; GFX9-NEXT: s_add_i32 s2, s2, s9 ; GFX9-NEXT: s_mul_hi_u32 s1, s1, s3 ; GFX9-NEXT: s_add_i32 s2, s2, s5 ; GFX9-NEXT: s_mul_i32 s6, s0, s3 ; GFX9-NEXT: s_mul_hi_u32 s0, s0, s4 ; GFX9-NEXT: s_add_i32 s1, s2, s1 ; GFX9-NEXT: s_add_i32 s0, s1, s0 ; GFX9-NEXT: s_add_i32 s2, s0, s8 ; GFX9-NEXT: s_mov_b32 s0, s6 ; GFX9-NEXT: s_mov_b32 s1, s7 ; GFX9-NEXT: ; return to shader part epilog %result = mul i96 %num, %den %cast = bitcast i96 %result to <3 x i32> ret <3 x i32> %cast } define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX7-LABEL: v_mul_i96: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_lo_u32 v7, v1, v3 ; GFX7-NEXT: v_mul_lo_u32 v8, v0, v4 ; GFX7-NEXT: v_mul_hi_u32 v9, v0, v3 ; GFX7-NEXT: v_mul_lo_u32 v2, v2, v3 ; GFX7-NEXT: v_mul_lo_u32 v5, v0, v5 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GFX7-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GFX7-NEXT: v_mul_lo_u32 v9, v1, v4 ; GFX7-NEXT: v_mul_hi_u32 v1, v1, v3 ; GFX7-NEXT: v_mul_lo_u32 v6, v0, v3 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, v4 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v9 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v0, v8 ; GFX7-NEXT: v_mov_b32_e32 v0, v6 ; GFX7-NEXT: v_mov_b32_e32 v1, v7 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i96: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v7, v1, v3 ; GFX8-NEXT: v_mul_lo_u32 v8, v0, v4 ; GFX8-NEXT: v_mul_hi_u32 v9, v0, v3 ; GFX8-NEXT: v_mul_lo_u32 v2, v2, v3 ; GFX8-NEXT: v_mul_lo_u32 v5, v0, v5 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v9 ; GFX8-NEXT: v_mul_lo_u32 v9, v1, v4 ; GFX8-NEXT: v_mul_hi_u32 v1, v1, v3 ; GFX8-NEXT: v_mul_lo_u32 v6, v0, v3 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, v4 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v9 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v8 ; GFX8-NEXT: v_mov_b32_e32 v0, v6 ; GFX8-NEXT: v_mov_b32_e32 v1, v7 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i96: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v7, v1, v3 ; GFX9-NEXT: v_mul_lo_u32 v8, v0, v4 ; GFX9-NEXT: v_mul_hi_u32 v9, v0, v3 ; GFX9-NEXT: v_mul_lo_u32 v10, v1, v4 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, v3 ; GFX9-NEXT: v_mul_lo_u32 v5, v0, v5 ; GFX9-NEXT: v_mul_hi_u32 v1, v1, v3 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 ; GFX9-NEXT: v_mul_lo_u32 v6, v0, v3 ; GFX9-NEXT: v_mul_hi_u32 v0, v0, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v2, v2, v10 ; GFX9-NEXT: v_add_u32_e32 v3, v8, v9 ; GFX9-NEXT: v_add3_u32 v1, v2, v5, v1 ; GFX9-NEXT: v_add3_u32 v2, v1, v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = mul i96 %num, %den ret i96 %result } define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) { ; GFX7-LABEL: s_mul_i128: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX7-NEXT: s_mul_i32 s9, s1, s4 ; GFX7-NEXT: s_mul_i32 s10, s0, s5 ; GFX7-NEXT: s_add_u32 s9, s9, s10 ; GFX7-NEXT: s_cselect_b32 s10, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s9, v0 ; GFX7-NEXT: s_and_b32 s10, s10, 1 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v1, vcc, s10, v1 ; GFX7-NEXT: s_mul_i32 s9, s2, s4 ; GFX7-NEXT: s_mul_i32 s10, s1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: s_add_u32 s9, s9, s10 ; GFX7-NEXT: s_cselect_b32 s10, 1, 0 ; GFX7-NEXT: v_mul_hi_u32 v2, v2, s4 ; GFX7-NEXT: s_mul_i32 s11, s0, s6 ; GFX7-NEXT: s_and_b32 s10, s10, 1 ; GFX7-NEXT: s_add_u32 s9, s9, s11 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: s_cselect_b32 s11, 1, 0 ; GFX7-NEXT: v_mul_hi_u32 v4, s0, v3 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, s9, v2 ; GFX7-NEXT: s_and_b32 s11, s11, 1 ; GFX7-NEXT: s_add_i32 s10, s10, s11 ; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v5, vcc, s10, v5 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: s_mul_i32 s5, s2, s5 ; GFX7-NEXT: s_mul_i32 s3, s3, s4 ; GFX7-NEXT: v_mul_hi_u32 v4, v4, s4 ; GFX7-NEXT: s_mul_i32 s8, s0, s4 ; GFX7-NEXT: s_mul_i32 s9, s1, s6 ; GFX7-NEXT: v_mul_hi_u32 v3, s1, v3 ; GFX7-NEXT: s_mul_i32 s7, s0, s7 ; GFX7-NEXT: v_mul_hi_u32 v5, s0, v5 ; GFX7-NEXT: s_add_i32 s0, s3, s5 ; GFX7-NEXT: s_add_i32 s0, s0, s9 ; GFX7-NEXT: s_add_i32 s0, s0, s7 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, s0, v4 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX7-NEXT: v_readfirstlane_b32 s1, v0 ; GFX7-NEXT: v_readfirstlane_b32 s2, v1 ; GFX7-NEXT: v_readfirstlane_b32 s3, v2 ; GFX7-NEXT: s_mov_b32 s0, s8 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_mul_i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX8-NEXT: s_mul_i32 s9, s1, s4 ; GFX8-NEXT: s_mul_i32 s10, s0, s5 ; GFX8-NEXT: s_add_u32 s9, s9, s10 ; GFX8-NEXT: s_cselect_b32 s10, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s9, v0 ; GFX8-NEXT: s_and_b32 s10, s10, 1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s10, v1 ; GFX8-NEXT: s_mul_i32 s9, s2, s4 ; GFX8-NEXT: s_mul_i32 s10, s1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_add_u32 s9, s9, s10 ; GFX8-NEXT: s_cselect_b32 s10, 1, 0 ; GFX8-NEXT: v_mul_hi_u32 v2, v2, s4 ; GFX8-NEXT: s_mul_i32 s11, s0, s6 ; GFX8-NEXT: s_and_b32 s10, s10, 1 ; GFX8-NEXT: s_add_u32 s9, s9, s11 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_cselect_b32 s11, 1, 0 ; GFX8-NEXT: v_mul_hi_u32 v4, s0, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s9, v2 ; GFX8-NEXT: s_and_b32 s11, s11, 1 ; GFX8-NEXT: s_add_i32 s10, s10, s11 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s10, v5 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: s_mul_i32 s5, s2, s5 ; GFX8-NEXT: s_mul_i32 s3, s3, s4 ; GFX8-NEXT: v_mul_hi_u32 v4, v4, s4 ; GFX8-NEXT: s_mul_i32 s8, s0, s4 ; GFX8-NEXT: s_mul_i32 s9, s1, s6 ; GFX8-NEXT: v_mul_hi_u32 v3, s1, v3 ; GFX8-NEXT: s_mul_i32 s7, s0, s7 ; GFX8-NEXT: v_mul_hi_u32 v5, s0, v5 ; GFX8-NEXT: s_add_i32 s0, s3, s5 ; GFX8-NEXT: s_add_i32 s0, s0, s9 ; GFX8-NEXT: s_add_i32 s0, s0, s7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s0, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_readfirstlane_b32 s1, v0 ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_readfirstlane_b32 s3, v2 ; GFX8-NEXT: s_mov_b32 s0, s8 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mul_i32 s9, s1, s4 ; GFX9-NEXT: s_mul_i32 s10, s0, s5 ; GFX9-NEXT: s_add_u32 s9, s9, s10 ; GFX9-NEXT: s_cselect_b32 s10, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s11, s0, s4 ; GFX9-NEXT: s_and_b32 s10, s10, 1 ; GFX9-NEXT: s_add_u32 s9, s9, s11 ; GFX9-NEXT: s_cselect_b32 s11, 1, 0 ; GFX9-NEXT: s_and_b32 s11, s11, 1 ; GFX9-NEXT: s_add_i32 s10, s10, s11 ; GFX9-NEXT: s_mul_i32 s11, s2, s4 ; GFX9-NEXT: s_mul_i32 s12, s1, s5 ; GFX9-NEXT: s_add_u32 s11, s11, s12 ; GFX9-NEXT: s_cselect_b32 s12, 1, 0 ; GFX9-NEXT: s_mul_i32 s13, s0, s6 ; GFX9-NEXT: s_and_b32 s12, s12, 1 ; GFX9-NEXT: s_add_u32 s11, s11, s13 ; GFX9-NEXT: s_cselect_b32 s13, 1, 0 ; GFX9-NEXT: s_and_b32 s13, s13, 1 ; GFX9-NEXT: s_mul_hi_u32 s14, s1, s4 ; GFX9-NEXT: s_add_i32 s12, s12, s13 ; GFX9-NEXT: s_add_u32 s11, s11, s14 ; GFX9-NEXT: s_cselect_b32 s13, 1, 0 ; GFX9-NEXT: s_and_b32 s13, s13, 1 ; GFX9-NEXT: s_mul_hi_u32 s15, s0, s5 ; GFX9-NEXT: s_add_i32 s12, s12, s13 ; GFX9-NEXT: s_add_u32 s11, s11, s15 ; GFX9-NEXT: s_cselect_b32 s13, 1, 0 ; GFX9-NEXT: s_and_b32 s13, s13, 1 ; GFX9-NEXT: s_add_i32 s12, s12, s13 ; GFX9-NEXT: s_add_u32 s10, s11, s10 ; GFX9-NEXT: s_cselect_b32 s11, 1, 0 ; GFX9-NEXT: s_and_b32 s11, s11, 1 ; GFX9-NEXT: s_add_i32 s12, s12, s11 ; GFX9-NEXT: s_mul_i32 s11, s2, s5 ; GFX9-NEXT: s_mul_i32 s3, s3, s4 ; GFX9-NEXT: s_mul_i32 s13, s1, s6 ; GFX9-NEXT: s_add_i32 s3, s3, s11 ; GFX9-NEXT: s_mul_i32 s7, s0, s7 ; GFX9-NEXT: s_add_i32 s3, s3, s13 ; GFX9-NEXT: s_mul_hi_u32 s2, s2, s4 ; GFX9-NEXT: s_add_i32 s3, s3, s7 ; GFX9-NEXT: s_mul_hi_u32 s1, s1, s5 ; GFX9-NEXT: s_add_i32 s2, s3, s2 ; GFX9-NEXT: s_mul_i32 s8, s0, s4 ; GFX9-NEXT: s_add_i32 s1, s2, s1 ; GFX9-NEXT: s_mul_hi_u32 s0, s0, s6 ; GFX9-NEXT: s_add_i32 s0, s1, s0 ; GFX9-NEXT: s_add_i32 s3, s0, s12 ; GFX9-NEXT: s_mov_b32 s0, s8 ; GFX9-NEXT: s_mov_b32 s1, s9 ; GFX9-NEXT: s_mov_b32 s2, s10 ; GFX9-NEXT: ; return to shader part epilog %result = mul i128 %num, %den %cast = bitcast i128 %result to <4 x i32> ret <4 x i32> %cast } define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX7-LABEL: v_mul_i128: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_lo_u32 v9, v1, v4 ; GFX7-NEXT: v_mul_lo_u32 v10, v0, v5 ; GFX7-NEXT: v_mul_hi_u32 v11, v0, v4 ; GFX7-NEXT: v_mul_lo_u32 v12, v1, v5 ; GFX7-NEXT: v_mul_lo_u32 v13, v0, v6 ; GFX7-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GFX7-NEXT: v_mul_lo_u32 v11, v2, v4 ; GFX7-NEXT: v_mul_hi_u32 v14, v1, v4 ; GFX7-NEXT: v_mul_hi_u32 v15, v0, v5 ; GFX7-NEXT: v_mul_lo_u32 v3, v3, v4 ; GFX7-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GFX7-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; GFX7-NEXT: v_add_i32_e32 v11, vcc, v11, v14 ; GFX7-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; GFX7-NEXT: v_add_i32_e32 v11, vcc, v11, v15 ; GFX7-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; GFX7-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; GFX7-NEXT: v_mul_lo_u32 v12, v2, v5 ; GFX7-NEXT: v_mul_lo_u32 v13, v1, v6 ; GFX7-NEXT: v_mul_lo_u32 v7, v0, v7 ; GFX7-NEXT: v_mul_hi_u32 v2, v2, v4 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v12 ; GFX7-NEXT: v_mul_hi_u32 v1, v1, v5 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v13 ; GFX7-NEXT: v_mul_lo_u32 v8, v0, v4 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, v6 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, v0, v11 ; GFX7-NEXT: v_mov_b32_e32 v0, v8 ; GFX7-NEXT: v_mov_b32_e32 v1, v9 ; GFX7-NEXT: v_mov_b32_e32 v2, v10 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v9, v1, v4 ; GFX8-NEXT: v_mul_lo_u32 v10, v0, v5 ; GFX8-NEXT: v_mul_hi_u32 v11, v0, v4 ; GFX8-NEXT: v_mul_lo_u32 v12, v1, v5 ; GFX8-NEXT: v_mul_lo_u32 v13, v0, v6 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11 ; GFX8-NEXT: v_mul_lo_u32 v11, v2, v4 ; GFX8-NEXT: v_mul_hi_u32 v14, v1, v4 ; GFX8-NEXT: v_mul_hi_u32 v15, v0, v5 ; GFX8-NEXT: v_mul_lo_u32 v3, v3, v4 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v13 ; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v13 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v14 ; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v13 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v15 ; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v13 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v11, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v12, v11 ; GFX8-NEXT: v_mul_lo_u32 v12, v2, v5 ; GFX8-NEXT: v_mul_lo_u32 v13, v1, v6 ; GFX8-NEXT: v_mul_lo_u32 v7, v0, v7 ; GFX8-NEXT: v_mul_hi_u32 v2, v2, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v12 ; GFX8-NEXT: v_mul_hi_u32 v1, v1, v5 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v13 ; GFX8-NEXT: v_mul_lo_u32 v8, v0, v4 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, v6 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v0, v11 ; GFX8-NEXT: v_mov_b32_e32 v0, v8 ; GFX8-NEXT: v_mov_b32_e32 v1, v9 ; GFX8-NEXT: v_mov_b32_e32 v2, v10 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v9, v1, v4 ; GFX9-NEXT: v_mul_lo_u32 v10, v0, v5 ; GFX9-NEXT: v_mul_hi_u32 v11, v0, v4 ; GFX9-NEXT: v_mul_lo_u32 v12, v1, v5 ; GFX9-NEXT: v_mul_lo_u32 v13, v0, v6 ; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v10, v10, v11 ; GFX9-NEXT: v_mul_lo_u32 v11, v2, v4 ; GFX9-NEXT: v_mul_hi_u32 v14, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v15, v0, v5 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, v4 ; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v13 ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v14 ; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v15 ; GFX9-NEXT: v_add3_u32 v12, v12, v13, v14 ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v11, v12, v13, v11 ; GFX9-NEXT: v_mul_lo_u32 v12, v2, v5 ; GFX9-NEXT: v_mul_lo_u32 v13, v1, v6 ; GFX9-NEXT: v_mul_lo_u32 v7, v0, v7 ; GFX9-NEXT: v_mul_hi_u32 v2, v2, v4 ; GFX9-NEXT: v_mul_hi_u32 v1, v1, v5 ; GFX9-NEXT: v_mul_lo_u32 v8, v0, v4 ; GFX9-NEXT: v_mul_hi_u32 v0, v0, v6 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v12 ; GFX9-NEXT: v_add3_u32 v3, v3, v13, v7 ; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1 ; GFX9-NEXT: v_add3_u32 v3, v1, v0, v11 ; GFX9-NEXT: v_mov_b32_e32 v0, v8 ; GFX9-NEXT: v_mov_b32_e32 v1, v9 ; GFX9-NEXT: v_mov_b32_e32 v2, v10 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = mul i128 %num, %den ret i128 %result } define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-LABEL: s_mul_i256: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX7-NEXT: s_mul_i32 s17, s1, s8 ; GFX7-NEXT: s_mul_i32 s18, s0, s9 ; GFX7-NEXT: s_add_u32 s17, s17, s18 ; GFX7-NEXT: s_cselect_b32 s18, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s17, v0 ; GFX7-NEXT: s_and_b32 s18, s18, 1 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v1, vcc, s18, v1 ; GFX7-NEXT: s_mul_i32 s17, s2, s8 ; GFX7-NEXT: s_mul_i32 s18, s1, s9 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: s_add_u32 s17, s17, s18 ; GFX7-NEXT: s_cselect_b32 s18, 1, 0 ; GFX7-NEXT: v_mul_hi_u32 v2, v2, s8 ; GFX7-NEXT: s_mul_i32 s19, s0, s10 ; GFX7-NEXT: s_and_b32 s18, s18, 1 ; GFX7-NEXT: s_add_u32 s17, s17, s19 ; GFX7-NEXT: v_mov_b32_e32 v3, s9 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_mul_hi_u32 v4, s0, v3 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, s17, v2 ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v5, vcc, s18, v5 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX7-NEXT: s_mul_i32 s17, s3, s8 ; GFX7-NEXT: s_mul_i32 s18, s2, s9 ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX7-NEXT: s_add_u32 s17, s17, s18 ; GFX7-NEXT: s_cselect_b32 s18, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX7-NEXT: s_mul_i32 s19, s1, s10 ; GFX7-NEXT: s_and_b32 s18, s18, 1 ; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX7-NEXT: s_add_u32 s17, s17, s19 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-NEXT: v_mul_hi_u32 v5, v4, s8 ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: s_mul_i32 s20, s0, s11 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: s_add_u32 s17, s17, s20 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_mul_hi_u32 v3, s1, v3 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, s17, v5 ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_mov_b32_e32 v6, s10 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v8, vcc, s18, v8 ; GFX7-NEXT: v_mul_hi_u32 v7, s0, v6 ; GFX7-NEXT: s_mul_i32 s17, s4, s8 ; GFX7-NEXT: s_mul_i32 s18, s3, s9 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GFX7-NEXT: s_add_u32 s17, s17, s18 ; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX7-NEXT: s_cselect_b32 s18, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; GFX7-NEXT: s_mul_i32 s19, s2, s10 ; GFX7-NEXT: s_and_b32 s18, s18, 1 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GFX7-NEXT: s_add_u32 s17, s17, s19 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX7-NEXT: s_mul_i32 s20, s1, s11 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX7-NEXT: s_add_u32 s17, s17, s20 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_mul_hi_u32 v7, v5, s8 ; GFX7-NEXT: s_mul_i32 s21, s0, s12 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: s_add_u32 s17, s17, s21 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, s17, v7 ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_mul_hi_u32 v4, v4, s9 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v11, vcc, s18, v11 ; GFX7-NEXT: s_mul_i32 s17, s5, s8 ; GFX7-NEXT: s_mul_i32 s18, s4, s9 ; GFX7-NEXT: s_add_u32 s17, s17, s18 ; GFX7-NEXT: v_mul_hi_u32 v8, s1, v6 ; GFX7-NEXT: s_cselect_b32 s18, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, v7, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX7-NEXT: v_mov_b32_e32 v9, s11 ; GFX7-NEXT: s_mul_i32 s19, s3, s10 ; GFX7-NEXT: s_and_b32 s18, s18, 1 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; GFX7-NEXT: s_add_u32 s17, s17, s19 ; GFX7-NEXT: v_mul_hi_u32 v10, s0, v9 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GFX7-NEXT: s_mul_i32 s20, s2, s11 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; GFX7-NEXT: s_add_u32 s17, s17, s20 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GFX7-NEXT: s_mul_i32 s21, s1, s12 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX7-NEXT: s_add_u32 s17, s17, s21 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, s4 ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_mul_hi_u32 v8, v7, s8 ; GFX7-NEXT: s_mul_i32 s22, s0, s13 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: s_add_u32 s17, s17, s22 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, s17, v8 ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_mul_hi_u32 v10, v5, s9 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v14, vcc, s18, v14 ; GFX7-NEXT: s_mul_i32 s17, s6, s8 ; GFX7-NEXT: s_mul_i32 s18, s5, s9 ; GFX7-NEXT: s_add_u32 s17, s17, s18 ; GFX7-NEXT: s_cselect_b32 s18, 1, 0 ; GFX7-NEXT: v_mul_hi_u32 v6, s2, v6 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GFX7-NEXT: s_mul_i32 s19, s4, s10 ; GFX7-NEXT: s_and_b32 s18, s18, 1 ; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX7-NEXT: s_add_u32 s17, s17, s19 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v10, vcc, v14, v10 ; GFX7-NEXT: v_mul_hi_u32 v11, s1, v9 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX7-NEXT: v_mov_b32_e32 v12, s12 ; GFX7-NEXT: s_mul_i32 s20, s3, s11 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; GFX7-NEXT: s_add_u32 s17, s17, s20 ; GFX7-NEXT: v_mul_hi_u32 v13, s0, v12 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GFX7-NEXT: s_mul_i32 s21, s2, s12 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, v6, v13 ; GFX7-NEXT: s_add_u32 s17, s17, s21 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GFX7-NEXT: s_mul_i32 s22, s1, s13 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX7-NEXT: s_add_u32 s17, s17, s22 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v8, s5 ; GFX7-NEXT: v_mul_hi_u32 v10, v8, s8 ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: s_mul_i32 s23, s0, s14 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: s_add_u32 s17, s17, s23 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_mul_hi_u32 v11, v7, s9 ; GFX7-NEXT: v_add_i32_e32 v10, vcc, s17, v10 ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v17, vcc, s18, v17 ; GFX7-NEXT: v_mul_hi_u32 v5, v5, s10 ; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX7-NEXT: v_mul_hi_u32 v13, s2, v9 ; GFX7-NEXT: v_add_i32_e32 v11, vcc, v17, v11 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GFX7-NEXT: v_mul_hi_u32 v14, s1, v12 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v13 ; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX7-NEXT: v_mov_b32_e32 v15, s13 ; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GFX7-NEXT: v_mul_hi_u32 v16, s0, v15 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v14 ; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v16 ; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GFX7-NEXT: v_mov_b32_e32 v13, s14 ; GFX7-NEXT: s_mul_i32 s7, s7, s8 ; GFX7-NEXT: s_mul_i32 s17, s6, s9 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GFX7-NEXT: s_mul_i32 s16, s0, s8 ; GFX7-NEXT: s_mul_i32 s5, s5, s10 ; GFX7-NEXT: s_mul_i32 s15, s0, s15 ; GFX7-NEXT: v_mul_hi_u32 v13, s0, v13 ; GFX7-NEXT: s_add_i32 s0, s7, s17 ; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX7-NEXT: s_mul_i32 s4, s4, s11 ; GFX7-NEXT: s_add_i32 s0, s0, s5 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; GFX7-NEXT: v_mov_b32_e32 v10, s6 ; GFX7-NEXT: s_mul_i32 s11, s3, s12 ; GFX7-NEXT: s_add_i32 s0, s0, s4 ; GFX7-NEXT: s_mul_i32 s12, s2, s13 ; GFX7-NEXT: s_add_i32 s0, s0, s11 ; GFX7-NEXT: v_mul_hi_u32 v10, v10, s8 ; GFX7-NEXT: s_mul_i32 s13, s1, s14 ; GFX7-NEXT: s_add_i32 s0, s0, s12 ; GFX7-NEXT: v_mul_hi_u32 v8, v8, s9 ; GFX7-NEXT: s_add_i32 s0, s0, s13 ; GFX7-NEXT: v_mul_hi_u32 v7, v7, s10 ; GFX7-NEXT: v_mul_hi_u32 v9, s3, v9 ; GFX7-NEXT: s_add_i32 s0, s0, s15 ; GFX7-NEXT: v_mul_hi_u32 v11, s2, v12 ; GFX7-NEXT: v_add_i32_e32 v10, vcc, s0, v10 ; GFX7-NEXT: v_mul_hi_u32 v12, s1, v15 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v11 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v12 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GFX7-NEXT: v_readfirstlane_b32 s1, v0 ; GFX7-NEXT: v_readfirstlane_b32 s2, v1 ; GFX7-NEXT: v_readfirstlane_b32 s3, v2 ; GFX7-NEXT: v_readfirstlane_b32 s4, v3 ; GFX7-NEXT: v_readfirstlane_b32 s5, v4 ; GFX7-NEXT: v_readfirstlane_b32 s6, v5 ; GFX7-NEXT: v_readfirstlane_b32 s7, v6 ; GFX7-NEXT: s_mov_b32 s0, s16 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_mul_i256: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX8-NEXT: s_mul_i32 s17, s1, s8 ; GFX8-NEXT: s_mul_i32 s18, s0, s9 ; GFX8-NEXT: s_add_u32 s17, s17, s18 ; GFX8-NEXT: s_cselect_b32 s18, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s17, v0 ; GFX8-NEXT: s_and_b32 s18, s18, 1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s18, v1 ; GFX8-NEXT: s_mul_i32 s17, s2, s8 ; GFX8-NEXT: s_mul_i32 s18, s1, s9 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_add_u32 s17, s17, s18 ; GFX8-NEXT: s_cselect_b32 s18, 1, 0 ; GFX8-NEXT: v_mul_hi_u32 v2, v2, s8 ; GFX8-NEXT: s_mul_i32 s19, s0, s10 ; GFX8-NEXT: s_and_b32 s18, s18, 1 ; GFX8-NEXT: s_add_u32 s17, s17, s19 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_mul_hi_u32 v4, s0, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s17, v2 ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s18, v5 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: s_mul_i32 s17, s3, s8 ; GFX8-NEXT: s_mul_i32 s18, s2, s9 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: s_add_u32 s17, s17, s18 ; GFX8-NEXT: s_cselect_b32 s18, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 ; GFX8-NEXT: s_mul_i32 s19, s1, s10 ; GFX8-NEXT: s_and_b32 s18, s18, 1 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: s_add_u32 s17, s17, s19 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: v_mul_hi_u32 v5, v4, s8 ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: s_mul_i32 s20, s0, s11 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: s_add_u32 s17, s17, s20 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_mul_hi_u32 v3, s1, v3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s17, v5 ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_mov_b32_e32 v6, s10 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, s18, v8 ; GFX8-NEXT: v_mul_hi_u32 v7, s0, v6 ; GFX8-NEXT: s_mul_i32 s17, s4, s8 ; GFX8-NEXT: s_mul_i32 s18, s3, s9 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 ; GFX8-NEXT: s_add_u32 s17, s17, s18 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: s_cselect_b32 s18, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v8, v5 ; GFX8-NEXT: s_mul_i32 s19, s2, s10 ; GFX8-NEXT: s_and_b32 s18, s18, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7 ; GFX8-NEXT: s_add_u32 s17, s17, s19 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v7 ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: s_mul_i32 s20, s1, s11 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: s_add_u32 s17, s17, s20 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 ; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_mul_hi_u32 v7, v5, s8 ; GFX8-NEXT: s_mul_i32 s21, s0, s12 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: s_add_u32 s17, s17, s21 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s17, v7 ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_mul_hi_u32 v4, v4, s9 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v11, vcc, s18, v11 ; GFX8-NEXT: s_mul_i32 s17, s5, s8 ; GFX8-NEXT: s_mul_i32 s18, s4, s9 ; GFX8-NEXT: s_add_u32 s17, s17, s18 ; GFX8-NEXT: v_mul_hi_u32 v8, s1, v6 ; GFX8-NEXT: s_cselect_b32 s18, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NEXT: s_mul_i32 s19, s3, s10 ; GFX8-NEXT: s_and_b32 s18, s18, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v11, v7 ; GFX8-NEXT: s_add_u32 s17, s17, s19 ; GFX8-NEXT: v_mul_hi_u32 v10, s0, v9 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GFX8-NEXT: s_mul_i32 s20, s2, s11 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v10 ; GFX8-NEXT: s_add_u32 s17, s17, s20 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: s_mul_i32 s21, s1, s12 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: s_add_u32 s17, s17, s21 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 ; GFX8-NEXT: v_mov_b32_e32 v7, s4 ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_mul_hi_u32 v8, v7, s8 ; GFX8-NEXT: s_mul_i32 s22, s0, s13 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: s_add_u32 s17, s17, s22 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, s17, v8 ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_mul_hi_u32 v10, v5, s9 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v14, vcc, s18, v14 ; GFX8-NEXT: s_mul_i32 s17, s6, s8 ; GFX8-NEXT: s_mul_i32 s18, s5, s9 ; GFX8-NEXT: s_add_u32 s17, s17, s18 ; GFX8-NEXT: s_cselect_b32 s18, 1, 0 ; GFX8-NEXT: v_mul_hi_u32 v6, s2, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10 ; GFX8-NEXT: s_mul_i32 s19, s4, s10 ; GFX8-NEXT: s_and_b32 s18, s18, 1 ; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX8-NEXT: s_add_u32 s17, s17, s19 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v14, v10 ; GFX8-NEXT: v_mul_hi_u32 v11, s1, v9 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_mov_b32_e32 v12, s12 ; GFX8-NEXT: s_mul_i32 s20, s3, s11 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v10, v8 ; GFX8-NEXT: s_add_u32 s17, s17, s20 ; GFX8-NEXT: v_mul_hi_u32 v13, s0, v12 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10 ; GFX8-NEXT: s_mul_i32 s21, s2, s12 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v13 ; GFX8-NEXT: s_add_u32 s17, s17, s21 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10 ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4 ; GFX8-NEXT: s_mul_i32 s22, s1, s13 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: s_add_u32 s17, s17, s22 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 ; GFX8-NEXT: v_mov_b32_e32 v8, s5 ; GFX8-NEXT: v_mul_hi_u32 v10, v8, s8 ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: s_mul_i32 s23, s0, s14 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: s_add_u32 s17, s17, s23 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_mul_hi_u32 v11, v7, s9 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, s17, v10 ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s18, v17 ; GFX8-NEXT: v_mul_hi_u32 v5, v5, s10 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX8-NEXT: v_mul_hi_u32 v13, s2, v9 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v17, v11 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v10, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v11, v10 ; GFX8-NEXT: v_mul_hi_u32 v14, s1, v12 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v13 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX8-NEXT: v_mov_b32_e32 v15, s13 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11 ; GFX8-NEXT: v_mul_hi_u32 v16, s0, v15 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v14 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v16 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11 ; GFX8-NEXT: v_mov_b32_e32 v13, s14 ; GFX8-NEXT: s_mul_i32 s7, s7, s8 ; GFX8-NEXT: s_mul_i32 s17, s6, s9 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v6 ; GFX8-NEXT: s_mul_i32 s16, s0, s8 ; GFX8-NEXT: s_mul_i32 s5, s5, s10 ; GFX8-NEXT: s_mul_i32 s15, s0, s15 ; GFX8-NEXT: v_mul_hi_u32 v13, s0, v13 ; GFX8-NEXT: s_add_i32 s0, s7, s17 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: s_mul_i32 s4, s4, s11 ; GFX8-NEXT: s_add_i32 s0, s0, s5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v10, v6 ; GFX8-NEXT: v_mov_b32_e32 v10, s6 ; GFX8-NEXT: s_mul_i32 s11, s3, s12 ; GFX8-NEXT: s_add_i32 s0, s0, s4 ; GFX8-NEXT: s_mul_i32 s12, s2, s13 ; GFX8-NEXT: s_add_i32 s0, s0, s11 ; GFX8-NEXT: v_mul_hi_u32 v10, v10, s8 ; GFX8-NEXT: s_mul_i32 s13, s1, s14 ; GFX8-NEXT: s_add_i32 s0, s0, s12 ; GFX8-NEXT: v_mul_hi_u32 v8, v8, s9 ; GFX8-NEXT: s_add_i32 s0, s0, s13 ; GFX8-NEXT: v_mul_hi_u32 v7, v7, s10 ; GFX8-NEXT: v_mul_hi_u32 v9, s3, v9 ; GFX8-NEXT: s_add_i32 s0, s0, s15 ; GFX8-NEXT: v_mul_hi_u32 v11, s2, v12 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, s0, v10 ; GFX8-NEXT: v_mul_hi_u32 v12, s1, v15 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v10, v8 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v11 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v12 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v13 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 ; GFX8-NEXT: v_readfirstlane_b32 s1, v0 ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_readfirstlane_b32 s3, v2 ; GFX8-NEXT: v_readfirstlane_b32 s4, v3 ; GFX8-NEXT: v_readfirstlane_b32 s5, v4 ; GFX8-NEXT: v_readfirstlane_b32 s6, v5 ; GFX8-NEXT: v_readfirstlane_b32 s7, v6 ; GFX8-NEXT: s_mov_b32 s0, s16 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i256: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mul_i32 s17, s1, s8 ; GFX9-NEXT: s_mul_i32 s18, s0, s9 ; GFX9-NEXT: s_add_u32 s17, s17, s18 ; GFX9-NEXT: s_cselect_b32 s18, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s19, s0, s8 ; GFX9-NEXT: s_and_b32 s18, s18, 1 ; GFX9-NEXT: s_add_u32 s17, s17, s19 ; GFX9-NEXT: s_cselect_b32 s19, 1, 0 ; GFX9-NEXT: s_and_b32 s19, s19, 1 ; GFX9-NEXT: s_add_i32 s18, s18, s19 ; GFX9-NEXT: s_mul_i32 s19, s2, s8 ; GFX9-NEXT: s_mul_i32 s20, s1, s9 ; GFX9-NEXT: s_add_u32 s19, s19, s20 ; GFX9-NEXT: s_cselect_b32 s20, 1, 0 ; GFX9-NEXT: s_mul_i32 s21, s0, s10 ; GFX9-NEXT: s_and_b32 s20, s20, 1 ; GFX9-NEXT: s_add_u32 s19, s19, s21 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 ; GFX9-NEXT: s_and_b32 s21, s21, 1 ; GFX9-NEXT: s_mul_hi_u32 s22, s1, s8 ; GFX9-NEXT: s_add_i32 s20, s20, s21 ; GFX9-NEXT: s_add_u32 s19, s19, s22 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 ; GFX9-NEXT: s_and_b32 s21, s21, 1 ; GFX9-NEXT: s_mul_hi_u32 s23, s0, s9 ; GFX9-NEXT: s_add_i32 s20, s20, s21 ; GFX9-NEXT: s_add_u32 s19, s19, s23 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 ; GFX9-NEXT: s_and_b32 s21, s21, 1 ; GFX9-NEXT: s_add_i32 s20, s20, s21 ; GFX9-NEXT: s_add_u32 s18, s19, s18 ; GFX9-NEXT: s_cselect_b32 s19, 1, 0 ; GFX9-NEXT: s_and_b32 s19, s19, 1 ; GFX9-NEXT: s_add_i32 s20, s20, s19 ; GFX9-NEXT: s_mul_i32 s19, s3, s8 ; GFX9-NEXT: s_mul_i32 s21, s2, s9 ; GFX9-NEXT: s_add_u32 s19, s19, s21 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 ; GFX9-NEXT: s_mul_i32 s22, s1, s10 ; GFX9-NEXT: s_and_b32 s21, s21, 1 ; GFX9-NEXT: s_add_u32 s19, s19, s22 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0 ; GFX9-NEXT: s_and_b32 s22, s22, 1 ; GFX9-NEXT: s_mul_i32 s23, s0, s11 ; GFX9-NEXT: s_add_i32 s21, s21, s22 ; GFX9-NEXT: s_add_u32 s19, s19, s23 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0 ; GFX9-NEXT: s_and_b32 s22, s22, 1 ; GFX9-NEXT: s_mul_hi_u32 s24, s2, s8 ; GFX9-NEXT: s_add_i32 s21, s21, s22 ; GFX9-NEXT: s_add_u32 s19, s19, s24 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0 ; GFX9-NEXT: s_and_b32 s22, s22, 1 ; GFX9-NEXT: s_mul_hi_u32 s25, s1, s9 ; GFX9-NEXT: s_add_i32 s21, s21, s22 ; GFX9-NEXT: s_add_u32 s19, s19, s25 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0 ; GFX9-NEXT: s_and_b32 s22, s22, 1 ; GFX9-NEXT: s_mul_hi_u32 s26, s0, s10 ; GFX9-NEXT: s_add_i32 s21, s21, s22 ; GFX9-NEXT: s_add_u32 s19, s19, s26 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0 ; GFX9-NEXT: s_and_b32 s22, s22, 1 ; GFX9-NEXT: s_add_i32 s21, s21, s22 ; GFX9-NEXT: s_add_u32 s19, s19, s20 ; GFX9-NEXT: s_cselect_b32 s20, 1, 0 ; GFX9-NEXT: s_and_b32 s20, s20, 1 ; GFX9-NEXT: s_add_i32 s21, s21, s20 ; GFX9-NEXT: s_mul_i32 s20, s4, s8 ; GFX9-NEXT: s_mul_i32 s22, s3, s9 ; GFX9-NEXT: s_add_u32 s20, s20, s22 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0 ; GFX9-NEXT: s_mul_i32 s23, s2, s10 ; GFX9-NEXT: s_and_b32 s22, s22, 1 ; GFX9-NEXT: s_add_u32 s20, s20, s23 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 ; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_mul_i32 s24, s1, s11 ; GFX9-NEXT: s_add_i32 s22, s22, s23 ; GFX9-NEXT: s_add_u32 s20, s20, s24 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 ; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_mul_i32 s25, s0, s12 ; GFX9-NEXT: s_add_i32 s22, s22, s23 ; GFX9-NEXT: s_add_u32 s20, s20, s25 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 ; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_mul_hi_u32 s26, s3, s8 ; GFX9-NEXT: s_add_i32 s22, s22, s23 ; GFX9-NEXT: s_add_u32 s20, s20, s26 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 ; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_mul_hi_u32 s27, s2, s9 ; GFX9-NEXT: s_add_i32 s22, s22, s23 ; GFX9-NEXT: s_add_u32 s20, s20, s27 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 ; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_mul_hi_u32 s28, s1, s10 ; GFX9-NEXT: s_add_i32 s22, s22, s23 ; GFX9-NEXT: s_add_u32 s20, s20, s28 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 ; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_mul_hi_u32 s29, s0, s11 ; GFX9-NEXT: s_add_i32 s22, s22, s23 ; GFX9-NEXT: s_add_u32 s20, s20, s29 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 ; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_add_i32 s22, s22, s23 ; GFX9-NEXT: s_add_u32 s20, s20, s21 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 ; GFX9-NEXT: s_and_b32 s21, s21, 1 ; GFX9-NEXT: s_add_i32 s22, s22, s21 ; GFX9-NEXT: s_mul_i32 s21, s5, s8 ; GFX9-NEXT: s_mul_i32 s23, s4, s9 ; GFX9-NEXT: s_add_u32 s21, s21, s23 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 ; GFX9-NEXT: s_mul_i32 s24, s3, s10 ; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_add_u32 s21, s21, s24 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 ; GFX9-NEXT: s_and_b32 s24, s24, 1 ; GFX9-NEXT: s_mul_i32 s25, s2, s11 ; GFX9-NEXT: s_add_i32 s23, s23, s24 ; GFX9-NEXT: s_add_u32 s21, s21, s25 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 ; GFX9-NEXT: s_and_b32 s24, s24, 1 ; GFX9-NEXT: s_mul_i32 s26, s1, s12 ; GFX9-NEXT: s_add_i32 s23, s23, s24 ; GFX9-NEXT: s_add_u32 s21, s21, s26 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 ; GFX9-NEXT: s_and_b32 s24, s24, 1 ; GFX9-NEXT: s_mul_i32 s27, s0, s13 ; GFX9-NEXT: s_add_i32 s23, s23, s24 ; GFX9-NEXT: s_add_u32 s21, s21, s27 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 ; GFX9-NEXT: s_and_b32 s24, s24, 1 ; GFX9-NEXT: s_mul_hi_u32 s28, s4, s8 ; GFX9-NEXT: s_add_i32 s23, s23, s24 ; GFX9-NEXT: s_add_u32 s21, s21, s28 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 ; GFX9-NEXT: s_and_b32 s24, s24, 1 ; GFX9-NEXT: s_mul_hi_u32 s29, s3, s9 ; GFX9-NEXT: s_add_i32 s23, s23, s24 ; GFX9-NEXT: s_add_u32 s21, s21, s29 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 ; GFX9-NEXT: s_and_b32 s24, s24, 1 ; GFX9-NEXT: s_mul_hi_u32 s30, s2, s10 ; GFX9-NEXT: s_add_i32 s23, s23, s24 ; GFX9-NEXT: s_add_u32 s21, s21, s30 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 ; GFX9-NEXT: s_and_b32 s24, s24, 1 ; GFX9-NEXT: s_mul_hi_u32 s31, s1, s11 ; GFX9-NEXT: s_add_i32 s23, s23, s24 ; GFX9-NEXT: s_add_u32 s21, s21, s31 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 ; GFX9-NEXT: s_and_b32 s24, s24, 1 ; GFX9-NEXT: s_mul_hi_u32 s33, s0, s12 ; GFX9-NEXT: s_add_i32 s23, s23, s24 ; GFX9-NEXT: s_add_u32 s21, s21, s33 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 ; GFX9-NEXT: s_and_b32 s24, s24, 1 ; GFX9-NEXT: s_add_i32 s23, s23, s24 ; GFX9-NEXT: s_add_u32 s21, s21, s22 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0 ; GFX9-NEXT: s_and_b32 s22, s22, 1 ; GFX9-NEXT: s_add_i32 s23, s23, s22 ; GFX9-NEXT: s_mul_i32 s22, s6, s8 ; GFX9-NEXT: s_mul_i32 s24, s5, s9 ; GFX9-NEXT: s_add_u32 s22, s22, s24 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 ; GFX9-NEXT: s_mul_i32 s25, s4, s10 ; GFX9-NEXT: s_and_b32 s24, s24, 1 ; GFX9-NEXT: s_add_u32 s22, s22, s25 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 ; GFX9-NEXT: s_and_b32 s25, s25, 1 ; GFX9-NEXT: s_mul_i32 s26, s3, s11 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s26 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 ; GFX9-NEXT: s_and_b32 s25, s25, 1 ; GFX9-NEXT: s_mul_i32 s27, s2, s12 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s27 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 ; GFX9-NEXT: s_and_b32 s25, s25, 1 ; GFX9-NEXT: s_mul_i32 s28, s1, s13 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s28 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 ; GFX9-NEXT: s_and_b32 s25, s25, 1 ; GFX9-NEXT: s_mul_i32 s29, s0, s14 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s29 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 ; GFX9-NEXT: s_and_b32 s25, s25, 1 ; GFX9-NEXT: s_mul_hi_u32 s30, s5, s8 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s30 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 ; GFX9-NEXT: s_and_b32 s25, s25, 1 ; GFX9-NEXT: s_mul_hi_u32 s31, s4, s9 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s31 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 ; GFX9-NEXT: s_and_b32 s25, s25, 1 ; GFX9-NEXT: s_mul_hi_u32 s33, s3, s10 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s33 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 ; GFX9-NEXT: s_and_b32 s25, s25, 1 ; GFX9-NEXT: s_mul_hi_u32 s34, s2, s11 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s34 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 ; GFX9-NEXT: s_and_b32 s25, s25, 1 ; GFX9-NEXT: s_mul_hi_u32 s35, s1, s12 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s35 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 ; GFX9-NEXT: s_and_b32 s25, s25, 1 ; GFX9-NEXT: s_mul_hi_u32 s36, s0, s13 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s36 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 ; GFX9-NEXT: s_and_b32 s25, s25, 1 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s23 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 ; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_add_i32 s24, s24, s23 ; GFX9-NEXT: s_mul_i32 s23, s6, s9 ; GFX9-NEXT: s_mul_i32 s7, s7, s8 ; GFX9-NEXT: s_mul_i32 s25, s5, s10 ; GFX9-NEXT: s_add_i32 s7, s7, s23 ; GFX9-NEXT: s_mul_i32 s26, s4, s11 ; GFX9-NEXT: s_add_i32 s7, s7, s25 ; GFX9-NEXT: s_mul_i32 s27, s3, s12 ; GFX9-NEXT: s_add_i32 s7, s7, s26 ; GFX9-NEXT: s_mul_i32 s28, s2, s13 ; GFX9-NEXT: s_add_i32 s7, s7, s27 ; GFX9-NEXT: s_mul_i32 s29, s1, s14 ; GFX9-NEXT: s_add_i32 s7, s7, s28 ; GFX9-NEXT: s_mul_i32 s15, s0, s15 ; GFX9-NEXT: s_add_i32 s7, s7, s29 ; GFX9-NEXT: s_mul_hi_u32 s6, s6, s8 ; GFX9-NEXT: s_add_i32 s7, s7, s15 ; GFX9-NEXT: s_mul_hi_u32 s5, s5, s9 ; GFX9-NEXT: s_add_i32 s6, s7, s6 ; GFX9-NEXT: s_add_i32 s5, s6, s5 ; GFX9-NEXT: s_mul_hi_u32 s4, s4, s10 ; GFX9-NEXT: s_add_i32 s4, s5, s4 ; GFX9-NEXT: s_mul_hi_u32 s3, s3, s11 ; GFX9-NEXT: s_add_i32 s3, s4, s3 ; GFX9-NEXT: s_mul_hi_u32 s2, s2, s12 ; GFX9-NEXT: s_add_i32 s2, s3, s2 ; GFX9-NEXT: s_mul_hi_u32 s1, s1, s13 ; GFX9-NEXT: s_mul_i32 s16, s0, s8 ; GFX9-NEXT: s_add_i32 s1, s2, s1 ; GFX9-NEXT: s_mul_hi_u32 s0, s0, s14 ; GFX9-NEXT: s_add_i32 s0, s1, s0 ; GFX9-NEXT: s_add_i32 s7, s0, s24 ; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: s_mov_b32 s1, s17 ; GFX9-NEXT: s_mov_b32 s2, s18 ; GFX9-NEXT: s_mov_b32 s3, s19 ; GFX9-NEXT: s_mov_b32 s4, s20 ; GFX9-NEXT: s_mov_b32 s5, s21 ; GFX9-NEXT: s_mov_b32 s6, s22 ; GFX9-NEXT: ; return to shader part epilog %result = mul i256 %num, %den %cast = bitcast i256 %result to <8 x i32> ret <8 x i32> %cast } define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX7-LABEL: v_mul_i256: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_lo_u32 v16, v1, v8 ; GFX7-NEXT: v_mul_lo_u32 v17, v0, v9 ; GFX7-NEXT: v_mul_hi_u32 v18, v0, v8 ; GFX7-NEXT: v_mul_lo_u32 v19, v2, v8 ; GFX7-NEXT: v_mul_lo_u32 v20, v1, v9 ; GFX7-NEXT: v_add_i32_e32 v16, vcc, v16, v17 ; GFX7-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v16, vcc, v16, v18 ; GFX7-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v17, vcc, v17, v18 ; GFX7-NEXT: v_mul_lo_u32 v18, v0, v10 ; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v20 ; GFX7-NEXT: v_mul_hi_u32 v21, v1, v8 ; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v18, vcc, v19, v18 ; GFX7-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v19, vcc, v20, v19 ; GFX7-NEXT: v_add_i32_e32 v18, vcc, v18, v21 ; GFX7-NEXT: v_mul_hi_u32 v21, v0, v9 ; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v20 ; GFX7-NEXT: v_mul_lo_u32 v22, v0, v11 ; GFX7-NEXT: v_add_i32_e32 v18, vcc, v18, v21 ; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v20 ; GFX7-NEXT: v_add_i32_e32 v17, vcc, v18, v17 ; GFX7-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GFX7-NEXT: v_mul_lo_u32 v20, v3, v8 ; GFX7-NEXT: v_mul_lo_u32 v21, v2, v9 ; GFX7-NEXT: v_add_i32_e32 v18, vcc, v19, v18 ; GFX7-NEXT: v_mul_lo_u32 v19, v1, v10 ; GFX7-NEXT: v_mul_lo_u32 v23, v1, v11 ; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21 ; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v19, vcc, v20, v19 ; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v20, vcc, v21, v20 ; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v22 ; GFX7-NEXT: v_mul_hi_u32 v22, v2, v8 ; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21 ; GFX7-NEXT: v_mul_lo_u32 v7, v7, v8 ; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v22 ; GFX7-NEXT: v_mul_hi_u32 v22, v1, v9 ; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21 ; GFX7-NEXT: v_mul_lo_u32 v15, v0, v15 ; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v22 ; GFX7-NEXT: v_mul_hi_u32 v22, v0, v10 ; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21 ; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v22 ; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21 ; GFX7-NEXT: v_add_i32_e32 v18, vcc, v19, v18 ; GFX7-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GFX7-NEXT: v_mul_lo_u32 v21, v4, v8 ; GFX7-NEXT: v_mul_lo_u32 v22, v3, v9 ; GFX7-NEXT: v_add_i32_e32 v19, vcc, v20, v19 ; GFX7-NEXT: v_mul_lo_u32 v20, v2, v10 ; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22 ; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v20, vcc, v21, v20 ; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v21, vcc, v22, v21 ; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23 ; GFX7-NEXT: v_mul_lo_u32 v23, v0, v12 ; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22 ; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23 ; GFX7-NEXT: v_mul_hi_u32 v23, v3, v8 ; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22 ; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23 ; GFX7-NEXT: v_mul_hi_u32 v23, v2, v9 ; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22 ; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23 ; GFX7-NEXT: v_mul_hi_u32 v23, v1, v10 ; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22 ; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23 ; GFX7-NEXT: v_mul_hi_u32 v23, v0, v11 ; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22 ; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23 ; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22 ; GFX7-NEXT: v_add_i32_e32 v19, vcc, v20, v19 ; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GFX7-NEXT: v_mul_lo_u32 v22, v5, v8 ; GFX7-NEXT: v_mul_lo_u32 v23, v4, v9 ; GFX7-NEXT: v_add_i32_e32 v20, vcc, v21, v20 ; GFX7-NEXT: v_mul_lo_u32 v21, v3, v10 ; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23 ; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v21, vcc, v22, v21 ; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v22, vcc, v23, v22 ; GFX7-NEXT: v_mul_lo_u32 v23, v2, v11 ; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23 ; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23 ; GFX7-NEXT: v_mul_lo_u32 v23, v1, v12 ; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23 ; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23 ; GFX7-NEXT: v_mul_lo_u32 v23, v0, v13 ; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23 ; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23 ; GFX7-NEXT: v_mul_hi_u32 v23, v4, v8 ; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23 ; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23 ; GFX7-NEXT: v_mul_hi_u32 v23, v3, v9 ; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23 ; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23 ; GFX7-NEXT: v_mul_hi_u32 v23, v2, v10 ; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23 ; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23 ; GFX7-NEXT: v_mul_hi_u32 v23, v1, v11 ; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23 ; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23 ; GFX7-NEXT: v_mul_hi_u32 v23, v0, v12 ; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23 ; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23 ; GFX7-NEXT: v_add_i32_e32 v20, vcc, v21, v20 ; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v21, vcc, v22, v21 ; GFX7-NEXT: v_mul_lo_u32 v22, v6, v8 ; GFX7-NEXT: v_mul_lo_u32 v23, v5, v9 ; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23 ; GFX7-NEXT: v_mul_lo_u32 v23, v4, v10 ; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23 ; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v23, vcc, v24, v23 ; GFX7-NEXT: v_mul_lo_u32 v24, v3, v11 ; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24 ; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24 ; GFX7-NEXT: v_mul_lo_u32 v24, v2, v12 ; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24 ; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24 ; GFX7-NEXT: v_mul_lo_u32 v24, v1, v13 ; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24 ; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24 ; GFX7-NEXT: v_mul_lo_u32 v24, v0, v14 ; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24 ; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24 ; GFX7-NEXT: v_mul_hi_u32 v24, v5, v8 ; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24 ; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24 ; GFX7-NEXT: v_mul_hi_u32 v24, v4, v9 ; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24 ; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24 ; GFX7-NEXT: v_mul_hi_u32 v24, v3, v10 ; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24 ; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24 ; GFX7-NEXT: v_mul_hi_u32 v24, v2, v11 ; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24 ; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24 ; GFX7-NEXT: v_mul_hi_u32 v24, v1, v12 ; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24 ; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24 ; GFX7-NEXT: v_mul_hi_u32 v24, v0, v13 ; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24 ; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24 ; GFX7-NEXT: v_add_i32_e32 v21, vcc, v22, v21 ; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v22 ; GFX7-NEXT: v_mul_lo_u32 v22, v0, v8 ; GFX7-NEXT: v_mul_hi_u32 v8, v6, v8 ; GFX7-NEXT: v_mul_lo_u32 v6, v6, v9 ; GFX7-NEXT: v_mul_hi_u32 v9, v5, v9 ; GFX7-NEXT: v_mul_lo_u32 v5, v5, v10 ; GFX7-NEXT: v_mul_hi_u32 v10, v4, v10 ; GFX7-NEXT: v_mul_lo_u32 v4, v4, v11 ; GFX7-NEXT: v_mul_hi_u32 v11, v3, v11 ; GFX7-NEXT: v_mul_lo_u32 v3, v3, v12 ; GFX7-NEXT: v_mul_hi_u32 v12, v2, v12 ; GFX7-NEXT: v_mul_lo_u32 v2, v2, v13 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GFX7-NEXT: v_mul_hi_u32 v13, v1, v13 ; GFX7-NEXT: v_mul_lo_u32 v1, v1, v14 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v15 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v8 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v9 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v10 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, v14 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v12 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v13 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, v0, v23 ; GFX7-NEXT: v_mov_b32_e32 v0, v22 ; GFX7-NEXT: v_mov_b32_e32 v1, v16 ; GFX7-NEXT: v_mov_b32_e32 v2, v17 ; GFX7-NEXT: v_mov_b32_e32 v3, v18 ; GFX7-NEXT: v_mov_b32_e32 v4, v19 ; GFX7-NEXT: v_mov_b32_e32 v5, v20 ; GFX7-NEXT: v_mov_b32_e32 v6, v21 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i256: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v16, v1, v8 ; GFX8-NEXT: v_mul_lo_u32 v17, v0, v9 ; GFX8-NEXT: v_mul_hi_u32 v18, v0, v8 ; GFX8-NEXT: v_mul_lo_u32 v19, v2, v8 ; GFX8-NEXT: v_mul_lo_u32 v20, v1, v9 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v17 ; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v18 ; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v18 ; GFX8-NEXT: v_mul_lo_u32 v18, v0, v10 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v20 ; GFX8-NEXT: v_mul_hi_u32 v21, v1, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v19, v18 ; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v20, v19 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v21 ; GFX8-NEXT: v_mul_hi_u32 v21, v0, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v20 ; GFX8-NEXT: v_mul_lo_u32 v22, v0, v11 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v21 ; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v20 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, v18, v17 ; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GFX8-NEXT: v_mul_lo_u32 v20, v3, v8 ; GFX8-NEXT: v_mul_lo_u32 v21, v2, v9 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v19, v18 ; GFX8-NEXT: v_mul_lo_u32 v19, v1, v10 ; GFX8-NEXT: v_mul_lo_u32 v23, v1, v11 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21 ; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v20, v19 ; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v21, v20 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v22 ; GFX8-NEXT: v_mul_hi_u32 v22, v2, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21 ; GFX8-NEXT: v_mul_lo_u32 v7, v7, v8 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v22 ; GFX8-NEXT: v_mul_hi_u32 v22, v1, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21 ; GFX8-NEXT: v_mul_lo_u32 v15, v0, v15 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v22 ; GFX8-NEXT: v_mul_hi_u32 v22, v0, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v22 ; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v19, v18 ; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GFX8-NEXT: v_mul_lo_u32 v21, v4, v8 ; GFX8-NEXT: v_mul_lo_u32 v22, v3, v9 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v20, v19 ; GFX8-NEXT: v_mul_lo_u32 v20, v2, v10 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22 ; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v21, v20 ; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v22, v21 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23 ; GFX8-NEXT: v_mul_lo_u32 v23, v0, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23 ; GFX8-NEXT: v_mul_hi_u32 v23, v3, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23 ; GFX8-NEXT: v_mul_hi_u32 v23, v2, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23 ; GFX8-NEXT: v_mul_hi_u32 v23, v1, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23 ; GFX8-NEXT: v_mul_hi_u32 v23, v0, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23 ; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v20, v19 ; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GFX8-NEXT: v_mul_lo_u32 v22, v5, v8 ; GFX8-NEXT: v_mul_lo_u32 v23, v4, v9 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v21, v20 ; GFX8-NEXT: v_mul_lo_u32 v21, v3, v10 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23 ; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v22, v21 ; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v23, v22 ; GFX8-NEXT: v_mul_lo_u32 v23, v2, v11 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23 ; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23 ; GFX8-NEXT: v_mul_lo_u32 v23, v1, v12 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23 ; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23 ; GFX8-NEXT: v_mul_lo_u32 v23, v0, v13 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23 ; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23 ; GFX8-NEXT: v_mul_hi_u32 v23, v4, v8 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23 ; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23 ; GFX8-NEXT: v_mul_hi_u32 v23, v3, v9 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23 ; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23 ; GFX8-NEXT: v_mul_hi_u32 v23, v2, v10 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23 ; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23 ; GFX8-NEXT: v_mul_hi_u32 v23, v1, v11 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23 ; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23 ; GFX8-NEXT: v_mul_hi_u32 v23, v0, v12 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23 ; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v21, v20 ; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v22, v21 ; GFX8-NEXT: v_mul_lo_u32 v22, v6, v8 ; GFX8-NEXT: v_mul_lo_u32 v23, v5, v9 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23 ; GFX8-NEXT: v_mul_lo_u32 v23, v4, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23 ; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v24, v23 ; GFX8-NEXT: v_mul_lo_u32 v24, v3, v11 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24 ; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24 ; GFX8-NEXT: v_mul_lo_u32 v24, v2, v12 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24 ; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24 ; GFX8-NEXT: v_mul_lo_u32 v24, v1, v13 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24 ; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24 ; GFX8-NEXT: v_mul_lo_u32 v24, v0, v14 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24 ; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24 ; GFX8-NEXT: v_mul_hi_u32 v24, v5, v8 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24 ; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24 ; GFX8-NEXT: v_mul_hi_u32 v24, v4, v9 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24 ; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24 ; GFX8-NEXT: v_mul_hi_u32 v24, v3, v10 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24 ; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24 ; GFX8-NEXT: v_mul_hi_u32 v24, v2, v11 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24 ; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24 ; GFX8-NEXT: v_mul_hi_u32 v24, v1, v12 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24 ; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24 ; GFX8-NEXT: v_mul_hi_u32 v24, v0, v13 ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24 ; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v22, v21 ; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v22 ; GFX8-NEXT: v_mul_lo_u32 v22, v0, v8 ; GFX8-NEXT: v_mul_hi_u32 v8, v6, v8 ; GFX8-NEXT: v_mul_lo_u32 v6, v6, v9 ; GFX8-NEXT: v_mul_hi_u32 v9, v5, v9 ; GFX8-NEXT: v_mul_lo_u32 v5, v5, v10 ; GFX8-NEXT: v_mul_hi_u32 v10, v4, v10 ; GFX8-NEXT: v_mul_lo_u32 v4, v4, v11 ; GFX8-NEXT: v_mul_hi_u32 v11, v3, v11 ; GFX8-NEXT: v_mul_lo_u32 v3, v3, v12 ; GFX8-NEXT: v_mul_hi_u32 v12, v2, v12 ; GFX8-NEXT: v_mul_lo_u32 v2, v2, v13 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 ; GFX8-NEXT: v_mul_hi_u32 v13, v1, v13 ; GFX8-NEXT: v_mul_lo_u32 v1, v1, v14 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v15 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v8 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v9 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v10 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, v14 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v11 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v12 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v13 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v0, v23 ; GFX8-NEXT: v_mov_b32_e32 v0, v22 ; GFX8-NEXT: v_mov_b32_e32 v1, v16 ; GFX8-NEXT: v_mov_b32_e32 v2, v17 ; GFX8-NEXT: v_mov_b32_e32 v3, v18 ; GFX8-NEXT: v_mov_b32_e32 v4, v19 ; GFX8-NEXT: v_mov_b32_e32 v5, v20 ; GFX8-NEXT: v_mov_b32_e32 v6, v21 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i256: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v16, v2, v8 ; GFX9-NEXT: v_mul_lo_u32 v17, v1, v9 ; GFX9-NEXT: v_mul_lo_u32 v18, v0, v10 ; GFX9-NEXT: v_mul_hi_u32 v19, v1, v8 ; GFX9-NEXT: v_mul_lo_u32 v20, v1, v8 ; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, v16, v17 ; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, v16, v18 ; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v16, v19 ; GFX9-NEXT: v_mul_lo_u32 v21, v0, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v18, v17, v18, v16 ; GFX9-NEXT: v_mul_hi_u32 v16, v0, v8 ; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, v20, v21 ; GFX9-NEXT: v_mul_hi_u32 v21, v0, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, v17, v16 ; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v21 ; GFX9-NEXT: v_add_u32_e32 v17, v20, v17 ; GFX9-NEXT: v_mul_lo_u32 v21, v3, v8 ; GFX9-NEXT: v_mul_lo_u32 v22, v2, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, v19, v17 ; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v18, v18, v20, v19 ; GFX9-NEXT: v_mul_lo_u32 v19, v1, v10 ; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v21, v22 ; GFX9-NEXT: v_mul_lo_u32 v22, v0, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v20, v19 ; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v22 ; GFX9-NEXT: v_mul_hi_u32 v23, v2, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v20, v21, v20, v22 ; GFX9-NEXT: v_mul_hi_u32 v21, v1, v9 ; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v23 ; GFX9-NEXT: v_mul_hi_u32 v23, v0, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v21 ; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v20, v20, v22, v21 ; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v23 ; GFX9-NEXT: v_mul_lo_u32 v22, v4, v8 ; GFX9-NEXT: v_mul_lo_u32 v23, v3, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, v19, v18 ; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v19, v20, v21, v19 ; GFX9-NEXT: v_mul_lo_u32 v20, v2, v10 ; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v22, v23 ; GFX9-NEXT: v_mul_lo_u32 v23, v1, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v21, v20 ; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v23 ; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v21, v22, v21, v23 ; GFX9-NEXT: v_mul_lo_u32 v22, v0, v12 ; GFX9-NEXT: v_mul_hi_u32 v23, v3, v8 ; GFX9-NEXT: v_mul_lo_u32 v7, v7, v8 ; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v22 ; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v23 ; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v21, v21, v22, v23 ; GFX9-NEXT: v_mul_hi_u32 v22, v2, v9 ; GFX9-NEXT: v_mul_hi_u32 v23, v1, v10 ; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v22 ; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v23 ; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v21, v21, v22, v23 ; GFX9-NEXT: v_mul_hi_u32 v22, v0, v11 ; GFX9-NEXT: v_mul_lo_u32 v23, v3, v10 ; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v22 ; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v20, v19 ; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v20, v21, v22, v20 ; GFX9-NEXT: v_mul_lo_u32 v21, v5, v8 ; GFX9-NEXT: v_mul_lo_u32 v22, v4, v9 ; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v22 ; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23 ; GFX9-NEXT: v_mul_lo_u32 v23, v2, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23 ; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v22, v22, v24, v23 ; GFX9-NEXT: v_mul_lo_u32 v23, v1, v12 ; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23 ; GFX9-NEXT: v_mul_lo_u32 v23, v0, v13 ; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23 ; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v22, v22, v24, v23 ; GFX9-NEXT: v_mul_hi_u32 v23, v4, v8 ; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23 ; GFX9-NEXT: v_mul_hi_u32 v23, v3, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23 ; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v22, v22, v24, v23 ; GFX9-NEXT: v_mul_hi_u32 v23, v2, v10 ; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23 ; GFX9-NEXT: v_mul_hi_u32 v23, v1, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23 ; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v22, v22, v24, v23 ; GFX9-NEXT: v_mul_hi_u32 v23, v0, v12 ; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23 ; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v21, v20 ; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v21, v22, v23, v21 ; GFX9-NEXT: v_mul_lo_u32 v22, v6, v8 ; GFX9-NEXT: v_mul_lo_u32 v23, v5, v9 ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v23 ; GFX9-NEXT: v_mul_lo_u32 v23, v4, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v23 ; GFX9-NEXT: v_mul_lo_u32 v23, v3, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v23 ; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v23, v24, v25, v23 ; GFX9-NEXT: v_mul_lo_u32 v24, v2, v12 ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24 ; GFX9-NEXT: v_mul_lo_u32 v24, v1, v13 ; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24 ; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v23, v23, v25, v24 ; GFX9-NEXT: v_mul_lo_u32 v24, v0, v14 ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24 ; GFX9-NEXT: v_mul_hi_u32 v24, v5, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24 ; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v23, v23, v25, v24 ; GFX9-NEXT: v_mul_hi_u32 v24, v4, v9 ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24 ; GFX9-NEXT: v_mul_hi_u32 v24, v3, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24 ; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v23, v23, v25, v24 ; GFX9-NEXT: v_mul_hi_u32 v24, v2, v11 ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24 ; GFX9-NEXT: v_mul_hi_u32 v24, v1, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24 ; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v23, v23, v25, v24 ; GFX9-NEXT: v_mul_hi_u32 v24, v0, v13 ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24 ; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v22, v21 ; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v22, v23, v24, v22 ; GFX9-NEXT: v_mul_lo_u32 v23, v6, v9 ; GFX9-NEXT: v_mul_lo_u32 v24, v4, v11 ; GFX9-NEXT: v_mul_hi_u32 v4, v4, v10 ; GFX9-NEXT: v_mul_hi_u32 v6, v6, v8 ; GFX9-NEXT: v_add_u32_e32 v7, v7, v23 ; GFX9-NEXT: v_mul_lo_u32 v23, v5, v10 ; GFX9-NEXT: v_mul_hi_u32 v5, v5, v9 ; GFX9-NEXT: v_mul_hi_u32 v9, v3, v11 ; GFX9-NEXT: v_mul_hi_u32 v10, v2, v12 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, v12 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, v13 ; GFX9-NEXT: v_mul_hi_u32 v11, v1, v13 ; GFX9-NEXT: v_mul_lo_u32 v12, v1, v14 ; GFX9-NEXT: v_mul_lo_u32 v13, v0, v15 ; GFX9-NEXT: v_add3_u32 v7, v7, v23, v24 ; GFX9-NEXT: v_add3_u32 v2, v7, v3, v2 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, v8 ; GFX9-NEXT: v_add3_u32 v2, v2, v12, v13 ; GFX9-NEXT: v_mul_hi_u32 v0, v0, v14 ; GFX9-NEXT: v_add3_u32 v2, v2, v6, v5 ; GFX9-NEXT: v_add3_u32 v2, v2, v4, v9 ; GFX9-NEXT: v_add3_u32 v2, v2, v10, v11 ; GFX9-NEXT: v_add3_u32 v7, v2, v0, v22 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, v16 ; GFX9-NEXT: v_mov_b32_e32 v2, v17 ; GFX9-NEXT: v_mov_b32_e32 v3, v18 ; GFX9-NEXT: v_mov_b32_e32 v4, v19 ; GFX9-NEXT: v_mov_b32_e32 v5, v20 ; GFX9-NEXT: v_mov_b32_e32 v6, v21 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = mul i256 %num, %den ret i256 %result }