target/arm: Handle FPCR.AH in FMLSL (by element and vector)
Handle FPCR.AH's requirement to not negate the sign of a NaN in FMLSL by element and vector, using the usual trick of negating by XOR when AH=0 and by muladd flags when AH=1. Since we have the CPUARMState* in the helper anyway, we can look directly at env->vfp.fpcr and don't need toa pass in the FPCR.AH value via the SIMD data word. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Message-id: 20250129013857.135256-31-richard.henderson@linaro.org [PMM: commit message tweaked] Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
parent
0b5ca769cf
commit
0fa4b7afd4
@ -2124,27 +2124,24 @@ static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
|
||||
*/
|
||||
|
||||
static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
|
||||
uint32_t desc, bool fz16)
|
||||
uint64_t negx, int negf, uint32_t desc, bool fz16)
|
||||
{
|
||||
intptr_t i, oprsz = simd_oprsz(desc);
|
||||
int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
|
||||
int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
|
||||
int is_q = oprsz == 16;
|
||||
uint64_t n_4, m_4;
|
||||
|
||||
/* Pre-load all of the f16 data, avoiding overlap issues. */
|
||||
n_4 = load4_f16(vn, is_q, is_2);
|
||||
/*
|
||||
* Pre-load all of the f16 data, avoiding overlap issues.
|
||||
* Negate all inputs for AH=0 FMLSL at once.
|
||||
*/
|
||||
n_4 = load4_f16(vn, is_q, is_2) ^ negx;
|
||||
m_4 = load4_f16(vm, is_q, is_2);
|
||||
|
||||
/* Negate all inputs for FMLSL at once. */
|
||||
if (is_s) {
|
||||
n_4 ^= 0x8000800080008000ull;
|
||||
}
|
||||
|
||||
for (i = 0; i < oprsz / 4; i++) {
|
||||
float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
|
||||
float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
|
||||
d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
|
||||
d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst);
|
||||
}
|
||||
clear_tail(d, oprsz, simd_maxsz(desc));
|
||||
}
|
||||
@ -2152,14 +2149,28 @@ static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
|
||||
void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
|
||||
CPUARMState *env, uint32_t desc)
|
||||
{
|
||||
do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
|
||||
bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
|
||||
uint64_t negx = is_s ? 0x8000800080008000ull : 0;
|
||||
|
||||
do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, negx, 0, desc,
|
||||
get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32));
|
||||
}
|
||||
|
||||
void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
|
||||
CPUARMState *env, uint32_t desc)
|
||||
{
|
||||
do_fmlal(vd, vn, vm, &env->vfp.fp_status_a64, desc,
|
||||
bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
|
||||
uint64_t negx = 0;
|
||||
int negf = 0;
|
||||
|
||||
if (is_s) {
|
||||
if (env->vfp.fpcr & FPCR_AH) {
|
||||
negf = float_muladd_negate_product;
|
||||
} else {
|
||||
negx = 0x8000800080008000ull;
|
||||
}
|
||||
}
|
||||
do_fmlal(vd, vn, vm, &env->vfp.fp_status_a64, negx, negf, desc,
|
||||
get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64));
|
||||
}
|
||||
|
||||
@ -2184,29 +2195,25 @@ void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
|
||||
}
|
||||
|
||||
static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
|
||||
uint32_t desc, bool fz16)
|
||||
uint64_t negx, int negf, uint32_t desc, bool fz16)
|
||||
{
|
||||
intptr_t i, oprsz = simd_oprsz(desc);
|
||||
int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
|
||||
int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
|
||||
int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
|
||||
int is_q = oprsz == 16;
|
||||
uint64_t n_4;
|
||||
float32 m_1;
|
||||
|
||||
/* Pre-load all of the f16 data, avoiding overlap issues. */
|
||||
n_4 = load4_f16(vn, is_q, is_2);
|
||||
|
||||
/* Negate all inputs for FMLSL at once. */
|
||||
if (is_s) {
|
||||
n_4 ^= 0x8000800080008000ull;
|
||||
}
|
||||
|
||||
/*
|
||||
* Pre-load all of the f16 data, avoiding overlap issues.
|
||||
* Negate all inputs for AH=0 FMLSL at once.
|
||||
*/
|
||||
n_4 = load4_f16(vn, is_q, is_2) ^ negx;
|
||||
m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
|
||||
|
||||
for (i = 0; i < oprsz / 4; i++) {
|
||||
float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
|
||||
d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
|
||||
d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst);
|
||||
}
|
||||
clear_tail(d, oprsz, simd_maxsz(desc));
|
||||
}
|
||||
@ -2214,14 +2221,28 @@ static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
|
||||
void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
|
||||
CPUARMState *env, uint32_t desc)
|
||||
{
|
||||
do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
|
||||
bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
|
||||
uint64_t negx = is_s ? 0x8000800080008000ull : 0;
|
||||
|
||||
do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, negx, 0, desc,
|
||||
get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32));
|
||||
}
|
||||
|
||||
void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
|
||||
CPUARMState *env, uint32_t desc)
|
||||
{
|
||||
do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status_a64, desc,
|
||||
bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
|
||||
uint64_t negx = 0;
|
||||
int negf = 0;
|
||||
|
||||
if (is_s) {
|
||||
if (env->vfp.fpcr & FPCR_AH) {
|
||||
negf = float_muladd_negate_product;
|
||||
} else {
|
||||
negx = 0x8000800080008000ull;
|
||||
}
|
||||
}
|
||||
do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status_a64, negx, negf, desc,
|
||||
get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64));
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user