target/i386: reimplement fyl2x using floatx80 operations

The x87 fyl2x emulation is currently based around conversion to double. This is inherently unsuitable for a good emulation of any floatx80 operation. Reimplement using the soft-float operations, building on top of the reimplementation of fyl2xp1 and factoring out code to be shared between the two instructions. The included test assumes that the result in round-to-nearest mode should always be one of the two closest floating-point numbers to the mathematically exact result (including that it should be exact, in the exact cases which cover more cases than for fyl2xp1). Signed-off-by: Joseph Myers <joseph@codesourcery.com> Message-Id: <alpine.DEB.2.21.2006172321530.20587@digraph.polyomino.org.uk> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2020-06-17 23:24:29 +00:00 · 2020-06-17 23:24:29 +00:00 · 1f18a1e6ab
commit 1f18a1e6ab
parent 5eebc49d2d
2 changed files with 1423 additions and 111 deletions
--- a/target/i386/fpu_helper.c
+++ b/target/i386/fpu_helper.c
@ -1223,21 +1223,6 @@ void helper_f2xm1(CPUX86State *env)
    merge_exception_flags(env, old_flags);
 }
 void helper_fyl2x(CPUX86State *env)
 {
    double fptemp = floatx80_to_double(env, ST0);
    if (fptemp > 0.0) {
        fptemp = log(fptemp) / log(2.0); /* log2(ST) */
        fptemp *= floatx80_to_double(env, ST1);
        ST1 = double_to_floatx80(env, fptemp);
        fpop(env);
    } else {
        env->fpus &= ~0x4700;
        env->fpus |= 0x400;
    }
 }
 void helper_fptan(CPUX86State *env)
 {
    double fptemp = floatx80_to_double(env, ST0);
@ -1395,6 +1380,118 @@ void helper_fprem(CPUX86State *env)
 #define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
 #define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
 /*
 * Compute an approximation of log2(1+arg), where 1+arg is in the
 * interval [sqrt(2)/2, sqrt(2)].  It is assumed that when this
 * function is called, rounding precision is set to 80 and the
 * round-to-nearest mode is in effect.  arg must not be exactly zero,
 * and must not be so close to zero that underflow might occur.
 */
 static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
                                uint64_t *sig0, uint64_t *sig1)
 {
    uint64_t arg0_sig = extractFloatx80Frac(arg);
    int32_t arg0_exp = extractFloatx80Exp(arg);
    bool arg0_sign = extractFloatx80Sign(arg);
    bool asign;
    int32_t dexp, texp, aexp;
    uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
    uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
    uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
    floatx80 t2, accum;
    /*
     * Compute an approximation of arg/(2+arg), with extra precision,
     * as the argument to a polynomial approximation.  The extra
     * precision is only needed for the first term of the
     * approximation, with subsequent terms being significantly
     * smaller; the approximation only uses odd exponents, and the
     * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
     */
    if (arg0_sign) {
        dexp = 0x3fff;
        shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
        sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
    } else {
        dexp = 0x4000;
        shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
        dsig0 |= 0x8000000000000000ULL;
    }
    texp = arg0_exp - dexp + 0x3ffe;
    rsig0 = arg0_sig;
    rsig1 = 0;
    rsig2 = 0;
    if (dsig0 <= rsig0) {
        shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
        ++texp;
    }
    tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
    mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
    sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
           &rsig0, &rsig1, &rsig2);
    while ((int64_t) rsig0 < 0) {
        --tsig0;
        add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
               &rsig0, &rsig1, &rsig2);
    }
    tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
    /*
     * No need to correct any estimation error in tsig1; even with
     * such error, it is accurate enough.  Now compute the square of
     * that approximation.
     */
    mul128To256(tsig0, tsig1, tsig0, tsig1,
                &t2sig0, &t2sig1, &t2sig2, &t2sig3);
    t2 = normalizeRoundAndPackFloatx80(80, false, texp + texp - 0x3ffe,
                                       t2sig0, t2sig1, &env->fp_status);
    /* Compute the lower parts of the polynomial expansion.  */
    accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
    accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
    accum = floatx80_mul(accum, t2, &env->fp_status);
    accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
    accum = floatx80_mul(accum, t2, &env->fp_status);
    accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
    accum = floatx80_mul(accum, t2, &env->fp_status);
    accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
    accum = floatx80_mul(accum, t2, &env->fp_status);
    accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
    accum = floatx80_mul(accum, t2, &env->fp_status);
    accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
    accum = floatx80_mul(accum, t2, &env->fp_status);
    accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
    accum = floatx80_mul(accum, t2, &env->fp_status);
    accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
    accum = floatx80_mul(accum, t2, &env->fp_status);
    accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
    /*
     * The full polynomial expansion is fyl2x_coeff_0 + accum (where
     * accum has much lower magnitude, and so, in particular, carry
     * out of the addition is not possible), multiplied by t.  (This
     * expansion is only accurate to about 70 bits, not 128 bits.)
     */
    aexp = extractFloatx80Exp(fyl2x_coeff_0);
    asign = extractFloatx80Sign(fyl2x_coeff_0);
    shift128RightJamming(extractFloatx80Frac(accum), 0,
                         aexp - extractFloatx80Exp(accum),
                         &asig0, &asig1);
    bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
    bsig1 = 0;
    if (asign == extractFloatx80Sign(accum)) {
        add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
    } else {
        sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
    }
    /* Multiply by t to compute the required result.  */
    mul128To256(asig0, asig1, tsig0, tsig1,
                &asig0, &asig1, &asig2, &asig3);
    aexp += texp - 0x3ffe;
    *exp = aexp;
    *sig0 = asig0;
    *sig1 = asig1;
 }
 void helper_fyl2xp1(CPUX86State *env)
 {
    uint8_t old_flags = save_exception_flags(env);
@ -1462,109 +1559,18 @@ void helper_fyl2xp1(CPUX86State *env)
        ST1 = normalizeRoundAndPackFloatx80(80, arg0_sign ^ arg1_sign, exp,
                                            sig0, sig1, &env->fp_status);
    } else {
-        bool asign;
+        int32_t aexp;
-        uint32_t dexp, texp, aexp;
+        uint64_t asig0, asig1, asig2;
        uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
        uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
        uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
        floatx80 t2, accum;
        FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
        signed char save_prec = env->fp_status.floatx80_rounding_precision;
        env->fp_status.float_rounding_mode = float_round_nearest_even;
        env->fp_status.floatx80_rounding_precision = 80;
        helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
        /*
-         * Compute an approximation of ST0/(2+ST0), with extra
+         * Multiply by the second argument to compute the required
-         * precision, as the argument to a polynomial approximation.
+         * result.
         * The extra precision is only needed for the first term of
         * the approximation, with subsequent terms being
         * significantly smaller; the approximation only uses odd
         * exponents, and the square of ST0/(2+ST0) is at most
         * 17-12*sqrt(2) = 0.029....
         */
        if (arg0_sign) {
            dexp = 0x3fff;
            shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
            sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
        } else {
            dexp = 0x4000;
            shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
            dsig0 |= 0x8000000000000000ULL;
        }
        texp = arg0_exp - dexp + 0x3ffe;
        rsig0 = arg0_sig;
        rsig1 = 0;
        rsig2 = 0;
        if (dsig0 <= rsig0) {
            shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
            ++texp;
        }
        tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
        mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
        sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
               &rsig0, &rsig1, &rsig2);
        while ((int64_t) rsig0 < 0) {
            --tsig0;
            add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
                   &rsig0, &rsig1, &rsig2);
        }
        tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
        /*
         * No need to correct any estimation error in tsig1; even with
         * such error, it is accurate enough.  Now compute the square
         * of that approximation.
         */
        mul128To256(tsig0, tsig1, tsig0, tsig1,
                    &t2sig0, &t2sig1, &t2sig2, &t2sig3);
        t2 = normalizeRoundAndPackFloatx80(80, false, texp + texp - 0x3ffe,
                                           t2sig0, t2sig1, &env->fp_status);
        /* Compute the lower parts of the polynomial expansion.  */
        accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
        accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
        accum = floatx80_mul(accum, t2, &env->fp_status);
        accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
        accum = floatx80_mul(accum, t2, &env->fp_status);
        accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
        accum = floatx80_mul(accum, t2, &env->fp_status);
        accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
        accum = floatx80_mul(accum, t2, &env->fp_status);
        accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
        accum = floatx80_mul(accum, t2, &env->fp_status);
        accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
        accum = floatx80_mul(accum, t2, &env->fp_status);
        accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
        accum = floatx80_mul(accum, t2, &env->fp_status);
        accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
        accum = floatx80_mul(accum, t2, &env->fp_status);
        accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
        /*
         * The full polynomial expansion is fyl2x_coeff_0 + accum
         * (where accum has much lower magnitude, and so, in
         * particular, carry out of the addition is not possible),
         * multiplied by t.  (This expansion is only accurate to about
         * 70 bits, not 128 bits.)
         */
        aexp = extractFloatx80Exp(fyl2x_coeff_0);
        asign = extractFloatx80Sign(fyl2x_coeff_0);
        shift128RightJamming(extractFloatx80Frac(accum), 0,
                             aexp - extractFloatx80Exp(accum),
                             &asig0, &asig1);
        bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
        bsig1 = 0;
        if (asign == extractFloatx80Sign(accum)) {
            add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
        } else {
            sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
        }
        /*
         * Multiply by t and by the second argument to compute the
         * required result.
         */
        mul128To256(asig0, asig1, tsig0, tsig1,
                    &asig0, &asig1, &asig2, &asig3);
        aexp += texp - 0x3ffe;
        if (arg1_exp == 0) {
            normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
        }
@ -1581,6 +1587,151 @@ void helper_fyl2xp1(CPUX86State *env)
    merge_exception_flags(env, old_flags);
 }
 void helper_fyl2x(CPUX86State *env)
 {
    uint8_t old_flags = save_exception_flags(env);
    uint64_t arg0_sig = extractFloatx80Frac(ST0);
    int32_t arg0_exp = extractFloatx80Exp(ST0);
    bool arg0_sign = extractFloatx80Sign(ST0);
    uint64_t arg1_sig = extractFloatx80Frac(ST1);
    int32_t arg1_exp = extractFloatx80Exp(ST1);
    bool arg1_sign = extractFloatx80Sign(ST1);
    if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
        float_raise(float_flag_invalid, &env->fp_status);
        ST1 = floatx80_silence_nan(ST0, &env->fp_status);
    } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
        float_raise(float_flag_invalid, &env->fp_status);
        ST1 = floatx80_silence_nan(ST1, &env->fp_status);
    } else if (floatx80_invalid_encoding(ST0) ||
               floatx80_invalid_encoding(ST1)) {
        float_raise(float_flag_invalid, &env->fp_status);
        ST1 = floatx80_default_nan(&env->fp_status);
    } else if (floatx80_is_any_nan(ST0)) {
        ST1 = ST0;
    } else if (floatx80_is_any_nan(ST1)) {
        /* Pass this NaN through.  */
    } else if (arg0_sign && !floatx80_is_zero(ST0)) {
        float_raise(float_flag_invalid, &env->fp_status);
        ST1 = floatx80_default_nan(&env->fp_status);
    } else if (floatx80_is_infinity(ST1)) {
        FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
                                             &env->fp_status);
        switch (cmp) {
        case float_relation_less:
            ST1 = floatx80_chs(ST1);
            break;
        case float_relation_greater:
            /* Result is infinity of the same sign as ST1.  */
            break;
        default:
            float_raise(float_flag_invalid, &env->fp_status);
            ST1 = floatx80_default_nan(&env->fp_status);
            break;
        }
    } else if (floatx80_is_infinity(ST0)) {
        if (floatx80_is_zero(ST1)) {
            float_raise(float_flag_invalid, &env->fp_status);
            ST1 = floatx80_default_nan(&env->fp_status);
        } else if (arg1_sign) {
            ST1 = floatx80_chs(ST0);
        } else {
            ST1 = ST0;
        }
    } else if (floatx80_is_zero(ST0)) {
        if (floatx80_is_zero(ST1)) {
            float_raise(float_flag_invalid, &env->fp_status);
            ST1 = floatx80_default_nan(&env->fp_status);
        } else {
            /* Result is infinity with opposite sign to ST1.  */
            float_raise(float_flag_divbyzero, &env->fp_status);
            ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
                                0x8000000000000000ULL);
        }
    } else if (floatx80_is_zero(ST1)) {
        if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
            ST1 = floatx80_chs(ST1);
        }
        /* Otherwise, ST1 is already the correct result.  */
    } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
        if (arg1_sign) {
            ST1 = floatx80_chs(floatx80_zero);
        } else {
            ST1 = floatx80_zero;
        }
    } else {
        int32_t int_exp;
        floatx80 arg0_m1;
        FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
        signed char save_prec = env->fp_status.floatx80_rounding_precision;
        env->fp_status.float_rounding_mode = float_round_nearest_even;
        env->fp_status.floatx80_rounding_precision = 80;
        if (arg0_exp == 0) {
            normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
        }
        if (arg1_exp == 0) {
            normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
        }
        int_exp = arg0_exp - 0x3fff;
        if (arg0_sig > 0xb504f333f9de6484ULL) {
            ++int_exp;
        }
        arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
                                               &env->fp_status),
                               floatx80_one, &env->fp_status);
        if (floatx80_is_zero(arg0_m1)) {
            /* Exact power of 2; multiply by ST1.  */
            env->fp_status.float_rounding_mode = save_mode;
            ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
                               ST1, &env->fp_status);
        } else {
            bool asign = extractFloatx80Sign(arg0_m1);
            int32_t aexp;
            uint64_t asig0, asig1, asig2;
            helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
            if (int_exp != 0) {
                bool isign = (int_exp < 0);
                int32_t iexp;
                uint64_t isig;
                int shift;
                int_exp = isign ? -int_exp : int_exp;
                shift = clz32(int_exp) + 32;
                isig = int_exp;
                isig <<= shift;
                iexp = 0x403e - shift;
                shift128RightJamming(asig0, asig1, iexp - aexp,
                                     &asig0, &asig1);
                if (asign == isign) {
                    add128(isig, 0, asig0, asig1, &asig0, &asig1);
                } else {
                    sub128(isig, 0, asig0, asig1, &asig0, &asig1);
                }
                aexp = iexp;
                asign = isign;
            }
            /*
             * Multiply by the second argument to compute the required
             * result.
             */
            if (arg1_exp == 0) {
                normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
            }
            mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
            aexp += arg1_exp - 0x3ffe;
            /* This result is inexact.  */
            asig1 |= 1;
            env->fp_status.float_rounding_mode = save_mode;
            ST1 = normalizeRoundAndPackFloatx80(80, asign ^ arg1_sign, aexp,
                                                asig0, asig1, &env->fp_status);
        }
        env->fp_status.floatx80_rounding_precision = save_prec;
    }
    fpop(env);
    merge_exception_flags(env, old_flags);
 }
 void helper_fsqrt(CPUX86State *env)
 {
    uint8_t old_flags = save_exception_flags(env);
--- a/tests/tcg/i386/test-i386-fyl2x.c
+++ b/tests/tcg/i386/test-i386-fyl2x.c