diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c index d94a0022e4..67c85db477 100644 --- a/target/arm/tcg/translate-a64.c +++ b/target/arm/tcg/translate-a64.c @@ -654,6 +654,68 @@ static void write_fp_sreg(DisasContext *s, int reg, TCGv_i32 v) write_fp_dreg(s, reg, tmp); } +/* + * Write a double result to 128 bit vector register reg, honouring FPCR.NEP: + * - if FPCR.NEP == 0, clear the high elements of reg + * - if FPCR.NEP == 1, set the high elements of reg from mergereg + * (i.e. merge the result with those high elements) + * In either case, SVE register bits above 128 are zeroed (per R_WKYLB). + */ +static void write_fp_dreg_merging(DisasContext *s, int reg, int mergereg, + TCGv_i64 v) +{ + if (!s->fpcr_nep) { + write_fp_dreg(s, reg, v); + return; + } + + /* + * Move from mergereg to reg; this sets the high elements and + * clears the bits above 128 as a side effect. + */ + tcg_gen_gvec_mov(MO_64, vec_full_reg_offset(s, reg), + vec_full_reg_offset(s, mergereg), + 16, vec_full_reg_size(s)); + tcg_gen_st_i64(v, tcg_env, vec_full_reg_offset(s, reg)); +} + +/* + * Write a single-prec result, but only clear the higher elements + * of the destination register if FPCR.NEP is 0; otherwise preserve them. + */ +static void write_fp_sreg_merging(DisasContext *s, int reg, int mergereg, + TCGv_i32 v) +{ + if (!s->fpcr_nep) { + write_fp_sreg(s, reg, v); + return; + } + + tcg_gen_gvec_mov(MO_64, vec_full_reg_offset(s, reg), + vec_full_reg_offset(s, mergereg), + 16, vec_full_reg_size(s)); + tcg_gen_st_i32(v, tcg_env, fp_reg_offset(s, reg, MO_32)); +} + +/* + * Write a half-prec result, but only clear the higher elements + * of the destination register if FPCR.NEP is 0; otherwise preserve them. + * The caller must ensure that the top 16 bits of v are zero. + */ +static void write_fp_hreg_merging(DisasContext *s, int reg, int mergereg, + TCGv_i32 v) +{ + if (!s->fpcr_nep) { + write_fp_sreg(s, reg, v); + return; + } + + tcg_gen_gvec_mov(MO_64, vec_full_reg_offset(s, reg), + vec_full_reg_offset(s, mergereg), + 16, vec_full_reg_size(s)); + tcg_gen_st16_i32(v, tcg_env, fp_reg_offset(s, reg, MO_16)); +} + /* Expand a 2-operand AdvSIMD vector operation using an expander function. */ static void gen_gvec_fn2(DisasContext *s, bool is_q, int rd, int rn, GVecGen2Fn *gvec_fn, int vece) @@ -5027,7 +5089,7 @@ typedef struct FPScalar { } FPScalar; static bool do_fp3_scalar_with_fpsttype(DisasContext *s, arg_rrr_e *a, - const FPScalar *f, + const FPScalar *f, int mergereg, ARMFPStatusFlavour fpsttype) { switch (a->esz) { @@ -5036,7 +5098,7 @@ static bool do_fp3_scalar_with_fpsttype(DisasContext *s, arg_rrr_e *a, TCGv_i64 t0 = read_fp_dreg(s, a->rn); TCGv_i64 t1 = read_fp_dreg(s, a->rm); f->gen_d(t0, t0, t1, fpstatus_ptr(fpsttype)); - write_fp_dreg(s, a->rd, t0); + write_fp_dreg_merging(s, a->rd, mergereg, t0); } break; case MO_32: @@ -5044,7 +5106,7 @@ static bool do_fp3_scalar_with_fpsttype(DisasContext *s, arg_rrr_e *a, TCGv_i32 t0 = read_fp_sreg(s, a->rn); TCGv_i32 t1 = read_fp_sreg(s, a->rm); f->gen_s(t0, t0, t1, fpstatus_ptr(fpsttype)); - write_fp_sreg(s, a->rd, t0); + write_fp_sreg_merging(s, a->rd, mergereg, t0); } break; case MO_16: @@ -5055,7 +5117,7 @@ static bool do_fp3_scalar_with_fpsttype(DisasContext *s, arg_rrr_e *a, TCGv_i32 t0 = read_fp_hreg(s, a->rn); TCGv_i32 t1 = read_fp_hreg(s, a->rm); f->gen_h(t0, t0, t1, fpstatus_ptr(fpsttype)); - write_fp_sreg(s, a->rd, t0); + write_fp_hreg_merging(s, a->rd, mergereg, t0); } break; default: @@ -5064,16 +5126,19 @@ static bool do_fp3_scalar_with_fpsttype(DisasContext *s, arg_rrr_e *a, return true; } -static bool do_fp3_scalar(DisasContext *s, arg_rrr_e *a, const FPScalar *f) +static bool do_fp3_scalar(DisasContext *s, arg_rrr_e *a, const FPScalar *f, + int mergereg) { - return do_fp3_scalar_with_fpsttype(s, a, f, + return do_fp3_scalar_with_fpsttype(s, a, f, mergereg, a->esz == MO_16 ? FPST_A64_F16 : FPST_A64); } -static bool do_fp3_scalar_ah(DisasContext *s, arg_rrr_e *a, const FPScalar *f) +static bool do_fp3_scalar_ah(DisasContext *s, arg_rrr_e *a, const FPScalar *f, + int mergereg) { - return do_fp3_scalar_with_fpsttype(s, a, f, select_ah_fpst(s, a->esz)); + return do_fp3_scalar_with_fpsttype(s, a, f, mergereg, + select_ah_fpst(s, a->esz)); } static const FPScalar f_scalar_fadd = { @@ -5081,63 +5146,63 @@ static const FPScalar f_scalar_fadd = { gen_helper_vfp_adds, gen_helper_vfp_addd, }; -TRANS(FADD_s, do_fp3_scalar, a, &f_scalar_fadd) +TRANS(FADD_s, do_fp3_scalar, a, &f_scalar_fadd, a->rn) static const FPScalar f_scalar_fsub = { gen_helper_vfp_subh, gen_helper_vfp_subs, gen_helper_vfp_subd, }; -TRANS(FSUB_s, do_fp3_scalar, a, &f_scalar_fsub) +TRANS(FSUB_s, do_fp3_scalar, a, &f_scalar_fsub, a->rn) static const FPScalar f_scalar_fdiv = { gen_helper_vfp_divh, gen_helper_vfp_divs, gen_helper_vfp_divd, }; -TRANS(FDIV_s, do_fp3_scalar, a, &f_scalar_fdiv) +TRANS(FDIV_s, do_fp3_scalar, a, &f_scalar_fdiv, a->rn) static const FPScalar f_scalar_fmul = { gen_helper_vfp_mulh, gen_helper_vfp_muls, gen_helper_vfp_muld, }; -TRANS(FMUL_s, do_fp3_scalar, a, &f_scalar_fmul) +TRANS(FMUL_s, do_fp3_scalar, a, &f_scalar_fmul, a->rn) static const FPScalar f_scalar_fmax = { gen_helper_vfp_maxh, gen_helper_vfp_maxs, gen_helper_vfp_maxd, }; -TRANS(FMAX_s, do_fp3_scalar, a, &f_scalar_fmax) +TRANS(FMAX_s, do_fp3_scalar, a, &f_scalar_fmax, a->rn) static const FPScalar f_scalar_fmin = { gen_helper_vfp_minh, gen_helper_vfp_mins, gen_helper_vfp_mind, }; -TRANS(FMIN_s, do_fp3_scalar, a, &f_scalar_fmin) +TRANS(FMIN_s, do_fp3_scalar, a, &f_scalar_fmin, a->rn) static const FPScalar f_scalar_fmaxnm = { gen_helper_vfp_maxnumh, gen_helper_vfp_maxnums, gen_helper_vfp_maxnumd, }; -TRANS(FMAXNM_s, do_fp3_scalar, a, &f_scalar_fmaxnm) +TRANS(FMAXNM_s, do_fp3_scalar, a, &f_scalar_fmaxnm, a->rn) static const FPScalar f_scalar_fminnm = { gen_helper_vfp_minnumh, gen_helper_vfp_minnums, gen_helper_vfp_minnumd, }; -TRANS(FMINNM_s, do_fp3_scalar, a, &f_scalar_fminnm) +TRANS(FMINNM_s, do_fp3_scalar, a, &f_scalar_fminnm, a->rn) static const FPScalar f_scalar_fmulx = { gen_helper_advsimd_mulxh, gen_helper_vfp_mulxs, gen_helper_vfp_mulxd, }; -TRANS(FMULX_s, do_fp3_scalar, a, &f_scalar_fmulx) +TRANS(FMULX_s, do_fp3_scalar, a, &f_scalar_fmulx, a->rn) static void gen_fnmul_h(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, TCGv_ptr s) { @@ -5162,42 +5227,42 @@ static const FPScalar f_scalar_fnmul = { gen_fnmul_s, gen_fnmul_d, }; -TRANS(FNMUL_s, do_fp3_scalar, a, &f_scalar_fnmul) +TRANS(FNMUL_s, do_fp3_scalar, a, &f_scalar_fnmul, a->rn) static const FPScalar f_scalar_fcmeq = { gen_helper_advsimd_ceq_f16, gen_helper_neon_ceq_f32, gen_helper_neon_ceq_f64, }; -TRANS(FCMEQ_s, do_fp3_scalar, a, &f_scalar_fcmeq) +TRANS(FCMEQ_s, do_fp3_scalar, a, &f_scalar_fcmeq, a->rm) static const FPScalar f_scalar_fcmge = { gen_helper_advsimd_cge_f16, gen_helper_neon_cge_f32, gen_helper_neon_cge_f64, }; -TRANS(FCMGE_s, do_fp3_scalar, a, &f_scalar_fcmge) +TRANS(FCMGE_s, do_fp3_scalar, a, &f_scalar_fcmge, a->rm) static const FPScalar f_scalar_fcmgt = { gen_helper_advsimd_cgt_f16, gen_helper_neon_cgt_f32, gen_helper_neon_cgt_f64, }; -TRANS(FCMGT_s, do_fp3_scalar, a, &f_scalar_fcmgt) +TRANS(FCMGT_s, do_fp3_scalar, a, &f_scalar_fcmgt, a->rm) static const FPScalar f_scalar_facge = { gen_helper_advsimd_acge_f16, gen_helper_neon_acge_f32, gen_helper_neon_acge_f64, }; -TRANS(FACGE_s, do_fp3_scalar, a, &f_scalar_facge) +TRANS(FACGE_s, do_fp3_scalar, a, &f_scalar_facge, a->rm) static const FPScalar f_scalar_facgt = { gen_helper_advsimd_acgt_f16, gen_helper_neon_acgt_f32, gen_helper_neon_acgt_f64, }; -TRANS(FACGT_s, do_fp3_scalar, a, &f_scalar_facgt) +TRANS(FACGT_s, do_fp3_scalar, a, &f_scalar_facgt, a->rm) static void gen_fabd_h(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, TCGv_ptr s) { @@ -5222,21 +5287,21 @@ static const FPScalar f_scalar_fabd = { gen_fabd_s, gen_fabd_d, }; -TRANS(FABD_s, do_fp3_scalar, a, &f_scalar_fabd) +TRANS(FABD_s, do_fp3_scalar, a, &f_scalar_fabd, a->rn) static const FPScalar f_scalar_frecps = { gen_helper_recpsf_f16, gen_helper_recpsf_f32, gen_helper_recpsf_f64, }; -TRANS(FRECPS_s, do_fp3_scalar_ah, a, &f_scalar_frecps) +TRANS(FRECPS_s, do_fp3_scalar_ah, a, &f_scalar_frecps, a->rn) static const FPScalar f_scalar_frsqrts = { gen_helper_rsqrtsf_f16, gen_helper_rsqrtsf_f32, gen_helper_rsqrtsf_f64, }; -TRANS(FRSQRTS_s, do_fp3_scalar_ah, a, &f_scalar_frsqrts) +TRANS(FRSQRTS_s, do_fp3_scalar_ah, a, &f_scalar_frsqrts, a->rn) static bool do_fcmp0_s(DisasContext *s, arg_rr_e *a, const FPScalar *f, bool swap)