From 685b85ce0e471d8dcd64cc97b1460acd784d2743 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Thu, 25 Jun 2020 19:34:02 +0100 Subject: [PATCH] A32: Implement v8 ASIMD V{MAX,MIN}NM --- .../x64/emit_x64_vector_floating_point.cpp | 117 ++++++++++++++++++ src/frontend/A32/decoder/asimd.inc | 2 + .../A32/translate/impl/asimd_three_same.cpp | 12 ++ .../A32/translate/impl/translate_arm.h | 2 + src/frontend/ir/ir_emitter.cpp | 20 +++ src/frontend/ir/ir_emitter.h | 2 + src/frontend/ir/opcodes.inc | 4 + 7 files changed, 159 insertions(+) diff --git a/src/backend/x64/emit_x64_vector_floating_point.cpp b/src/backend/x64/emit_x64_vector_floating_point.cpp index ba41b658..0501cc9d 100644 --- a/src/backend/x64/emit_x64_vector_floating_point.cpp +++ b/src/backend/x64/emit_x64_vector_floating_point.cpp @@ -950,6 +950,123 @@ void EmitX64::EmitFPVectorMin64(EmitContext& ctx, IR::Inst* inst) { EmitFPVectorMinMax<64, false>(code, ctx, inst); } +template +static void EmitFPVectorMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mp::unsigned_integer_of_size; + constexpr u8 mantissa_msb_bit = static_cast(FP::FPInfo::explicit_mantissa_width - 1); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const bool fpcr_controlled = args[2].GetImmediateU1(); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm xmm_a = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[0]) : ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm xmm_b = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]); + + const Xbyak::Xmm eq = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp0 = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(); + + Xbyak::Label end, fallback; + + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{ + DenormalsAreZero(code, ctx.FPCR(fpcr_controlled), {xmm_a, xmm_b}, xmm0); + + if (!code.HasAVX()) { + FCODE(vcmpeqp)(xmm0, xmm_a, xmm_b); + FCODE(vcmpunordp)(tmp0, xmm_a, xmm_a); + FCODE(vcmpunordp)(tmp1, xmm_b, xmm_b); + code.pand(tmp0, xmm_a); + code.vpandn(tmp1, xmm_b, tmp1); + FCODE(orp)(tmp0, tmp1); + if constexpr (is_max) { + code.vpand(eq, xmm_a, xmm_b); + FCODE(vmaxp)(result, xmm_a, xmm_b); + } else { + code.vpor(eq, xmm_a, xmm_b); + FCODE(vminp)(result, xmm_a, xmm_b); + } + ICODE(psll)(tmp0, static_cast(fsize - mantissa_msb_bit)); + + // At this point: + // tmp0 = IsSNaN(xmm_a) || IsQNaN(xmm_b) + // xmm0 == (xmm_a == xmm_b) + // result = xmm_a {<,>} xmm_b ? xmm_a : xmm_b + + FCODE(blendvp)(result, eq); + FCODE(vblendvp)(result, result, xmm_a, tmp0); + } else { + /* + code.movaps(tmp0, xmm_a); + code.movaps(tmp1, xmm_b); + FCODE(cmpunordp)(tmp0, tmp0); + FCODE(cmpunordp)(tmp1, tmp1); + */ + FCODE(vcmpunordp)(tmp0, xmm_a, xmm_a); + FCODE(vcmpunordp)(tmp1, xmm_b, xmm_b); + + code.pand(tmp0, xmm_a); + + /* + code.movaps(xmm0, xmm_b); + code.pandn(xmm0, tmp1); + code.por(tmp0, xmm0); + */ + code.vpandn(tmp1, xmm_b, tmp1); + FCODE(orp)(tmp0, tmp1); + + ICODE(psll)(tmp0, static_cast(fsize - mantissa_msb_bit)); + code.psrad(tmp0, 31); + if constexpr (fsize == 64) { + code.pshufd(tmp0, tmp0, 0b11110101); + } + + /* + code.movaps(xmm0, xmm_a); + FCODE(cmpeqp)(xmm0, xmm_b); + */ + FCODE(vcmpeqp)(xmm0, xmm_a, xmm_b); + + code.movaps(eq, xmm_a); + code.movaps(result, xmm_a); + if constexpr (is_max) { + code.pand(eq, xmm_b); + FCODE(maxp)(result, xmm_b); + } else { + code.por(eq, xmm_b); + FCODE(minp)(result, xmm_b); + } + + code.pand(eq, xmm0); + code.pandn(xmm0, result); + code.por(eq, xmm0); + + code.movaps(result, xmm_a); + code.pand(result, tmp0); + code.pandn(tmp0, eq); + code.por(result, tmp0); + } + + ForceToDefaultNaN(code, ctx.FPCR(fpcr_controlled), result); + }); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitX64::EmitFPVectorMaxNumeric32(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMinMaxNumeric<32, true>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorMaxNumeric64(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMinMaxNumeric<64, true>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorMinNumeric32(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMinMaxNumeric<32, false>(code, ctx, inst); +} + +void EmitX64::EmitFPVectorMinNumeric64(EmitContext& ctx, IR::Inst* inst) { + EmitFPVectorMinMaxNumeric<64, false>(code, ctx, inst); +} + void EmitX64::EmitFPVectorMul32(EmitContext& ctx, IR::Inst* inst) { EmitThreeOpVectorOperation<32, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::mulps); } diff --git a/src/frontend/A32/decoder/asimd.inc b/src/frontend/A32/decoder/asimd.inc index 44029fb6..90048fdb 100644 --- a/src/frontend/A32/decoder/asimd.inc +++ b/src/frontend/A32/decoder/asimd.inc @@ -50,6 +50,8 @@ INST(asimd_VCGT_reg_float, "VCGT (register)", "111100110D1znnnndddd111 INST(asimd_VACGE, "VACGE", "111100110Doznnnndddd1110NQM1mmmm") // ASIMD INST(asimd_VMAX_float, "VMAX (floating-point)", "111100100D0znnnndddd1111NQM0mmmm") // ASIMD INST(asimd_VMIN_float, "VMIN (floating-point)", "111100100D1znnnndddd1111NQM0mmmm") // ASIMD +INST(v8_VMAXNM, "VMAXNM", "111100110D0znnnndddd1111NQM1mmmm") // v8 +INST(v8_VMINNM, "VMINNM", "111100110D1znnnndddd1111NQM1mmmm") // v8 INST(asimd_VRECPS, "VRECPS", "111100100D0znnnndddd1111NQM1mmmm") // ASIMD INST(asimd_VRSQRTS, "VRSQRTS", "111100100D1znnnndddd1111NQM1mmmm") // ASIMD diff --git a/src/frontend/A32/translate/impl/asimd_three_same.cpp b/src/frontend/A32/translate/impl/asimd_three_same.cpp index 9994e4e7..4c9ac0b2 100644 --- a/src/frontend/A32/translate/impl/asimd_three_same.cpp +++ b/src/frontend/A32/translate/impl/asimd_three_same.cpp @@ -795,6 +795,18 @@ bool ArmTranslatorVisitor::asimd_VMIN_float(bool D, bool sz, size_t Vn, size_t V }); } +bool ArmTranslatorVisitor::v8_VMAXNM(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) { + return ir.FPVectorMaxNumeric(32, reg_n, reg_m, false); + }); +} + +bool ArmTranslatorVisitor::v8_VMINNM(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { + return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) { + return ir.FPVectorMinNumeric(32, reg_n, reg_m, false); + }); +} + bool ArmTranslatorVisitor::asimd_VRECPS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) { return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) { return ir.FPVectorRecipStepFused(32, reg_n, reg_m, false); diff --git a/src/frontend/A32/translate/impl/translate_arm.h b/src/frontend/A32/translate/impl/translate_arm.h index dfc02b1d..4938e0ae 100644 --- a/src/frontend/A32/translate/impl/translate_arm.h +++ b/src/frontend/A32/translate/impl/translate_arm.h @@ -503,6 +503,8 @@ struct ArmTranslatorVisitor final { bool asimd_VACGE(bool D, bool op, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); bool asimd_VMAX_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); bool asimd_VMIN_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool v8_VMAXNM(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); + bool v8_VMINNM(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); bool asimd_VRECPS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); bool asimd_VRSQRTS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 7f19e4f8..0e684bec 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -2426,6 +2426,16 @@ U128 IREmitter::FPVectorMax(size_t esize, const U128& a, const U128& b, bool fpc UNREACHABLE(); } +U128 IREmitter::FPVectorMaxNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { + switch (esize) { + case 32: + return Inst(Opcode::FPVectorMaxNumeric32, a, b, Imm1(fpcr_controlled)); + case 64: + return Inst(Opcode::FPVectorMaxNumeric64, a, b, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + U128 IREmitter::FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { switch (esize) { case 32: @@ -2436,6 +2446,16 @@ U128 IREmitter::FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpc UNREACHABLE(); } +U128 IREmitter::FPVectorMinNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { + switch (esize) { + case 32: + return Inst(Opcode::FPVectorMinNumeric32, a, b, Imm1(fpcr_controlled)); + case 64: + return Inst(Opcode::FPVectorMinNumeric64, a, b, Imm1(fpcr_controlled)); + } + UNREACHABLE(); +} + U128 IREmitter::FPVectorMul(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) { switch (esize) { case 32: diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 95051edb..5125216b 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -358,7 +358,9 @@ public: U128 FPVectorGreater(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); U128 FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); U128 FPVectorMax(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); + U128 FPVectorMaxNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); U128 FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); + U128 FPVectorMinNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); U128 FPVectorMul(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); U128 FPVectorMulAdd(size_t esize, const U128& addend, const U128& op1, const U128& op2, bool fpcr_controlled = true); U128 FPVectorMulX(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 4314d5a3..3e144994 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -612,8 +612,12 @@ OPCODE(FPVectorGreaterEqual32, U128, U128 OPCODE(FPVectorGreaterEqual64, U128, U128, U128, U1 ) OPCODE(FPVectorMax32, U128, U128, U128, U1 ) OPCODE(FPVectorMax64, U128, U128, U128, U1 ) +OPCODE(FPVectorMaxNumeric32, U128, U128, U128, U1 ) +OPCODE(FPVectorMaxNumeric64, U128, U128, U128, U1 ) OPCODE(FPVectorMin32, U128, U128, U128, U1 ) OPCODE(FPVectorMin64, U128, U128, U128, U1 ) +OPCODE(FPVectorMinNumeric32, U128, U128, U128, U1 ) +OPCODE(FPVectorMinNumeric64, U128, U128, U128, U1 ) OPCODE(FPVectorMul32, U128, U128, U128, U1 ) OPCODE(FPVectorMul64, U128, U128, U128, U1 ) OPCODE(FPVectorMulAdd16, U128, U128, U128, U128, U1 )