diff --git a/src/dynarmic/backend/x64/constants.h b/src/dynarmic/backend/x64/constants.h index 9d894574..682d8ebd 100644 --- a/src/dynarmic/backend/x64/constants.h +++ b/src/dynarmic/backend/x64/constants.h @@ -5,8 +5,11 @@ #pragma once +#include + #include "dynarmic/common/bit_util.h" #include "dynarmic/common/common_types.h" +#include "dynarmic/common/fp/rounding_mode.h" namespace Dynarmic::Backend::X64 { @@ -84,4 +87,19 @@ constexpr u32 FixupLUT(FpFixup src_qnan = FpFixup::A, return fixup_lut; } +constexpr std::optional ConvertRoundingModeToX64Immediate(FP::RoundingMode rounding_mode) { + switch (rounding_mode) { + case FP::RoundingMode::ToNearest_TieEven: + return 0b00; + case FP::RoundingMode::TowardsPlusInfinity: + return 0b10; + case FP::RoundingMode::TowardsMinusInfinity: + return 0b01; + case FP::RoundingMode::TowardsZero: + return 0b11; + default: + return std::nullopt; + } +} + } // namespace Dynarmic::Backend::X64 diff --git a/src/dynarmic/backend/x64/emit_x64_floating_point.cpp b/src/dynarmic/backend/x64/emit_x64_floating_point.cpp index c00121fb..2994a91e 100644 --- a/src/dynarmic/backend/x64/emit_x64_floating_point.cpp +++ b/src/dynarmic/backend/x64/emit_x64_floating_point.cpp @@ -80,21 +80,6 @@ constexpr u64 f64_max_u64_lim = 0x43f0000000000000u; // 2^64 as a double (actua } \ } -std::optional ConvertRoundingModeToX64Immediate(FP::RoundingMode rounding_mode) { - switch (rounding_mode) { - case FP::RoundingMode::ToNearest_TieEven: - return 0b00; - case FP::RoundingMode::TowardsPlusInfinity: - return 0b10; - case FP::RoundingMode::TowardsMinusInfinity: - return 0b01; - case FP::RoundingMode::TowardsZero: - return 0b11; - default: - return std::nullopt; - } -} - template void DenormalsAreZero(BlockOfCode& code, EmitContext& ctx, std::initializer_list to_daz) { if (ctx.FPCR().FZ()) { diff --git a/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp b/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp index dfbd8da0..3952527b 100644 --- a/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp +++ b/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp @@ -642,6 +642,49 @@ void EmitX64::EmitFPVectorEqual64(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(inst, a); } +void EmitX64::EmitFPVectorFromHalf32(EmitContext& ctx, IR::Inst* inst) { + const auto rounding_mode = static_cast(inst->GetArg(1).GetU8()); + const bool fpcr_controlled = inst->GetArg(2).GetU1(); + + if (code.HasHostFeature(HostFeature::F16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm value = ctx.reg_alloc.UseXmm(args[0]); + + code.vcvtph2ps(result, value); + ForceToDefaultNaN<32>(code, ctx.FPCR(fpcr_controlled), result); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + using rounding_list = mp::list< + mp::lift_value, + mp::lift_value, + mp::lift_value, + mp::lift_value, + mp::lift_value>; + + static const auto lut = Common::GenerateLookupTableFromList( + [](auto arg) { + return std::pair{ + mp::lower_to_tuple_v, + Common::FptrCast( + [](VectorArray& output, const VectorArray& input, FP::FPCR fpcr, FP::FPSR& fpsr) { + constexpr auto t = mp::lower_to_tuple_v; + constexpr FP::RoundingMode rounding_mode = std::get<0>(t); + + for (size_t i = 0; i < output.size(); ++i) { + output[i] = FP::FPConvert(input[i], fpcr, rounding_mode, fpsr); + } + })}; + }, + mp::cartesian_product{}); + + EmitTwoOpFallback<2>(code, ctx, inst, lut.at(std::make_tuple(rounding_mode))); +} + void EmitX64::EmitFPVectorFromSignedFixed32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Xmm xmm = ctx.reg_alloc.UseScratchXmm(args[0]); @@ -1607,6 +1650,53 @@ void EmitX64::EmitFPVectorSub64(EmitContext& ctx, IR::Inst* inst) { EmitThreeOpVectorOperation<64, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::subpd); } +void EmitX64::EmitFPVectorToHalf32(EmitContext& ctx, IR::Inst* inst) { + const auto rounding_mode = static_cast(inst->GetArg(1).GetU8()); + const bool fpcr_controlled = inst->GetArg(2).GetU1(); + + if (code.HasHostFeature(HostFeature::F16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode); + + const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); + + ForceToDefaultNaN<32>(code, ctx.FPCR(fpcr_controlled), result); + code.vcvtps2ph(result, result, static_cast(*round_imm)); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + using rounding_list = mp::list< + mp::lift_value, + mp::lift_value, + mp::lift_value, + mp::lift_value, + mp::lift_value>; + + static const auto lut = Common::GenerateLookupTableFromList( + [](auto arg) { + return std::pair{ + mp::lower_to_tuple_v, + Common::FptrCast( + [](VectorArray& output, const VectorArray& input, FP::FPCR fpcr, FP::FPSR& fpsr) { + constexpr auto t = mp::lower_to_tuple_v; + constexpr FP::RoundingMode rounding_mode = std::get<0>(t); + + for (size_t i = 0; i < output.size(); ++i) { + if (i < input.size()) { + output[i] = FP::FPConvert(input[i], fpcr, rounding_mode, fpsr); + } else { + output[i] = 0; + } + } + })}; + }, + mp::cartesian_product{}); + + EmitTwoOpFallback<2>(code, ctx, inst, lut.at(std::make_tuple(rounding_mode))); +} + template void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { using FPT = mp::unsigned_integer_of_size; diff --git a/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_misc.cpp b/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_misc.cpp index 449d46da..b618088d 100644 --- a/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_misc.cpp +++ b/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_misc.cpp @@ -620,24 +620,13 @@ bool TranslatorVisitor::asimd_VCVT_half(bool D, size_t sz, size_t Vd, bool half_ } const size_t esize = 8U << sz; - const size_t num_elements = 4; const auto rounding_mode = FP::RoundingMode::ToNearest_TieEven; // StandardFPSCRValue().RMode const auto d = ToVector(half_to_single, Vd, D); const auto m = ToVector(!half_to_single, Vm, M); const auto operand = ir.GetVector(m); - IR::U128 result = ir.ZeroVector(); - for (size_t i = 0; i < num_elements; i++) { - if (half_to_single) { - const IR::U16 old_element = ir.VectorGetElement(esize, operand, i); - const IR::U32 new_element = ir.FPHalfToSingle(old_element, rounding_mode); - result = ir.VectorSetElement(esize * 2, result, i, new_element); - } else { - const IR::U32 old_element = ir.VectorGetElement(esize * 2, operand, i); - const IR::U16 new_element = ir.FPSingleToHalf(old_element, rounding_mode); - result = ir.VectorSetElement(esize, result, i, new_element); - } - } + const IR::U128 result = half_to_single ? ir.FPVectorFromHalf(esize * 2, operand, rounding_mode, false) + : ir.FPVectorToHalf(esize * 2, operand, rounding_mode, false); ir.SetVector(d, result); return true; } diff --git a/src/dynarmic/ir/ir_emitter.cpp b/src/dynarmic/ir/ir_emitter.cpp index bd6c1ebc..2f289d67 100644 --- a/src/dynarmic/ir/ir_emitter.cpp +++ b/src/dynarmic/ir/ir_emitter.cpp @@ -2404,6 +2404,11 @@ U128 IREmitter::FPVectorEqual(size_t esize, const U128& a, const U128& b, bool f UNREACHABLE(); } +U128 IREmitter::FPVectorFromHalf(size_t esize, const U128& a, FP::RoundingMode rounding, bool fpcr_controlled) { + ASSERT(esize == 32); + return Inst(Opcode::FPVectorFromHalf32, a, Imm8(static_cast(rounding)), Imm1(fpcr_controlled)); +} + U128 IREmitter::FPVectorFromSignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled) { ASSERT(fbits <= esize); switch (esize) { @@ -2613,6 +2618,11 @@ U128 IREmitter::FPVectorSub(size_t esize, const U128& a, const U128& b, bool fpc UNREACHABLE(); } +U128 IREmitter::FPVectorToHalf(size_t esize, const U128& a, FP::RoundingMode rounding, bool fpcr_controlled) { + ASSERT(esize == 32); + return Inst(Opcode::FPVectorToHalf32, a, Imm8(static_cast(rounding)), Imm1(fpcr_controlled)); +} + U128 IREmitter::FPVectorToSignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled) { ASSERT(fbits <= esize); diff --git a/src/dynarmic/ir/ir_emitter.h b/src/dynarmic/ir/ir_emitter.h index db542136..caba937d 100644 --- a/src/dynarmic/ir/ir_emitter.h +++ b/src/dynarmic/ir/ir_emitter.h @@ -370,6 +370,7 @@ public: U128 FPVectorAdd(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); U128 FPVectorDiv(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); U128 FPVectorEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); + U128 FPVectorFromHalf(size_t esize, const U128& a, FP::RoundingMode rounding, bool fpcr_controlled = true); U128 FPVectorFromSignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled = true); U128 FPVectorFromUnsignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled = true); U128 FPVectorGreater(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); @@ -389,6 +390,7 @@ public: U128 FPVectorRSqrtStepFused(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); U128 FPVectorSqrt(size_t esize, const U128& a, bool fpcr_controlled = true); U128 FPVectorSub(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true); + U128 FPVectorToHalf(size_t esize, const U128& a, FP::RoundingMode rounding, bool fpcr_controlled = true); U128 FPVectorToSignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled = true); U128 FPVectorToUnsignedFixed(size_t esize, const U128& a, size_t fbits, FP::RoundingMode rounding, bool fpcr_controlled = true); diff --git a/src/dynarmic/ir/opcodes.inc b/src/dynarmic/ir/opcodes.inc index 50c36439..f8995b19 100644 --- a/src/dynarmic/ir/opcodes.inc +++ b/src/dynarmic/ir/opcodes.inc @@ -613,6 +613,7 @@ OPCODE(FPVectorDiv64, U128, U128 OPCODE(FPVectorEqual16, U128, U128, U128, U1 ) OPCODE(FPVectorEqual32, U128, U128, U128, U1 ) OPCODE(FPVectorEqual64, U128, U128, U128, U1 ) +OPCODE(FPVectorFromHalf32, U128, U128, U8, U1 ) OPCODE(FPVectorFromSignedFixed32, U128, U128, U8, U8, U1 ) OPCODE(FPVectorFromSignedFixed64, U128, U128, U8, U8, U1 ) OPCODE(FPVectorFromUnsignedFixed32, U128, U128, U8, U8, U1 ) @@ -658,6 +659,7 @@ OPCODE(FPVectorSqrt32, U128, U128 OPCODE(FPVectorSqrt64, U128, U128, U1 ) OPCODE(FPVectorSub32, U128, U128, U128, U1 ) OPCODE(FPVectorSub64, U128, U128, U128, U1 ) +OPCODE(FPVectorToHalf32, U128, U128, U8, U1 ) OPCODE(FPVectorToSignedFixed16, U128, U128, U8, U8, U1 ) OPCODE(FPVectorToSignedFixed32, U128, U128, U8, U8, U1 ) OPCODE(FPVectorToSignedFixed64, U128, U128, U8, U8, U1 )