From 25b28bb234e9bfed06ca7bbfe9e1313dbaa98467 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Wed, 25 Jul 2018 11:29:50 +0100 Subject: [PATCH] emit_x64_vector_floating_point: Simplify EmitVectorOperation* --- .../emit_x64_vector_floating_point.cpp | 171 +++++++----------- 1 file changed, 66 insertions(+), 105 deletions(-) diff --git a/src/backend_x64/emit_x64_vector_floating_point.cpp b/src/backend_x64/emit_x64_vector_floating_point.cpp index c963bf83..c312c637 100644 --- a/src/backend_x64/emit_x64_vector_floating_point.cpp +++ b/src/backend_x64/emit_x64_vector_floating_point.cpp @@ -4,16 +4,20 @@ * General Public License version 2 or any later version. */ +#include #include +#include #include "backend_x64/abi.h" #include "backend_x64/block_of_code.h" #include "backend_x64/emit_x64.h" #include "common/bit_util.h" #include "common/fp/fpcr.h" +#include "common/fp/info.h" #include "common/fp/op.h" #include "common/fp/util.h" #include "common/mp/function_info.h" +#include "common/mp/integer.h" #include "frontend/ir/basic_block.h" #include "frontend/ir/microinstruction.h" @@ -22,23 +26,38 @@ namespace Dynarmic::BackendX64 { using namespace Xbyak::util; namespace mp = Common::mp; -template -struct NaNWrapper; +template +static T ChooseOnFsize(T f32, T f64) { + static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64"); -template <> -struct NaNWrapper { - static constexpr u32 value = 0x7fc00000; -}; + if constexpr (fsize == 32) { + return f32; + } else { + return f64; + } +} -template <> -struct NaNWrapper { - static constexpr u64 value = 0x7ff8'0000'0000'0000; -}; +#define FCODE(NAME) (code.*ChooseOnFsize(&Xbyak::CodeGenerator::NAME##s, &Xbyak::CodeGenerator::NAME##d)) -template -static void HandleNaNs(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& xmm_a, - const Xbyak::Xmm& xmm_b, const Xbyak::Xmm& result, const Xbyak::Xmm& nan_mask) { - static_assert(std::is_same_v || std::is_same_v, "T must be either u32 or u64"); +template +static auto GetRuntimeNaNFunction(std::index_sequence) { + auto result = [](std::array, sizeof...(argi) + 1>& values) { + VectorArray& result = values[0]; + for (size_t elementi = 0; elementi < result.size(); ++elementi) { + const auto current_values = IndexFunction(elementi, values[argi + 1]...); + if (auto r = FP::ProcessNaNs(std::get(current_values)...)) { + result[elementi] = *r; + } else if (FP::IsNaN(result[elementi])) { + result[elementi] = FP::FPInfo::DefaultNaN(); + } + } + }; + return static_cast*>(result); +} + +template +static void HandleNaNs(BlockOfCode& code, EmitContext& ctx, std::array xmms, const Xbyak::Xmm& nan_mask) { + static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64"); if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { code.ptest(nan_mask, nan_mask); @@ -57,29 +76,21 @@ static void HandleNaNs(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& xm code.SwitchToFarCode(); code.L(nan); + + const Xbyak::Xmm result = xmms[0]; + code.sub(rsp, 8); ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); - const size_t stack_space = 3 * 16; - code.sub(rsp, stack_space + ABI_SHADOW_SPACE); - code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]); - code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]); - code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]); - code.movaps(xword[code.ABI_PARAM1], result); - code.movaps(xword[code.ABI_PARAM2], xmm_a); - code.movaps(xword[code.ABI_PARAM3], xmm_b); - code.CallFunction(static_cast&, const VectorArray&, const VectorArray&)>( - [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - for (size_t i = 0; i < result.size(); ++i) { - auto [first, second] = IndexFunction(i, a, b); - if (auto r = FP::ProcessNaNs(first, second)) { - result[i] = *r; - } else if (FP::IsNaN(result[i])) { - result[i] = NaNWrapper::value; - } - } - } - )); + const size_t stack_space = xmms.size() * 16; + code.sub(rsp, stack_space + ABI_SHADOW_SPACE); + for (size_t i = 0; i < xmms.size(); ++i) { + code.movaps(xword[rsp + ABI_SHADOW_SPACE + i * 16], xmms[i]); + } + code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]); + + using T = mp::unsigned_integer_of_size; + code.CallFunction(GetRuntimeNaNFunction(std::make_index_sequence{})); code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]); code.add(rsp, stack_space + ABI_SHADOW_SPACE); @@ -123,8 +134,10 @@ static std::tuple PairedLowerIndexFunction64(size_t i, const VectorArr return i == 0 ? std::make_tuple(a[0], b[0]) : std::make_tuple(u64(0), u64(0)); } -template -static void EmitVectorOperation32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { +template +static void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { + static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64"); + if (!ctx.AccurateNaN() || ctx.FPSCR_DN()) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); @@ -141,10 +154,10 @@ static void EmitVectorOperation32(BlockOfCode& code, EmitContext& ctx, IR::Inst* Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); code.pcmpeqw(tmp, tmp); code.movaps(nan_mask, xmm_a); - code.cmpordps(nan_mask, nan_mask); + FCODE(cmpordp)(nan_mask, nan_mask); code.andps(xmm_a, nan_mask); code.xorps(nan_mask, tmp); - code.andps(nan_mask, code.MConst(xword, 0x7fc0'0000'7fc0'0000, 0x7fc0'0000'7fc0'0000)); + code.andps(nan_mask, fsize == 32 ? code.MConst(xword, 0x7fc0'0000'7fc0'0000, 0x7fc0'0000'7fc0'0000) : code.MConst(xword, 0x7ff8'0000'0000'0000, 0x7ff8'0000'0000'0000)); code.orps(xmm_a, nan_mask); } @@ -161,67 +174,15 @@ static void EmitVectorOperation32(BlockOfCode& code, EmitContext& ctx, IR::Inst* code.movaps(nan_mask, xmm_b); code.movaps(result, xmm_a); - code.cmpunordps(nan_mask, xmm_a); + FCODE(cmpunordp)(nan_mask, xmm_a); if constexpr (std::is_member_function_pointer_v) { (code.*fn)(result, xmm_b); } else { fn(result, xmm_b); } - code.cmpunordps(nan_mask, result); + FCODE(cmpunordp)(nan_mask, result); - HandleNaNs(code, ctx, xmm_a, xmm_b, result, nan_mask); - - ctx.reg_alloc.DefineValue(inst, result); -} - -template -static void EmitVectorOperation64(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { - if (!ctx.AccurateNaN() || ctx.FPSCR_DN()) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]); - Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); - - if constexpr (std::is_member_function_pointer_v) { - (code.*fn)(xmm_a, xmm_b); - } else { - fn(xmm_a, xmm_b); - } - - if (ctx.FPSCR_DN()) { - Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm(); - Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); - code.pcmpeqw(tmp, tmp); - code.movaps(nan_mask, xmm_a); - code.cmpordpd(nan_mask, nan_mask); - code.andps(xmm_a, nan_mask); - code.xorps(nan_mask, tmp); - code.andps(nan_mask, code.MConst(xword, 0x7ff8'0000'0000'0000, 0x7ff8'0000'0000'0000)); - code.orps(xmm_a, nan_mask); - } - - ctx.reg_alloc.DefineValue(inst, xmm_a); - return; - } - - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - Xbyak::Label end, nan; - Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); - Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]); - Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]); - Xbyak::Xmm nan_mask = ctx.reg_alloc.ScratchXmm(); - - code.movaps(nan_mask, xmm_b); - code.movaps(result, xmm_a); - code.cmpunordpd(nan_mask, xmm_a); - if constexpr (std::is_member_function_pointer_v) { - (code.*fn)(result, xmm_b); - } else { - fn(result, xmm_b); - } - code.cmpunordpd(nan_mask, result); - - HandleNaNs(code, ctx, xmm_a, xmm_b, result, nan_mask); + HandleNaNs(code, ctx, {result, xmm_a, xmm_b}, nan_mask); ctx.reg_alloc.DefineValue(inst, result); } @@ -329,19 +290,19 @@ void EmitX64::EmitFPVectorAbs64(EmitContext& ctx, IR::Inst* inst) { } void EmitX64::EmitFPVectorAdd32(EmitContext& ctx, IR::Inst* inst) { - EmitVectorOperation32(code, ctx, inst, &Xbyak::CodeGenerator::addps); + EmitThreeOpVectorOperation<32, DefaultIndexFunction32>(code, ctx, inst, &Xbyak::CodeGenerator::addps); } void EmitX64::EmitFPVectorAdd64(EmitContext& ctx, IR::Inst* inst) { - EmitVectorOperation64(code, ctx, inst, &Xbyak::CodeGenerator::addpd); + EmitThreeOpVectorOperation<64, DefaultIndexFunction64>(code, ctx, inst, &Xbyak::CodeGenerator::addpd); } void EmitX64::EmitFPVectorDiv32(EmitContext& ctx, IR::Inst* inst) { - EmitVectorOperation32(code, ctx, inst, &Xbyak::CodeGenerator::divps); + EmitThreeOpVectorOperation<32, DefaultIndexFunction32>(code, ctx, inst, &Xbyak::CodeGenerator::divps); } void EmitX64::EmitFPVectorDiv64(EmitContext& ctx, IR::Inst* inst) { - EmitVectorOperation64(code, ctx, inst, &Xbyak::CodeGenerator::divpd); + EmitThreeOpVectorOperation<64, DefaultIndexFunction64>(code, ctx, inst, &Xbyak::CodeGenerator::divpd); } void EmitX64::EmitFPVectorEqual32(EmitContext& ctx, IR::Inst* inst) { @@ -405,23 +366,23 @@ void EmitX64::EmitFPVectorGreaterEqual64(EmitContext& ctx, IR::Inst* inst) { } void EmitX64::EmitFPVectorMul32(EmitContext& ctx, IR::Inst* inst) { - EmitVectorOperation32(code, ctx, inst, &Xbyak::CodeGenerator::mulps); + EmitThreeOpVectorOperation<32, DefaultIndexFunction32>(code, ctx, inst, &Xbyak::CodeGenerator::mulps); } void EmitX64::EmitFPVectorMul64(EmitContext& ctx, IR::Inst* inst) { - EmitVectorOperation64(code, ctx, inst, &Xbyak::CodeGenerator::mulpd); + EmitThreeOpVectorOperation<64, DefaultIndexFunction64>(code, ctx, inst, &Xbyak::CodeGenerator::mulpd); } void EmitX64::EmitFPVectorPairedAdd32(EmitContext& ctx, IR::Inst* inst) { - EmitVectorOperation32(code, ctx, inst, &Xbyak::CodeGenerator::haddps); + EmitThreeOpVectorOperation<32, PairedIndexFunction32>(code, ctx, inst, &Xbyak::CodeGenerator::haddps); } void EmitX64::EmitFPVectorPairedAdd64(EmitContext& ctx, IR::Inst* inst) { - EmitVectorOperation64(code, ctx, inst, &Xbyak::CodeGenerator::haddpd); + EmitThreeOpVectorOperation<64, PairedIndexFunction64>(code, ctx, inst, &Xbyak::CodeGenerator::haddpd); } void EmitX64::EmitFPVectorPairedAddLower32(EmitContext& ctx, IR::Inst* inst) { - EmitVectorOperation32(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm xmm_b) { + EmitThreeOpVectorOperation<32, PairedLowerIndexFunction32>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm xmm_b) { const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(); code.xorps(zero, zero); code.punpcklqdq(result, xmm_b); @@ -430,7 +391,7 @@ void EmitX64::EmitFPVectorPairedAddLower32(EmitContext& ctx, IR::Inst* inst) { } void EmitX64::EmitFPVectorPairedAddLower64(EmitContext& ctx, IR::Inst* inst) { - EmitVectorOperation64(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm xmm_b) { + EmitThreeOpVectorOperation<64, PairedLowerIndexFunction64>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm xmm_b) { const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(); code.xorps(zero, zero); code.punpcklqdq(result, xmm_b); @@ -523,11 +484,11 @@ void EmitX64::EmitFPVectorS64ToDouble(EmitContext& ctx, IR::Inst* inst) { } void EmitX64::EmitFPVectorSub32(EmitContext& ctx, IR::Inst* inst) { - EmitVectorOperation32(code, ctx, inst, &Xbyak::CodeGenerator::subps); + EmitThreeOpVectorOperation<32, DefaultIndexFunction32>(code, ctx, inst, &Xbyak::CodeGenerator::subps); } void EmitX64::EmitFPVectorSub64(EmitContext& ctx, IR::Inst* inst) { - EmitVectorOperation64(code, ctx, inst, &Xbyak::CodeGenerator::subpd); + EmitThreeOpVectorOperation<64, DefaultIndexFunction64>(code, ctx, inst, &Xbyak::CodeGenerator::subpd); } void EmitX64::EmitFPVectorU32ToSingle(EmitContext& ctx, IR::Inst* inst) {