From 7162f6f254cd6df1dc07c433055075c05f552585 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Sat, 22 Sep 2018 17:25:19 +0100 Subject: [PATCH] emit_x64_vector_floating_point: SSE4.1 implementation of EmitFPVectorToFixed --- .../x64/emit_x64_vector_floating_point.cpp | 110 ++++++++++++++++++ 1 file changed, 110 insertions(+) diff --git a/src/backend/x64/emit_x64_vector_floating_point.cpp b/src/backend/x64/emit_x64_vector_floating_point.cpp index 4039554b..b4c8b07b 100644 --- a/src/backend/x64/emit_x64_vector_floating_point.cpp +++ b/src/backend/x64/emit_x64_vector_floating_point.cpp @@ -5,6 +5,7 @@ */ #include +#include #include #include #include @@ -183,6 +184,19 @@ void ForceToDefaultNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result) { } } +template +void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm result) { + const Xbyak::Xmm nan_mask = xmm0; + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + FCODE(vcmpordp)(nan_mask, result, result); + FCODE(vandp)(result, result, nan_mask); + } else { + code.movaps(nan_mask, result); + FCODE(cmpordp)(nan_mask, nan_mask); + code.andps(result, nan_mask); + } +} + template void DenormalsAreZero(BlockOfCode& code, EmitContext& ctx, std::initializer_list to_daz, Xbyak::Xmm tmp) { if (ctx.FPSCR_FTZ()) { @@ -1237,6 +1251,102 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { const size_t fbits = inst->GetArg(1).GetU8(); const auto rounding = static_cast(inst->GetArg(2).GetU8()); + // TODO: AVX512 implementation + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm src = ctx.reg_alloc.UseScratchXmm(args[0]); + + const int round_imm = [&]{ + switch (rounding) { + case FP::RoundingMode::ToNearest_TieEven: + default: + return 0b00; + case FP::RoundingMode::TowardsPlusInfinity: + return 0b10; + case FP::RoundingMode::TowardsMinusInfinity: + return 0b01; + case FP::RoundingMode::TowardsZero: + return 0b11; + } + }(); + + const auto perform_conversion = [&code, &ctx](const Xbyak::Xmm& src) { + // MSVC doesn't allow us to use a [&] capture, so we have to do this instead. + (void)ctx; + + if constexpr (fsize == 32) { + code.cvttps2dq(src, src); + } else { + const Xbyak::Reg64 hi = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 lo = ctx.reg_alloc.ScratchGpr(); + + code.cvttsd2si(lo, src); + code.punpckhqdq(src, src); + code.cvttsd2si(hi, src); + code.movq(src, lo); + code.pinsrq(src, hi, 1); + + ctx.reg_alloc.Release(hi); + ctx.reg_alloc.Release(lo); + } + }; + + if (fbits != 0) { + const u64 scale_factor = fsize == 32 + ? static_cast(fbits + 127) << 23 + : static_cast(fbits + 1023) << 52; + FCODE(mulp)(src, GetVectorOf(code, scale_factor)); + } + + FCODE(roundp)(src, src, static_cast(round_imm)); + ZeroIfNaN(code, src); + + constexpr u64 float_upper_limit_signed = fsize == 32 ? 0x4f000000 : 0x43e0000000000000; + [[maybe_unused]] constexpr u64 float_upper_limit_unsigned = fsize == 32 ? 0x4f800000 : 0x43f0000000000000; + + if constexpr (unsigned_) { + // Zero is minimum + code.xorps(xmm0, xmm0); + FCODE(cmplep)(xmm0, src); + FCODE(andp)(src, xmm0); + + // Will we exceed unsigned range? + const Xbyak::Xmm exceed_unsigned = ctx.reg_alloc.ScratchXmm(); + code.movaps(exceed_unsigned, GetVectorOf(code)); + FCODE(cmplep)(exceed_unsigned, src); + + // Will be exceed signed range? + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + code.movaps(tmp, GetVectorOf(code)); + code.movaps(xmm0, tmp); + FCODE(cmplep)(xmm0, src); + FCODE(andp)(tmp, xmm0); + FCODE(subp)(src, tmp); + perform_conversion(src); + if constexpr (fsize == 32) { + code.pslld(xmm0, 31); + } else { + code.psllq(xmm0, 63); + } + FCODE(orp)(src, xmm0); + + // Saturate to max + FCODE(orp)(src, exceed_unsigned); + } else { + constexpr u64 integer_max = static_cast(std::numeric_limits>>::max()); + + code.movaps(xmm0, GetVectorOf(code)); + FCODE(cmplep)(xmm0, src); + perform_conversion(src); + FCODE(blendvp)(src, GetVectorOf(code)); + } + + ctx.reg_alloc.DefineValue(inst, src); + return; + } + using fbits_list = mp::vllift>; using rounding_list = mp::list< std::integral_constant,