From 82417da7803e2cf18efc28a1cd3f3d0a4b6045ae Mon Sep 17 00:00:00 2001 From: MerryMage Date: Sat, 11 Jul 2020 14:02:55 +0100 Subject: [PATCH] emit_x64{_vector}_floating_point: Add unsafe optimizations for RSqrtEstimate and RecipEstimate --- src/backend/x64/emit_x64_floating_point.cpp | 68 +++++++++++++++---- .../x64/emit_x64_vector_floating_point.cpp | 58 +++++++++++++--- 2 files changed, 106 insertions(+), 20 deletions(-) diff --git a/src/backend/x64/emit_x64_floating_point.cpp b/src/backend/x64/emit_x64_floating_point.cpp index 2f170bbe..5efd5258 100644 --- a/src/backend/x64/emit_x64_floating_point.cpp +++ b/src/backend/x64/emit_x64_floating_point.cpp @@ -738,8 +738,29 @@ void EmitX64::EmitFPMulX64(EmitContext& ctx, IR::Inst* inst) { EmitFPMulX<64>(code, ctx, inst); } -template +template static void EmitFPRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mp::unsigned_integer_of_size; + + if constexpr (fsize != 16) { + if (ctx.UnsafeOptimizations()) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + if constexpr (fsize == 32) { + code.rcpss(result, operand); + } else { + code.cvtsd2ss(result, operand); + code.rcpss(result, result); + code.cvtss2sd(result, result); + } + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + } + auto args = ctx.reg_alloc.GetArgumentInfo(inst); ctx.reg_alloc.HostCall(inst, args[0]); code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value()); @@ -748,19 +769,21 @@ static void EmitFPRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i } void EmitX64::EmitFPRecipEstimate16(EmitContext& ctx, IR::Inst* inst) { - EmitFPRecipEstimate(code, ctx, inst); + EmitFPRecipEstimate<16>(code, ctx, inst); } void EmitX64::EmitFPRecipEstimate32(EmitContext& ctx, IR::Inst* inst) { - EmitFPRecipEstimate(code, ctx, inst); + EmitFPRecipEstimate<32>(code, ctx, inst); } void EmitX64::EmitFPRecipEstimate64(EmitContext& ctx, IR::Inst* inst) { - EmitFPRecipEstimate(code, ctx, inst); + EmitFPRecipEstimate<64>(code, ctx, inst); } -template +template static void EmitFPRecipExponent(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mp::unsigned_integer_of_size; + auto args = ctx.reg_alloc.GetArgumentInfo(inst); ctx.reg_alloc.HostCall(inst, args[0]); code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value()); @@ -769,15 +792,15 @@ static void EmitFPRecipExponent(BlockOfCode& code, EmitContext& ctx, IR::Inst* i } void EmitX64::EmitFPRecipExponent16(EmitContext& ctx, IR::Inst* inst) { - EmitFPRecipExponent(code, ctx, inst); + EmitFPRecipExponent<16>(code, ctx, inst); } void EmitX64::EmitFPRecipExponent32(EmitContext& ctx, IR::Inst* inst) { - EmitFPRecipExponent(code, ctx, inst); + EmitFPRecipExponent<32>(code, ctx, inst); } void EmitX64::EmitFPRecipExponent64(EmitContext& ctx, IR::Inst* inst) { - EmitFPRecipExponent(code, ctx, inst); + EmitFPRecipExponent<64>(code, ctx, inst); } template @@ -911,8 +934,29 @@ void EmitX64::EmitFPRoundInt64(EmitContext& ctx, IR::Inst* inst) { EmitFPRound(code, ctx, inst, 64); } -template +template static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mp::unsigned_integer_of_size; + + if constexpr (fsize != 16) { + if (ctx.UnsafeOptimizations()) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + if constexpr (fsize == 32) { + code.rsqrtss(result, operand); + } else { + code.cvtsd2ss(result, operand); + code.rsqrtss(result, result); + code.cvtss2sd(result, result); + } + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + } + auto args = ctx.reg_alloc.GetArgumentInfo(inst); ctx.reg_alloc.HostCall(inst, args[0]); code.mov(code.ABI_PARAM2.cvt32(), ctx.FPCR().Value()); @@ -921,15 +965,15 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i } void EmitX64::EmitFPRSqrtEstimate16(EmitContext& ctx, IR::Inst* inst) { - EmitFPRSqrtEstimate(code, ctx, inst); + EmitFPRSqrtEstimate<16>(code, ctx, inst); } void EmitX64::EmitFPRSqrtEstimate32(EmitContext& ctx, IR::Inst* inst) { - EmitFPRSqrtEstimate(code, ctx, inst); + EmitFPRSqrtEstimate<32>(code, ctx, inst); } void EmitX64::EmitFPRSqrtEstimate64(EmitContext& ctx, IR::Inst* inst) { - EmitFPRSqrtEstimate(code, ctx, inst); + EmitFPRSqrtEstimate<64>(code, ctx, inst); } template diff --git a/src/backend/x64/emit_x64_vector_floating_point.cpp b/src/backend/x64/emit_x64_vector_floating_point.cpp index e1ac36a2..f5fe2c84 100644 --- a/src/backend/x64/emit_x64_vector_floating_point.cpp +++ b/src/backend/x64/emit_x64_vector_floating_point.cpp @@ -1178,8 +1178,29 @@ void EmitX64::EmitFPVectorPairedAddLower64(EmitContext& ctx, IR::Inst* inst) { }); } -template +template static void EmitRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mp::unsigned_integer_of_size; + + if constexpr (fsize != 16) { + if (ctx.UnsafeOptimizations()) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + if constexpr (fsize == 32) { + code.rcpps(result, operand); + } else { + code.cvtpd2ps(result, operand); + code.rcpps(result, result); + code.cvtps2pd(result, result); + } + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + } + EmitTwoOpFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& operand, FP::FPCR fpcr, FP::FPSR& fpsr) { for (size_t i = 0; i < result.size(); i++) { result[i] = FP::FPRecipEstimate(operand[i], fpcr, fpsr); @@ -1188,15 +1209,15 @@ static void EmitRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins } void EmitX64::EmitFPVectorRecipEstimate16(EmitContext& ctx, IR::Inst* inst) { - EmitRecipEstimate(code, ctx, inst); + EmitRecipEstimate<16>(code, ctx, inst); } void EmitX64::EmitFPVectorRecipEstimate32(EmitContext& ctx, IR::Inst* inst) { - EmitRecipEstimate(code, ctx, inst); + EmitRecipEstimate<32>(code, ctx, inst); } void EmitX64::EmitFPVectorRecipEstimate64(EmitContext& ctx, IR::Inst* inst) { - EmitRecipEstimate(code, ctx, inst); + EmitRecipEstimate<64>(code, ctx, inst); } template @@ -1337,8 +1358,29 @@ void EmitX64::EmitFPVectorRoundInt64(EmitContext& ctx, IR::Inst* inst) { EmitFPVectorRoundInt<64>(code, ctx, inst); } -template +template static void EmitRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mp::unsigned_integer_of_size; + + if constexpr (fsize != 16) { + if (ctx.UnsafeOptimizations()) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + + if constexpr (fsize == 32) { + code.rsqrtps(result, operand); + } else { + code.cvtpd2ps(result, operand); + code.rsqrtps(result, result); + code.cvtps2pd(result, result); + } + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + } + EmitTwoOpFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& operand, FP::FPCR fpcr, FP::FPSR& fpsr) { for (size_t i = 0; i < result.size(); i++) { result[i] = FP::FPRSqrtEstimate(operand[i], fpcr, fpsr); @@ -1347,15 +1389,15 @@ static void EmitRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins } void EmitX64::EmitFPVectorRSqrtEstimate16(EmitContext& ctx, IR::Inst* inst) { - EmitRSqrtEstimate(code, ctx, inst); + EmitRSqrtEstimate<16>(code, ctx, inst); } void EmitX64::EmitFPVectorRSqrtEstimate32(EmitContext& ctx, IR::Inst* inst) { - EmitRSqrtEstimate(code, ctx, inst); + EmitRSqrtEstimate<32>(code, ctx, inst); } void EmitX64::EmitFPVectorRSqrtEstimate64(EmitContext& ctx, IR::Inst* inst) { - EmitRSqrtEstimate(code, ctx, inst); + EmitRSqrtEstimate<64>(code, ctx, inst); } template