diff --git a/src/backend_x64/emit_x64_floating_point.cpp b/src/backend_x64/emit_x64_floating_point.cpp index 29dd100b..0ca86768 100644 --- a/src/backend_x64/emit_x64_floating_point.cpp +++ b/src/backend_x64/emit_x64_floating_point.cpp @@ -108,75 +108,6 @@ void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Xmm xmm_scratch) code.pand(xmm_value, xmm_scratch); } -template -void PreProcessNaNs(BlockOfCode& code, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Label& end) { - using FPT = mp::unsigned_integer_of_size; - - Xbyak::Label nan; - - FCODE(ucomis)(a, b); - code.jp(nan, code.T_NEAR); - code.SwitchToFarCode(); - code.L(nan); - - code.sub(rsp, 8); - ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx())); - code.movq(code.ABI_PARAM1, a); - code.movq(code.ABI_PARAM2, b); - code.CallFunction(static_cast([](FPT a, FPT b) -> FPT { - return *FP::ProcessNaNs(a, b); - })); - code.movq(a, code.ABI_RETURN); - ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx())); - code.add(rsp, 8); - - code.jmp(end, code.T_NEAR); - code.SwitchToNearCode(); -} - -template -void PreProcessNaNs(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm a, Xbyak::Xmm b, Xbyak::Xmm c, Xbyak::Label& end, NaNHandler nan_handler) { - using FPT = mp::unsigned_integer_of_size; - - Xbyak::Label nan; - - FCODE(ucomis)(a, b); - code.jp(nan, code.T_NEAR); - FCODE(ucomis)(c, c); - code.jp(nan, code.T_NEAR); - code.SwitchToFarCode(); - code.L(nan); - - code.sub(rsp, 8); - ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx())); - code.movq(code.ABI_PARAM1, a); - code.movq(code.ABI_PARAM2, b); - code.movq(code.ABI_PARAM3, c); - code.mov(code.ABI_PARAM4, ctx.FPCR()); - code.CallFunction(static_cast(nan_handler)); - code.movq(a, code.ABI_RETURN); - ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(a.getIdx())); - code.add(rsp, 8); - - code.jmp(end, code.T_NEAR); - code.SwitchToNearCode(); -} - -template -void PostProcessNaNs(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm tmp) { - if constexpr (fsize == 32) { - code.movaps(tmp, result); - code.cmpunordps(tmp, tmp); - code.pslld(tmp, 31); - code.xorps(result, tmp); - } else { - code.movaps(tmp, result); - code.cmpunordpd(tmp, tmp); - code.psllq(tmp, 63); - code.xorps(result, tmp); - } -} - template void ForceToDefaultNaN(BlockOfCode& code, Xbyak::Xmm result) { if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { @@ -207,6 +138,21 @@ Xbyak::Label ProcessNaN(BlockOfCode& code, Xbyak::Xmm a) { return end; } +template +void PostProcessNaN(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm tmp) { + if constexpr (fsize == 32) { + code.movaps(tmp, result); + code.cmpunordps(tmp, tmp); + code.pslld(tmp, 31); + code.xorps(result, tmp); + } else { + code.movaps(tmp, result); + code.cmpunordpd(tmp, tmp); + code.psllq(tmp, 63); + code.xorps(result, tmp); + } +} + // This is necessary because x86 and ARM differ in they way they return NaNs from floating point operations // // ARM behaviour: @@ -372,47 +318,7 @@ void FPTwoOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { if (ctx.FPSCR_DN()) { ForceToDefaultNaN(code, result); } else if (ctx.AccurateNaN()) { - PostProcessNaNs(code, result, ctx.reg_alloc.ScratchXmm()); - } - code.L(end); - - ctx.reg_alloc.DefineValue(inst, result); -} - -enum class CallDenormalsAreZero { - Yes, - No, -}; - -template -void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, [[maybe_unused]] PreprocessFunction preprocess, Function fn, CallDenormalsAreZero call_denormals_are_zero = CallDenormalsAreZero::No) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - Xbyak::Label end; - - Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); - Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]); - Xbyak::Reg64 gpr_scratch = ctx.reg_alloc.ScratchGpr(); - - if (ctx.FPSCR_FTZ() && call_denormals_are_zero == CallDenormalsAreZero::Yes) { - DenormalsAreZero(code, result, gpr_scratch); - DenormalsAreZero(code, operand, gpr_scratch); - } - if constexpr(!std::is_same_v) { - preprocess(result, operand, gpr_scratch, end); - } - if (ctx.AccurateNaN() && !ctx.FPSCR_DN()) { - PreProcessNaNs(code, result, operand, end); - } - if constexpr (std::is_member_function_pointer_v) { - (code.*fn)(result, operand); - } else { - fn(result, operand); - } - if (ctx.FPSCR_DN()) { - ForceToDefaultNaN(code, result); - } else if (ctx.AccurateNaN()) { - PostProcessNaNs(code, result, operand); + PostProcessNaN(code, result, ctx.reg_alloc.ScratchXmm()); } code.L(end); @@ -528,8 +434,8 @@ void EmitX64::EmitFPDiv64(EmitContext& ctx, IR::Inst* inst) { FPThreeOp<64>(code, ctx, inst, &Xbyak::CodeGenerator::divsd); } -template -static void EmitFPMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { +template +static void EmitFPMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); @@ -545,14 +451,22 @@ static void EmitFPMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { FCODE(ucomis)(result, operand); code.jz(equal, code.T_NEAR); - FCODE(maxs)(result, operand); + if constexpr (is_max) { + FCODE(maxs)(result, operand); + } else { + FCODE(mins)(result, operand); + } code.L(end); code.SwitchToFarCode(); code.L(equal); code.jp(nan); - code.andps(result, operand); + if constexpr (is_max) { + code.andps(result, operand); + } else { + code.orps(result, operand); + } code.jmp(end); code.L(nan); @@ -568,196 +482,137 @@ static void EmitFPMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(inst, result); } -void EmitX64::EmitFPMax32(EmitContext& ctx, IR::Inst* inst) { - EmitFPMax<32>(code, ctx, inst); -} +template +static void EmitFPMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + using FPT = mp::unsigned_integer_of_size; + constexpr u8 mantissa_msb_bit = static_cast(FP::FPInfo::explicit_mantissa_width - 1); -void EmitX64::EmitFPMax64(EmitContext& ctx, IR::Inst* inst) { - EmitFPMax<64>(code, ctx, inst); -} - -void EmitX64::EmitFPMaxNumeric32(EmitContext& ctx, IR::Inst* inst) { - FPThreeOp<32>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand, Xbyak::Reg64 scratch, Xbyak::Label& end){ - Xbyak::Label normal, normal_or_equal, result_is_result; - - code.ucomiss(result, operand); - code.jnp(normal_or_equal); - // If operand == QNaN, result = result. - code.movd(scratch.cvt32(), operand); - code.shl(scratch.cvt32(), 1); - code.cmp(scratch.cvt32(), 0xff800000u); - code.jae(result_is_result); - // If operand == SNaN, let usual NaN code handle it. - code.cmp(scratch.cvt32(), 0xff000000u); - code.ja(normal); - // If result == SNaN, && operand != NaN, result = result. - code.movd(scratch.cvt32(), result); - code.shl(scratch.cvt32(), 1); - code.cmp(scratch.cvt32(), 0xff800000u); - code.jnae(result_is_result); - // If result == QNaN && operand != NaN, result = operand. - code.movaps(result, operand); - code.jmp(end, code.T_NEAR); - - code.L(result_is_result); - code.movaps(operand, result); - code.jmp(normal); - - code.L(normal_or_equal); - code.jnz(normal); - code.andps(operand, result); - code.L(normal); - }, &Xbyak::CodeGenerator::maxss, CallDenormalsAreZero::Yes); -} - -void EmitX64::EmitFPMaxNumeric64(EmitContext& ctx, IR::Inst* inst) { - FPThreeOp<64>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand, Xbyak::Reg64 scratch, Xbyak::Label& end){ - Xbyak::Label normal, normal_or_equal, result_is_result; - - code.ucomisd(result, operand); - code.jnp(normal_or_equal); - // If operand == QNaN, result = result. - code.movq(scratch, operand); - code.shl(scratch, 1); - code.cmp(scratch, code.MConst(qword, 0xfff0'0000'0000'0000u)); - code.jae(result_is_result); - // If operand == SNaN, let usual NaN code handle it. - code.cmp(scratch, code.MConst(qword, 0xffe0'0000'0000'0000u)); - code.ja(normal); - // If result == SNaN, && operand != NaN, result = result. - code.movq(scratch, result); - code.shl(scratch, 1); - code.cmp(scratch, code.MConst(qword, 0xfff0'0000'0000'0000u)); - code.jnae(result_is_result); - // If result == QNaN && operand != NaN, result = operand. - code.movaps(result, operand); - code.jmp(end, code.T_NEAR); - - code.L(result_is_result); - code.movaps(operand, result); - code.jmp(normal); - - code.L(normal_or_equal); - code.jnz(normal); - code.andps(operand, result); - code.L(normal); - }, &Xbyak::CodeGenerator::maxsd, CallDenormalsAreZero::Yes); -} - -template -static void EmitFPMin(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]); - const Xbyak::Reg64 gpr_scratch = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Xmm op1 = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm op2 = ctx.reg_alloc.UseScratchXmm(args[1]); // Result stored here! + Xbyak::Reg tmp = ctx.reg_alloc.ScratchGpr(); + tmp.setBit(fsize); + + const auto move_to_tmp = [&](const Xbyak::Xmm& xmm) { + if constexpr (fsize == 32) { + code.movd(tmp.cvt32(), xmm); + } else { + code.movq(tmp.cvt64(), xmm); + } + }; if (ctx.FPSCR_FTZ()) { - DenormalsAreZero(code, result, gpr_scratch); - DenormalsAreZero(code, operand, gpr_scratch); + DenormalsAreZero(code, op1, tmp.cvt64()); + DenormalsAreZero(code, op2, tmp.cvt64()); } - Xbyak::Label equal, end, nan; + Xbyak::Label end, z, nan, op2_is_nan, snan, maybe_both_nan, normal; - FCODE(ucomis)(result, operand); - code.jz(equal, code.T_NEAR); - FCODE(mins)(result, operand); + FCODE(ucomis)(op1, op2); + code.jz(z, code.T_NEAR); + code.L(normal); + if constexpr (is_max) { + FCODE(maxs)(op2, op1); + } else { + FCODE(mins)(op2, op1); + } code.L(end); code.SwitchToFarCode(); - code.L(equal); + code.L(z); code.jp(nan); - code.orps(result, operand); + if constexpr (is_max) { + code.andps(op2, op1); + } else { + code.orps(op2, op1); + } code.jmp(end); + // NaN requirements: + // op1 op2 result + // SNaN anything op1 + // !SNaN SNaN op2 + // QNaN !NaN op2 + // !NaN QNaN op1 + // QNaN QNaN op1 + code.L(nan); + FCODE(ucomis)(op1, op1); + code.jnp(op2_is_nan); + + // op1 is NaN + move_to_tmp(op1); + code.bt(tmp, mantissa_msb_bit); + code.jc(maybe_both_nan); if (ctx.FPSCR_DN()) { - code.movaps(result, code.MConst(xword, fsize == 32 ? f32_nan : f64_nan)); + code.L(snan); + code.movaps(op2, code.MConst(xword, FP::FPInfo::DefaultNaN())); code.jmp(end); } else { - EmitProcessNaNs(code, result, result, operand, gpr_scratch, end); + code.movaps(op2, op1); + code.L(snan); + code.orps(op2, code.MConst(xword, FP::FPInfo::mantissa_msb)); + code.jmp(end); } + code.L(maybe_both_nan); + FCODE(ucomis)(op2, op2); + code.jnp(end, code.T_NEAR); + if (ctx.FPSCR_DN()) { + code.jmp(snan); + } else { + move_to_tmp(op2); + code.bt(tmp.cvt64(), mantissa_msb_bit); + code.jnc(snan); + code.movaps(op2, op1); + code.jmp(end); + } + + // op2 is NaN + code.L(op2_is_nan); + move_to_tmp(op2); + code.bt(tmp, mantissa_msb_bit); + code.jnc(snan); + code.movaps(op2, op1); + code.jmp(end); + code.SwitchToNearCode(); - ctx.reg_alloc.DefineValue(inst, result); + ctx.reg_alloc.DefineValue(inst, op2); +} + +void EmitX64::EmitFPMax32(EmitContext& ctx, IR::Inst* inst) { + EmitFPMinMax<32, true>(code, ctx, inst); +} + +void EmitX64::EmitFPMax64(EmitContext& ctx, IR::Inst* inst) { + EmitFPMinMax<64, true>(code, ctx, inst); +} + +void EmitX64::EmitFPMaxNumeric32(EmitContext& ctx, IR::Inst* inst) { + EmitFPMinMaxNumeric<32, true>(code, ctx, inst); +} + +void EmitX64::EmitFPMaxNumeric64(EmitContext& ctx, IR::Inst* inst) { + EmitFPMinMaxNumeric<64, true>(code, ctx, inst); } void EmitX64::EmitFPMin32(EmitContext& ctx, IR::Inst* inst) { - EmitFPMin<32>(code, ctx, inst); + EmitFPMinMax<32, false>(code, ctx, inst); } void EmitX64::EmitFPMin64(EmitContext& ctx, IR::Inst* inst) { - EmitFPMin<64>(code, ctx, inst); + EmitFPMinMax<64, false>(code, ctx, inst); } void EmitX64::EmitFPMinNumeric32(EmitContext& ctx, IR::Inst* inst) { - FPThreeOp<32>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand, Xbyak::Reg64 scratch, Xbyak::Label& end){ - Xbyak::Label normal, normal_or_equal, result_is_result; - - code.ucomiss(result, operand); - code.jnp(normal_or_equal); - // If operand == QNaN, result = result. - code.movd(scratch.cvt32(), operand); - code.shl(scratch.cvt32(), 1); - code.cmp(scratch.cvt32(), 0xff800000u); - code.jae(result_is_result); - // If operand == SNaN, let usual NaN code handle it. - code.cmp(scratch.cvt32(), 0xff000000u); - code.ja(normal); - // If result == SNaN, && operand != NaN, result = result. - code.movd(scratch.cvt32(), result); - code.shl(scratch.cvt32(), 1); - code.cmp(scratch.cvt32(), 0xff800000u); - code.jnae(result_is_result); - // If result == QNaN && operand != NaN, result = operand. - code.movaps(result, operand); - code.jmp(end, code.T_NEAR); - - code.L(result_is_result); - code.movaps(operand, result); - code.jmp(normal); - - code.L(normal_or_equal); - code.jnz(normal); - code.orps(operand, result); - code.L(normal); - }, &Xbyak::CodeGenerator::minss, CallDenormalsAreZero::Yes); + EmitFPMinMaxNumeric<32, false>(code, ctx, inst); } void EmitX64::EmitFPMinNumeric64(EmitContext& ctx, IR::Inst* inst) { - FPThreeOp<64>(code, ctx, inst, [&](Xbyak::Xmm result, Xbyak::Xmm operand, Xbyak::Reg64 scratch, Xbyak::Label& end){ - Xbyak::Label normal, normal_or_equal, result_is_result; - - code.ucomisd(result, operand); - code.jnp(normal_or_equal); - // If operand == QNaN, result = result. - code.movq(scratch, operand); - code.shl(scratch, 1); - code.cmp(scratch, code.MConst(qword, 0xfff0'0000'0000'0000u)); - code.jae(result_is_result); - // If operand == SNaN, let usual NaN code handle it. - code.cmp(scratch, code.MConst(qword, 0xffe0'0000'0000'0000u)); - code.ja(normal); - // If result == SNaN, && operand != NaN, result = result. - code.movq(scratch, result); - code.shl(scratch, 1); - code.cmp(scratch, code.MConst(qword, 0xfff0'0000'0000'0000u)); - code.jnae(result_is_result); - // If result == QNaN && operand != NaN, result = operand. - code.movaps(result, operand); - code.jmp(end, code.T_NEAR); - - code.L(result_is_result); - code.movaps(operand, result); - code.jmp(normal); - - code.L(normal_or_equal); - code.jnz(normal); - code.orps(operand, result); - code.L(normal); - }, &Xbyak::CodeGenerator::minsd, CallDenormalsAreZero::Yes); + EmitFPMinMaxNumeric<64, false>(code, ctx, inst); } void EmitX64::EmitFPMul32(EmitContext& ctx, IR::Inst* inst) {