From 56e3bf57d2451173af0b6698028590178ffc8554 Mon Sep 17 00:00:00 2001 From: Merry Date: Fri, 28 May 2021 15:10:38 +0100 Subject: [PATCH] emit_x64_vector_saturated: Consolidate unsigned operations into EmitVectorUnsignedSaturated --- .../x64/emit_x64_vector_saturation.cpp | 321 ++++++------------ 1 file changed, 98 insertions(+), 223 deletions(-) diff --git a/src/dynarmic/backend/x64/emit_x64_vector_saturation.cpp b/src/dynarmic/backend/x64/emit_x64_vector_saturation.cpp index 8dfb2f6e..24edc515 100644 --- a/src/dynarmic/backend/x64/emit_x64_vector_saturation.cpp +++ b/src/dynarmic/backend/x64/emit_x64_vector_saturation.cpp @@ -157,6 +157,100 @@ void EmitVectorSignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst* in } } +template +void EmitVectorUnsignedSaturated(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + static_assert(esize == 32 || esize == 64); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) { + const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); + + if constexpr (op == Op::Add) { + ICODE(vpadd)(result, operand1, operand2); + ICODE(vpcmpu)(k1, result, operand1, CmpInt::LessThan); + ICODE(vpternlog)(result | k1, result, result, u8(0xFF)); + } else { + ICODE(vpsub)(result, operand1, operand2); + ICODE(vpcmpu)(k1, result, operand1, CmpInt::GreaterThan); + ICODE(vpxor)(result | k1, result, result); + } + + code.ktestb(k1, k1); + code.setnz(overflow); + code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + const Xbyak::Xmm operand1 = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.UseXmm(args[0]) : ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.ScratchXmm() : operand1; + const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + + if constexpr (op == Op::Add) { + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpxor(xmm0, operand1, operand2); + code.vpand(tmp, operand1, operand2); + ICODE(vpadd)(result, operand1, operand2); + } else { + code.movaps(tmp, operand1); + code.movaps(xmm0, operand1); + + code.pxor(xmm0, operand2); + code.pand(tmp, operand2); + ICODE(padd)(result, operand2); + } + + ICODE(psrl)(xmm0, 1); + ICODE(padd)(tmp, xmm0); + } else { + if (code.HasHostFeature(HostFeature::AVX)) { + code.vpxor(tmp, operand1, operand2); + ICODE(vpsub)(result, operand1, operand2); + code.vpand(xmm0, operand2, tmp); + } else { + code.movaps(tmp, operand1); + code.movaps(xmm0, operand2); + + code.pxor(tmp, operand2); + ICODE(psub)(result, operand2); + code.pand(xmm0, tmp); + } + + ICODE(psrl)(tmp, 1); + ICODE(psub)(tmp, xmm0); + } + + code.psrad(tmp, 31); + if constexpr (esize == 64) { + code.pshufd(tmp, tmp, 0b11110101); + } + + if (code.HasHostFeature(HostFeature::SSE41)) { + code.ptest(tmp, tmp); + } else { + FCODE(movmskp)(overflow.cvt32(), tmp); + code.test(overflow.cvt32(), overflow.cvt32()); + } + + code.setnz(overflow); + code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow); + + if constexpr (op == Op::Add) { + code.por(result, tmp); + ctx.reg_alloc.DefineValue(inst, result); + } else { + code.pandn(tmp, result); + ctx.reg_alloc.DefineValue(inst, tmp); + } +} + } // anonymous namespace void EmitX64::EmitVectorSignedSaturatedAdd8(EmitContext& ctx, IR::Inst* inst) { @@ -200,123 +294,11 @@ void EmitX64::EmitVectorUnsignedSaturatedAdd16(EmitContext& ctx, IR::Inst* inst) } void EmitX64::EmitVectorUnsignedSaturatedAdd32(EmitContext& ctx, IR::Inst* inst) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) { - const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); - const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); - const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); - - code.vpaddd(result, operand1, operand2); - code.vpcmpud(k1, result, operand2, CmpInt::LessThan); - code.vpternlogd(result | k1, result, result, 0xFF); - code.ktestb(k1, k1); - - code.setnz(overflow); - code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow); - - ctx.reg_alloc.DefineValue(inst, result); - return; - } - - const Xbyak::Xmm operand1 = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.UseXmm(args[0]) : ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); - const Xbyak::Xmm result = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.ScratchXmm() : operand1; - const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); - - if (code.HasHostFeature(HostFeature::AVX)) { - code.vpxor(xmm0, operand1, operand2); - code.vpand(tmp, operand1, operand2); - code.vpaddd(result, operand1, operand2); - } else { - code.movaps(tmp, operand1); - code.movaps(xmm0, operand1); - - code.pxor(xmm0, operand2); - code.pand(tmp, operand2); - code.paddd(result, operand2); - } - - code.psrld(xmm0, 1); - code.paddd(tmp, xmm0); - code.psrad(tmp, 31); - - code.por(result, tmp); - - if (code.HasHostFeature(HostFeature::SSE41)) { - code.ptest(tmp, tmp); - } else { - code.movmskps(overflow.cvt32(), tmp); - code.test(overflow.cvt32(), overflow.cvt32()); - } - - code.setnz(overflow); - code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow); - - ctx.reg_alloc.DefineValue(inst, result); + EmitVectorUnsignedSaturated(code, ctx, inst); } void EmitX64::EmitVectorUnsignedSaturatedAdd64(EmitContext& ctx, IR::Inst* inst) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) { - const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); - const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); - const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); - - code.vpaddq(result, operand1, operand2); - code.vpcmpuq(k1, result, operand1, CmpInt::LessThan); - code.vpternlogq(result | k1, result, result, 0xFF); - code.ktestb(k1, k1); - - code.setnz(overflow); - code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow); - - ctx.reg_alloc.DefineValue(inst, result); - - return; - } - - const Xbyak::Xmm operand1 = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.UseXmm(args[0]) : ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); - const Xbyak::Xmm result = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.ScratchXmm() : operand1; - const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); - - if (code.HasHostFeature(HostFeature::AVX)) { - code.vpxor(xmm0, operand1, operand2); - code.vpand(tmp, operand1, operand2); - code.vpaddq(result, operand1, operand2); - } else { - code.movaps(xmm0, operand1); - code.movaps(tmp, operand1); - - code.pxor(xmm0, operand2); - code.pand(tmp, operand2); - code.paddq(result, operand2); - } - - code.psrlq(xmm0, 1); - code.paddq(tmp, xmm0); - code.psrad(tmp, 31); - code.pshufd(tmp, tmp, 0b11110101); - - code.por(result, tmp); - - if (code.HasHostFeature(HostFeature::SSE41)) { - code.ptest(tmp, tmp); - } else { - code.movmskpd(overflow.cvt32(), tmp); - code.test(overflow.cvt32(), overflow.cvt32()); - } - - code.setnz(overflow); - code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow); - - ctx.reg_alloc.DefineValue(inst, result); + EmitVectorUnsignedSaturated(code, ctx, inst); } void EmitX64::EmitVectorUnsignedSaturatedSub8(EmitContext& ctx, IR::Inst* inst) { @@ -328,118 +310,11 @@ void EmitX64::EmitVectorUnsignedSaturatedSub16(EmitContext& ctx, IR::Inst* inst) } void EmitX64::EmitVectorUnsignedSaturatedSub32(EmitContext& ctx, IR::Inst* inst) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) { - const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); - const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); - const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); - - code.vpsubd(result, operand1, operand2); - code.vpcmpud(k1, result, operand1, CmpInt::GreaterThan); - code.vpxord(result | k1, result, result); - code.ktestb(k1, k1); - - code.setnz(overflow); - code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow); - - ctx.reg_alloc.DefineValue(inst, result); - return; - } - - const Xbyak::Xmm operand1 = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.UseXmm(args[0]) : ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); - const Xbyak::Xmm result = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.ScratchXmm() : operand1; - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); - const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); - - if (code.HasHostFeature(HostFeature::AVX)) { - code.vpxor(tmp, operand1, operand2); - code.vpsubd(result, operand1, operand2); - code.vpand(xmm0, operand2, tmp); - } else { - code.movaps(tmp, operand1); - code.movaps(xmm0, operand2); - - code.pxor(tmp, operand2); - code.psubd(result, operand2); - code.pand(xmm0, tmp); - } - - code.psrld(tmp, 1); - code.psubd(tmp, xmm0); - code.psrad(tmp, 31); - - if (code.HasHostFeature(HostFeature::SSE41)) { - code.ptest(tmp, tmp); - } else { - code.movmskps(overflow.cvt32(), tmp); - code.test(overflow.cvt32(), overflow.cvt32()); - } - code.setnz(overflow); - code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow); - - code.pandn(tmp, result); - ctx.reg_alloc.DefineValue(inst, tmp); + EmitVectorUnsignedSaturated(code, ctx, inst); } void EmitX64::EmitVectorUnsignedSaturatedSub64(EmitContext& ctx, IR::Inst* inst) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) { - const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]); - const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); - const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); - - code.vpsubq(result, operand1, operand2); - code.vpcmpuq(k1, result, operand1, CmpInt::GreaterThan); - code.vpxorq(result | k1, result, result); - code.ktestb(k1, k1); - - code.setnz(overflow); - code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow); - - ctx.reg_alloc.DefineValue(inst, result); - return; - } - - const Xbyak::Xmm operand1 = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.UseXmm(args[0]) : ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); - const Xbyak::Xmm result = code.HasHostFeature(HostFeature::AVX) ? ctx.reg_alloc.ScratchXmm() : operand1; - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); - const Xbyak::Reg8 overflow = ctx.reg_alloc.ScratchGpr().cvt8(); - - if (code.HasHostFeature(HostFeature::AVX)) { - code.vpxor(tmp, operand1, operand2); - code.vpsubq(result, operand1, operand2); - code.vpand(xmm0, operand2, tmp); - } else { - code.movaps(tmp, operand1); - code.movaps(xmm0, operand2); - - code.pxor(tmp, operand2); - code.psubq(result, operand2); - code.pand(xmm0, tmp); - } - - code.psrlq(tmp, 1); - code.psubq(tmp, xmm0); - code.psrad(tmp, 31); - code.pshufd(tmp, tmp, 0b11110101); - - if (code.HasHostFeature(HostFeature::SSE41)) { - code.ptest(tmp, tmp); - } else { - code.movmskpd(overflow.cvt32(), tmp); - code.test(overflow.cvt32(), overflow.cvt32()); - } - code.setnz(overflow); - code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], overflow); - - code.pandn(tmp, result); - ctx.reg_alloc.DefineValue(inst, tmp); + EmitVectorUnsignedSaturated(code, ctx, inst); } } // namespace Dynarmic::Backend::X64