From 06b31448aaf2588606d7f8bc6e8db8a9c269f58a Mon Sep 17 00:00:00 2001 From: MerryMage Date: Sat, 15 Sep 2018 09:04:19 +0100 Subject: [PATCH] emit_x64_vector: Changes to VectorSignedSaturatedDoublingMultiply * Return both the upper and lower parts of the multiply if required * SSE2 does not support the pmuldq instruction, do sign correction to an unsigned result instead * Improve port utilisation where possible (punpck instructions were a bottleneck) --- src/backend/x64/emit_x64_vector.cpp | 223 +++++++++++++----- .../A64/translate/impl/simd_three_same.cpp | 2 +- .../impl/simd_vector_x_indexed_element.cpp | 2 +- src/frontend/ir/ir_emitter.cpp | 26 +- src/frontend/ir/ir_emitter.h | 2 +- src/frontend/ir/microinstruction.cpp | 2 - src/frontend/ir/opcodes.inc | 4 +- tests/A64/a64.cpp | 42 ++++ 8 files changed, 233 insertions(+), 70 deletions(-) diff --git a/src/backend/x64/emit_x64_vector.cpp b/src/backend/x64/emit_x64_vector.cpp index 8999370a..27839e83 100644 --- a/src/backend/x64/emit_x64_vector.cpp +++ b/src/backend/x64/emit_x64_vector.cpp @@ -3077,74 +3077,189 @@ void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned64(EmitContext& ctx, IR EmitVectorSignedSaturatedAccumulateUnsigned<64>(code, ctx, inst); } -void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyReturnHigh16(EmitContext& ctx, IR::Inst* inst) { +void EmitX64::EmitVectorSignedSaturatedDoublingMultiply16(EmitContext& ctx, IR::Inst* inst) { + const auto upper_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetUpperFromOp); + const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm upper_tmp = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm lower_tmp = ctx.reg_alloc.ScratchXmm(); + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + code.vpmulhw(upper_tmp, x, y); + } else { + code.movdqa(upper_tmp, x); + code.pmulhw(upper_tmp, y); + } + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + code.vpmullw(lower_tmp, x, y); + } else { + code.movdqa(lower_tmp, x); + code.pmullw(lower_tmp, y); + } + + ctx.reg_alloc.Release(x); + ctx.reg_alloc.Release(y); + + if (lower_inst) { + const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm(); + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + code.vpaddw(lower_result, lower_tmp, lower_tmp); + } else { + code.movdqa(lower_result, lower_tmp); + code.paddw(lower_result, lower_result); + } + + ctx.reg_alloc.DefineValue(lower_inst, lower_result); + ctx.EraseInstruction(lower_inst); + } + + if (upper_inst) { + const Xbyak::Xmm upper_result = ctx.reg_alloc.ScratchXmm(); + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + code.vpsrlw(lower_tmp, lower_tmp, 15); + code.vpaddw(upper_tmp, upper_tmp, upper_tmp); + code.vpor(upper_result, upper_tmp, lower_tmp); + code.vpcmpeqw(upper_tmp, upper_result, code.MConst(xword, 0x8000800080008000, 0x8000800080008000)); + code.vpxor(upper_result, upper_result, upper_tmp); + } else { + code.paddw(upper_tmp, upper_tmp); + code.psrlw(lower_tmp, 15); + code.movdqa(upper_result, upper_tmp); + code.por(upper_result, lower_tmp); + code.movdqa(upper_tmp, code.MConst(xword, 0x8000800080008000, 0x8000800080008000)); + code.pcmpeqw(upper_tmp, upper_result); + code.pxor(upper_result, upper_tmp); + } + + const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32(); + code.pmovmskb(bit, upper_tmp); + code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit); + + ctx.reg_alloc.DefineValue(upper_inst, upper_result); + ctx.EraseInstruction(upper_inst); + } +} + +void EmitX64::EmitVectorSignedSaturatedDoublingMultiply32(EmitContext& ctx, IR::Inst* inst) { + const auto upper_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetUpperFromOp); + const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { + const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); + const Xbyak::Xmm odds = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm even = ctx.reg_alloc.ScratchXmm(); + + code.vpmuldq(odds, x, y); + code.vpsrlq(x, x, 32); + code.vpsrlq(y, y, 32); + code.vpmuldq(even, x, y); + + ctx.reg_alloc.Release(x); + ctx.reg_alloc.Release(y); + + code.vpaddq(odds, odds, odds); + code.vpaddq(even, even, even); + + if (upper_inst) { + const Xbyak::Xmm upper_result = ctx.reg_alloc.ScratchXmm(); + + code.vpsrlq(upper_result, odds, 32); + code.vblendps(upper_result, upper_result, even, 0b1010); + + const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32(); + + code.vpcmpeqd(mask, upper_result, code.MConst(xword, 0x8000000080000000, 0x8000000080000000)); + code.vpxor(upper_result, upper_result, mask); + code.pmovmskb(bit, mask); + code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit); + + ctx.reg_alloc.Release(mask); + ctx.reg_alloc.Release(bit); + + ctx.reg_alloc.DefineValue(upper_inst, upper_result); + ctx.EraseInstruction(upper_inst); + } + + if (lower_inst) { + const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm(); + + code.vpsllq(lower_result, even, 32); + code.vblendps(lower_result, lower_result, even, 0b0101); + + ctx.reg_alloc.DefineValue(lower_inst, lower_result); + ctx.EraseInstruction(lower_inst); + } + + return; + } const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm sign_correction = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm upper_result = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm(); + // calculate sign correction code.movdqa(tmp, x); - code.pmulhw(tmp, y); - code.paddw(tmp, tmp); - code.pmullw(y, x); - code.psrlw(y, 15); - code.por(y, tmp); + code.movdqa(sign_correction, y); + code.psrad(tmp, 31); + code.psrad(sign_correction, 31); + code.pand(tmp, y); + code.pand(sign_correction, x); + code.paddd(sign_correction, tmp); + code.pslld(sign_correction, 1); - code.movdqa(x, code.MConst(xword, 0x8000800080008000, 0x8000800080008000)); - code.pcmpeqw(x, y); + // unsigned multiply code.movdqa(tmp, x); - code.pxor(x, y); + code.pmuludq(tmp, y); + code.psrlq(x, 32); + code.psrlq(y, 32); + code.pmuludq(x, y); - // Check if any saturation occurred (i.e. if any halfwords in x were - // 0x8000 before saturating - const Xbyak::Reg32 mask = ctx.reg_alloc.ScratchGpr().cvt32(); - code.pmovmskb(mask, tmp); - code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], mask); + // double + code.paddq(tmp, tmp); + code.paddq(x, x); - ctx.reg_alloc.DefineValue(inst, x); -} + // put everything into place + code.pcmpeqw(upper_result, upper_result); + code.pcmpeqw(lower_result, lower_result); + code.psllq(upper_result, 32); + code.psrlq(lower_result, 32); + code.pand(upper_result, x); + code.pand(lower_result, tmp); + code.psrlq(tmp, 32); + code.psllq(x, 32); + code.por(upper_result, tmp); + code.por(lower_result, x); + code.psubd(upper_result, sign_correction); -void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyReturnHigh32(EmitContext& ctx, IR::Inst* inst) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); + if (upper_inst) { + const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr().cvt32(); - const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); - const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(); - const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(); + code.movdqa(tmp, code.MConst(xword, 0x8000000080000000, 0x8000000080000000)); + code.pcmpeqd(tmp, upper_result); + code.pxor(upper_result, tmp); + code.pmovmskb(bit, tmp); + code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], bit); - code.movdqa(tmp1, x); - code.punpckldq(tmp1, y); - - code.movdqa(tmp2, y); - code.punpckldq(tmp2, x); - - code.pmuldq(tmp2, tmp1); - code.paddq(tmp2, tmp2); - - code.movdqa(tmp1, x); - code.punpckhdq(tmp1, y); - code.punpckhdq(y, x); - - code.pmuldq(y, tmp1); - code.paddq(y, y); - - code.pshufd(tmp1, tmp2, 0b11101101); - code.pshufd(x, y, 0b11101101); - code.punpcklqdq(tmp1, x); - - code.movdqa(x, code.MConst(xword, 0x8000000080000000, 0x8000000080000000)); - code.pcmpeqd(x, tmp1); - code.movdqa(tmp2, x); - code.pxor(x, tmp1); - - // Check if any saturation occurred (i.e. if any words in x were - // 0x80000000 before saturating - const Xbyak::Reg32 mask = ctx.reg_alloc.ScratchGpr().cvt32(); - code.pmovmskb(mask, tmp2); - code.or_(code.dword[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], mask); - - ctx.reg_alloc.DefineValue(inst, x); + ctx.reg_alloc.DefineValue(upper_inst, upper_result); + ctx.EraseInstruction(upper_inst); + } + if (lower_inst) { + ctx.reg_alloc.DefineValue(lower_inst, lower_result); + ctx.EraseInstruction(lower_inst); + } } static void EmitVectorSignedSaturatedNarrowToSigned(size_t original_esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { diff --git a/src/frontend/A64/translate/impl/simd_three_same.cpp b/src/frontend/A64/translate/impl/simd_three_same.cpp index 19d6265d..eb2594ca 100644 --- a/src/frontend/A64/translate/impl/simd_three_same.cpp +++ b/src/frontend/A64/translate/impl/simd_three_same.cpp @@ -432,7 +432,7 @@ bool TranslatorVisitor::SQDMULH_vec_2(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec V const IR::U128 operand1 = V(datasize, Vn); const IR::U128 operand2 = V(datasize, Vm); - const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiplyReturnHigh(esize, operand1, operand2); + const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiply(esize, operand1, operand2).upper; V(datasize, Vd, result); return true; diff --git a/src/frontend/A64/translate/impl/simd_vector_x_indexed_element.cpp b/src/frontend/A64/translate/impl/simd_vector_x_indexed_element.cpp index eb28013b..4efcfdb8 100644 --- a/src/frontend/A64/translate/impl/simd_vector_x_indexed_element.cpp +++ b/src/frontend/A64/translate/impl/simd_vector_x_indexed_element.cpp @@ -233,7 +233,7 @@ bool TranslatorVisitor::SQDMULH_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, I const IR::U128 operand1 = V(datasize, Vn); const IR::U128 operand2 = V(idxsize, concatenate(Vmhi, Vmlo).ZeroExtend()); const IR::U128 index_vector = ir.VectorBroadcast(esize, ir.VectorGetElement(esize, operand2, index)); - const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiplyReturnHigh(esize, operand1, index_vector); + const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiply(esize, operand1, index_vector).upper; V(datasize, Vd, result); return true; diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 6729eacd..7b73efa2 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -1575,15 +1575,23 @@ U128 IREmitter::VectorSignedSaturatedAccumulateUnsigned(size_t esize, const U128 return {}; } -U128 IREmitter::VectorSignedSaturatedDoublingMultiplyReturnHigh(size_t esize, const U128& a, const U128& b) { - switch (esize) { - case 16: - return Inst(Opcode::VectorSignedSaturatedDoublingMultiplyReturnHigh16, a, b); - case 32: - return Inst(Opcode::VectorSignedSaturatedDoublingMultiplyReturnHigh32, a, b); - } - UNREACHABLE(); - return {}; +UpperAndLower IREmitter::VectorSignedSaturatedDoublingMultiply(size_t esize, const U128& a, const U128& b) { + const Value multiply = [&] { + switch (esize) { + case 16: + return Inst(Opcode::VectorSignedSaturatedDoublingMultiply16, a, b); + case 32: + return Inst(Opcode::VectorSignedSaturatedDoublingMultiply32, a, b); + default: + UNREACHABLE(); + return Value{}; + } + }(); + + return { + Inst(Opcode::GetUpperFromOp, multiply), + Inst(Opcode::GetLowerFromOp, multiply), + }; } U128 IREmitter::VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a) { diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index d7764451..9385f29c 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -273,7 +273,7 @@ public: UpperAndLower VectorSignedMultiply(size_t esize, const U128& a, const U128& b); U128 VectorSignedSaturatedAbs(size_t esize, const U128& a); U128 VectorSignedSaturatedAccumulateUnsigned(size_t esize, const U128& a, const U128& b); - U128 VectorSignedSaturatedDoublingMultiplyReturnHigh(size_t esize, const U128& a, const U128& b); + UpperAndLower VectorSignedSaturatedDoublingMultiply(size_t esize, const U128& a, const U128& b); U128 VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a); U128 VectorSignedSaturatedNarrowToUnsigned(size_t original_esize, const U128& a); U128 VectorSignedSaturatedNeg(size_t esize, const U128& a); diff --git a/src/frontend/ir/microinstruction.cpp b/src/frontend/ir/microinstruction.cpp index 78c690f5..299be55b 100644 --- a/src/frontend/ir/microinstruction.cpp +++ b/src/frontend/ir/microinstruction.cpp @@ -361,8 +361,6 @@ bool Inst::WritesToFPSRCumulativeSaturationBit() const { case Opcode::VectorSignedSaturatedNarrowToUnsigned16: case Opcode::VectorSignedSaturatedNarrowToUnsigned32: case Opcode::VectorSignedSaturatedNarrowToUnsigned64: - case Opcode::VectorSignedSaturatedDoublingMultiplyReturnHigh16: - case Opcode::VectorSignedSaturatedDoublingMultiplyReturnHigh32: case Opcode::VectorSignedSaturatedNeg8: case Opcode::VectorSignedSaturatedNeg16: case Opcode::VectorSignedSaturatedNeg32: diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 585f712b..03279c31 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -408,8 +408,8 @@ OPCODE(VectorSignedSaturatedAccumulateUnsigned8, U128, U128, OPCODE(VectorSignedSaturatedAccumulateUnsigned16, U128, U128, U128 ) OPCODE(VectorSignedSaturatedAccumulateUnsigned32, U128, U128, U128 ) OPCODE(VectorSignedSaturatedAccumulateUnsigned64, U128, U128, U128 ) -OPCODE(VectorSignedSaturatedDoublingMultiplyReturnHigh16, U128, U128, U128 ) -OPCODE(VectorSignedSaturatedDoublingMultiplyReturnHigh32, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedDoublingMultiply16, Void, U128, U128 ) +OPCODE(VectorSignedSaturatedDoublingMultiply32, Void, U128, U128 ) OPCODE(VectorSignedSaturatedNarrowToSigned16, U128, U128 ) OPCODE(VectorSignedSaturatedNarrowToSigned32, U128, U128 ) OPCODE(VectorSignedSaturatedNarrowToSigned64, U128, U128 ) diff --git a/tests/A64/a64.cpp b/tests/A64/a64.cpp index 0d24fe55..e62a2b83 100644 --- a/tests/A64/a64.cpp +++ b/tests/A64/a64.cpp @@ -494,3 +494,45 @@ TEST_CASE("A64: FRSQRTS", "[a64]") { REQUIRE(jit.GetVector(13) == Vector{0xff7fffff, 0}); } + +TEST_CASE("A64: SQDMULH.8H (saturate)", "[a64]") { + A64TestEnv env; + Dynarmic::A64::Jit jit{Dynarmic::A64::UserConfig{&env}}; + + env.code_mem.emplace_back(0x4e62b420); // SQDMULH.8H V0, V1, V2 + env.code_mem.emplace_back(0x14000000); // B . + + // Make sure that saturating values are tested + + jit.SetPC(0); + jit.SetVector(1, {0x7fff80007ffe8001, 0x7fff80007ffe8001}); + jit.SetVector(2, {0x7fff80007ffe8001, 0x80007fff80017ffe}); + jit.SetFpsr(0); + + env.ticks_left = 2; + jit.Run(); + + REQUIRE(jit.GetVector(0) == Vector{0x7ffe7fff7ffc7ffe, 0x8001800180028002}); + REQUIRE(FP::FPSR{jit.GetFpsr()}.QC() == true); +} + +TEST_CASE("A64: SQDMULH.4S (saturate)", "[a64]") { + A64TestEnv env; + Dynarmic::A64::Jit jit{Dynarmic::A64::UserConfig{&env}}; + + env.code_mem.emplace_back(0x4ea2b420); // SQDMULH.4S V0, V1, V2 + env.code_mem.emplace_back(0x14000000); // B . + + // Make sure that saturating values are tested + + jit.SetPC(0); + jit.SetVector(1, {0x7fffffff80000000, 0x7fffffff80000000}); + jit.SetVector(2, {0x7fffffff80000000, 0x800000007fffffff}); + jit.SetFpsr(0); + + env.ticks_left = 2; + jit.Run(); + + REQUIRE(jit.GetVector(0) == Vector{0x7ffffffe7fffffff, 0x8000000180000001}); + REQUIRE(FP::FPSR{jit.GetFpsr()}.QC() == true); +}