From 4c0adbb7f1db156a95c3dfb4b69afb9554b3cf8c Mon Sep 17 00:00:00 2001 From: Lioncash Date: Sat, 8 Sep 2018 21:08:34 -0400 Subject: [PATCH] ir: Add opcodes for signed saturated accumulations of unsigned values --- src/backend/x64/emit_x64_vector.cpp | 92 ++++++++++++++++++++++++++++ src/frontend/ir/ir_emitter.cpp | 15 +++++ src/frontend/ir/ir_emitter.h | 1 + src/frontend/ir/microinstruction.cpp | 4 ++ src/frontend/ir/opcodes.inc | 4 ++ 5 files changed, 116 insertions(+) diff --git a/src/backend/x64/emit_x64_vector.cpp b/src/backend/x64/emit_x64_vector.cpp index 230b69ae..5b52861f 100644 --- a/src/backend/x64/emit_x64_vector.cpp +++ b/src/backend/x64/emit_x64_vector.cpp @@ -99,6 +99,34 @@ static void EmitOneArgumentFallbackWithSaturation(BlockOfCode& code, EmitContext ctx.reg_alloc.DefineValue(inst, result); } +template +static void EmitTwoArgumentFallbackWithSaturation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { + const auto fn = static_cast*>(lambda); + constexpr u32 stack_space = 3 * 16; + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]); + const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(args[1]); + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + ctx.reg_alloc.EndOfAllocScope(); + + ctx.reg_alloc.HostCall(nullptr); + code.sub(rsp, stack_space + ABI_SHADOW_SPACE); + code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE + 0 * 16]); + code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 1 * 16]); + code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 2 * 16]); + + code.movaps(xword[code.ABI_PARAM2], arg1); + code.movaps(xword[code.ABI_PARAM3], arg2); + code.CallFunction(fn); + code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]); + + code.add(rsp, stack_space + ABI_SHADOW_SPACE); + + code.or_(code.byte[code.r15 + code.GetJitStateInfo().offsetof_fpsr_qc], code.ABI_RETURN.cvt8()); + + ctx.reg_alloc.DefineValue(inst, result); +} + template static void EmitTwoArgumentFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { const auto fn = static_cast*>(lambda); @@ -2764,6 +2792,70 @@ void EmitX64::EmitVectorSignedSaturatedAbs64(EmitContext& ctx, IR::Inst* inst) { }); } +// Simple generic case for 8, 16, and 32-bit values. 64-bit values +// will need to be special-cased as we can't simply use a larger integral size. +template +static bool EmitSignedSaturatedAccumulateUnsigned(VectorArray& result, const VectorArray& lhs, const VectorArray& rhs) { + static_assert(std::is_signed_v, "T must be signed."); + static_assert(sizeof(T) < 64, "T must be less than 64 bits in size."); + + bool qc_flag = false; + + for (size_t i = 0; i < result.size(); i++) { + // We treat lhs' members as unsigned, so cast to unsigned before signed to inhibit sign-extension. + // We use the unsigned equivalent of T, as we want zero-extension to occur, rather than a plain move. + const s64 x = static_cast(static_cast>(lhs[i])); + const s64 y = rhs[i]; + const s64 sum = x + y; + + if (sum > std::numeric_limits::max()) { + result[i] = std::numeric_limits::max(); + qc_flag = true; + } else if (sum < std::numeric_limits::min()) { + result[i] = std::numeric_limits::min(); + qc_flag = true; + } else { + result[i] = static_cast(sum); + } + } + + return qc_flag; +} + +void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned8(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, EmitSignedSaturatedAccumulateUnsigned); +} + +void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned16(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, EmitSignedSaturatedAccumulateUnsigned); +} + +void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned32(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, EmitSignedSaturatedAccumulateUnsigned); +} + +void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned64(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, [](VectorArray& result, const VectorArray& lhs, const VectorArray& rhs) { + bool qc_flag = false; + + for (size_t i = 0; i < result.size(); i++) { + const u64 x = lhs[i]; + const u64 y = rhs[i]; + const u64 res = x + y; + + // Check sign bits to determine if an overflow occurred. + if (((x & res) | (~y & res) | (x & ~y)) & 0x8000000000000000) { + result[i] = static_cast(INT64_MAX); + qc_flag = true; + } else { + result[i] = res; + } + } + + return qc_flag; + }); +} + void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyReturnHigh16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 9e83be06..6c14720b 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -1529,6 +1529,21 @@ U128 IREmitter::VectorSignedSaturatedAbs(size_t esize, const U128& a) { return {}; } +U128 IREmitter::VectorSignedSaturatedAccumulateUnsigned(size_t esize, const U128& a, const U128& b) { + switch (esize) { + case 8: + return Inst(Opcode::VectorSignedSaturatedAccumulateUnsigned8, a, b); + case 16: + return Inst(Opcode::VectorSignedSaturatedAccumulateUnsigned16, a, b); + case 32: + return Inst(Opcode::VectorSignedSaturatedAccumulateUnsigned32, a, b); + case 64: + return Inst(Opcode::VectorSignedSaturatedAccumulateUnsigned64, a, b); + } + UNREACHABLE(); + return {}; +} + U128 IREmitter::VectorSignedSaturatedDoublingMultiplyReturnHigh(size_t esize, const U128& a, const U128& b) { switch (esize) { case 16: diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 2d947391..7fa48028 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -265,6 +265,7 @@ public: U128 VectorSignExtend(size_t original_esize, const U128& a); U128 VectorSignedAbsoluteDifference(size_t esize, const U128& a, const U128& b); U128 VectorSignedSaturatedAbs(size_t esize, const U128& a); + U128 VectorSignedSaturatedAccumulateUnsigned(size_t esize, const U128& a, const U128& b); U128 VectorSignedSaturatedDoublingMultiplyReturnHigh(size_t esize, const U128& a, const U128& b); U128 VectorSignedSaturatedNarrowToSigned(size_t original_esize, const U128& a); U128 VectorSignedSaturatedNarrowToUnsigned(size_t original_esize, const U128& a); diff --git a/src/frontend/ir/microinstruction.cpp b/src/frontend/ir/microinstruction.cpp index 9e20bd90..85128b5b 100644 --- a/src/frontend/ir/microinstruction.cpp +++ b/src/frontend/ir/microinstruction.cpp @@ -351,6 +351,10 @@ bool Inst::WritesToFPSRCumulativeSaturationBit() const { case Opcode::VectorSignedSaturatedAbs16: case Opcode::VectorSignedSaturatedAbs32: case Opcode::VectorSignedSaturatedAbs64: + case Opcode::VectorSignedSaturatedAccumulateUnsigned8: + case Opcode::VectorSignedSaturatedAccumulateUnsigned16: + case Opcode::VectorSignedSaturatedAccumulateUnsigned32: + case Opcode::VectorSignedSaturatedAccumulateUnsigned64: case Opcode::VectorSignedSaturatedNarrowToSigned16: case Opcode::VectorSignedSaturatedNarrowToSigned32: case Opcode::VectorSignedSaturatedNarrowToSigned64: diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index e3a76411..5f79c424 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -397,6 +397,10 @@ OPCODE(VectorSignedSaturatedAbs8, U128, U128 OPCODE(VectorSignedSaturatedAbs16, U128, U128 ) OPCODE(VectorSignedSaturatedAbs32, U128, U128 ) OPCODE(VectorSignedSaturatedAbs64, U128, U128 ) +OPCODE(VectorSignedSaturatedAccumulateUnsigned8, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedAccumulateUnsigned16, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedAccumulateUnsigned32, U128, U128, U128 ) +OPCODE(VectorSignedSaturatedAccumulateUnsigned64, U128, U128, U128 ) OPCODE(VectorSignedSaturatedDoublingMultiplyReturnHigh16, U128, U128, U128 ) OPCODE(VectorSignedSaturatedDoublingMultiplyReturnHigh32, U128, U128, U128 ) OPCODE(VectorSignedSaturatedNarrowToSigned16, U128, U128 )