From a6c2d1952acee976dbe19a219b9bbc428822c2cc Mon Sep 17 00:00:00 2001 From: SachinVin Date: Sat, 28 Sep 2019 18:14:44 +0530 Subject: [PATCH] backend/A64/emit_a64_saturation.cpp: Implement EmitSignedSaturation and EmitUnsignedSaturation Implements SSAT SSAT16 USAT USAT16 QASX QSAX UQASX UQSAX --- src/CMakeLists.txt | 3 +- src/backend/A64/emit_a64_saturation.cpp | 79 ++++++++++++++++++++++++- src/backend/A64/opcodes.inc | 4 +- src/frontend/A32/decoder/arm_a64.inc | 16 ++--- 4 files changed, 89 insertions(+), 13 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 23249422..d3084a34 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -360,14 +360,13 @@ elseif(ARCHITECTURE_Aarch64) backend/A64/emit_a64_data_processing.cpp backend/A64/emit_a64_floating_point.cpp backend/A64/emit_a64_packed.cpp - # backend/A64/emit_a64_saturation.cpp + backend/A64/emit_a64_saturation.cpp # backend/A64/emit_a64_sm4.cpp # backend/A64/emit_a64_vector.cpp # backend/A64/emit_a64_vector_floating_point.cpp backend/A64/hostloc.cpp backend/A64/hostloc.h backend/A64/jitstate_info.h - # backend/A64/oparg.h backend/A64/opcodes.inc backend/A64/perf_map.cpp backend/A64/perf_map.h diff --git a/src/backend/A64/emit_a64_saturation.cpp b/src/backend/A64/emit_a64_saturation.cpp index b53e8d7e..43959517 100644 --- a/src/backend/A64/emit_a64_saturation.cpp +++ b/src/backend/A64/emit_a64_saturation.cpp @@ -20,5 +20,82 @@ namespace Dynarmic::BackendA64 { namespace mp = Dynarmic::Common::mp; -namespace { +void EmitA64::EmitSignedSaturation(EmitContext& ctx, IR::Inst* inst) { + const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const size_t N = args[1].GetImmediateU8(); + ASSERT(N >= 1 && N <= 32); + + if (N == 32) { + if (overflow_inst) { + const auto no_overflow = IR::Value(false); + overflow_inst->ReplaceUsesWith(no_overflow); + } + ctx.reg_alloc.DefineValue(inst, args[0]); + return; + } + + const u32 mask = (1u << N) - 1; + const u32 positive_saturated_value = (1u << (N - 1)) - 1; + const u32 negative_saturated_value = 1u << (N - 1); + const u32 sext_negative_satured_value = Common::SignExtend(N, negative_saturated_value); + + const ARM64Reg result = DecodeReg(ctx.reg_alloc.ScratchGpr()); + const ARM64Reg reg_a = DecodeReg(ctx.reg_alloc.UseGpr(args[0])); + const ARM64Reg overflow = DecodeReg(ctx.reg_alloc.ScratchGpr()); + const ARM64Reg tmp = DecodeReg(ctx.reg_alloc.ScratchGpr()); + + // overflow now contains a value between 0 and mask if it was originally between {negative,positive}_saturated_value. + code.ADDI2R(overflow, reg_a, negative_saturated_value, overflow); + + // Put the appropriate saturated value in result + code.MOVI2R(tmp, positive_saturated_value); + code.CMP(reg_a, tmp); + code.MOVI2R(result, sext_negative_satured_value); + code.CSEL(result, tmp, result, CC_GT); + + // Do the saturation + code.CMPI2R(overflow, mask, tmp); + code.CSEL(result, reg_a, result, CC_LS); + + if (overflow_inst) { + code.CSET(overflow, CC_HI); + + ctx.reg_alloc.DefineValue(overflow_inst, overflow); + ctx.EraseInstruction(overflow_inst); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitUnsignedSaturation(EmitContext& ctx, IR::Inst* inst) { + const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const size_t N = args[1].GetImmediateU8(); + ASSERT(N <= 31); + + const u32 saturated_value = (1u << N) - 1; + + const ARM64Reg result = DecodeReg(ctx.reg_alloc.ScratchGpr()); + const ARM64Reg reg_a = DecodeReg(ctx.reg_alloc.UseGpr(args[0])); + const ARM64Reg overflow = DecodeReg(ctx.reg_alloc.ScratchGpr()); + + // Pseudocode: result = clamp(reg_a, 0, saturated_value); + code.MOVI2R(result, saturated_value); + code.CMP(reg_a, result); + code.CSEL(result, WZR, result, CC_LE); + code.CSEL(result, reg_a, result, CC_LS); + + if (overflow_inst) { + code.CSET(overflow, CC_HI); + + ctx.reg_alloc.DefineValue(overflow_inst, overflow); + ctx.EraseInstruction(overflow_inst); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + } // namespace Dynarmic::BackendA64 diff --git a/src/backend/A64/opcodes.inc b/src/backend/A64/opcodes.inc index 72caf173..cee8895a 100644 --- a/src/backend/A64/opcodes.inc +++ b/src/backend/A64/opcodes.inc @@ -168,7 +168,7 @@ OPCODE(CountLeadingZeros64, U64, U64 //OPCODE(SignedSaturatedSub16, U16, U16, U16 ) //OPCODE(SignedSaturatedSub32, U32, U32, U32 ) //OPCODE(SignedSaturatedSub64, U64, U64, U64 ) -//OPCODE(SignedSaturation, U32, U32, U8 ) +OPCODE(SignedSaturation, U32, U32, U8 ) //OPCODE(UnsignedSaturatedAdd8, U8, U8, U8 ) //OPCODE(UnsignedSaturatedAdd16, U16, U16, U16 ) //OPCODE(UnsignedSaturatedAdd32, U32, U32, U32 ) @@ -177,7 +177,7 @@ OPCODE(CountLeadingZeros64, U64, U64 //OPCODE(UnsignedSaturatedSub16, U16, U16, U16 ) //OPCODE(UnsignedSaturatedSub32, U32, U32, U32 ) //OPCODE(UnsignedSaturatedSub64, U64, U64, U64 ) -//OPCODE(UnsignedSaturation, U32, U32, U8 ) +OPCODE(UnsignedSaturation, U32, U32, U8 ) // Packed instructions OPCODE(PackedAddU8, U32, U32, U32 ) diff --git a/src/frontend/A32/decoder/arm_a64.inc b/src/frontend/A32/decoder/arm_a64.inc index 595e57f2..eafb39a8 100644 --- a/src/frontend/A32/decoder/arm_a64.inc +++ b/src/frontend/A32/decoder/arm_a64.inc @@ -200,10 +200,10 @@ INST(arm_REV16, "REV16", "cccc011010111111dddd11111011mmmm INST(arm_REVSH, "REVSH", "cccc011011111111dddd11111011mmmm") // v6 // Saturation instructions -//INST(arm_SSAT, "SSAT", "cccc0110101vvvvvddddvvvvvr01nnnn") // v6 -//INST(arm_SSAT16, "SSAT16", "cccc01101010vvvvdddd11110011nnnn") // v6 -//INST(arm_USAT, "USAT", "cccc0110111vvvvvddddvvvvvr01nnnn") // v6 -//INST(arm_USAT16, "USAT16", "cccc01101110vvvvdddd11110011nnnn") // v6 +INST(arm_SSAT, "SSAT", "cccc0110101vvvvvddddvvvvvr01nnnn") // v6 +INST(arm_SSAT16, "SSAT16", "cccc01101010vvvvdddd11110011nnnn") // v6 +INST(arm_USAT, "USAT", "cccc0110111vvvvvddddvvvvvr01nnnn") // v6 +INST(arm_USAT16, "USAT16", "cccc01101110vvvvdddd11110011nnnn") // v6 // Divide instructions INST(arm_SDIV, "SDIV", "cccc01110001dddd1111mmmm0001nnnn") // v7a @@ -260,14 +260,14 @@ INST(arm_USUB16, "USUB16", "cccc01100101nnnndddd11110111mmmm // Parallel Add/Subtract (Saturating) instructions INST(arm_QADD8, "QADD8", "cccc01100010nnnndddd11111001mmmm") // v6 INST(arm_QADD16, "QADD16", "cccc01100010nnnndddd11110001mmmm") // v6 -//INST(arm_QASX, "QASX", "cccc01100010nnnndddd11110011mmmm") // v6 -//INST(arm_QSAX, "QSAX", "cccc01100010nnnndddd11110101mmmm") // v6 +INST(arm_QASX, "QASX", "cccc01100010nnnndddd11110011mmmm") // v6 +INST(arm_QSAX, "QSAX", "cccc01100010nnnndddd11110101mmmm") // v6 INST(arm_QSUB8, "QSUB8", "cccc01100010nnnndddd11111111mmmm") // v6 INST(arm_QSUB16, "QSUB16", "cccc01100010nnnndddd11110111mmmm") // v6 INST(arm_UQADD8, "UQADD8", "cccc01100110nnnndddd11111001mmmm") // v6 INST(arm_UQADD16, "UQADD16", "cccc01100110nnnndddd11110001mmmm") // v6 -//INST(arm_UQASX, "UQASX", "cccc01100110nnnndddd11110011mmmm") // v6 -//INST(arm_UQSAX, "UQSAX", "cccc01100110nnnndddd11110101mmmm") // v6 +INST(arm_UQASX, "UQASX", "cccc01100110nnnndddd11110011mmmm") // v6 +INST(arm_UQSAX, "UQSAX", "cccc01100110nnnndddd11110101mmmm") // v6 INST(arm_UQSUB8, "UQSUB8", "cccc01100110nnnndddd11111111mmmm") // v6 INST(arm_UQSUB16, "UQSUB16", "cccc01100110nnnndddd11110111mmmm") // v6