From 6b167a68e408c04d03fe0f4bbc6f56a7b841adb1 Mon Sep 17 00:00:00 2001 From: SachinVin Date: Sat, 27 Jul 2019 17:34:49 +0530 Subject: [PATCH] backend\A64\emit_a64_packed.cpp: Implement AddSub halving and non halving --- src/backend/A64/emit_a64_packed.cpp | 102 ++++++++++++++++++++++++++- src/backend/A64/opcodes.inc | 16 ++--- src/frontend/A32/decoder/arm_a64.inc | 16 ++--- 3 files changed, 117 insertions(+), 17 deletions(-) diff --git a/src/backend/A64/emit_a64_packed.cpp b/src/backend/A64/emit_a64_packed.cpp index b45db657..fb54361d 100644 --- a/src/backend/A64/emit_a64_packed.cpp +++ b/src/backend/A64/emit_a64_packed.cpp @@ -190,7 +190,6 @@ void EmitA64::EmitPackedHalvingAddU8(EmitContext& ctx, IR::Inst* inst) { code.fp_emitter.UHADD(B, a, a, b); ctx.reg_alloc.DefineValue(inst, a); - } void EmitA64::EmitPackedHalvingAddU16(EmitContext& ctx, IR::Inst* inst) { @@ -262,6 +261,107 @@ void EmitA64::EmitPackedHalvingSubS16(EmitContext& ctx, IR::Inst* inst) { code.fp_emitter.SHSUB(H, a, a, b); ctx.reg_alloc.DefineValue(inst, a); } + +void EmitPackedSubAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, bool hi_is_sum, bool is_signed, bool is_halving) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + const ARM64Reg reg_a_hi = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + const ARM64Reg reg_b_hi = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[1])); + const ARM64Reg reg_a_lo = DecodeReg(ctx.reg_alloc.ScratchGpr()); + const ARM64Reg reg_b_lo = DecodeReg(ctx.reg_alloc.ScratchGpr()); + ARM64Reg reg_sum, reg_diff; + + if (is_signed) { + code.SXTH(reg_a_lo, reg_a_hi); + code.SXTH(reg_b_lo, reg_b_hi); + code.ASR(reg_a_hi, reg_a_hi, 16); + code.ASR(reg_b_hi, reg_b_hi, 16); + } else { + code.UXTH(reg_a_lo, reg_a_hi); + code.UXTH(reg_b_lo, reg_b_hi); + code.LSR(reg_a_hi, reg_a_hi, 16); + code.LSR(reg_b_hi, reg_b_hi, 16); + } + + if (hi_is_sum) { + code.SUB(reg_a_lo, reg_a_lo, reg_b_hi); + code.ADD(reg_a_hi, reg_a_hi, reg_b_lo); + reg_diff = reg_a_lo; + reg_sum = reg_a_hi; + } else { + code.ADD(reg_a_lo, reg_a_lo, reg_b_hi); + code.SUB(reg_a_hi, reg_a_hi, reg_b_lo); + reg_diff = reg_a_hi; + reg_sum = reg_a_lo; + } + + if (ge_inst) { + // The reg_b registers are no longer required. + const ARM64Reg ge_sum = reg_b_hi; + const ARM64Reg ge_diff = reg_b_lo; + + if (!is_signed) { + code.LSL(ge_sum, reg_sum, 15); + code.ASR(ge_sum, ge_sum, 31); + } else { + code.MVN(ge_sum, reg_sum); + code.ASR(ge_sum, ge_sum, 31); + } + code.MVN(ge_diff, reg_diff); + code.ASR(ge_diff, ge_diff, 31); + code.ANDI2R(ge_sum, ge_sum, hi_is_sum ? 0xFFFF0000 : 0x0000FFFF); + code.ANDI2R(ge_diff, ge_diff, hi_is_sum ? 0x0000FFFF : 0xFFFF0000); + code.ORR(ge_sum, ge_sum, ge_diff); + + ctx.reg_alloc.DefineValue(ge_inst, ge_sum); + ctx.EraseInstruction(ge_inst); + } + + if (is_halving) { + code.LSR(reg_a_hi, reg_a_hi, 1); + code.LSR(reg_a_lo, reg_a_lo, 1); + } + + // reg_a_lo now contains the low word and reg_a_hi now contains the high word. + // Merge them. + code.BFM(reg_a_lo, reg_a_hi, 16, 15); + + ctx.reg_alloc.DefineValue(inst, reg_a_lo); +} + +void EmitA64::EmitPackedAddSubU16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedSubAdd(code, ctx, inst, true, false, false); +} + +void EmitA64::EmitPackedAddSubS16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedSubAdd(code, ctx, inst, true, true, false); +} + +void EmitA64::EmitPackedSubAddU16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedSubAdd(code, ctx, inst, false, false, false); +} + +void EmitA64::EmitPackedSubAddS16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedSubAdd(code, ctx, inst, false, true, false); +} + +void EmitA64::EmitPackedHalvingAddSubU16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedSubAdd(code, ctx, inst, true, false, true); +} + +void EmitA64::EmitPackedHalvingAddSubS16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedSubAdd(code, ctx, inst, true, true, true); +} + +void EmitA64::EmitPackedHalvingSubAddU16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedSubAdd(code, ctx, inst, false, false, true); +} + +void EmitA64::EmitPackedHalvingSubAddS16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedSubAdd(code, ctx, inst, false, true, true); +} + void EmitA64::EmitPackedSaturatedAddU8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); diff --git a/src/backend/A64/opcodes.inc b/src/backend/A64/opcodes.inc index 04eab7b5..e52c15f4 100644 --- a/src/backend/A64/opcodes.inc +++ b/src/backend/A64/opcodes.inc @@ -187,15 +187,11 @@ OPCODE(PackedSubS8, U32, U32, OPCODE(PackedAddU16, U32, U32, U32 ) OPCODE(PackedAddS16, U32, U32, U32 ) OPCODE(PackedSubU16, U32, U32, U32 ) -//OPCODE(PackedAddSubU16, U32, U32, U32 ) -//OPCODE(PackedAddSubS16, U32, U32, U32 ) -//OPCODE(PackedSubAddU16, U32, U32, U32 ) -//OPCODE(PackedSubAddS16, U32, U32, U32 ) -//OPCODE(PackedHalvingAddSubU16, U32, U32, U32 ) -//OPCODE(PackedHalvingAddSubS16, U32, U32, U32 ) -//OPCODE(PackedHalvingSubAddU16, U32, U32, U32 ) -//OPCODE(PackedHalvingSubAddS16, U32, U32, U32 ) OPCODE(PackedSubS16, U32, U32, U32 ) +OPCODE(PackedAddSubU16, U32, U32, U32 ) +OPCODE(PackedAddSubS16, U32, U32, U32 ) +OPCODE(PackedSubAddU16, U32, U32, U32 ) +OPCODE(PackedSubAddS16, U32, U32, U32 ) OPCODE(PackedHalvingAddU8, U32, U32, U32 ) OPCODE(PackedHalvingAddS8, U32, U32, U32 ) OPCODE(PackedHalvingSubU8, U32, U32, U32 ) @@ -204,6 +200,10 @@ OPCODE(PackedHalvingAddU16, U32, U32, OPCODE(PackedHalvingAddS16, U32, U32, U32 ) OPCODE(PackedHalvingSubU16, U32, U32, U32 ) OPCODE(PackedHalvingSubS16, U32, U32, U32 ) +OPCODE(PackedHalvingAddSubU16, U32, U32, U32 ) +OPCODE(PackedHalvingAddSubS16, U32, U32, U32 ) +OPCODE(PackedHalvingSubAddU16, U32, U32, U32 ) +OPCODE(PackedHalvingSubAddS16, U32, U32, U32 ) OPCODE(PackedSaturatedAddU8, U32, U32, U32 ) OPCODE(PackedSaturatedAddS8, U32, U32, U32 ) OPCODE(PackedSaturatedSubU8, U32, U32, U32 ) diff --git a/src/frontend/A32/decoder/arm_a64.inc b/src/frontend/A32/decoder/arm_a64.inc index ade85048..595e57f2 100644 --- a/src/frontend/A32/decoder/arm_a64.inc +++ b/src/frontend/A32/decoder/arm_a64.inc @@ -244,16 +244,16 @@ INST(arm_SMUAD, "SMUAD", "cccc01110000dddd1111mmmm00M1nnnn INST(arm_SMUSD, "SMUSD", "cccc01110000dddd1111mmmm01M1nnnn") // v6 // Parallel Add/Subtract (Modulo) instructions -//INST(arm_SASX, "SASX", "cccc01100001nnnndddd11110011mmmm") // v6 -//INST(arm_SSAX, "SSAX", "cccc01100001nnnndddd11110101mmmm") // v6 +INST(arm_SASX, "SASX", "cccc01100001nnnndddd11110011mmmm") // v6 +INST(arm_SSAX, "SSAX", "cccc01100001nnnndddd11110101mmmm") // v6 INST(arm_SADD8, "SADD8", "cccc01100001nnnndddd11111001mmmm") // v6 INST(arm_SADD16, "SADD16", "cccc01100001nnnndddd11110001mmmm") // v6 INST(arm_SSUB8, "SSUB8", "cccc01100001nnnndddd11111111mmmm") // v6 INST(arm_SSUB16, "SSUB16", "cccc01100001nnnndddd11110111mmmm") // v6 INST(arm_UADD8, "UADD8", "cccc01100101nnnndddd11111001mmmm") // v6 INST(arm_UADD16, "UADD16", "cccc01100101nnnndddd11110001mmmm") // v6 -//INST(arm_UASX, "UASX", "cccc01100101nnnndddd11110011mmmm") // v6 -//INST(arm_USAX, "USAX", "cccc01100101nnnndddd11110101mmmm") // v6 +INST(arm_UASX, "UASX", "cccc01100101nnnndddd11110011mmmm") // v6 +INST(arm_USAX, "USAX", "cccc01100101nnnndddd11110101mmmm") // v6 INST(arm_USUB8, "USUB8", "cccc01100101nnnndddd11111111mmmm") // v6 INST(arm_USUB16, "USUB16", "cccc01100101nnnndddd11110111mmmm") // v6 @@ -272,10 +272,10 @@ INST(arm_UQSUB8, "UQSUB8", "cccc01100110nnnndddd11111111mmmm INST(arm_UQSUB16, "UQSUB16", "cccc01100110nnnndddd11110111mmmm") // v6 // Parallel Add/Subtract (Halving) instructions -//INST(arm_SHASX, "SHASX", "cccc01100011nnnndddd11110011mmmm") // v6 -//INST(arm_SHSAX, "SHSAX", "cccc01100011nnnndddd11110101mmmm") // v6 -//INST(arm_UHASX, "UHASX", "cccc01100111nnnndddd11110011mmmm") // v6 -//INST(arm_UHSAX, "UHSAX", "cccc01100111nnnndddd11110101mmmm") // v6 +INST(arm_SHASX, "SHASX", "cccc01100011nnnndddd11110011mmmm") // v6 +INST(arm_SHSAX, "SHSAX", "cccc01100011nnnndddd11110101mmmm") // v6 +INST(arm_UHASX, "UHASX", "cccc01100111nnnndddd11110011mmmm") // v6 +INST(arm_UHSAX, "UHSAX", "cccc01100111nnnndddd11110101mmmm") // v6 INST(arm_SHADD8, "SHADD8", "cccc01100011nnnndddd11111001mmmm") // v6 INST(arm_SHADD16, "SHADD16", "cccc01100011nnnndddd11110001mmmm") // v6 INST(arm_SHSUB8, "SHSUB8", "cccc01100011nnnndddd11111111mmmm") // v6