backend\A64\emit_a64_packed.cpp: Implement AddSub halving and non halving

This commit is contained in:
SachinVin 2019-07-27 17:34:49 +05:30
parent a87b13cabf
commit 6b167a68e4
3 changed files with 117 additions and 17 deletions

View File

@ -190,7 +190,6 @@ void EmitA64::EmitPackedHalvingAddU8(EmitContext& ctx, IR::Inst* inst) {
code.fp_emitter.UHADD(B, a, a, b);
ctx.reg_alloc.DefineValue(inst, a);
}
void EmitA64::EmitPackedHalvingAddU16(EmitContext& ctx, IR::Inst* inst) {
@ -262,6 +261,107 @@ void EmitA64::EmitPackedHalvingSubS16(EmitContext& ctx, IR::Inst* inst) {
code.fp_emitter.SHSUB(H, a, a, b);
ctx.reg_alloc.DefineValue(inst, a);
}
void EmitPackedSubAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, bool hi_is_sum, bool is_signed, bool is_halving) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
const ARM64Reg reg_a_hi = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
const ARM64Reg reg_b_hi = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[1]));
const ARM64Reg reg_a_lo = DecodeReg(ctx.reg_alloc.ScratchGpr());
const ARM64Reg reg_b_lo = DecodeReg(ctx.reg_alloc.ScratchGpr());
ARM64Reg reg_sum, reg_diff;
if (is_signed) {
code.SXTH(reg_a_lo, reg_a_hi);
code.SXTH(reg_b_lo, reg_b_hi);
code.ASR(reg_a_hi, reg_a_hi, 16);
code.ASR(reg_b_hi, reg_b_hi, 16);
} else {
code.UXTH(reg_a_lo, reg_a_hi);
code.UXTH(reg_b_lo, reg_b_hi);
code.LSR(reg_a_hi, reg_a_hi, 16);
code.LSR(reg_b_hi, reg_b_hi, 16);
}
if (hi_is_sum) {
code.SUB(reg_a_lo, reg_a_lo, reg_b_hi);
code.ADD(reg_a_hi, reg_a_hi, reg_b_lo);
reg_diff = reg_a_lo;
reg_sum = reg_a_hi;
} else {
code.ADD(reg_a_lo, reg_a_lo, reg_b_hi);
code.SUB(reg_a_hi, reg_a_hi, reg_b_lo);
reg_diff = reg_a_hi;
reg_sum = reg_a_lo;
}
if (ge_inst) {
// The reg_b registers are no longer required.
const ARM64Reg ge_sum = reg_b_hi;
const ARM64Reg ge_diff = reg_b_lo;
if (!is_signed) {
code.LSL(ge_sum, reg_sum, 15);
code.ASR(ge_sum, ge_sum, 31);
} else {
code.MVN(ge_sum, reg_sum);
code.ASR(ge_sum, ge_sum, 31);
}
code.MVN(ge_diff, reg_diff);
code.ASR(ge_diff, ge_diff, 31);
code.ANDI2R(ge_sum, ge_sum, hi_is_sum ? 0xFFFF0000 : 0x0000FFFF);
code.ANDI2R(ge_diff, ge_diff, hi_is_sum ? 0x0000FFFF : 0xFFFF0000);
code.ORR(ge_sum, ge_sum, ge_diff);
ctx.reg_alloc.DefineValue(ge_inst, ge_sum);
ctx.EraseInstruction(ge_inst);
}
if (is_halving) {
code.LSR(reg_a_hi, reg_a_hi, 1);
code.LSR(reg_a_lo, reg_a_lo, 1);
}
// reg_a_lo now contains the low word and reg_a_hi now contains the high word.
// Merge them.
code.BFM(reg_a_lo, reg_a_hi, 16, 15);
ctx.reg_alloc.DefineValue(inst, reg_a_lo);
}
void EmitA64::EmitPackedAddSubU16(EmitContext& ctx, IR::Inst* inst) {
EmitPackedSubAdd(code, ctx, inst, true, false, false);
}
void EmitA64::EmitPackedAddSubS16(EmitContext& ctx, IR::Inst* inst) {
EmitPackedSubAdd(code, ctx, inst, true, true, false);
}
void EmitA64::EmitPackedSubAddU16(EmitContext& ctx, IR::Inst* inst) {
EmitPackedSubAdd(code, ctx, inst, false, false, false);
}
void EmitA64::EmitPackedSubAddS16(EmitContext& ctx, IR::Inst* inst) {
EmitPackedSubAdd(code, ctx, inst, false, true, false);
}
void EmitA64::EmitPackedHalvingAddSubU16(EmitContext& ctx, IR::Inst* inst) {
EmitPackedSubAdd(code, ctx, inst, true, false, true);
}
void EmitA64::EmitPackedHalvingAddSubS16(EmitContext& ctx, IR::Inst* inst) {
EmitPackedSubAdd(code, ctx, inst, true, true, true);
}
void EmitA64::EmitPackedHalvingSubAddU16(EmitContext& ctx, IR::Inst* inst) {
EmitPackedSubAdd(code, ctx, inst, false, false, true);
}
void EmitA64::EmitPackedHalvingSubAddS16(EmitContext& ctx, IR::Inst* inst) {
EmitPackedSubAdd(code, ctx, inst, false, true, true);
}
void EmitA64::EmitPackedSaturatedAddU8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);

View File

@ -187,15 +187,11 @@ OPCODE(PackedSubS8, U32, U32,
OPCODE(PackedAddU16, U32, U32, U32 )
OPCODE(PackedAddS16, U32, U32, U32 )
OPCODE(PackedSubU16, U32, U32, U32 )
//OPCODE(PackedAddSubU16, U32, U32, U32 )
//OPCODE(PackedAddSubS16, U32, U32, U32 )
//OPCODE(PackedSubAddU16, U32, U32, U32 )
//OPCODE(PackedSubAddS16, U32, U32, U32 )
//OPCODE(PackedHalvingAddSubU16, U32, U32, U32 )
//OPCODE(PackedHalvingAddSubS16, U32, U32, U32 )
//OPCODE(PackedHalvingSubAddU16, U32, U32, U32 )
//OPCODE(PackedHalvingSubAddS16, U32, U32, U32 )
OPCODE(PackedSubS16, U32, U32, U32 )
OPCODE(PackedAddSubU16, U32, U32, U32 )
OPCODE(PackedAddSubS16, U32, U32, U32 )
OPCODE(PackedSubAddU16, U32, U32, U32 )
OPCODE(PackedSubAddS16, U32, U32, U32 )
OPCODE(PackedHalvingAddU8, U32, U32, U32 )
OPCODE(PackedHalvingAddS8, U32, U32, U32 )
OPCODE(PackedHalvingSubU8, U32, U32, U32 )
@ -204,6 +200,10 @@ OPCODE(PackedHalvingAddU16, U32, U32,
OPCODE(PackedHalvingAddS16, U32, U32, U32 )
OPCODE(PackedHalvingSubU16, U32, U32, U32 )
OPCODE(PackedHalvingSubS16, U32, U32, U32 )
OPCODE(PackedHalvingAddSubU16, U32, U32, U32 )
OPCODE(PackedHalvingAddSubS16, U32, U32, U32 )
OPCODE(PackedHalvingSubAddU16, U32, U32, U32 )
OPCODE(PackedHalvingSubAddS16, U32, U32, U32 )
OPCODE(PackedSaturatedAddU8, U32, U32, U32 )
OPCODE(PackedSaturatedAddS8, U32, U32, U32 )
OPCODE(PackedSaturatedSubU8, U32, U32, U32 )

View File

@ -244,16 +244,16 @@ INST(arm_SMUAD, "SMUAD", "cccc01110000dddd1111mmmm00M1nnnn
INST(arm_SMUSD, "SMUSD", "cccc01110000dddd1111mmmm01M1nnnn") // v6
// Parallel Add/Subtract (Modulo) instructions
//INST(arm_SASX, "SASX", "cccc01100001nnnndddd11110011mmmm") // v6
//INST(arm_SSAX, "SSAX", "cccc01100001nnnndddd11110101mmmm") // v6
INST(arm_SASX, "SASX", "cccc01100001nnnndddd11110011mmmm") // v6
INST(arm_SSAX, "SSAX", "cccc01100001nnnndddd11110101mmmm") // v6
INST(arm_SADD8, "SADD8", "cccc01100001nnnndddd11111001mmmm") // v6
INST(arm_SADD16, "SADD16", "cccc01100001nnnndddd11110001mmmm") // v6
INST(arm_SSUB8, "SSUB8", "cccc01100001nnnndddd11111111mmmm") // v6
INST(arm_SSUB16, "SSUB16", "cccc01100001nnnndddd11110111mmmm") // v6
INST(arm_UADD8, "UADD8", "cccc01100101nnnndddd11111001mmmm") // v6
INST(arm_UADD16, "UADD16", "cccc01100101nnnndddd11110001mmmm") // v6
//INST(arm_UASX, "UASX", "cccc01100101nnnndddd11110011mmmm") // v6
//INST(arm_USAX, "USAX", "cccc01100101nnnndddd11110101mmmm") // v6
INST(arm_UASX, "UASX", "cccc01100101nnnndddd11110011mmmm") // v6
INST(arm_USAX, "USAX", "cccc01100101nnnndddd11110101mmmm") // v6
INST(arm_USUB8, "USUB8", "cccc01100101nnnndddd11111111mmmm") // v6
INST(arm_USUB16, "USUB16", "cccc01100101nnnndddd11110111mmmm") // v6
@ -272,10 +272,10 @@ INST(arm_UQSUB8, "UQSUB8", "cccc01100110nnnndddd11111111mmmm
INST(arm_UQSUB16, "UQSUB16", "cccc01100110nnnndddd11110111mmmm") // v6
// Parallel Add/Subtract (Halving) instructions
//INST(arm_SHASX, "SHASX", "cccc01100011nnnndddd11110011mmmm") // v6
//INST(arm_SHSAX, "SHSAX", "cccc01100011nnnndddd11110101mmmm") // v6
//INST(arm_UHASX, "UHASX", "cccc01100111nnnndddd11110011mmmm") // v6
//INST(arm_UHSAX, "UHSAX", "cccc01100111nnnndddd11110101mmmm") // v6
INST(arm_SHASX, "SHASX", "cccc01100011nnnndddd11110011mmmm") // v6
INST(arm_SHSAX, "SHSAX", "cccc01100011nnnndddd11110101mmmm") // v6
INST(arm_UHASX, "UHASX", "cccc01100111nnnndddd11110011mmmm") // v6
INST(arm_UHSAX, "UHSAX", "cccc01100111nnnndddd11110101mmmm") // v6
INST(arm_SHADD8, "SHADD8", "cccc01100011nnnndddd11111001mmmm") // v6
INST(arm_SHADD16, "SHADD16", "cccc01100011nnnndddd11110001mmmm") // v6
INST(arm_SHSUB8, "SHSUB8", "cccc01100011nnnndddd11111111mmmm") // v6