From 8c66a1609ee55bcc411ba3157017a363fde89028 Mon Sep 17 00:00:00 2001 From: SachinVin Date: Tue, 8 Oct 2019 12:52:45 +0530 Subject: [PATCH] backend/A64: Use CSLE instead of branches for LSL LSR and ASR + minor cleanup --- src/backend/A64/emit_a64_data_processing.cpp | 122 ++++++------------- 1 file changed, 36 insertions(+), 86 deletions(-) diff --git a/src/backend/A64/emit_a64_data_processing.cpp b/src/backend/A64/emit_a64_data_processing.cpp index 81aaa5da..37ebe292 100644 --- a/src/backend/A64/emit_a64_data_processing.cpp +++ b/src/backend/A64/emit_a64_data_processing.cpp @@ -19,7 +19,6 @@ void EmitA64::EmitPack2x32To1x64(EmitContext& ctx, IR::Inst* inst) { ARM64Reg lo = ctx.reg_alloc.UseScratchGpr(args[0]); ARM64Reg hi = ctx.reg_alloc.UseScratchGpr(args[1]); - // code.MOV(lo, DecodeReg(lo)); // Zero extend to 64-bits code.ORR(lo, lo, hi, ArithOption{hi, ST_LSL, 32}); ctx.reg_alloc.DefineValue(inst, lo); @@ -98,7 +97,7 @@ void EmitA64::EmitIsZero64(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]); // TODO: Flag optimization - code.CMP(result, WZR); + code.CMP(result, ZR); code.CSET(result, CC_EQ); ctx.reg_alloc.DefineValue(inst, result); } @@ -231,8 +230,7 @@ void EmitA64::EmitLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) { if (shift == 0) { // There is nothing more to do. } else if (shift < 32) { - code.LSL(carry, result, shift - 1); - code.LSR(carry, carry, 31); + code.UBFX(carry, result, 32 - shift, 1); code.LSL(result, result, shift); } else if (shift > 32) { code.MOV(result, WZR); @@ -246,44 +244,26 @@ void EmitA64::EmitLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) { ctx.EraseInstruction(carry_inst); ctx.reg_alloc.DefineValue(inst, result); } else { - //ctx.reg_alloc.Use(shift_arg, HostLoc::X0); Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg)); Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg)); Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg)); - // TODO: Optimize this. - // TODO: Use CSEL instead? - FixupBranch Rs_gt32, Rs_eq32; - std::vector end; + FixupBranch end; + + code.ANDSI2R(shift, shift, 0xFF); + // if (Rs & 0xFF == 0) goto end; + end = code.B(CC_EQ); - code.ANDI2R(shift, shift, 0xFF); - code.CMP(shift, WZR); - // if (Rs & 0xFF == 0) { - end.push_back(code.B(CC_EQ)); - // } code.CMPI2R(shift, 32); - Rs_gt32 = code.B(CC_GT); - Rs_eq32 = code.B(CC_EQ); - // } else if (Rs & 0xFF < 32) { code.SUBI2R(shift, shift, 1); // Subtract 1 to get the bit that is shiftedout, into the MSB. code.LSLV(result, result, shift); code.UBFX(carry, result, 31, 1); code.LSL(result, result, 1); - end.push_back(code.B()); - // } else if (Rs & 0xFF > 32) { - code.SetJumpTarget(Rs_gt32); - code.MOV(result, WZR); - code.MOV(carry, WZR); - end.push_back(code.B()); - // } else if (Rs & 0xFF == 32) { - code.SetJumpTarget(Rs_eq32); - code.ANDI2R(carry, result, 1); - code.MOV(result, WZR); - // } - for (FixupBranch e : end) { - code.SetJumpTarget(e); - } + code.CSEL(result, result, WZR, CC_LT); + code.CSEL(carry, carry, WZR, CC_LE); + + code.SetJumpTarget(end); ctx.reg_alloc.DefineValue(carry_inst, carry); ctx.EraseInstruction(carry_inst); @@ -332,17 +312,16 @@ void EmitA64::EmitLogicalShiftRight32(EmitContext& ctx, IR::Inst* inst) { u8 shift = shift_arg.GetImmediateU8(); if (shift <= 31) { - code.LSR(result,result, shift); + code.LSR(result, result, shift); } else { code.MOVI2R(result, 0); } ctx.reg_alloc.DefineValue(inst, result); } else { - //ctx.reg_alloc.Use(shift_arg, HostLoc::X0); Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg)); Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg)); - // The 32-bit x64 SHR instruction masks the shift count by 0x1F before performing the shift. + // The 32-bit A64 LSR instruction masks the shift count by 0x1F before performing the shift. // ARM differs from the behaviour: It does not mask the count, so shifts above 31 result in zeros. code.ANDI2R(shift, shift, 0xFF); @@ -365,8 +344,7 @@ void EmitA64::EmitLogicalShiftRight32(EmitContext& ctx, IR::Inst* inst) { code.ANDI2R(carry, carry, 1); code.LSR(result,result, shift); } else if (shift == 32) { - code.LSR(carry, result, 31); - code.ANDI2R(carry, carry, 1); + code.UBFX(carry, result, 31, 1); code.MOV(result, WZR); } else { code.MOV(result, WZR); @@ -377,44 +355,27 @@ void EmitA64::EmitLogicalShiftRight32(EmitContext& ctx, IR::Inst* inst) { ctx.EraseInstruction(carry_inst); ctx.reg_alloc.DefineValue(inst, result); } else { - //ctx.reg_alloc.Use(shift_arg, HostLoc::X0); Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg)); Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg)); Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg)); // TODO: Optimize this. - // TODO: Use CSEL instead? - FixupBranch Rs_gt32 , Rs_eq32; - std::vector end; + FixupBranch end; - code.ANDI2R(shift, shift, 0xFF); - code.CMPI2R(shift, 32); - Rs_gt32 = code.B(CC_GT); - Rs_eq32 = code.B(CC_EQ); + code.ANDSI2R(shift, shift, 0xFF); // if (Rs & 0xFF == 0) goto end; - code.CMP(shift, WZR); - end.push_back(code.B(CC_EQ)); - // if (Rs & 0xFF <= 31) { + end = code.B(CC_EQ); + + code.CMPI2R(shift, 32); code.SUBI2R(shift, shift, 1); // Subtract 1 to get the bit that is shifted out to the carry. code.LSRV(result, result, shift); code.ANDI2R(carry, result, 1); code.LSR(result, result, 1); - end.push_back(code.B()); - // else if (Rs & 0xFF == 32) { - code.SetJumpTarget(Rs_eq32); - code.LSR(carry, result, 31); - code.ANDI2R(carry, carry, 1); - code.MOV(result, WZR); - end.push_back(code.B()); - // } else if (Rs & 0xFF > 32) { - code.SetJumpTarget(Rs_gt32); - code.MOV(result, WZR); - code.MOV(carry, WZR); - // } - for(FixupBranch e : end){ - code.SetJumpTarget(e); - } + code.CSEL(result, result, WZR, CC_LT); + code.CSEL(carry, carry, WZR, CC_LE); + + code.SetJumpTarget(end); ctx.reg_alloc.DefineValue(carry_inst, carry); ctx.EraseInstruction(carry_inst); @@ -474,7 +435,7 @@ void EmitA64::EmitArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst) { Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg)); Arm64Gen::ARM64Reg const31 = DecodeReg(ctx.reg_alloc.ScratchGpr()); - // The 32-bit arm64 SAR instruction masks the shift count by 0x1F before performing the shift. + // The 32-bit arm64 ASR instruction masks the shift count by 0x1F before performing the shift. // ARM differs from the behaviour: It does not mask the count. // We note that all shift values above 31 have the same behaviour as 31 does, so we saturate `shift` to 31. @@ -507,37 +468,28 @@ void EmitA64::EmitArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst) { ctx.EraseInstruction(carry_inst); ctx.reg_alloc.DefineValue(inst, result); } else { - //ctx.reg_alloc.Use(shift_arg, HostLoc::X0); Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg)); Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg)); Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg)); // TODO: Optimize this. - std::vector end; - FixupBranch Rs_gt31; + FixupBranch end; - code.ANDI2R(shift, shift, 0xFF); - code.CMPI2R(shift, u32(31)); - Rs_gt31 = code.B(CC_GT); + code.ANDSI2R(shift, shift, 0xFF); // if (Rs & 0xFF == 0) goto end; - code.CMP(shift, WZR); - end.push_back(code.B(CC_EQ)); - // if (Rs & 0xFF <= 31) { + end = code.B(CC_EQ); + // else { + code.MOVI2R(carry, 31); + code.CMPI2R(shift, u32(31)); + code.CSEL(shift, shift, carry, CC_LE); code.SUBI2R(shift, shift, 1); code.ASRV(result, result, shift); code.ANDI2R(carry, result, 1); code.ASR(result, result, 1); - end.push_back(code.B()); - // } else if (Rs & 0xFF > 31) { - code.SetJumpTarget(Rs_gt31); - code.ASR(result, result, 31); // 31 produces the same results as anything above 31 - code.ANDI2R(carry, result, 1); // } - - for (FixupBranch e : end) { - code.SetJumpTarget(e); - } + + code.SetJumpTarget(end); ctx.reg_alloc.DefineValue(carry_inst, carry); ctx.EraseInstruction(carry_inst); @@ -594,11 +546,11 @@ void EmitA64::EmitRotateRight32(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(inst, result); } else { - ctx.reg_alloc.Use(shift_arg, HostLoc::X0); + Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseGpr(shift_arg)); Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg)); // aarch64 ROR instruction does (shift & 0x1F) for us. - code.RORV(result, result, W0); + code.RORV(result, result, shift); ctx.reg_alloc.DefineValue(inst, result); } @@ -622,7 +574,6 @@ void EmitA64::EmitRotateRight32(EmitContext& ctx, IR::Inst* inst) { ctx.EraseInstruction(carry_inst); ctx.reg_alloc.DefineValue(inst, result); } else { - //ctx.reg_alloc.UseScratch(shift_arg, HostLoc::X0) Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg)); Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg)); Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg)); @@ -634,7 +585,6 @@ void EmitA64::EmitRotateRight32(EmitContext& ctx, IR::Inst* inst) { code.ANDSI2R(shift, shift, u32(0xFF)); // if (Rs & 0xFF == 0) goto end; - code.CMP(shift, WZR); end.push_back(code.B(CC_EQ)); code.ANDSI2R(shift, shift, u32(0x1F)); zero_1F = code.B(CC_EQ); @@ -695,7 +645,7 @@ void EmitA64::EmitRotateRightExtended(EmitContext& ctx, IR::Inst* inst) { code.MOV(temp, result); } - // Set carry to the LSB and ROR + // Set carry to the LSB and perform ROR. code.BFI(result, carry, 0, 1); code.ROR(result, result, 1);