backend\A64\emit_a64_data_processing.cpp: Implement Shift and Rotate ops

This commit is contained in:
SachinVin 2019-07-16 17:42:03 +05:30 committed by xperia64
parent 85fa3096dd
commit 361d221741
2 changed files with 532 additions and 6 deletions

View File

@ -191,4 +191,530 @@ void EmitA64::EmitConditionalSelect64(EmitContext& ctx, IR::Inst* inst) {
void EmitA64::EmitConditionalSelectNZCV(EmitContext& ctx, IR::Inst* inst) {
EmitConditionalSelect(code, ctx, inst, 32);
}
void EmitA64::EmitLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) {
auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
auto& operand_arg = args[0];
auto& shift_arg = args[1];
auto& carry_arg = args[2];
if (!carry_inst) {
if (shift_arg.IsImmediate()) {
Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
u8 shift = shift_arg.GetImmediateU8();
if (shift <= 31) {
code.LSL(result, result, shift);
} else {
code.MOV(result, WZR);
}
ctx.reg_alloc.DefineValue(inst, result);
} else {
//ctx.reg_alloc.Use(shift_arg, HostLoc::X0);
Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg));
Arm64Gen::ARM64Reg result = ctx.reg_alloc.UseScratchGpr(operand_arg);
code.ANDI2R(shift, shift, 0xFF);
code.LSLV(result, result, shift);
code.CMPI2R(shift, 32);
code.CSEL(result, WZR, DecodeReg(result), CC_GE);
ctx.reg_alloc.DefineValue(inst, DecodeReg(result));
}
} else {
if (shift_arg.IsImmediate()) {
u8 shift = shift_arg.GetImmediateU8();
Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg));
if (shift == 0) {
// There is nothing more to do.
} else if (shift < 32) {
code.LSL(carry, result, shift - 1);
code.LSR(carry, carry, 31);
code.LSL(result, result, shift);
} else if (shift > 32) {
code.MOV(result, WZR);
code.MOV(carry, WZR);
} else {
code.ANDI2R(carry, result, 1);
code.MOV(result, WZR);
}
ctx.reg_alloc.DefineValue(carry_inst, carry);
ctx.EraseInstruction(carry_inst);
ctx.reg_alloc.DefineValue(inst, result);
} else {
//ctx.reg_alloc.Use(shift_arg, HostLoc::X0);
Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg));
Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg));
// TODO: Optimize this.
// TODO: Use CSEL instead?
FixupBranch Rs_gt32, Rs_eq32;
std::vector<FixupBranch> end;
code.ANDI2R(shift, shift, 0xFF);
code.CMP(shift, WZR);
// if (Rs & 0xFF == 0) {
end.push_back(code.B(CC_EQ));
// }
code.CMPI2R(shift, 32);
Rs_gt32 = code.B(CC_GT);
Rs_eq32 = code.B(CC_EQ);
// } else if (Rs & 0xFF < 32) {
code.SUBI2R(shift, shift, 1); // Subtract 1 to get the bit that is shiftedout, into the MSB.
code.LSLV(result, result, shift);
code.UBFX(carry, result, 31, 1);
code.LSL(result, result, 1);
end.push_back(code.B());
// } else if (Rs & 0xFF > 32) {
code.SetJumpTarget(Rs_gt32);
code.MOV(result, WZR);
code.MOV(carry, WZR);
end.push_back(code.B());
// } else if (Rs & 0xFF == 32) {
code.SetJumpTarget(Rs_eq32);
code.ANDI2R(carry, result, 1);
code.MOV(result, WZR);
// }
for (FixupBranch e : end) {
code.SetJumpTarget(e);
}
ctx.reg_alloc.DefineValue(carry_inst, carry);
ctx.EraseInstruction(carry_inst);
ctx.reg_alloc.DefineValue(inst, result);
}
}
}
//void EmitA64::EmitLogicalShiftLeft64(EmitContext& ctx, IR::Inst* inst) {
// auto args = ctx.reg_alloc.GetArgumentInfo(inst);
// auto& operand_arg = args[0];
// auto& shift_arg = args[1];
//
// if (shift_arg.IsImmediate()) {
// Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg);
// u8 shift = shift_arg.GetImmediateU8();
//
// if (shift < 64) {
// code.shl(result, shift);
// } else {
// code.xor_(result.cvt32(), result.cvt32());
// }
//
// ctx.reg_alloc.DefineValue(inst, result);
// } else {
// ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
// Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg);
// Xbyak::Reg64 zero = ctx.reg_alloc.ScratchGpr();
//
// // The x64 SHL instruction masks the shift count by 0x1F before performing the shift.
// // ARM differs from the behaviour: It does not mask the count, so shifts above 31 result in zeros.
//
// code.shl(result, code.cl);
// code.xor_(zero.cvt32(), zero.cvt32());
// code.cmp(code.cl, 64);
// code.cmovnb(result, zero);
//
// ctx.reg_alloc.DefineValue(inst, result);
// }
//}
void EmitA64::EmitLogicalShiftRight32(EmitContext& ctx, IR::Inst* inst) {
auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
auto& operand_arg = args[0];
auto& shift_arg = args[1];
auto& carry_arg = args[2];
if (!carry_inst) {
if (shift_arg.IsImmediate()) {
Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
u8 shift = shift_arg.GetImmediateU8();
if (shift <= 31) {
code.LSR(result,result, shift);
} else {
code.MOVI2R(result, 0);
}
ctx.reg_alloc.DefineValue(inst, result);
} else {
//ctx.reg_alloc.Use(shift_arg, HostLoc::X0);
Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg));
Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
// The 32-bit x64 SHR instruction masks the shift count by 0x1F before performing the shift.
// ARM differs from the behaviour: It does not mask the count, so shifts above 31 result in zeros.
code.ANDI2R(shift, shift, 0xFF);
code.LSRV(result, result, shift);
code.CMPI2R(shift, 31);
code.CSEL(result, WZR, result, CC_GT);
ctx.reg_alloc.DefineValue(inst, result);
}
} else {
if (shift_arg.IsImmediate()) {
u8 shift = shift_arg.GetImmediateU8();
Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg));
if (shift == 0) {
// There is nothing more to do.
} else if (shift < 32) {
code.LSR(carry, result, shift - 1);
code.ANDI2R(carry, carry, 1);
code.LSR(result,result, shift);
} else if (shift == 32) {
code.LSR(carry, result, 31);
code.ANDI2R(carry, carry, 1);
code.MOV(result, WZR);
} else {
code.MOV(result, WZR);
code.MOV(carry, WZR);
}
ctx.reg_alloc.DefineValue(carry_inst, carry);
ctx.EraseInstruction(carry_inst);
ctx.reg_alloc.DefineValue(inst, result);
} else {
//ctx.reg_alloc.Use(shift_arg, HostLoc::X0);
Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg));
Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg));
// TODO: Optimize this.
// TODO: Use CSEL instead?
FixupBranch Rs_gt32 , Rs_eq32;
std::vector<FixupBranch> end;
code.ANDI2R(shift, shift, 0xFF);
code.CMPI2R(shift, 32);
Rs_gt32 = code.B(CC_GT);
Rs_eq32 = code.B(CC_EQ);
// if (Rs & 0xFF == 0) goto end;
code.CMP(shift, WZR);
end.push_back(code.B(CC_EQ));
// if (Rs & 0xFF <= 32) {
code.SUBI2R(shift, shift, 1); // Subtract 1 to get the bit that is shifted out to the carry.
code.LSRV(result, result, shift);
code.ANDI2R(carry, result, 1);
code.LSR(result, result, 1);
end.push_back(code.B());
// else if (Rs & 0xFF == 32) {
code.SetJumpTarget(Rs_eq32);
code.LSR(carry, result, 31);
code.ANDI2R(carry, carry, 1);
code.MOV(result, WZR);
end.push_back(code.B());
// } else if (Rs & 0xFF > 32) {
code.SetJumpTarget(Rs_gt32);
code.MOV(result, WZR);
code.MOV(carry, WZR);
// }
for(FixupBranch e : end){
code.SetJumpTarget(e);
}
ctx.reg_alloc.DefineValue(carry_inst, carry);
ctx.EraseInstruction(carry_inst);
ctx.reg_alloc.DefineValue(inst, result);
}
}
}
void EmitA64::EmitLogicalShiftRight64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
auto& operand_arg = args[0];
auto& shift_arg = args[1];
if (shift_arg.IsImmediate()) {
ARM64Reg result = ctx.reg_alloc.UseScratchGpr(operand_arg);
u8 shift = shift_arg.GetImmediateU8();
if (shift < 64) {
code.LSR(result, result, shift);
} else {
code.MOV(result, ZR);
}
ctx.reg_alloc.DefineValue(inst, result);
} else {
ARM64Reg shift = ctx.reg_alloc.UseScratchGpr(shift_arg);
ARM64Reg result = ctx.reg_alloc.UseScratchGpr(operand_arg);
code.ANDI2R(shift, shift, 0xFF);
code.LSRV(result, result, shift);
code.CMP(shift, 63);
code.CSEL(result, WZR, result, CC_GT);
ctx.reg_alloc.DefineValue(inst, result);
}
}
void EmitA64::EmitArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst) {
auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
auto& operand_arg = args[0];
auto& shift_arg = args[1];
auto& carry_arg = args[2];
if (!carry_inst) {
if (shift_arg.IsImmediate()) {
u8 shift = shift_arg.GetImmediateU8();
Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
code.ASR(result, result, u8(shift < 31 ? shift : 31));
ctx.reg_alloc.DefineValue(inst, result);
} else {
//ctx.reg_alloc.UseScratch(shift_arg, HostLoc::X0);
Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg));
Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
Arm64Gen::ARM64Reg const31 = DecodeReg(ctx.reg_alloc.ScratchGpr());
// The 32-bit arm64 SAR instruction masks the shift count by 0x1F before performing the shift.
// ARM differs from the behaviour: It does not mask the count.
// We note that all shift values above 31 have the same behaviour as 31 does, so we saturate `shift` to 31.
code.ANDI2R(shift, shift, 0xFF);
code.MOVI2R(const31, 31);
code.CMPI2R(shift, u32(31));
code.CSEL(shift, shift, const31, CC_LE);
code.ASRV(result, result, shift);
ctx.reg_alloc.DefineValue(inst, result);
}
} else {
if (shift_arg.IsImmediate()) {
u8 shift = shift_arg.GetImmediateU8();
Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg));
if (shift == 0) {
// There is nothing more to do.
} else if (shift <= 31) {
code.ASR(result, result, shift - 1);
code.ANDI2R(carry, result, 1);
code.ASR(result, result, 1);
} else {
code.ASR(result, result, 31);
code.ANDI2R(carry, result, 1);
}
ctx.reg_alloc.DefineValue(carry_inst, carry);
ctx.EraseInstruction(carry_inst);
ctx.reg_alloc.DefineValue(inst, result);
} else {
//ctx.reg_alloc.Use(shift_arg, HostLoc::X0);
Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg));
Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg));
// TODO: Optimize this.
std::vector<FixupBranch> end;
FixupBranch Rs_gt31;
code.ANDI2R(shift, shift, 0xFF);
code.CMPI2R(shift, u32(31));
Rs_gt31 = code.B(CC_GT);
// if (Rs & 0xFF == 0) goto end;
code.CMP(shift, WZR);
end.push_back(code.B(CC_EQ));
// if (Rs & 0xFF <= 31) {
code.SUBI2R(shift, shift, 1);
code.ASRV(result, result, shift);
code.ANDI2R(carry, result, 1);
code.ASR(result, result, 1);
end.push_back(code.B());
// } else if (Rs & 0xFF > 31) {
code.SetJumpTarget(Rs_gt31);
code.ASR(result, result, 31); // 31 produces the same results as anything above 31
code.ANDI2R(carry, result, 1);
// }
for (FixupBranch e : end) {
code.SetJumpTarget(e);
}
ctx.reg_alloc.DefineValue(carry_inst, carry);
ctx.EraseInstruction(carry_inst);
ctx.reg_alloc.DefineValue(inst, result);
}
}
}
//void EmitA64::EmitArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst) {
// auto args = ctx.reg_alloc.GetArgumentInfo(inst);
// auto& operand_arg = args[0];
// auto& shift_arg = args[1];
//
// if (shift_arg.IsImmediate()) {
// u8 shift = shift_arg.GetImmediateU8();
// Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg);
//
// code.sar(result, u8(shift < 63 ? shift : 63));
//
// ctx.reg_alloc.DefineValue(inst, result);
// } else {
// ctx.reg_alloc.UseScratch(shift_arg, HostLoc::RCX);
// Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg);
// Xbyak::Reg64 const63 = ctx.reg_alloc.ScratchGpr();
//
// // The 64-bit x64 SAR instruction masks the shift count by 0x3F before performing the shift.
// // ARM differs from the behaviour: It does not mask the count.
//
// // We note that all shift values above 63 have the same behaviour as 63 does, so we saturate `shift` to 63.
// code.mov(const63, 63);
// code.movzx(code.ecx, code.cl);
// code.cmp(code.ecx, u32(63));
// code.cmovg(code.ecx, const63);
// code.sar(result, code.cl);
//
// ctx.reg_alloc.DefineValue(inst, result);
// }
//}
void EmitA64::EmitRotateRight32(EmitContext& ctx, IR::Inst* inst) {
auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
auto& operand_arg = args[0];
auto& shift_arg = args[1];
auto& carry_arg = args[2];
if (!carry_inst) {
if (shift_arg.IsImmediate()) {
u8 shift = shift_arg.GetImmediateU8();
Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
code.ROR(result, result, u8(shift & 0x1F));
ctx.reg_alloc.DefineValue(inst, result);
} else {
ctx.reg_alloc.Use(shift_arg, HostLoc::X0);
Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
// aarch64 ROR instruction does (shift & 0x1F) for us.
code.RORV(result, result, W0);
ctx.reg_alloc.DefineValue(inst, result);
}
} else {
if (shift_arg.IsImmediate()) {
u8 shift = shift_arg.GetImmediateU8();
Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg));
if (shift == 0) {
// There is nothing more to do.
} else if ((shift & 0x1F) == 0) {
code.MOV(carry, result, ArithOption{result, ST_LSR, 31});
} else {
code.ROR(result, result, (shift & 0x1F) - 1);
code.ANDI2R(carry, result, 1);
code.ROR(result, result, 1);
}
ctx.reg_alloc.DefineValue(carry_inst, carry);
ctx.EraseInstruction(carry_inst);
ctx.reg_alloc.DefineValue(inst, result);
} else {
//ctx.reg_alloc.UseScratch(shift_arg, HostLoc::X0)
Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg));
Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg));
// TODO: Optimize
std::vector<FixupBranch> end;
FixupBranch zero_1F;
code.ANDSI2R(shift, shift, u32(0xFF));
// if (Rs & 0xFF == 0) goto end;
code.CMP(shift, WZR);
end.push_back(code.B(CC_EQ));
code.ANDSI2R(shift, shift, u32(0x1F));
zero_1F = code.B(CC_EQ);
// if (Rs & 0x1F != 0) {
code.SUBI2R(shift, shift, 1);
code.RORV(result, result, shift);
code.ANDI2R(carry, result, 1);
code.ROR(result, result, 1);
end.push_back(code.B());
// } else {
code.SetJumpTarget(zero_1F);
code.MOV(carry, result, ArithOption{result, ST_LSR, 31});
// }
for (FixupBranch e : end) {
code.SetJumpTarget(e);
}
ctx.reg_alloc.DefineValue(carry_inst, carry);
ctx.EraseInstruction(carry_inst);
ctx.reg_alloc.DefineValue(inst, result);
}
}
}
//void EmitA64::EmitRotateRight64(EmitContext& ctx, IR::Inst* inst) {
// auto args = ctx.reg_alloc.GetArgumentInfo(inst);
// auto& operand_arg = args[0];
// auto& shift_arg = args[1];
//
// if (shift_arg.IsImmediate()) {
// u8 shift = shift_arg.GetImmediateU8();
// Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg);
//
// code.ror(result, u8(shift & 0x3F));
//
// ctx.reg_alloc.DefineValue(inst, result);
// } else {
// ctx.reg_alloc.Use(shift_arg, HostLoc::RCX);
// Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg);
//
// // x64 ROR instruction does (shift & 0x3F) for us.
// code.ror(result, code.cl);
//
// ctx.reg_alloc.DefineValue(inst, result);
// }
//}
void EmitA64::EmitRotateRightExtended(EmitContext& ctx, IR::Inst* inst) {
auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[1]));
Arm64Gen::ARM64Reg temp = DecodeReg(ctx.reg_alloc.ScratchGpr());
if (carry_inst) {
code.MOV(temp, result);
}
// Set carry to the LSB and ROR
code.BFI(result, carry, 0, 1);
code.ROR(result, result, 1);
if (carry_inst) {
code.ANDI2R(carry, temp, 1);
ctx.reg_alloc.DefineValue(carry_inst, carry);
ctx.EraseInstruction(carry_inst);
}
ctx.reg_alloc.DefineValue(inst, result);
}
} // namespace Dynarmic::BackendA64

View File

@ -97,24 +97,24 @@ OPCODE(MostSignificantBit, U1, U32
OPCODE(IsZero32, U1, U32 )
OPCODE(IsZero64, U1, U64 )
//OPCODE(TestBit, U1, U64, U8 )
//OPCODE(LogicalShiftLeft32, U32, U32, U8, U1 )
OPCODE(ConditionalSelect32, U32, Cond, U32, U32 )
OPCODE(ConditionalSelect64, U64, Cond, U64, U64 )
OPCODE(ConditionalSelectNZCV, NZCV, Cond, NZCV, NZCV )
OPCODE(LogicalShiftLeft32, U32, U32, U8, U1 )
//OPCODE(LogicalShiftLeft64, U64, U64, U8 )
//OPCODE(LogicalShiftRight32, U32, U32, U8, U1 )
//OPCODE(LogicalShiftRight64, U64, U64, U8 )
//OPCODE(ArithmeticShiftRight32, U32, U32, U8, U1 )
OPCODE(LogicalShiftRight32, U32, U32, U8, U1 )
OPCODE(LogicalShiftRight64, U64, U64, U8 )
OPCODE(ArithmeticShiftRight32, U32, U32, U8, U1 )
//OPCODE(ArithmeticShiftRight64, U64, U64, U8 )
//OPCODE(RotateRight32, U32, U32, U8, U1 )
OPCODE(RotateRight32, U32, U32, U8, U1 )
//OPCODE(RotateRight64, U64, U64, U8 )
//OPCODE(RotateRightExtended, U32, U32, U1 )
//OPCODE(Add32, U32, U32, U32, U1 )
//OPCODE(Add64, U64, U64, U64, U1 )
//OPCODE(Sub32, U32, U32, U32, U1 )
//OPCODE(Sub64, U64, U64, U64, U1 )
//OPCODE(Mul32, U32, U32, U32 )
//OPCODE(Mul64, U64, U64, U64 )
OPCODE(RotateRightExtended, U32, U32, U1 )
//OPCODE(SignedMultiplyHigh64, U64, U64, U64 )
//OPCODE(UnsignedMultiplyHigh64, U64, U64, U64 )
//OPCODE(UnsignedDiv32, U32, U32, U32 )