From 3d4caa5ee1234f38ecc1151a86492513cf0be58f Mon Sep 17 00:00:00 2001 From: SachinVin Date: Wed, 31 Jul 2019 22:43:42 +0530 Subject: [PATCH] backend /A64: cleanup --- src/backend/A64/a32_emit_a64.cpp | 69 +------ src/backend/A64/emit_a64_floating_point.cpp | 211 +------------------- src/backend/A64/oparg.h | 76 ------- 3 files changed, 10 insertions(+), 346 deletions(-) delete mode 100644 src/backend/A64/oparg.h diff --git a/src/backend/A64/a32_emit_a64.cpp b/src/backend/A64/a32_emit_a64.cpp index 577a2947..360f7d72 100644 --- a/src/backend/A64/a32_emit_a64.cpp +++ b/src/backend/A64/a32_emit_a64.cpp @@ -393,31 +393,11 @@ static u32 GetCpsrImpl(A32JitState* jit_state) { } void A32EmitA64::EmitA32GetCpsr(A32EmitContext& ctx, IR::Inst* inst) { - //if (code.DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) { - // Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32(); - // Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); + // TODO:Inline + ctx.reg_alloc.HostCall(inst); + code.MOV(code.ABI_PARAM1, X28); + code.QuickCallFunction(&GetCpsrImpl); - // // Here we observe that CPSR_et and CPSR_ge are right next to each other in memory, - // // so we load them both at the same time with one 64-bit read. This allows us to - // // extract all of their bits together at once with one pext. - // static_assert(offsetof(A32JitState, CPSR_et) + 4 == offsetof(A32JitState, CPSR_ge)); - // code.mov(result.cvt64(), qword[r15 + offsetof(A32JitState, CPSR_et)]); - // code.mov(tmp.cvt64(), 0x80808080'00000003ull); - // code.pext(result.cvt64(), result.cvt64(), tmp.cvt64()); - // code.mov(tmp, 0x000f0220); - // code.pdep(result, result, tmp); - // code.mov(tmp, dword[r15 + offsetof(A32JitState, CPSR_q)]); - // code.shl(tmp, 27); - // code.or_(result, tmp); - // code.or_(result, dword[r15 + offsetof(A32JitState, CPSR_nzcv)]); - // code.or_(result, dword[r15 + offsetof(A32JitState, CPSR_jaifm)]); - - // ctx.reg_alloc.DefineValue(inst, result); - //} else { - ctx.reg_alloc.HostCall(inst); - code.MOV(code.ABI_PARAM1, X28); - code.QuickCallFunction(&GetCpsrImpl); - //} } static void SetCpsrImpl(u32 value, A32JitState* jit_state) { @@ -426,43 +406,12 @@ static void SetCpsrImpl(u32 value, A32JitState* jit_state) { void A32EmitA64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + // TODO:Inline + ctx.reg_alloc.HostCall(nullptr, args[0]); + code.MOV(code.ABI_PARAM2, X28); + code.QuickCallFunction(&SetCpsrImpl); - //if (code.DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) { - // Xbyak::Reg32 cpsr = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32(); - // Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32(); - // Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr().cvt32(); - - // // CPSR_q - // code.bt(cpsr, 27); - // code.setc(code.byte[r15 + offsetof(A32JitState, CPSR_q)]); - - // // CPSR_nzcv - // code.mov(tmp, cpsr); - // code.and_(tmp, 0xF0000000); - // code.mov(dword[r15 + offsetof(A32JitState, CPSR_nzcv)], tmp); - - // // CPSR_jaifm - // code.mov(tmp, cpsr); - // code.and_(tmp, 0x07F0FDDF); - // code.mov(dword[r15 + offsetof(A32JitState, CPSR_jaifm)], tmp); - - // // CPSR_et and CPSR_ge - // static_assert(offsetof(A32JitState, CPSR_et) + 4 == offsetof(A32JitState, CPSR_ge)); - // code.mov(tmp, 0x000f0220); - // code.pext(cpsr, cpsr, tmp); - // code.mov(tmp.cvt64(), 0x01010101'00000003ull); - // code.pdep(cpsr.cvt64(), cpsr.cvt64(), tmp.cvt64()); - // // We perform SWAR partitioned subtraction here, to negate the GE bytes. - // code.mov(tmp.cvt64(), 0x80808080'00000003ull); - // code.mov(tmp2.cvt64(), tmp.cvt64()); - // code.sub(tmp.cvt64(), cpsr.cvt64()); - // code.xor_(tmp.cvt64(), tmp2.cvt64()); - // code.mov(qword[r15 + offsetof(A32JitState, CPSR_et)], tmp.cvt64()); - //} else { - ctx.reg_alloc.HostCall(nullptr, args[0]); - code.MOV(code.ABI_PARAM2, X28); - code.QuickCallFunction(&SetCpsrImpl); - //} } void A32EmitA64::EmitA32SetCpsrNZCV(A32EmitContext& ctx, IR::Inst* inst) { diff --git a/src/backend/A64/emit_a64_floating_point.cpp b/src/backend/A64/emit_a64_floating_point.cpp index 8c21657f..da957cef 100644 --- a/src/backend/A64/emit_a64_floating_point.cpp +++ b/src/backend/A64/emit_a64_floating_point.cpp @@ -36,42 +36,6 @@ namespace mp = Dynarmic::Common::mp; namespace { -const ARM64Reg INVALID_REG = ARM64Reg(-1); - -constexpr u64 f16_negative_zero = 0x8000; -constexpr u64 f16_non_sign_mask = 0x7fff; - -constexpr u64 f32_negative_zero = 0x80000000u; -constexpr u64 f32_nan = 0x7fc00000u; -constexpr u64 f32_non_sign_mask = 0x7fffffffu; -constexpr u64 f32_smallest_normal = 0x00800000u; - -constexpr u64 f64_negative_zero = 0x8000000000000000u; -constexpr u64 f64_nan = 0x7ff8000000000000u; -constexpr u64 f64_non_sign_mask = 0x7fffffffffffffffu; -constexpr u64 f64_smallest_normal = 0x0010000000000000u; - -constexpr u64 f64_penultimate_positive_denormal = 0x000ffffffffffffeu; -constexpr u64 f64_max_s32 = 0x41dfffffffc00000u; // 2147483647 as a double -constexpr u64 f64_min_u32 = 0x0000000000000000u; // 0 as a double -constexpr u64 f64_max_u32 = 0x41efffffffe00000u; // 4294967295 as a double -constexpr u64 f64_max_s64_lim = 0x43e0000000000000u; // 2^63 as a double (actual maximum unrepresentable) -constexpr u64 f64_min_u64 = 0x0000000000000000u; // 0 as a double -constexpr u64 f64_max_u64_lim = 0x43f0000000000000u; // 2^64 as a double (actual maximum unrepresentable) - -//template -//T ChooseOnFsize([[maybe_unused]] T f32, [[maybe_unused]] T f64) { -// static_assert(fsize == 32 || fsize == 64, "fsize must be either 32 or 64"); -// -// if constexpr (fsize == 32) { -// return f32; -// } else { -// return f64; -// } -//} -// -//#define FCODE(NAME) (code.*ChooseOnFsize(&Xbyak::CodeGenerator::NAME##s, &Xbyak::CodeGenerator::NAME##d)) - Arm64Gen::RoundingMode ConvertRoundingModeToA64RoundingMode(FP::RoundingMode rounding_mode) { switch (rounding_mode) { case FP::RoundingMode::ToNearest_TieEven: @@ -89,179 +53,6 @@ Arm64Gen::RoundingMode ConvertRoundingModeToA64RoundingMode(FP::RoundingMode rou } } -//template -//void DenormalsAreZero(BlockOfCode& code, ARM64Reg xmm_value, ARM64Reg gpr_scratch) { -// Xbyak::Label end; -// -// if constexpr (fsize == 32) { -// code.fp_emitter.FMOV(DecodeReg(gpr_scratch), xmm_value); -// code.ANDI2R(DecodeReg(gpr_scratch), DecodeReg(gpr_scratch), u32(0x7FFFFFFF)); -// code.SUBI2R(DecodeReg(gpr_scratch), DecodeReg(gpr_scratch), u32(1)); -// code.CMPI2R(DecodeReg(gpr_scratch), DecodeReg(gpr_scratch), u32(0x007FFFFE)); -// } else { -// auto mask = code.MConst(xword, f64_non_sign_mask); -// mask.setBit(64); -// auto penult_denormal = code.MConst(xword, f64_penultimate_positive_denormal); -// penult_denormal.setBit(64); -// -// code.movq(gpr_scratch, xmm_value); -// code.and_(gpr_scratch, mask); -// code.sub(gpr_scratch, u32(1)); -// code.cmp(gpr_scratch, penult_denormal); -// } -// -// // We need to report back whether we've found a denormal on input. -// // SSE doesn't do this for us when SSE's DAZ is enabled. -// -// code.ja(end); -// code.andps(xmm_value, code.MConst(xword, fsize == 32 ? f32_negative_zero : f64_negative_zero)); -// code.mov(dword[r15 + code.GetJitStateInfo().offsetof_FPSCR_IDC], u32(1 << 7)); -// code.L(end); -//} -// -//template -//void ZeroIfNaN(BlockOfCode& code, ARM64Reg xmm_value, ARM64Reg xmm_scratch) { -// code.xorps(xmm_scratch, xmm_scratch); -// FCODE(cmpords)(xmm_scratch, xmm_value); // true mask when ordered (i.e.: when not an NaN) -// code.pand(xmm_value, xmm_scratch); -//} - -//template -//void ForceToDefaultNaN(BlockOfCode& code, ARM64Reg result) { -// FixupBranch end; -// -// code.fp_emitter.FCMP(result); -// end = code.B(CC_VC); -// code.LDR(result, code.MConst(fsize == 32 ? f32_nan : f64_nan)); -// code.SetJumpTarget(end); -//} - -//template -//FixupBranch ProcessNaN(BlockOfCode& code, ARM64Reg a, ARM64Reg scratch) { -// FixupBranch not_nan, end; -// -// code.fp_emitter.FCMP(a); -// not_nan = code.B(CC_VC); -// -// //code.SwitchToFarCode(); -// //code.SetJumpTarget(nan); -// -// code.EmitPatchLDR(scratch, fsize == 32 ? 0x00400000 : 0x0008'0000'0000'0000); -// code.fp_emitter.ORR(EncodeRegToDouble(a), a, scratch); -// -// end = code.B(); -// -// //code.FlushIcache(); -// //code.SwitchToNearCode(); -// code.SetJumpTarget(not_nan); -// return end; -//} -// -//template -//void PostProcessNaN(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm tmp) { -// if constexpr (fsize == 32) { -// code.movaps(tmp, result); -// code.cmpunordps(tmp, tmp); -// code.pslld(tmp, 31); -// code.xorps(result, tmp); -// } -// else { -// code.movaps(tmp, result); -// code.cmpunordpd(tmp, tmp); -// code.psllq(tmp, 63); -// code.xorps(result, tmp); -// } -//} - -// This is necessary because x86 and ARM differ in they way they return NaNs from floating point operations -// -// ARM behaviour: -// op1 op2 result -// SNaN SNaN/QNaN op1 -// QNaN SNaN op2 -// QNaN QNaN op1 -// SNaN/QNaN other op1 -// other SNaN/QNaN op2 -// -// x86 behaviour: -// op1 op2 result -// SNaN/QNaN SNaN/QNaN op1 -// SNaN/QNaN other op1 -// other SNaN/QNaN op2 -// -// With ARM: SNaNs take priority. With x86: it doesn't matter. -// -// From the above we can see what differs between the architectures is -// the case when op1 == QNaN and op2 == SNaN. -// -// We assume that registers op1 and op2 are read-only. This function also trashes xmm0. -// We allow for the case where op1 and result are the same register. We do not read from op1 once result is written to. -//template -//void EmitPostProcessNaNs(BlockOfCode& code, ARM64Reg result, ARM64Reg op1, ARM64Reg op2, Xbyak::Reg64 tmp, Xbyak::Label end) { -// using FPT = mp::unsigned_integer_of_size; -// constexpr FPT exponent_mask = FP::FPInfo::exponent_mask; -// constexpr FPT mantissa_msb = FP::FPInfo::mantissa_msb; -// constexpr u8 mantissa_msb_bit = static_cast(FP::FPInfo::explicit_mantissa_width - 1); -// -// // At this point we know that at least one of op1 and op2 is a NaN. -// // Thus in op1 ^ op2 at least one of the two would have all 1 bits in the exponent. -// // Keeping in mind xor is commutative, there are only four cases: -// // SNaN ^ SNaN/Inf -> exponent == 0, mantissa_msb == 0 -// // QNaN ^ QNaN -> exponent == 0, mantissa_msb == 0 -// // QNaN ^ SNaN/Inf -> exponent == 0, mantissa_msb == 1 -// // SNaN/QNaN ^ Otherwise -> exponent != 0, mantissa_msb == ? -// // -// // We're only really interested in op1 == QNaN and op2 == SNaN, -// // so we filter out everything else. -// // -// // We do it this way instead of checking that op1 is QNaN because -// // op1 == QNaN && op2 == QNaN is the most common case. With this method -// // that case would only require one branch. -// -// if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { -// code.vxorps(xmm0, op1, op2); -// } else { -// code.movaps(xmm0, op1); -// code.xorps(xmm0, op2); -// } -// -// constexpr size_t shift = fsize == 32 ? 0 : 48; -// if constexpr (fsize == 32) { -// code.movd(tmp.cvt32(), xmm0); -// } else { -// // We do this to avoid requiring 64-bit immediates -// code.pextrw(tmp.cvt32(), xmm0, shift / 16); -// } -// code.and_(tmp.cvt32(), static_cast((exponent_mask | mantissa_msb) >> shift)); -// code.cmp(tmp.cvt32(), static_cast(mantissa_msb >> shift)); -// code.jne(end, code.T_NEAR); -// -// // If we're here there are four cases left: -// // op1 == SNaN && op2 == QNaN -// // op1 == Inf && op2 == QNaN -// // op1 == QNaN && op2 == SNaN <<< The problematic case -// // op1 == QNaN && op2 == Inf -// -// if constexpr (fsize == 32) { -// code.movd(tmp.cvt32(), op2); -// code.shl(tmp.cvt32(), 32 - mantissa_msb_bit); -// } else { -// code.movq(tmp, op2); -// code.shl(tmp, 64 - mantissa_msb_bit); -// } -// // If op2 is a SNaN, CF = 0 and ZF = 0. -// code.jna(end, code.T_NEAR); -// -// // Silence the SNaN as required by spec. -// if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX)) { -// code.vorps(result, op2, code.MConst(xword, mantissa_msb)); -// } else { -// code.movaps(result, op2); -// code.orps(result, code.MConst(xword, mantissa_msb)); -// } -// code.jmp(end, code.T_NEAR); -//} - template void FPTwoOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); @@ -686,4 +477,4 @@ void EmitA64::EmitFPFixedU64ToSingle(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(inst, result); } -} // namespace Dynarmic::BackendX64 +} // namespace Dynarmic::BackendA64 diff --git a/src/backend/A64/oparg.h b/src/backend/A64/oparg.h deleted file mode 100644 index 4e9760c9..00000000 --- a/src/backend/A64/oparg.h +++ /dev/null @@ -1,76 +0,0 @@ -/* This file is part of the dynarmic project. - * Copyright (c) 2016 MerryMage - * This software may be used and distributed according to the terms of the GNU - * General Public License version 2 or any later version. - */ - -#pragma once - -#include "common/assert.h" - -namespace Dynarmic::BackendA64 { - -struct OpArg { - OpArg() : type(Type::Operand), inner_operand() {} - /* implicit */ OpArg(const Xbyak::Address& address) : type(Type::Address), inner_address(address) {} - /* implicit */ OpArg(const Xbyak::Reg& reg) : type(Type::Reg), inner_reg(reg) {} - - Xbyak::Operand& operator*() { - switch (type) { - case Type::Address: - return inner_address; - case Type::Operand: - return inner_operand; - case Type::Reg: - return inner_reg; - } - ASSERT_MSG(false, "Unreachable"); - } - - void setBit(int bits) { - switch (type) { - case Type::Address: - inner_address.setBit(bits); - return; - case Type::Operand: - inner_operand.setBit(bits); - return; - case Type::Reg: - switch (bits) { - case 8: - inner_reg = inner_reg.cvt8(); - return; - case 16: - inner_reg = inner_reg.cvt16(); - return; - case 32: - inner_reg = inner_reg.cvt32(); - return; - case 64: - inner_reg = inner_reg.cvt64(); - return; - default: - ASSERT_MSG(false, "Invalid bits"); - return; - } - } - ASSERT_MSG(false, "Unreachable"); - } - -private: - enum class Type { - Operand, - Address, - Reg, - }; - - Type type; - - union { - Xbyak::Operand inner_operand; - Xbyak::Address inner_address; - Xbyak::Reg inner_reg; - }; -}; - -} // namespace Dynarmic::BackendX64