diff --git a/CMakeLists.txt b/CMakeLists.txt index adfea8cf..48339efc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,9 +8,25 @@ if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) set(MASTER_PROJECT ON) endif() +# Add the module directory to the list of paths +list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/CMakeModules") + +# Arch detection +include(DetectArchitecture) +if (NOT DEFINED ARCHITECTURE) + message(FATAL_ERROR "Unsupported architecture encountered. Ending CMake generation.") +endif() +message(STATUS "Target architecture: ${ARCHITECTURE}") + +set(REQUIRES_NO_EXECUTE_SUPPORT OFF) +# Apple Silicon chips require W^X +if(APPLE AND ARCHITECTURE STREQUAL "arm64") + set(REQUIRES_NO_EXECUTE_SUPPORT ON) +endif() + # Dynarmic project options option(DYNARMIC_ENABLE_CPU_FEATURE_DETECTION "Turning this off causes dynarmic to assume the host CPU doesn't support anything later than SSE3" ON) -option(DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT "Enables support for systems that require W^X" OFF) +option(DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT "Enables support for systems that require W^X" ${REQUIRES_NO_EXECUTE_SUPPORT}) option(DYNARMIC_FATAL_ERRORS "Errors are fatal" OFF) option(DYNARMIC_IGNORE_ASSERTS "Ignore asserts" OFF) option(DYNARMIC_TESTS "Build tests" ${MASTER_PROJECT}) @@ -39,9 +55,6 @@ if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}") message(SEND_ERROR "In-source builds are not allowed.") endif() -# Add the module directory to the list of paths -list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/CMakeModules") - # Compiler flags if (MSVC) set(DYNARMIC_CXX_FLAGS @@ -105,13 +118,6 @@ else() endif() endif() -# Arch detection -include(DetectArchitecture) -if (NOT DEFINED ARCHITECTURE) - message(FATAL_ERROR "Unsupported architecture encountered. Ending CMake generation.") -endif() -message(STATUS "Target architecture: ${ARCHITECTURE}") - # Include Boost if (NOT TARGET boost) if (NOT Boost_INCLUDE_DIRS) diff --git a/src/dynarmic/CMakeLists.txt b/src/dynarmic/CMakeLists.txt index 8ae02cf3..6dd194cf 100644 --- a/src/dynarmic/CMakeLists.txt +++ b/src/dynarmic/CMakeLists.txt @@ -365,55 +365,66 @@ if (ARCHITECTURE STREQUAL "x86_64") else() target_sources(dynarmic PRIVATE backend/x64/exception_handler_generic.cpp) endif() + elseif(ARCHITECTURE STREQUAL "arm64") - target_link_libraries(dynarmic PRIVATE $) - target_sources(dynarmic PRIVATE - backend/arm64/a32_jitstate.cpp - backend/arm64/a32_jitstate.h - backend/arm64/abi.cpp - backend/arm64/abi.h - backend/arm64/devirtualize.h - backend/arm64/emit_arm64.cpp - backend/arm64/emit_arm64.h - backend/arm64/emit_arm64_a32.cpp - backend/arm64/emit_arm64_a32_coprocessor.cpp - backend/arm64/emit_arm64_a32_memory.cpp - backend/arm64/emit_arm64_a64.cpp - backend/arm64/emit_arm64_a64_memory.cpp - backend/arm64/emit_arm64_cryptography.cpp - backend/arm64/emit_arm64_data_processing.cpp - backend/arm64/emit_arm64_floating_point.cpp - backend/arm64/emit_arm64_packed.cpp - backend/arm64/emit_arm64_saturation.cpp - backend/arm64/emit_arm64_vector.cpp - backend/arm64/emit_arm64_vector_floating_point.cpp - backend/arm64/emit_arm64_vector_saturation.cpp - backend/arm64/emit_context.h - backend/arm64/exclusive_monitor.cpp - backend/arm64/fpsr_manager.cpp - backend/arm64/fpsr_manager.h - backend/arm64/reg_alloc.cpp - backend/arm64/reg_alloc.h - backend/arm64/stack_layout.h - common/spin_lock_arm64.cpp - common/spin_lock_arm64.h + backend/A64/emitter/a64_emitter.cpp + backend/A64/emitter/a64_emitter.h + backend/A64/emitter/arm_common.h + backend/A64/emitter/code_block.h + # backend/A64/a64_emit_a64.cpp + # backend/A64/a64_emit_a64.h + # backend/A64/a64_exclusive_monitor.cpp + # backend/A64/a64_interface.cpp + # backend/A64/a64_jitstate.cpp + # backend/A64/a64_jitstate.h + backend/A64/abi.cpp + backend/A64/abi.h + backend/A64/block_of_code.cpp + backend/A64/block_of_code.h + backend/A64/block_range_information.cpp + backend/A64/block_range_information.h + backend/A64/callback.cpp + backend/A64/callback.h + backend/A64/constant_pool.cpp + backend/A64/constant_pool.h + backend/A64/devirtualize.h + backend/A64/emit_a64.cpp + backend/A64/emit_a64.h + # backend/A64/emit_a64_aes.cpp + # backend/A64/emit_a64_crc32.cpp + backend/A64/emit_a64_data_processing.cpp + backend/A64/emit_a64_floating_point.cpp + backend/A64/emit_a64_packed.cpp + backend/A64/emit_a64_saturation.cpp + # backend/A64/emit_a64_sm4.cpp + # backend/A64/emit_a64_vector.cpp + # backend/A64/emit_a64_vector_floating_point.cpp + backend/A64/exception_handler.h + backend/A64/hostloc.cpp + backend/A64/hostloc.h + backend/A64/jitstate_info.h + backend/A64/opcodes.inc + backend/A64/perf_map.cpp + backend/A64/perf_map.h + backend/A64/reg_alloc.cpp + backend/A64/reg_alloc.h ) - + if ("A32" IN_LIST DYNARMIC_FRONTENDS) target_sources(dynarmic PRIVATE - backend/arm64/a32_address_space.cpp - backend/arm64/a32_address_space.h - backend/arm64/a32_core.h - backend/arm64/a32_interface.cpp - - # Move this to the list below when implemented - backend/arm64/a64_interface.cpp + backend/A64/a32_emit_a64.cpp + backend/A64/a32_emit_a64.h + backend/A64/a32_interface.cpp + backend/A64/a32_jitstate.cpp + backend/A64/a32_jitstate.h ) endif() - - if ("A64" IN_LIST DYNARMIC_FRONTENDS) - message(FATAL_ERROR "TODO: Unimplemented frontend for this host architecture") + + if (UNIX) + target_sources(dynarmic PRIVATE backend/A64/exception_handler_posix.cpp) + else() + target_sources(dynarmic PRIVATE backend/A64/exception_handler_generic.cpp) endif() else() message(FATAL_ERROR "Unsupported architecture") diff --git a/src/dynarmic/backend/A64/a32_emit_a64.cpp b/src/dynarmic/backend/A64/a32_emit_a64.cpp new file mode 100644 index 00000000..192f0838 --- /dev/null +++ b/src/dynarmic/backend/A64/a32_emit_a64.cpp @@ -0,0 +1,1594 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#include +#include +#include +#include + +#include +#include + +#include + +#include "backend/A64/a32_emit_a64.h" +#include "backend/A64/a32_jitstate.h" +#include "backend/A64/abi.h" +#include "backend/A64/block_of_code.h" +#include "backend/A64/devirtualize.h" +#include "backend/A64/emit_a64.h" +#include "backend/A64/emitter/a64_emitter.h" +#include "backend/A64/perf_map.h" +#include "common/assert.h" +#include "common/bit_util.h" +#include "common/common_types.h" +#include "common/scope_exit.h" +#include "common/variant_util.h" +#include "frontend/A32/location_descriptor.h" +#include "frontend/A32/types.h" +#include "frontend/ir/basic_block.h" +#include "frontend/ir/microinstruction.h" +#include "frontend/ir/opcodes.h" + +// TODO: Have ARM flags in host flags and not have them use up GPR registers unless necessary. +// TODO: Actually implement that proper instruction selector you've always wanted to sweetheart. + +namespace Dynarmic::BackendA64 { + +// Note that unlike the x64 backend these only returns ONLY the offset to register and not the address! +static size_t MJitStateReg(A32::Reg reg) { + return offsetof(A32JitState, Reg) + sizeof(u32) * static_cast(reg); +} + +static size_t MJitStateExtReg(A32::ExtReg reg) { + if (A32::IsSingleExtReg(reg)) { + size_t index = static_cast(reg) - static_cast(A32::ExtReg::S0); + return offsetof(A32JitState, ExtReg) + sizeof(u32) * index; + } + if (A32::IsDoubleExtReg(reg)) { + size_t index = static_cast(reg) - static_cast(A32::ExtReg::D0); + return offsetof(A32JitState, ExtReg) + sizeof(u64) * index; + } + ASSERT_FALSE("Should never happen."); +} + +A32EmitContext::A32EmitContext(RegAlloc& reg_alloc, IR::Block& block) : EmitContext(reg_alloc, block) {} + +A32::LocationDescriptor A32EmitContext::Location() const { + return A32::LocationDescriptor{block.Location()}; +} + +bool A32EmitContext::IsSingleStep() const { + return A32::LocationDescriptor{block.Location()}.SingleStepping(); +} + +FP::RoundingMode A32EmitContext::FPSCR_RMode() const { + return Location().FPSCR().RMode(); +} + +u32 A32EmitContext::FPCR() const { + return Location().FPSCR().Value(); +} + +bool A32EmitContext::FPSCR_FTZ() const { + return Location().FPSCR().FTZ(); +} + +bool A32EmitContext::FPSCR_DN() const { + return Location().FPSCR().DN(); +} + +std::ptrdiff_t A32EmitContext::GetInstOffset(IR::Inst* inst) const { + return std::distance(block.begin(), IR::Block::iterator(inst)); +} + +A32EmitA64::A32EmitA64(BlockOfCode& code, A32::UserConfig config, A32::Jit* jit_interface) + : EmitA64(code), config(std::move(config)), jit_interface(jit_interface) { + exception_handler.Register(code, [this](CodePtr PC){FastmemCallback(PC);}); + GenMemoryAccessors(); + GenTerminalHandlers(); + code.PreludeComplete(); + ClearFastDispatchTable(); + fastmem_patch_info.clear(); +} + +A32EmitA64::~A32EmitA64() = default; + +A32EmitA64::BlockDescriptor A32EmitA64::Emit(IR::Block& block) { + code.EnableWriting(); + SCOPE_EXIT { + code.DisableWriting(); + }; + + RegAlloc reg_alloc{code, A32JitState::SpillCount, SpillToOpArg}; + A32EmitContext ctx{reg_alloc, block}; + + const u8* entrypoint = code.AlignCode16(); + + // Start emitting. + EmitCondPrelude(ctx); + + for (auto iter = block.begin(); iter != block.end(); ++iter) { + IR::Inst* inst = &*iter; + + // Call the relevant Emit* member function. + switch (inst->GetOpcode()) { + +#define OPCODE(name, type, ...) \ + case IR::Opcode::name: \ + A32EmitA64::Emit##name(ctx, inst); \ + break; +#define A32OPC(name, type, ...) \ + case IR::Opcode::A32##name: \ + A32EmitA64::EmitA32##name(ctx, inst); \ + break; +#define A64OPC(...) +#include "backend/A64/opcodes.inc" +#undef OPCODE +#undef A32OPC +#undef A64OPC + + default: + ASSERT_FALSE("Invalid opcode: {}", inst->GetOpcode()); + break; + } + + reg_alloc.EndOfAllocScope(); + } + + reg_alloc.AssertNoMoreUses(); + + EmitAddCycles(block.CycleCount()); + EmitA64::EmitTerminal(block.GetTerminal(), ctx.Location().SetSingleStepping(false), ctx.IsSingleStep()); + code.BRK(0); + code.PatchConstPool(); + code.FlushIcacheSection(entrypoint, code.GetCodePtr()); + + const size_t size = static_cast(code.GetCodePtr() - entrypoint); + + const A32::LocationDescriptor descriptor{block.Location()}; + const A32::LocationDescriptor end_location{block.EndLocation()}; + + const auto range = boost::icl::discrete_interval::closed(descriptor.PC(), end_location.PC() - 1); + block_ranges.AddRange(range, descriptor); + + return RegisterBlock(descriptor, entrypoint, size); +} + +void A32EmitA64::ClearCache() { + EmitA64::ClearCache(); + block_ranges.ClearCache(); + ClearFastDispatchTable(); + fastmem_patch_info.clear(); +} + +void A32EmitA64::InvalidateCacheRanges(const boost::icl::interval_set& ranges) { + InvalidateBasicBlocks(block_ranges.InvalidateRanges(ranges)); +} + +void A32EmitA64::EmitCondPrelude(const A32EmitContext& ctx) { + if (ctx.block.GetCondition() == IR::Cond::AL) { + ASSERT(!ctx.block.HasConditionFailedLocation()); + return; + } + + ASSERT(ctx.block.HasConditionFailedLocation()); + + FixupBranch pass = EmitCond(ctx.block.GetCondition()); + EmitAddCycles(ctx.block.ConditionFailedCycleCount()); + EmitTerminal(IR::Term::LinkBlock{ctx.block.ConditionFailedLocation()}, ctx.block.Location(), ctx.IsSingleStep()); + code.SetJumpTarget(pass); +} + +void A32EmitA64::ClearFastDispatchTable() { + if (config.enable_fast_dispatch) { + fast_dispatch_table.fill({}); + } +} + +void A32EmitA64::GenMemoryAccessors() { + code.AlignCode16(); + read_memory_8 = code.GetCodePtr(); + // Push lr and fp onto the stack + code.ABI_PushRegisters(0x60000000); + code.ADD(X29, SP, 0); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN); + Devirtualize<&A32::UserCallbacks::MemoryRead8>(config.callbacks).EmitCall(code); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN); + code.ABI_PopRegisters(0x60000000); + code.RET(); + PerfMapRegister(read_memory_8, code.GetCodePtr(), "a32_read_memory_8"); + + code.AlignCode16(); + read_memory_16 = code.GetCodePtr(); + // Push lr and fp onto the stack + code.ABI_PushRegisters(0x60000000); + code.ADD(X29, SP, 0); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN); + Devirtualize<&A32::UserCallbacks::MemoryRead16>(config.callbacks).EmitCall(code); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN); + code.ABI_PopRegisters(0x60000000); + code.RET(); + PerfMapRegister(read_memory_16, code.GetCodePtr(), "a32_read_memory_16"); + + code.AlignCode16(); + read_memory_32 = code.GetCodePtr(); + // Push lr and fp onto the stack + code.ABI_PushRegisters(0x60000000); + code.ADD(X29, SP, 0); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN); + Devirtualize<&A32::UserCallbacks::MemoryRead32>(config.callbacks).EmitCall(code); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN); + code.ABI_PopRegisters(0x60000000); + code.RET(); + PerfMapRegister(read_memory_32, code.GetCodePtr(), "a32_read_memory_32"); + + code.AlignCode16(); + read_memory_64 = code.GetCodePtr(); + // Push lr and fp onto the stack + code.ABI_PushRegisters(0x60000000); + code.ADD(X29, SP, 0); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN); + Devirtualize<&A32::UserCallbacks::MemoryRead64>(config.callbacks).EmitCall(code); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN); + code.ABI_PopRegisters(0x60000000); + code.RET(); + PerfMapRegister(read_memory_64, code.GetCodePtr(), "a32_read_memory_64"); + + code.AlignCode16(); + write_memory_8 = code.GetCodePtr(); + // Push lr and fp onto the stack + code.ABI_PushRegisters(0x60000000); + code.ADD(X29, SP, 0); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN); + Devirtualize<&A32::UserCallbacks::MemoryWrite8>(config.callbacks).EmitCall(code); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN); + code.ABI_PopRegisters(0x60000000); + code.RET(); + PerfMapRegister(write_memory_8, code.GetCodePtr(), "a32_write_memory_8"); + + code.AlignCode16(); + write_memory_16 = code.GetCodePtr(); + // Push lr and fp onto the stack + code.ABI_PushRegisters(0x60000000); + code.ADD(X29, SP, 0); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN); + Devirtualize<&A32::UserCallbacks::MemoryWrite16>(config.callbacks).EmitCall(code); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN); + code.ABI_PopRegisters(0x60000000); + code.RET(); + PerfMapRegister(write_memory_16, code.GetCodePtr(), "a32_write_memory_16"); + + code.AlignCode16(); + write_memory_32 = code.GetCodePtr(); + // Push lr and fp onto the stack + code.ABI_PushRegisters(0x60000000); + code.ADD(X29, SP, 0); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN); + Devirtualize<&A32::UserCallbacks::MemoryWrite32>(config.callbacks).EmitCall(code); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN); + code.ABI_PopRegisters(0x60000000); + code.RET(); + PerfMapRegister(write_memory_32, code.GetCodePtr(), "a32_write_memory_32"); + + code.AlignCode16(); + write_memory_64 = code.GetCodePtr(); + // Push lr and fp onto the stack + code.ABI_PushRegisters(0x60000000); + code.ADD(X29, SP, 0); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN); + Devirtualize<&A32::UserCallbacks::MemoryWrite64>(config.callbacks).EmitCall(code); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN); + code.ABI_PopRegisters(0x60000000); + code.RET(); + PerfMapRegister(write_memory_64, code.GetCodePtr(), "a32_write_memory_64"); +} + +void A32EmitA64::GenTerminalHandlers() { + const ARM64Reg fast_dispatch_entry_reg = X19; + const ARM64Reg location_descriptor_reg = X20; + + // PC ends up in fast_dispatch_entry_reg, location_descriptor ends up in location_descriptor_reg. + const auto calculate_location_descriptor = [this](ARM64Reg fast_dispatch_entry_reg_, ARM64Reg location_descriptor_reg_) { + // This calculation has to match up with IREmitter::PushRSB + code.LDR(INDEX_UNSIGNED, DecodeReg(location_descriptor_reg_), X28, offsetof(A32JitState, upper_location_descriptor)); + code.LDR(INDEX_UNSIGNED, DecodeReg(fast_dispatch_entry_reg_), X28, MJitStateReg(A32::Reg::PC)); + code.ORR(location_descriptor_reg_, fast_dispatch_entry_reg_, location_descriptor_reg_, ArithOption{location_descriptor_reg_, ST_LSL, 32}); + }; + + FixupBranch fast_dispatch_cache_miss, rsb_cache_miss; + + code.AlignCode16(); + terminal_handler_pop_rsb_hint = code.GetCodePtr(); + calculate_location_descriptor(fast_dispatch_entry_reg, location_descriptor_reg); + code.LDR(INDEX_UNSIGNED, DecodeReg(code.ABI_SCRATCH1), X28, offsetof(A32JitState, rsb_ptr)); + code.SUBI2R(code.ABI_SCRATCH1, DecodeReg(code.ABI_SCRATCH1), 1); + code.ANDI2R(code.ABI_SCRATCH1, DecodeReg(code.ABI_SCRATCH1), u32(A32JitState::RSBPtrMask)); + code.STR(INDEX_UNSIGNED, DecodeReg(code.ABI_SCRATCH1), X28, offsetof(A32JitState, rsb_ptr)); + + // cmp(location_descriptor_reg, qword[r15 + offsetof(A32JitState, rsb_location_descriptors) + rsb_ptr * sizeof(u64)]); + code.ADD(code.ABI_SCRATCH1, X28, code.ABI_SCRATCH1, ArithOption{code.ABI_SCRATCH1, ST_LSL, 3}); + code.LDR(INDEX_UNSIGNED, X8, code.ABI_SCRATCH1, offsetof(A32JitState, rsb_location_descriptors)); + code.CMP(location_descriptor_reg, X8); + if (config.enable_fast_dispatch) { + rsb_cache_miss = code.B(CC_NEQ); + } else { + code.B(CC_NEQ, code.GetReturnFromRunCodeAddress()); + } + code.LDR(INDEX_UNSIGNED, code.ABI_SCRATCH1, code.ABI_SCRATCH1, offsetof(A32JitState, rsb_codeptrs)); + code.BR(code.ABI_SCRATCH1); + PerfMapRegister(terminal_handler_pop_rsb_hint, code.GetCodePtr(), "a32_terminal_handler_pop_rsb_hint"); + + if (config.enable_fast_dispatch) { + terminal_handler_fast_dispatch_hint = code.AlignCode16(); + calculate_location_descriptor(fast_dispatch_entry_reg, location_descriptor_reg); + code.SetJumpTarget(rsb_cache_miss); + code.MOVI2R(code.ABI_SCRATCH1, reinterpret_cast(fast_dispatch_table.data())); + code.CRC32CW(DecodeReg(fast_dispatch_entry_reg), DecodeReg(fast_dispatch_entry_reg), DecodeReg(code.ABI_SCRATCH1)); + code.ANDI2R(fast_dispatch_entry_reg, fast_dispatch_entry_reg, fast_dispatch_table_mask); + code.ADD(fast_dispatch_entry_reg, fast_dispatch_entry_reg, code.ABI_SCRATCH1); + + code.LDR(INDEX_UNSIGNED, code.ABI_SCRATCH1, fast_dispatch_entry_reg, offsetof(FastDispatchEntry, location_descriptor)); + code.CMP(location_descriptor_reg, code.ABI_SCRATCH1); + fast_dispatch_cache_miss = code.B(CC_NEQ); + code.LDR(INDEX_UNSIGNED, code.ABI_SCRATCH1, fast_dispatch_entry_reg, offsetof(FastDispatchEntry, code_ptr)); + code.BR(code.ABI_SCRATCH1); + + code.SetJumpTarget(fast_dispatch_cache_miss); + code.STR(INDEX_UNSIGNED, location_descriptor_reg, fast_dispatch_entry_reg, offsetof(FastDispatchEntry, location_descriptor) ); + code.LookupBlock(); + code.STR(INDEX_UNSIGNED, code.ABI_RETURN, fast_dispatch_entry_reg, offsetof(FastDispatchEntry, code_ptr)); + code.BR(code.ABI_RETURN); + PerfMapRegister(terminal_handler_fast_dispatch_hint, code.GetCodePtr(), "a32_terminal_handler_fast_dispatch_hint"); + + code.AlignCode16(); + fast_dispatch_table_lookup = reinterpret_cast(code.GetWritableCodePtr()); + code.MOVI2R(code.ABI_PARAM2, reinterpret_cast(fast_dispatch_table.data())); + code.CRC32CW(DecodeReg(code.ABI_PARAM1), DecodeReg(code.ABI_PARAM1), DecodeReg(code.ABI_PARAM2)); + code.ANDI2R(DecodeReg(code.ABI_PARAM1), DecodeReg(code.ABI_PARAM1), fast_dispatch_table_mask); + code.ADD(code.ABI_RETURN, code.ABI_PARAM1, code.ABI_PARAM2); + code.RET(); + } +} + + +void A32EmitA64::EmitA32GetRegister(A32EmitContext& ctx, IR::Inst* inst) { + A32::Reg reg = inst->GetArg(0).GetA32RegRef(); + + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.ScratchGpr()); + code.LDR(INDEX_UNSIGNED, result, X28, MJitStateReg(reg)); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A32EmitA64::EmitA32GetExtendedRegister32(A32EmitContext& ctx, IR::Inst* inst) { + A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef(); + ASSERT(A32::IsSingleExtReg(reg)); + + ARM64Reg result = ctx.reg_alloc.ScratchFpr(); + code.fp_emitter.LDR(32, INDEX_UNSIGNED, result, X28, MJitStateExtReg(reg)); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A32EmitA64::EmitA32GetExtendedRegister64(A32EmitContext& ctx, IR::Inst* inst) { + A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef(); + ASSERT(A32::IsDoubleExtReg(reg)); + + ARM64Reg result = ctx.reg_alloc.ScratchFpr(); + code.fp_emitter.LDR(64, INDEX_UNSIGNED, result, X28, MJitStateExtReg(reg)); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A32EmitA64::EmitA32SetRegister(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + A32::Reg reg = inst->GetArg(0).GetA32RegRef(); + if (args[1].IsInFpr()) { + Arm64Gen::ARM64Reg to_store = ctx.reg_alloc.UseFpr(args[1]); + code.fp_emitter.STR(32, INDEX_UNSIGNED, to_store, X28, MJitStateReg(reg)); + } else { + Arm64Gen::ARM64Reg to_store = DecodeReg(ctx.reg_alloc.UseGpr(args[1])); + code.STR(INDEX_UNSIGNED, to_store, X28, MJitStateReg(reg)); + } +} + +void A32EmitA64::EmitA32SetExtendedRegister32(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef(); + ASSERT(A32::IsSingleExtReg(reg)); + if (args[1].IsInFpr()) { + ARM64Reg to_store = ctx.reg_alloc.UseFpr(args[1]); + code.fp_emitter.STR(32, INDEX_UNSIGNED, to_store, X28, MJitStateExtReg(reg)); + } else { + ARM64Reg to_store = DecodeReg(ctx.reg_alloc.UseGpr(args[1])); + code.STR(INDEX_UNSIGNED, to_store, X28, MJitStateExtReg(reg)); + } +} + +void A32EmitA64::EmitA32SetExtendedRegister64(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef(); + ASSERT(A32::IsDoubleExtReg(reg)); + if (args[1].IsInFpr()) { + ARM64Reg to_store = ctx.reg_alloc.UseFpr(args[1]); + code.fp_emitter.STR(64, INDEX_UNSIGNED, to_store, X28, MJitStateExtReg(reg)); + } + else { + ARM64Reg to_store = ctx.reg_alloc.UseGpr(args[1]); + code.STR(INDEX_UNSIGNED, to_store, X28, MJitStateExtReg(reg)); + } +} + +static u32 GetCpsrImpl(A32JitState* jit_state) { + return jit_state->Cpsr(); +} + +void A32EmitA64::EmitA32GetCpsr(A32EmitContext& ctx, IR::Inst* inst) { + // TODO:Inline + ctx.reg_alloc.HostCall(inst); + code.MOV(code.ABI_PARAM1, X28); + code.QuickCallFunction(&GetCpsrImpl); +} + +static void SetCpsrImpl(u32 value, A32JitState* jit_state) { + jit_state->SetCpsr(value); +} + +void A32EmitA64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + // TODO:Inline + ctx.reg_alloc.HostCall(nullptr, args[0]); + + // Use an unused HostCall register + ARM64Reg host_fpsr = X9; + + if (config.always_little_endian) { + code.ANDI2R(code.ABI_PARAM1, code.ABI_PARAM1, 0xFFFFFDFF, ctx.reg_alloc.ScratchGpr()); + } + + // Since this is one of the only places where the ~sticky~ + // guest's Q flag can be cleared it is also a great place to clear the host's Q flag + code.MRS(host_fpsr, FIELD_FPSR); + code.ANDI2R(host_fpsr, host_fpsr, ~(1 << 27)); + code._MSR(FIELD_FPSR, host_fpsr); + + code.MOV(code.ABI_PARAM2, X28); + code.QuickCallFunction(&SetCpsrImpl); +} + +void A32EmitA64::EmitA32SetCpsrNZCVRaw(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ARM64Reg a = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + + code.ANDI2R(a, a, 0xF0000000); + code.STR(INDEX_UNSIGNED, a, X28, offsetof(A32JitState, cpsr_nzcv)); +} + +void A32EmitA64::EmitA32SetCpsrNZCV(A32EmitContext& ctx, IR::Inst* inst) { + EmitA32SetCpsrNZCVRaw(ctx, inst); +} + +void A32EmitA64::EmitA32SetCpsrNZCVQ(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ARM64Reg host_fpsr = ctx.reg_alloc.ScratchGpr(); + if (args[0].IsImmediate()) { + u32 imm = args[0].GetImmediateU32(); + ARM64Reg a = DecodeReg(ctx.reg_alloc.ScratchGpr()); + + code.MOVI2R(a, u32(imm & 0xF0000000)); + code.STR(INDEX_UNSIGNED, a, X28, offsetof(A32JitState, cpsr_nzcv)); + code.MOVI2R(a, u8((imm & 0x08000000) != 0 ? 1 : 0)); + code.STR(INDEX_UNSIGNED, a, X28, offsetof(A32JitState, cpsr_q)); + } else { + ARM64Reg a = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + ARM64Reg q = DecodeReg(ctx.reg_alloc.ScratchGpr()); + + code.UBFX(q, a, 27, 1); + code.STR(INDEX_UNSIGNED, q, X28, offsetof(A32JitState, cpsr_q)); + code.ANDI2R(a, a, 0xF0000000); + code.STR(INDEX_UNSIGNED, a, X28, offsetof(A32JitState, cpsr_nzcv)); + } + + // Since this is one of the only places where the ~sticky~ + // guest's Q flag can be cleared it is also a great place to clear the host's Q flag. + // TODO : possibly a better job at explaining. + code.MRS(host_fpsr, FIELD_FPSR); + code.ANDI2R(host_fpsr, host_fpsr, ~(1 << 27)); + code._MSR(FIELD_FPSR, host_fpsr); +} + +void A32EmitA64::EmitA32GetNFlag(A32EmitContext& ctx, IR::Inst* inst) { + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.ScratchGpr()); + code.LDR(INDEX_UNSIGNED, result, X28, offsetof(A32JitState, cpsr_nzcv)); + code.UBFX(result, result, 31, 1); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A32EmitA64::EmitA32SetNFlag(A32EmitContext& ctx, IR::Inst* inst) { + constexpr size_t flag_bit = 31; + constexpr u32 flag_mask = 1u << flag_bit; + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + Arm64Gen::ARM64Reg nzcv = DecodeReg(ctx.reg_alloc.ScratchGpr()); + + code.LDR(INDEX_UNSIGNED, nzcv, X28, offsetof(A32JitState, cpsr_nzcv)); + if (args[0].IsImmediate()) { + if (args[0].GetImmediateU1()) { + code.ORRI2R(nzcv, nzcv, flag_mask); + } else { + code.ANDI2R(nzcv, nzcv, ~flag_mask); + } + } else { + Arm64Gen::ARM64Reg to_store = DecodeReg(ctx.reg_alloc.UseGpr(args[0])); + + code.BFI(nzcv, to_store, flag_bit, 1); + } + code.STR(INDEX_UNSIGNED, nzcv, X28, offsetof(A32JitState, cpsr_nzcv)); +} + +void A32EmitA64::EmitA32GetZFlag(A32EmitContext& ctx, IR::Inst* inst) { + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.ScratchGpr()); + code.LDR(INDEX_UNSIGNED, result, X28, offsetof(A32JitState, cpsr_nzcv)); + code.UBFX(result, result, 30, 1); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A32EmitA64::EmitA32SetZFlag(A32EmitContext& ctx, IR::Inst* inst) { + constexpr size_t flag_bit = 30; + constexpr u32 flag_mask = 1u << flag_bit; + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + Arm64Gen::ARM64Reg nzcv = DecodeReg(ctx.reg_alloc.ScratchGpr()); + + code.LDR(INDEX_UNSIGNED, nzcv, X28, offsetof(A32JitState, cpsr_nzcv)); + if (args[0].IsImmediate()) { + if (args[0].GetImmediateU1()) { + code.ORRI2R(nzcv, nzcv, flag_mask); + } else { + code.ANDI2R(nzcv, nzcv, ~flag_mask); + } + } else { + Arm64Gen::ARM64Reg to_store = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + + code.BFI(nzcv, to_store, flag_bit, 1); + } + code.STR(INDEX_UNSIGNED, nzcv, X28, offsetof(A32JitState, cpsr_nzcv)); +} + +void A32EmitA64::EmitA32SetCheckBit(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const ARM64Reg to_store = DecodeReg(ctx.reg_alloc.UseGpr(args[0])); + code.STRB(INDEX_UNSIGNED, to_store, X28, offsetof(A32JitState, check_bit)); +} + +void A32EmitA64::EmitA32GetCFlag(A32EmitContext& ctx, IR::Inst* inst) { + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.ScratchGpr()); + code.LDR(INDEX_UNSIGNED, result, X28, offsetof(A32JitState, cpsr_nzcv)); + code.UBFX(result, result, 29, 1); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A32EmitA64::EmitA32SetCFlag(A32EmitContext& ctx, IR::Inst* inst) { + constexpr size_t flag_bit = 29; + constexpr u32 flag_mask = 1u << flag_bit; + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + Arm64Gen::ARM64Reg nzcv = DecodeReg(ctx.reg_alloc.ScratchGpr()); + + code.LDR(INDEX_UNSIGNED, nzcv, X28, offsetof(A32JitState, cpsr_nzcv)); + if (args[0].IsImmediate()) { + if (args[0].GetImmediateU1()) { + code.ORRI2R(nzcv, nzcv, flag_mask); + } else { + code.ANDI2R(nzcv, nzcv, ~flag_mask); + } + } else { + Arm64Gen::ARM64Reg to_store = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + code.BFI(nzcv, to_store, flag_bit, 1); + } + code.STR(INDEX_UNSIGNED, nzcv, X28, offsetof(A32JitState, cpsr_nzcv)); +} + +void A32EmitA64::EmitA32GetVFlag(A32EmitContext& ctx, IR::Inst* inst) { + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.ScratchGpr()); + code.LDR(INDEX_UNSIGNED, result, X28, offsetof(A32JitState, cpsr_nzcv)); + code.UBFX(result, result, 28, 1); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A32EmitA64::EmitA32SetVFlag(A32EmitContext& ctx, IR::Inst* inst) { + constexpr size_t flag_bit = 28; + constexpr u32 flag_mask = 1u << flag_bit; + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + Arm64Gen::ARM64Reg nzcv = DecodeReg(ctx.reg_alloc.ScratchGpr()); + + code.LDR(INDEX_UNSIGNED, nzcv, X28, offsetof(A32JitState, cpsr_nzcv)); + if (args[0].IsImmediate()) { + if (args[0].GetImmediateU1()) { + code.ORRI2R(nzcv, nzcv, flag_mask); + } else { + code.ANDI2R(nzcv, nzcv, ~flag_mask); + } + } else { + Arm64Gen::ARM64Reg to_store = DecodeReg(ctx.reg_alloc.UseGpr(args[0])); + + code.BFI(nzcv, to_store, flag_bit, 1); + } + code.STR(INDEX_UNSIGNED, nzcv, X28, offsetof(A32JitState, cpsr_nzcv)); +} + +void A32EmitA64::EmitA32OrQFlag(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + if (args[0].IsImmediate()) { + if (args[0].GetImmediateU1()) { + ARM64Reg to_store = DecodeReg(ctx.reg_alloc.UseGpr(args[0])); + code.STR(INDEX_UNSIGNED, to_store, X28, offsetof(A32JitState, cpsr_q)); + } + } else { + ARM64Reg to_store = ctx.reg_alloc.UseGpr(args[0]); + ARM64Reg scratch = DecodeReg(ctx.reg_alloc.ScratchGpr()); + + code.LDR(INDEX_UNSIGNED, scratch, X28, offsetof(A32JitState, cpsr_q)); + code.ORR(scratch, scratch, to_store); + code.STR(INDEX_UNSIGNED, scratch, X28, offsetof(A32JitState, cpsr_q)); + } +} + +void A32EmitA64::EmitA32GetGEFlags(A32EmitContext& ctx, IR::Inst* inst) { + ARM64Reg result = EncodeRegToSingle(ctx.reg_alloc.ScratchFpr()); + code.LDR(INDEX_UNSIGNED, result, X28, offsetof(A32JitState, cpsr_ge)); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A32EmitA64::EmitA32SetGEFlags(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(!args[0].IsImmediate()); + ARM64Reg to_store = INVALID_REG; + if (args[0].IsInFpr()) { + to_store = EncodeRegToSingle(ctx.reg_alloc.UseFpr(args[0])); + } else { + to_store = DecodeReg(ctx.reg_alloc.UseGpr(args[0])); + } + code.STR(INDEX_UNSIGNED, to_store, X28, offsetof(A32JitState, cpsr_ge)); +} + +void A32EmitA64::EmitA32SetGEFlagsCompressed(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + if (args[0].IsImmediate()) { + ARM64Reg to_store = DecodeReg(ctx.reg_alloc.ScratchGpr()); + u32 imm = args[0].GetImmediateU32(); + u32 ge = 0; + ge |= Common::Bit<19>(imm) ? 0xFF000000 : 0; + ge |= Common::Bit<18>(imm) ? 0x00FF0000 : 0; + ge |= Common::Bit<17>(imm) ? 0x0000FF00 : 0; + ge |= Common::Bit<16>(imm) ? 0x000000FF : 0; + + code.MOVI2R(to_store, ge); + code.STR(INDEX_UNSIGNED, to_store, X28, offsetof(A32JitState, cpsr_ge)); + } else { + ARM64Reg a = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + ARM64Reg scratch = DecodeReg(ctx.reg_alloc.ScratchGpr()); + + code.UBFX(a, a, 16, 4); + code.MOVI2R(scratch, 0x00204081); + code.MUL(a, a, scratch); + code.ANDI2R(a, a, 0x01010101); + code.ORR(a, a, a, ArithOption{a, ST_LSL, 1}); + code.STR(INDEX_UNSIGNED, a, X28, offsetof(A32JitState, cpsr_ge)); + } +} + +void A32EmitA64::EmitA32BXWritePC(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& arg = args[0]; + + const u32 upper_without_t = (ctx.Location().SetSingleStepping(false).UniqueHash() >> 32) & 0xFFFFFFFE; + + // Pseudocode: + // if (new_pc & 1) { + // new_pc &= 0xFFFFFFFE; + // cpsr.T = true; + // } else { + // new_pc &= 0xFFFFFFFC; + // cpsr.T = false; + // } + // We rely on the fact we disallow EFlag from changing within a block. + + if (arg.IsImmediate()) { + const ARM64Reg scratch = DecodeReg(ctx.reg_alloc.ScratchGpr()); + u32 new_pc = arg.GetImmediateU32(); + const u32 mask = Common::Bit<0>(new_pc) ? 0xFFFFFFFE : 0xFFFFFFFC; + const u32 new_upper = upper_without_t | (Common::Bit<0>(new_pc) ? 1 : 0); + + code.MOVI2R(scratch, new_pc & mask); + code.STR(INDEX_UNSIGNED, scratch, X28, MJitStateReg(A32::Reg::PC)); + code.MOVI2R(scratch, new_upper); + code.STR(INDEX_UNSIGNED, scratch, X28, offsetof(A32JitState, upper_location_descriptor)); + } else { + const ARM64Reg new_pc = DecodeReg(ctx.reg_alloc.UseScratchGpr(arg)); + const ARM64Reg mask = DecodeReg(ctx.reg_alloc.ScratchGpr()); + const ARM64Reg new_upper = DecodeReg(ctx.reg_alloc.ScratchGpr()); + + code.ANDI2R(mask, new_pc, 1); + code.MOVI2R(new_upper, upper_without_t); + code.ADD(new_upper, new_upper, mask); + code.STR(INDEX_UNSIGNED, new_upper, X28, offsetof(A32JitState, upper_location_descriptor)); + code.LSL(mask, mask, 1); + code.SUBI2R(mask, mask, 4); // mask = pc & 1 ? 0xFFFFFFFE : 0xFFFFFFFC + code.AND(new_pc, new_pc, mask); + code.STR(INDEX_UNSIGNED, new_pc, X28, MJitStateReg(A32::Reg::PC)); + } +} + +void A32EmitA64::EmitA32CallSupervisor(A32EmitContext& ctx, IR::Inst* inst) { + ctx.reg_alloc.HostCall(nullptr); + + code.SwitchFpscrOnExit(); + code.LDR(INDEX_UNSIGNED, code.ABI_PARAM2, X28, offsetof(A32JitState, cycles_to_run)); + code.SUB(code.ABI_PARAM2, code.ABI_PARAM2, X26); + + Devirtualize<&A32::UserCallbacks::AddTicks>(config.callbacks).EmitCall(code); + ctx.reg_alloc.EndOfAllocScope(); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.HostCall(nullptr, {}, args[0]); + Devirtualize<&A32::UserCallbacks::CallSVC>(config.callbacks).EmitCall(code); + Devirtualize<&A32::UserCallbacks::GetTicksRemaining>(config.callbacks).EmitCall(code); + code.STR(INDEX_UNSIGNED, code.ABI_RETURN, X28, offsetof(A32JitState, cycles_to_run)); + code.MOV(X26, code.ABI_RETURN); + code.SwitchFpscrOnEntry(); +} + +void A32EmitA64::EmitA32ExceptionRaised(A32EmitContext& ctx, IR::Inst* inst) { + ctx.reg_alloc.HostCall(nullptr); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(args[0].IsImmediate() && args[1].IsImmediate()); + u32 pc = args[0].GetImmediateU32(); + u64 exception = args[1].GetImmediateU64(); + Devirtualize<&A32::UserCallbacks::ExceptionRaised>(config.callbacks).EmitCall(code, [&](RegList param) { + code.MOVI2R(param[0], pc); + code.MOVI2R(param[1], exception); + }); +} + +static u32 GetFpscrImpl(A32JitState* jit_state) { + return jit_state->Fpscr(); +} + +void A32EmitA64::EmitA32GetFpscr(A32EmitContext& ctx, IR::Inst* inst) { + ctx.reg_alloc.HostCall(inst); + // Use an unused HostCall register + const ARM64Reg fpsr = X9; + const ARM64Reg fpcr = X10; + code.MOV(code.ABI_PARAM1, X28); + + code.MRS(fpsr, FIELD_FPSR); + code.MRS(fpcr, FIELD_FPCR); + code.STR(INDEX_UNSIGNED, fpsr, X28, offsetof(A32JitState, guest_fpsr)); + code.STR(INDEX_UNSIGNED, fpcr, X28, offsetof(A32JitState, guest_fpcr)); + code.QuickCallFunction(&GetFpscrImpl); +} + +static void SetFpscrImpl(u32 value, A32JitState* jit_state) { + jit_state->SetFpscr(value); +} + +void A32EmitA64::EmitA32SetFpscr(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.HostCall(nullptr, args[0]); + // Use an unused HostCall register + const ARM64Reg fpsr = X9; + const ARM64Reg fpcr = X10; + + code.MOV(code.ABI_PARAM2, X28); + + code.QuickCallFunction(&SetFpscrImpl); + + code.LDR(INDEX_UNSIGNED, fpsr, X28, offsetof(A32JitState, guest_fpsr)); + code.LDR(INDEX_UNSIGNED, fpcr, X28, offsetof(A32JitState, guest_fpcr)); + code._MSR(FIELD_FPSR, fpsr); + code._MSR(FIELD_FPCR, fpcr); +} + +void A32EmitA64::EmitA32GetFpscrNZCV(A32EmitContext& ctx, IR::Inst* inst) { + ARM64Reg result = DecodeReg(ctx.reg_alloc.ScratchGpr()); + code.LDR(INDEX_UNSIGNED, result, X28, offsetof(A32JitState, fpsr_nzcv)); + ctx.reg_alloc.DefineValue(inst, result); +} + +void A32EmitA64::EmitA32SetFpscrNZCV(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ARM64Reg value = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + + code.ANDI2R(value, value, 0xF0000000); + + code.STR(INDEX_UNSIGNED, value, X28, offsetof(A32JitState, fpsr_nzcv)); +} + +void A32EmitA64::EmitA32ClearExclusive(A32EmitContext&, IR::Inst*) { + code.STR(INDEX_UNSIGNED, WZR, X28, offsetof(A32JitState, exclusive_state)); +} + +void A32EmitA64::EmitA32SetExclusive(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(args[1].IsImmediate()); + Arm64Gen::ARM64Reg address = DecodeReg(ctx.reg_alloc.UseGpr(args[0])); + Arm64Gen::ARM64Reg state = DecodeReg(ctx.reg_alloc.ScratchGpr()); + + code.MOVI2R(state, u8(1)); + code.STR(INDEX_UNSIGNED, state, X28, offsetof(A32JitState, exclusive_state)); + code.STR(INDEX_UNSIGNED, address, X28, offsetof(A32JitState, exclusive_address)); +} + +A32EmitA64::DoNotFastmemMarker A32EmitA64::GenerateDoNotFastmemMarker(A32EmitContext& ctx, IR::Inst* inst) { + return std::make_tuple(ctx.Location(), ctx.GetInstOffset(inst)); +} + +bool A32EmitA64::ShouldFastmem(const DoNotFastmemMarker& marker) const { + return config.fastmem_pointer && exception_handler.SupportsFastmem() && do_not_fastmem.count(marker) == 0; +} + +void A32EmitA64::DoNotFastmem(const DoNotFastmemMarker& marker) { + do_not_fastmem.emplace(marker); + InvalidateBasicBlocks({std::get<0>(marker)}); +} + +template +void A32EmitA64::ReadMemory(A32EmitContext& ctx, IR::Inst* inst, const CodePtr callback_fn) { + constexpr size_t bit_size = Common::BitSize(); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + ctx.reg_alloc.UseScratch(args[0], ABI_PARAM2); + ctx.reg_alloc.ScratchGpr({ABI_RETURN}); + + ARM64Reg result = ctx.reg_alloc.ScratchGpr(); + ARM64Reg vaddr = DecodeReg(code.ABI_PARAM2); + ARM64Reg tmp = code.ABI_RETURN; + + const auto do_not_fastmem_marker = GenerateDoNotFastmemMarker(ctx, inst); + + const auto page_table_lookup = [this, result, vaddr, tmp, callback_fn](FixupBranch& end) { + constexpr size_t bit_size = Common::BitSize(); + + code.MOVP2R(result, config.page_table); + code.MOV(tmp, vaddr, ArithOption{vaddr, ST_LSR, 12}); + code.LDR(result, result, ArithOption{tmp, true}); + FixupBranch abort = code.CBZ(result); + code.ANDI2R(vaddr, vaddr, 4095); + switch (bit_size) { + case 8: + code.LDRB(DecodeReg(result), result, vaddr); + break; + case 16: + code.LDRH(DecodeReg(result), result, vaddr); + break; + case 32: + code.LDR(DecodeReg(result), result, vaddr); + break; + case 64: + code.LDR(result, result, vaddr); + break; + default: + ASSERT_FALSE("Invalid bit_size"); + break; + } + end = code.B(); + code.SetJumpTarget(abort); + code.BL(callback_fn); + code.MOV(result, code.ABI_RETURN); + }; + + + if (ShouldFastmem(do_not_fastmem_marker)) { + const CodePtr patch_location = code.GetCodePtr(); + switch (bit_size) { + case 8: + code.LDRB(DecodeReg(result), X27, vaddr); + break; + case 16: + code.LDRH(DecodeReg(result), X27, vaddr); + break; + case 32: + code.LDR(DecodeReg(result), X27, vaddr); + break; + case 64: + code.LDR(result, X27, vaddr); + break; + default: + ASSERT_FALSE("Invalid bit_size"); + break; + } + + fastmem_patch_info.emplace( + patch_location, + FastmemPatchInfo{ + [this, patch_location, page_table_lookup, callback_fn, result, do_not_fastmem_marker]{ + CodePtr save_code_ptr = code.GetCodePtr(); + code.SetCodePtr(patch_location); + FixupBranch thunk = code.B(); + u8* end_ptr = code.GetWritableCodePtr(); + code.FlushIcacheSection(reinterpret_cast(patch_location), end_ptr); + code.SetCodePtr(save_code_ptr); + code.SwitchToFarCode(); + code.SetJumpTarget(thunk); + if (config.page_table) { + FixupBranch end{}; + page_table_lookup(end); + code.SetJumpTarget(end, end_ptr); + } else { + code.BL(callback_fn); + code.MOV(result, code.ABI_RETURN); + } + code.B(end_ptr); + code.FlushIcache(); + code.SwitchToNearCode(); + + DoNotFastmem(do_not_fastmem_marker); + } + }); + + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + if (!config.page_table) { + code.BL(callback_fn); + code.MOV(result, code.ABI_RETURN); + ctx.reg_alloc.DefineValue(inst, result); + return; + } + + FixupBranch end{}; + page_table_lookup(end); + code.SetJumpTarget(end); + + ctx.reg_alloc.DefineValue(inst, result); +} + +template +void A32EmitA64::WriteMemory(A32EmitContext& ctx, IR::Inst* inst, const CodePtr callback_fn) { + constexpr size_t bit_size = Common::BitSize(); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + ctx.reg_alloc.ScratchGpr({ABI_RETURN}); + ctx.reg_alloc.UseScratch(args[0], ABI_PARAM2); + ctx.reg_alloc.UseScratch(args[1], ABI_PARAM3); + + ARM64Reg vaddr = DecodeReg(code.ABI_PARAM2); + ARM64Reg value = code.ABI_PARAM3; + ARM64Reg page_index = ctx.reg_alloc.ScratchGpr(); + ARM64Reg addr = ctx.reg_alloc.ScratchGpr(); + + const auto do_not_fastmem_marker = GenerateDoNotFastmemMarker(ctx, inst); + + const auto page_table_lookup = [this, vaddr, value, page_index, addr, callback_fn](FixupBranch& end) { + constexpr size_t bit_size = Common::BitSize(); + + code.MOVP2R(addr, config.page_table); + code.MOV(DecodeReg(page_index), vaddr, ArithOption{vaddr, ST_LSR, 12}); + code.LDR(addr, addr, ArithOption{page_index, true}); + FixupBranch abort = code.CBZ(addr); + code.ANDI2R(vaddr, vaddr, 4095); + switch (bit_size) { + case 8: + code.STRB(DecodeReg(value), addr, vaddr); + break; + case 16: + code.STRH(DecodeReg(value), addr, vaddr); + break; + case 32: + code.STR(DecodeReg(value), addr, vaddr);; + break; + case 64: + code.STR(value, addr, vaddr); + break; + default: + ASSERT_FALSE("Invalid bit_size"); + break; + } + end = code.B(); + code.SetJumpTarget(abort); + code.BL(callback_fn); + }; + + if (ShouldFastmem(do_not_fastmem_marker)) { + const CodePtr patch_location = code.GetCodePtr(); + switch (bit_size) { + case 8: + code.STRB(DecodeReg(value), X27, vaddr); + break; + case 16: + code.STRH(DecodeReg(value), X27, vaddr); + break; + case 32: + code.STR(DecodeReg(value), X27, vaddr); + break; + case 64: + code.STR(value, X27, vaddr); + break; + default: + ASSERT_FALSE("Invalid bit_size"); + break; + } + + fastmem_patch_info.emplace( + patch_location, + FastmemPatchInfo{ + [this, patch_location, page_table_lookup, callback_fn, do_not_fastmem_marker]{ + CodePtr save_code_ptr = code.GetCodePtr(); + code.SetCodePtr(patch_location); + FixupBranch thunk = code.B(); + u8* end_ptr = code.GetWritableCodePtr(); + code.FlushIcacheSection(reinterpret_cast(patch_location), end_ptr); + code.SetCodePtr(save_code_ptr); + code.SwitchToFarCode(); + code.SetJumpTarget(thunk); + if (config.page_table) { + FixupBranch end{}; + page_table_lookup(end); + code.SetJumpTarget(end, end_ptr); + } else { + code.BL(callback_fn); + } + code.B(end_ptr); + code.FlushIcache(); + code.SwitchToNearCode(); + + DoNotFastmem(do_not_fastmem_marker); + } + }); + return; + } + + if (!config.page_table) { + code.BL(callback_fn); + return; + } + + FixupBranch end{}; + page_table_lookup(end); + code.SetJumpTarget(end); +} + +void A32EmitA64::EmitA32ReadMemory8(A32EmitContext& ctx, IR::Inst* inst) { + ReadMemory(ctx, inst, read_memory_8); +} + +void A32EmitA64::EmitA32ReadMemory16(A32EmitContext& ctx, IR::Inst* inst) { + ReadMemory(ctx, inst, read_memory_16); +} + +void A32EmitA64::EmitA32ReadMemory32(A32EmitContext& ctx, IR::Inst* inst) { + ReadMemory(ctx, inst, read_memory_32); +} + +void A32EmitA64::EmitA32ReadMemory64(A32EmitContext& ctx, IR::Inst* inst) { + ReadMemory(ctx, inst, read_memory_64); +} + +void A32EmitA64::EmitA32WriteMemory8(A32EmitContext& ctx, IR::Inst* inst) { + WriteMemory(ctx, inst, write_memory_8); +} + +void A32EmitA64::EmitA32WriteMemory16(A32EmitContext& ctx, IR::Inst* inst) { + WriteMemory(ctx, inst, write_memory_16); +} + +void A32EmitA64::EmitA32WriteMemory32(A32EmitContext& ctx, IR::Inst* inst) { + WriteMemory(ctx, inst, write_memory_32); +} + +void A32EmitA64::EmitA32WriteMemory64(A32EmitContext& ctx, IR::Inst* inst) { + WriteMemory(ctx, inst, write_memory_64); +} + +template +static void ExclusiveWrite(BlockOfCode& code, RegAlloc& reg_alloc, IR::Inst* inst, const A32::UserConfig& config) { + auto args = reg_alloc.GetArgumentInfo(inst); + reg_alloc.HostCall(nullptr, {}, args[0], args[1]); + + // Use unused HostCall registers + ARM64Reg passed = W9; + ARM64Reg tmp = W10; + + std::vector end; + + code.MOVI2R(passed, u32(1)); + code.LDR(INDEX_UNSIGNED, tmp, X28, offsetof(A32JitState, exclusive_state)); + end.push_back(code.CBZ(tmp)); + code.LDR(INDEX_UNSIGNED, tmp, X28, offsetof(A32JitState, exclusive_address)); + code.EOR(tmp, code.ABI_PARAM2, tmp); + code.TSTI2R(tmp, A32JitState::RESERVATION_GRANULE_MASK, reg_alloc.ScratchGpr()); + end.push_back(code.B(CC_NEQ)); + code.STR(INDEX_UNSIGNED, WZR, X28, offsetof(A32JitState, exclusive_state)); + + Devirtualize(config.callbacks).EmitCall(code); + code.MOVI2R(passed, 0); + + for (FixupBranch e : end) { + code.SetJumpTarget(e); + } + + reg_alloc.DefineValue(inst, passed); +} + +void A32EmitA64::EmitA32ExclusiveWriteMemory8(A32EmitContext& ctx, IR::Inst* inst) { + ExclusiveWrite(code, ctx.reg_alloc, inst, config); +} + +void A32EmitA64::EmitA32ExclusiveWriteMemory16(A32EmitContext& ctx, IR::Inst* inst) { + ExclusiveWrite(code, ctx.reg_alloc, inst, config); +} + +void A32EmitA64::EmitA32ExclusiveWriteMemory32(A32EmitContext& ctx, IR::Inst* inst) { + ExclusiveWrite(code, ctx.reg_alloc, inst, config); +} + +void A32EmitA64::EmitA32ExclusiveWriteMemory64(A32EmitContext& ctx, IR::Inst* inst) { + ExclusiveWrite(code, ctx.reg_alloc, inst, config); +} + +static void EmitCoprocessorException() { + ASSERT_FALSE("Should raise coproc exception here"); +} + +static void CallCoprocCallback(BlockOfCode& code, RegAlloc& reg_alloc, A32::Jit* jit_interface, A32::Coprocessor::Callback callback, + IR::Inst* inst = nullptr, std::optional arg0 = {}, std::optional arg1 = {}) { + reg_alloc.HostCall(inst, {}, {}, arg0, arg1); + + code.MOVP2R(code.ABI_PARAM1, jit_interface); + if (callback.user_arg) { + code.MOVP2R(code.ABI_PARAM2, *callback.user_arg); + } + + code.QuickCallFunction(callback.function); +} + +void A32EmitA64::EmitA32CoprocInternalOperation(A32EmitContext& ctx, IR::Inst* inst) { + auto coproc_info = inst->GetArg(0).GetCoprocInfo(); + + size_t coproc_num = coproc_info[0]; + bool two = coproc_info[1] != 0; + unsigned opc1 = static_cast(coproc_info[2]); + A32::CoprocReg CRd = static_cast(coproc_info[3]); + A32::CoprocReg CRn = static_cast(coproc_info[4]); + A32::CoprocReg CRm = static_cast(coproc_info[5]); + unsigned opc2 = static_cast(coproc_info[6]); + + std::shared_ptr coproc = config.coprocessors[coproc_num]; + if (!coproc) { + EmitCoprocessorException(); + return; + } + + auto action = coproc->CompileInternalOperation(two, opc1, CRd, CRn, CRm, opc2); + if (!action) { + EmitCoprocessorException(); + return; + } + + CallCoprocCallback(code, ctx.reg_alloc, jit_interface, *action); +} + +void A32EmitA64::EmitA32CoprocSendOneWord(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto coproc_info = inst->GetArg(0).GetCoprocInfo(); + + size_t coproc_num = coproc_info[0]; + bool two = coproc_info[1] != 0; + unsigned opc1 = static_cast(coproc_info[2]); + A32::CoprocReg CRn = static_cast(coproc_info[3]); + A32::CoprocReg CRm = static_cast(coproc_info[4]); + unsigned opc2 = static_cast(coproc_info[5]); + + std::shared_ptr coproc = config.coprocessors[coproc_num]; + if (!coproc) { + EmitCoprocessorException(); + return; + } + + auto action = coproc->CompileSendOneWord(two, opc1, CRn, CRm, opc2); + switch (action.index()) { + case 0: + EmitCoprocessorException(); + return; + case 1: + CallCoprocCallback(code, ctx.reg_alloc, jit_interface, std::get(action), nullptr, args[1]); + return; + case 2: { + u32* destination_ptr = std::get(action); + + ARM64Reg reg_word = DecodeReg(ctx.reg_alloc.UseGpr(args[1])); + ARM64Reg reg_destination_addr = ctx.reg_alloc.ScratchGpr(); + + code.MOVP2R(reg_destination_addr, destination_ptr); + code.STR(INDEX_UNSIGNED, reg_word, reg_destination_addr, 0); + + return; + } + default: + UNREACHABLE(); + } +} + +void A32EmitA64::EmitA32CoprocSendTwoWords(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto coproc_info = inst->GetArg(0).GetCoprocInfo(); + + size_t coproc_num = coproc_info[0]; + bool two = coproc_info[1] != 0; + unsigned opc = static_cast(coproc_info[2]); + A32::CoprocReg CRm = static_cast(coproc_info[3]); + + std::shared_ptr coproc = config.coprocessors[coproc_num]; + if (!coproc) { + EmitCoprocessorException(); + return; + } + + auto action = coproc->CompileSendTwoWords(two, opc, CRm); + switch (action.index()) { + case 0: + EmitCoprocessorException(); + return; + case 1: + CallCoprocCallback(code, ctx.reg_alloc, jit_interface, std::get(action), nullptr, args[1], args[2]); + return; + case 2: { + auto destination_ptrs = std::get>(action); + + ARM64Reg reg_word1 = DecodeReg(ctx.reg_alloc.UseGpr(args[1])); + ARM64Reg reg_word2 = DecodeReg(ctx.reg_alloc.UseGpr(args[2])); + ARM64Reg reg_destination_addr = ctx.reg_alloc.ScratchGpr(); + + code.MOVP2R(reg_destination_addr, destination_ptrs[0]); + code.STR(INDEX_UNSIGNED, reg_word1, reg_destination_addr, 0); + code.MOVP2R(reg_destination_addr, destination_ptrs[1]); + code.STR(INDEX_UNSIGNED, reg_word2, reg_destination_addr, 0); + + return; + } + default: + UNREACHABLE(); + } +} + +void A32EmitA64::EmitA32CoprocGetOneWord(A32EmitContext& ctx, IR::Inst* inst) { + auto coproc_info = inst->GetArg(0).GetCoprocInfo(); + + size_t coproc_num = coproc_info[0]; + bool two = coproc_info[1] != 0; + unsigned opc1 = static_cast(coproc_info[2]); + A32::CoprocReg CRn = static_cast(coproc_info[3]); + A32::CoprocReg CRm = static_cast(coproc_info[4]); + unsigned opc2 = static_cast(coproc_info[5]); + + std::shared_ptr coproc = config.coprocessors[coproc_num]; + if (!coproc) { + EmitCoprocessorException(); + return; + } + + auto action = coproc->CompileGetOneWord(two, opc1, CRn, CRm, opc2); + switch (action.index()) { + case 0: + EmitCoprocessorException(); + return; + case 1: + CallCoprocCallback(code, ctx.reg_alloc, jit_interface, std::get(action), inst); + return; + case 2: { + u32* source_ptr = std::get(action); + + ARM64Reg result = ctx.reg_alloc.ScratchGpr(); + + code.MOVP2R(result, source_ptr); + code.LDR(INDEX_UNSIGNED, DecodeReg(result), result, 0); + + ctx.reg_alloc.DefineValue(inst, result); + + return; + } + default: + UNREACHABLE(); + } +} + +void A32EmitA64::EmitA32CoprocGetTwoWords(A32EmitContext& ctx, IR::Inst* inst) { + auto coproc_info = inst->GetArg(0).GetCoprocInfo(); + + size_t coproc_num = coproc_info[0]; + bool two = coproc_info[1] != 0; + unsigned opc = coproc_info[2]; + A32::CoprocReg CRm = static_cast(coproc_info[3]); + + std::shared_ptr coproc = config.coprocessors[coproc_num]; + if (!coproc) { + EmitCoprocessorException(); + return; + } + + auto action = coproc->CompileGetTwoWords(two, opc, CRm); + switch (action.index()) { + case 0: + EmitCoprocessorException(); + return; + case 1: + CallCoprocCallback(code, ctx.reg_alloc, jit_interface, std::get(action), inst); + return; + case 2: { + auto source_ptrs = std::get>(action); + + ARM64Reg reg_result = ctx.reg_alloc.ScratchGpr(); + ARM64Reg reg_tmp = ctx.reg_alloc.ScratchGpr(); + + code.MOVP2R(reg_tmp, source_ptrs[1]); + code.LDR(INDEX_UNSIGNED, DecodeReg(reg_result), reg_tmp, 0); + code.MOVP2R(reg_tmp, source_ptrs[0]); + code.LDR(INDEX_UNSIGNED, DecodeReg(reg_tmp), reg_tmp, 0); + code.ORR(reg_result, reg_tmp, reg_result, ArithOption{ reg_result , ST_LSL, 32}); + + ctx.reg_alloc.DefineValue(inst, reg_result); + + return; + } + default: + UNREACHABLE(); + } +} + +void A32EmitA64::EmitA32CoprocLoadWords(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto coproc_info = inst->GetArg(0).GetCoprocInfo(); + + size_t coproc_num = coproc_info[0]; + bool two = coproc_info[1] != 0; + bool long_transfer = coproc_info[2] != 0; + A32::CoprocReg CRd = static_cast(coproc_info[3]); + bool has_option = coproc_info[4] != 0; + std::optional option = std::nullopt; + if (has_option) { + option = coproc_info[5]; + } + + + std::shared_ptr coproc = config.coprocessors[coproc_num]; + if (!coproc) { + EmitCoprocessorException(); + return; + } + + auto action = coproc->CompileLoadWords(two, long_transfer, CRd, option); + if (!action) { + EmitCoprocessorException(); + return; + } + + CallCoprocCallback(code, ctx.reg_alloc, jit_interface, *action, nullptr, args[1]); +} + +void A32EmitA64::EmitA32CoprocStoreWords(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto coproc_info = inst->GetArg(0).GetCoprocInfo(); + + size_t coproc_num = coproc_info[0]; + bool two = coproc_info[1] != 0; + bool long_transfer = coproc_info[2] != 0; + A32::CoprocReg CRd = static_cast(coproc_info[3]); + bool has_option = coproc_info[4] != 0; + std::optional option = std::nullopt; + if (has_option) { + option = coproc_info[5]; + } + + std::shared_ptr coproc = config.coprocessors[coproc_num]; + if (!coproc) { + EmitCoprocessorException(); + return; + } + + auto action = coproc->CompileStoreWords(two, long_transfer, CRd, option); + if (!action) { + EmitCoprocessorException(); + return; + } + + CallCoprocCallback(code, ctx.reg_alloc, jit_interface, *action, nullptr, args[1]); +} + + +std::string A32EmitA64::LocationDescriptorToFriendlyName(const IR::LocationDescriptor& ir_descriptor) const { + const A32::LocationDescriptor descriptor{ir_descriptor}; + return fmt::format("a32_{}{:08X}_{}_fpcr{:08X}", descriptor.TFlag() ? "t" : "a", descriptor.PC(), descriptor.EFlag() ? "be" : "le", + descriptor.FPSCR().Value()); +} + +void A32EmitA64::FastmemCallback(CodePtr PC) { + const auto iter = fastmem_patch_info.find(PC); + ASSERT(iter != fastmem_patch_info.end()); + iter->second.callback(); + fastmem_patch_info.erase(iter); +} + +void A32EmitA64::EmitTerminalImpl(IR::Term::Interpret terminal, IR::LocationDescriptor initial_location, bool) { + ASSERT_MSG(A32::LocationDescriptor{terminal.next}.TFlag() == A32::LocationDescriptor{initial_location}.TFlag(), "Unimplemented"); + ASSERT_MSG(A32::LocationDescriptor{terminal.next}.EFlag() == A32::LocationDescriptor{initial_location}.EFlag(), "Unimplemented"); + + code.MOVI2R(DecodeReg(code.ABI_PARAM2), A32::LocationDescriptor{terminal.next}.PC()); + code.MOVI2R(DecodeReg(code.ABI_PARAM3), terminal.num_instructions); + code.STR(INDEX_UNSIGNED, DecodeReg(code.ABI_PARAM2), X28, MJitStateReg(A32::Reg::PC)); + code.SwitchFpscrOnExit(); + Devirtualize<&A32::UserCallbacks::InterpreterFallback>(config.callbacks).EmitCall(code); + code.ReturnFromRunCode(true); // TODO: Check cycles +} + +void A32EmitA64::EmitTerminalImpl(IR::Term::ReturnToDispatch, IR::LocationDescriptor, bool) { + code.ReturnFromRunCode(); +} + +void A32EmitA64::EmitSetUpperLocationDescriptor(IR::LocationDescriptor new_location, IR::LocationDescriptor old_location) { + auto get_upper = [](const IR::LocationDescriptor &desc) -> u32 { + return static_cast(A32::LocationDescriptor{desc}.SetSingleStepping(false).UniqueHash() >> 32); + }; + + const u32 old_upper = get_upper(old_location); + const u32 new_upper = [&] { + const u32 mask = ~u32(config.always_little_endian ? 0x2 : 0); + return get_upper(new_location) & mask; + }(); + + if (old_upper != new_upper) { + code.MOVI2R(DecodeReg(code.ABI_SCRATCH1), new_upper); + code.STR(INDEX_UNSIGNED, DecodeReg(code.ABI_SCRATCH1), X28, offsetof(A32JitState, upper_location_descriptor)); + } +} + +void A32EmitA64::EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + EmitSetUpperLocationDescriptor(terminal.next, initial_location); + + if (!config.enable_optimizations || is_single_step) { + code.MOVI2R(DecodeReg(code.ABI_SCRATCH1), A32::LocationDescriptor{terminal.next}.PC()); + code.STR(INDEX_UNSIGNED, DecodeReg(code.ABI_SCRATCH1), X28, MJitStateReg(A32::Reg::PC)); + code.ReturnFromRunCode(); + return; + } + + code.CMP(X26, ZR); + + patch_information[terminal.next].jg.emplace_back(code.GetCodePtr()); + if (auto next_bb = GetBasicBlock(terminal.next)) { + EmitPatchJg(terminal.next, next_bb->entrypoint); + } else { + EmitPatchJg(terminal.next); + } + FixupBranch dest = code.B(); + + code.SwitchToFarCode(); + code.AlignCode16(); + code.SetJumpTarget(dest); + code.MOVI2R(DecodeReg(code.ABI_SCRATCH1), A32::LocationDescriptor{terminal.next}.PC()); + code.STR(INDEX_UNSIGNED, DecodeReg(code.ABI_SCRATCH1), X28, MJitStateReg(A32::Reg::PC)); + PushRSBHelper(X1, X2, terminal.next); + code.ForceReturnFromRunCode(); + + //Todo: find a better/generic place to FlushIcache when switching between + // far code and near code + code.FlushIcache(); + code.SwitchToNearCode(); +} + +void A32EmitA64::EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + EmitSetUpperLocationDescriptor(terminal.next, initial_location); + + if (!config.enable_optimizations || is_single_step) { + code.MOVI2R(DecodeReg(code.ABI_SCRATCH1), A32::LocationDescriptor{terminal.next}.PC()); + code.STR(INDEX_UNSIGNED, DecodeReg(code.ABI_SCRATCH1), X28, MJitStateReg(A32::Reg::PC)); + code.ReturnFromRunCode(); + return; + } + + patch_information[terminal.next].jmp.emplace_back(code.GetCodePtr()); + if (auto next_bb = GetBasicBlock(terminal.next)) { + EmitPatchJmp(terminal.next, next_bb->entrypoint); + } else { + EmitPatchJmp(terminal.next); + } +} + +void A32EmitA64::EmitTerminalImpl(IR::Term::PopRSBHint, IR::LocationDescriptor, bool is_single_step) { + if (!config.enable_optimizations || is_single_step) { + code.ReturnFromRunCode(); + return; + } + code.B(terminal_handler_pop_rsb_hint); +} + +void A32EmitA64::EmitTerminalImpl(IR::Term::FastDispatchHint, IR::LocationDescriptor, bool is_single_step) { + if (config.enable_fast_dispatch && !is_single_step) { + code.B(terminal_handler_fast_dispatch_hint); + } else { + code.ReturnFromRunCode(); + } +} + +void A32EmitA64::EmitTerminalImpl(IR::Term::If terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + FixupBranch pass = EmitCond(terminal.if_); + EmitTerminal(terminal.else_, initial_location, is_single_step); + code.SetJumpTarget(pass); + EmitTerminal(terminal.then_, initial_location, is_single_step); +} + +void A32EmitA64::EmitTerminalImpl(IR::Term::CheckBit terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + FixupBranch fail; + code.LDRB(INDEX_UNSIGNED, DecodeReg(code.ABI_SCRATCH1), X28, offsetof(A32JitState, check_bit)); + fail = code.CBZ(DecodeReg(code.ABI_SCRATCH1)); + EmitTerminal(terminal.then_, initial_location, is_single_step); + code.SetJumpTarget(fail); + EmitTerminal(terminal.else_, initial_location, is_single_step); +} + +void A32EmitA64::EmitTerminalImpl(IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + code.LDRB(INDEX_UNSIGNED, DecodeReg(code.ABI_SCRATCH1), X28, offsetof(A32JitState, halt_requested)); + // Conditional branch only gives +/- 1MB of branch distance + FixupBranch zero = code.CBZ(DecodeReg(code.ABI_SCRATCH1)); + code.B(code.GetForceReturnFromRunCodeAddress()); + code.SetJumpTarget(zero); + EmitTerminal(terminal.else_, initial_location, is_single_step); +} + +void A32EmitA64::EmitPatchJg(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr) { + const CodePtr patch_location = code.GetCodePtr(); + + auto long_branch_gt = [this](CodePtr ptr){ + const s64 distance = reinterpret_cast(ptr) - reinterpret_cast(code.GetCodePtr()); + + if((distance >> 2) >= -0x40000 && (distance >> 2) <= 0x3FFFF) { + code.B(CC_GT, ptr); + return; + } + + FixupBranch cc_le = code.B(CC_LE); + code.B(ptr); + code.SetJumpTarget(cc_le); + }; + + if (target_code_ptr) { + long_branch_gt(target_code_ptr); + } else { + code.MOVI2R(DecodeReg(code.ABI_SCRATCH1), A32::LocationDescriptor{target_desc}.PC()); + code.STR(INDEX_UNSIGNED, DecodeReg(code.ABI_SCRATCH1), X28, MJitStateReg(A32::Reg::PC)); + long_branch_gt(code.GetReturnFromRunCodeAddress()); + } + code.EnsurePatchLocationSize(patch_location, 24); +} + +void A32EmitA64::EmitPatchJmp(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr) { + const CodePtr patch_location = code.GetCodePtr(); + if (target_code_ptr) { + code.B(target_code_ptr); + } else { + code.MOVI2R(DecodeReg(code.ABI_SCRATCH1), A32::LocationDescriptor{target_desc}.PC()); + code.STR(INDEX_UNSIGNED, DecodeReg(code.ABI_SCRATCH1), X28, MJitStateReg(A32::Reg::PC)); + code.B(code.GetReturnFromRunCodeAddress()); + } + code.EnsurePatchLocationSize(patch_location, 20); +} + +void A32EmitA64::EmitPatchMovX0(CodePtr target_code_ptr) { + if (!target_code_ptr) { + target_code_ptr = code.GetReturnFromRunCodeAddress(); + } + const CodePtr patch_location = code.GetCodePtr(); + code.MOVP2R(X0, target_code_ptr); + code.EnsurePatchLocationSize(patch_location, 16); +} + +void A32EmitA64::Unpatch(const IR::LocationDescriptor& location) { + EmitA64::Unpatch(location); + if (config.enable_fast_dispatch) { + code.DisableWriting(); + SCOPE_EXIT { code.EnableWriting(); }; + + (*fast_dispatch_table_lookup)(location.Value()) = {}; + } +} + +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/a32_emit_a64.h b/src/dynarmic/backend/A64/a32_emit_a64.h new file mode 100644 index 00000000..4989cccb --- /dev/null +++ b/src/dynarmic/backend/A64/a32_emit_a64.h @@ -0,0 +1,138 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "backend/A64/a32_jitstate.h" +#include "backend/A64/block_range_information.h" +#include "backend/A64/emit_a64.h" +#include "backend/A64/exception_handler.h" +#include "dynarmic/A32/a32.h" +#include "dynarmic/A32/config.h" +#include "frontend/A32/location_descriptor.h" +#include "frontend/ir/terminal.h" + +namespace Dynarmic::BackendA64 { + +struct A64State; +class RegAlloc; + +struct A32EmitContext final : public EmitContext { + A32EmitContext(RegAlloc& reg_alloc, IR::Block& block); + A32::LocationDescriptor Location() const; + bool IsSingleStep() const; + FP::RoundingMode FPSCR_RMode() const override; + u32 FPCR() const override; + bool FPSCR_FTZ() const override; + bool FPSCR_DN() const override; + std::ptrdiff_t GetInstOffset(IR::Inst* inst) const; +}; + +class A32EmitA64 final : public EmitA64 { +public: + A32EmitA64(BlockOfCode& code, A32::UserConfig config, A32::Jit* jit_interface); + ~A32EmitA64() override; + + /** + * Emit host machine code for a basic block with intermediate representation `ir`. + * @note ir is modified. + */ + BlockDescriptor Emit(IR::Block& ir); + + void ClearCache() override; + + void InvalidateCacheRanges(const boost::icl::interval_set& ranges); + + void FastmemCallback(CodePtr PC); + +protected: + const A32::UserConfig config; + A32::Jit* jit_interface; + BlockRangeInformation block_ranges; + ExceptionHandler exception_handler; + + void EmitCondPrelude(const A32EmitContext& ctx); + + struct FastDispatchEntry { + u64 location_descriptor = 0xFFFF'FFFF'FFFF'FFFFull; + const void* code_ptr = nullptr; + }; + static_assert(sizeof(FastDispatchEntry) == 0x10); + static constexpr u64 fast_dispatch_table_mask = 0xFFFF0; + static constexpr size_t fast_dispatch_table_size = 0x10000; + std::array fast_dispatch_table; + void ClearFastDispatchTable(); + + using DoNotFastmemMarker = std::tuple; + std::set do_not_fastmem; + DoNotFastmemMarker GenerateDoNotFastmemMarker(A32EmitContext& ctx, IR::Inst* inst); + void DoNotFastmem(const DoNotFastmemMarker& marker); + bool ShouldFastmem(const DoNotFastmemMarker& marker) const; + + const void* read_memory_8; + const void* read_memory_16; + const void* read_memory_32; + const void* read_memory_64; + const void* write_memory_8; + const void* write_memory_16; + const void* write_memory_32; + const void* write_memory_64; + void GenMemoryAccessors(); + template + void ReadMemory(A32EmitContext& ctx, IR::Inst* inst, const CodePtr callback_fn); + template + void WriteMemory(A32EmitContext& ctx, IR::Inst* inst, const CodePtr callback_fn); + + const void* terminal_handler_pop_rsb_hint; + const void* terminal_handler_fast_dispatch_hint = nullptr; + FastDispatchEntry& (*fast_dispatch_table_lookup)(u64) = nullptr; + void GenTerminalHandlers(); + + // Microinstruction emitters +#define OPCODE(...) +#define A32OPC(name, type, ...) void EmitA32##name(A32EmitContext& ctx, IR::Inst* inst); +#define A64OPC(...) +#include "frontend/ir/opcodes.inc" +#undef OPCODE +#undef A32OPC +#undef A64OPC + + // Helpers + std::string LocationDescriptorToFriendlyName(const IR::LocationDescriptor&) const override; + + // Fastmem + struct FastmemPatchInfo { + std::function callback; + }; + std::unordered_map fastmem_patch_info; + + // Terminal instruction emitters + void EmitSetUpperLocationDescriptor(IR::LocationDescriptor new_location, IR::LocationDescriptor old_location); + void EmitTerminalImpl(IR::Term::Interpret terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + void EmitTerminalImpl(IR::Term::ReturnToDispatch terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + void EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + void EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + void EmitTerminalImpl(IR::Term::PopRSBHint terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + void EmitTerminalImpl(IR::Term::FastDispatchHint terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + void EmitTerminalImpl(IR::Term::If terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + void EmitTerminalImpl(IR::Term::CheckBit terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + void EmitTerminalImpl(IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; + + // Patching + void Unpatch(const IR::LocationDescriptor& target_desc) override; + void EmitPatchJg(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) override; + void EmitPatchJmp(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) override; + void EmitPatchMovX0(CodePtr target_code_ptr = nullptr) override; +}; + +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/a32_interface.cpp b/src/dynarmic/backend/A64/a32_interface.cpp new file mode 100644 index 00000000..ce2b0225 --- /dev/null +++ b/src/dynarmic/backend/A64/a32_interface.cpp @@ -0,0 +1,323 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#include + +#include +#include + +#include +#include + +#include "backend/A64/a32_emit_a64.h" +#include "backend/A64/a32_jitstate.h" +#include "backend/A64/block_of_code.h" +#include "backend/A64/callback.h" +#include "backend/A64/devirtualize.h" +#include "backend/A64/jitstate_info.h" +#include "common/assert.h" +#include "common/common_types.h" +#include "common/llvm_disassemble.h" +#include "common/scope_exit.h" +#include "frontend/A32/translate/translate.h" +#include "frontend/ir/basic_block.h" +#include "frontend/ir/location_descriptor.h" +#include "ir_opt/passes.h" + +namespace Dynarmic::A32 { + +using namespace BackendA64; + +static RunCodeCallbacks GenRunCodeCallbacks(const A32::UserConfig& config, CodePtr (*LookupBlock)(void* lookup_block_arg), void* arg) { + return RunCodeCallbacks{ + std::make_unique(LookupBlock, reinterpret_cast(arg)), + std::make_unique(Devirtualize<&A32::UserCallbacks::AddTicks>(config.callbacks)), + std::make_unique(Devirtualize<&A32::UserCallbacks::GetTicksRemaining>(config.callbacks)), + reinterpret_cast(config.fastmem_pointer), + }; +} + +struct Jit::Impl { + Impl(Jit* jit, A32::UserConfig config) + : block_of_code(GenRunCodeCallbacks(config, &GetCurrentBlockThunk, this), JitStateInfo{jit_state}) + , emitter(block_of_code, config, jit) + , config(std::move(config)) + , jit_interface(jit) + {} + + A32JitState jit_state; + BlockOfCode block_of_code; + A32EmitA64 emitter; + + const A32::UserConfig config; + + // Requests made during execution to invalidate the cache are queued up here. + size_t invalid_cache_generation = 0; + boost::icl::interval_set invalid_cache_ranges; + bool invalidate_entire_cache = false; + + void Execute() { + const CodePtr current_codeptr = [this]{ + // RSB optimization + const u32 new_rsb_ptr = (jit_state.rsb_ptr - 1) & A32JitState::RSBPtrMask; + if (jit_state.GetUniqueHash() == jit_state.rsb_location_descriptors[new_rsb_ptr]) { + jit_state.rsb_ptr = new_rsb_ptr; + return reinterpret_cast(jit_state.rsb_codeptrs[new_rsb_ptr]); + } + + return GetCurrentBlock(); + }(); + + block_of_code.RunCode(&jit_state, current_codeptr); + } + + void Step() { + block_of_code.StepCode(&jit_state, GetCurrentSingleStep()); + } + + std::string Disassemble(const IR::LocationDescriptor& descriptor) { + auto block = GetBasicBlock(descriptor); + std::string result = fmt::format("address: {}\nsize: {} bytes\n", block.entrypoint, block.size); +#ifdef DYNARMIC_USE_LLVM + for (const u32* pos = reinterpret_cast(block.entrypoint); + reinterpret_cast(pos) < reinterpret_cast(block.entrypoint) + block.size; pos += 1) { + fmt::print("0x{:02x} 0x{:02x} ", reinterpret_cast(pos), *pos); + fmt::print("{}", Common::DisassembleAArch64(*pos, reinterpret_cast(pos))); + result += Common::DisassembleAArch64(*pos, reinterpret_cast(pos)); + } +#endif + return result; + } + + void PerformCacheInvalidation() { + if (invalidate_entire_cache) { + jit_state.ResetRSB(); + block_of_code.ClearCache(); + emitter.ClearCache(); + + invalid_cache_ranges.clear(); + invalidate_entire_cache = false; + invalid_cache_generation++; + return; + } + + if (invalid_cache_ranges.empty()) { + return; + } + + jit_state.ResetRSB(); + emitter.InvalidateCacheRanges(invalid_cache_ranges); + invalid_cache_ranges.clear(); + invalid_cache_generation++; + } + + void RequestCacheInvalidation() { + if (jit_interface->is_executing) { + jit_state.halt_requested = true; + return; + } + + PerformCacheInvalidation(); + } + +private: + Jit* jit_interface; + + static CodePtr GetCurrentBlockThunk(void* this_voidptr) { + Jit::Impl& this_ = *static_cast(this_voidptr); + return this_.GetCurrentBlock(); + } + + IR::LocationDescriptor GetCurrentLocation() const { + return IR::LocationDescriptor{jit_state.GetUniqueHash()}; + } + + CodePtr GetCurrentBlock() { + return GetBasicBlock(GetCurrentLocation()).entrypoint; + } + + CodePtr GetCurrentSingleStep() { + return GetBasicBlock(A32::LocationDescriptor{GetCurrentLocation()}.SetSingleStepping(true)).entrypoint; + } + + A32EmitA64::BlockDescriptor GetBasicBlock(IR::LocationDescriptor descriptor) { + auto block = emitter.GetBasicBlock(descriptor); + if (block) + return *block; + + constexpr size_t MINIMUM_REMAINING_CODESIZE = 1 * 1024 * 1024; + if (block_of_code.SpaceRemaining() < MINIMUM_REMAINING_CODESIZE) { + invalidate_entire_cache = true; + PerformCacheInvalidation(); + } + + IR::Block ir_block = A32::Translate(A32::LocationDescriptor{descriptor}, [this](u32 vaddr) { return config.callbacks->MemoryReadCode(vaddr); }, {config.define_unpredictable_behaviour, config.hook_hint_instructions}); + if (config.enable_optimizations) { + Optimization::A32GetSetElimination(ir_block); + Optimization::DeadCodeElimination(ir_block); + Optimization::A32ConstantMemoryReads(ir_block, config.callbacks); + Optimization::ConstantPropagation(ir_block); + Optimization::DeadCodeElimination(ir_block); + Optimization::A32MergeInterpretBlocksPass(ir_block, config.callbacks); + } + Optimization::VerificationPass(ir_block); + return emitter.Emit(ir_block); + } +}; + +Jit::Jit(UserConfig config) : impl(std::make_unique(this, std::move(config))) {} + +Jit::~Jit() = default; + +void Jit::Run() { + ASSERT(!is_executing); + is_executing = true; + SCOPE_EXIT { this->is_executing = false; }; + + impl->jit_state.halt_requested = false; + + impl->Execute(); + + impl->PerformCacheInvalidation(); +} + +void Jit::Step() { + ASSERT(!is_executing); + is_executing = true; + SCOPE_EXIT { this->is_executing = false; }; + + impl->jit_state.halt_requested = true; + + impl->Step(); + + impl->PerformCacheInvalidation(); +} + +void Jit::ClearCache() { + impl->invalidate_entire_cache = true; + impl->RequestCacheInvalidation(); +} + +void Jit::InvalidateCacheRange(std::uint32_t start_address, std::size_t length) { + impl->invalid_cache_ranges.add(boost::icl::discrete_interval::closed(start_address, static_cast(start_address + length - 1))); + impl->RequestCacheInvalidation(); +} + +void Jit::Reset() { + ASSERT(!is_executing); + impl->jit_state = {}; +} + +void Jit::HaltExecution() { + impl->jit_state.halt_requested = true; +} + +std::array& Jit::Regs() { + return impl->jit_state.Reg; +} +const std::array& Jit::Regs() const { + return impl->jit_state.Reg; +} + +std::array& Jit::ExtRegs() { + return impl->jit_state.ExtReg; +} + +const std::array& Jit::ExtRegs() const { + return impl->jit_state.ExtReg; +} + +u32 Jit::Cpsr() const { + return impl->jit_state.Cpsr(); +} + +void Jit::SetCpsr(u32 value) { + return impl->jit_state.SetCpsr(value); +} + +u32 Jit::Fpscr() const { + return impl->jit_state.Fpscr(); +} + +void Jit::SetFpscr(u32 value) { + return impl->jit_state.SetFpscr(value); +} + +Context Jit::SaveContext() const { + Context ctx; + SaveContext(ctx); + return ctx; +} + +struct Context::Impl { + A32JitState jit_state; + size_t invalid_cache_generation; +}; + +Context::Context() : impl(std::make_unique()) { impl->jit_state.ResetRSB(); } +Context::~Context() = default; +Context::Context(const Context& ctx) : impl(std::make_unique(*ctx.impl)) {} +Context::Context(Context&& ctx) noexcept : impl(std::move(ctx.impl)) {} +Context& Context::operator=(const Context& ctx) { + *impl = *ctx.impl; + return *this; +} +Context& Context::operator=(Context&& ctx) noexcept { + impl = std::move(ctx.impl); + return *this; +} + +std::array& Context::Regs() { + return impl->jit_state.Reg; +} +const std::array& Context::Regs() const { + return impl->jit_state.Reg; +} +std::array& Context::ExtRegs() { + return impl->jit_state.ExtReg; +} +const std::array& Context::ExtRegs() const { + return impl->jit_state.ExtReg; +} + +std::uint32_t Context::Cpsr() const { + return impl->jit_state.Cpsr(); +} +void Context::SetCpsr(std::uint32_t value) { + impl->jit_state.SetCpsr(value); +} + +std::uint32_t Context::Fpscr() const { + return impl->jit_state.Fpscr(); +} +void Context::SetFpscr(std::uint32_t value) { + return impl->jit_state.SetFpscr(value); +} + +void Jit::SaveContext(Context& ctx) const { + ctx.impl->jit_state.TransferJitState(impl->jit_state, false); + ctx.impl->invalid_cache_generation = impl->invalid_cache_generation; +} + +void Jit::LoadContext(const Context& ctx) { + bool reset_rsb = ctx.impl->invalid_cache_generation != impl->invalid_cache_generation; + impl->jit_state.TransferJitState(ctx.impl->jit_state, reset_rsb); +} + +std::string Jit::Disassemble() const { + std::string result; +#ifdef DYNARMIC_USE_LLVM + for (const u32* pos = reinterpret_cast(impl->block_of_code.GetCodeBegin()); + reinterpret_cast(pos) < reinterpret_cast(impl->block_of_code.GetCodePtr()); pos += 1) { + fmt::print("0x{:02x} 0x{:02x} ", reinterpret_cast(pos), *pos); + fmt::print("{}", Common::DisassembleAArch64(*pos, reinterpret_cast(pos))); + result += Common::DisassembleAArch64(*pos, reinterpret_cast(pos)); + } +#endif + return result; +} + +} // namespace Dynarmic::A32 diff --git a/src/dynarmic/backend/A64/a32_jitstate.cpp b/src/dynarmic/backend/A64/a32_jitstate.cpp new file mode 100644 index 00000000..2ae33118 --- /dev/null +++ b/src/dynarmic/backend/A64/a32_jitstate.cpp @@ -0,0 +1,172 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#include "backend/A64/a32_jitstate.h" +#include "backend/A64/block_of_code.h" +#include "common/assert.h" +#include "common/bit_util.h" +#include "common/common_types.h" +#include "frontend/A32/location_descriptor.h" + +namespace Dynarmic::BackendA64 { + +/** + * CPSR Bits + * ========= + * + * ARM CPSR flags + * -------------- + * N bit 31 Negative flag + * Z bit 30 Zero flag + * C bit 29 Carry flag + * V bit 28 oVerflow flag + * Q bit 27 Saturation flag + * IT[1:0] bits 25-26 If-Then execution state (lower 2 bits) + * J bit 24 Jazelle instruction set flag + * GE bits 16-19 Greater than or Equal flags + * IT[7:2] bits 10-15 If-Then execution state (upper 6 bits) + * E bit 9 Data Endianness flag + * A bit 8 Disable imprecise Aborts + * I bit 7 Disable IRQ interrupts + * F bit 6 Disable FIQ interrupts + * T bit 5 Thumb instruction set flag + * M bits 0-4 Processor Mode bits + * + * A64 flags + * ------------------- + * N bit 31 Negative flag + * Z bit 30 Zero flag + * C bit 29 Carry flag + * V bit 28 oVerflow flag + */ + +u32 A32JitState::Cpsr() const { + DEBUG_ASSERT((cpsr_nzcv & ~0xF0000000) == 0); + DEBUG_ASSERT((cpsr_q & ~1) == 0); + DEBUG_ASSERT((cpsr_jaifm & ~0x010001DF) == 0); + + u32 cpsr = 0; + + // NZCV flags + cpsr |= cpsr_nzcv; + // Q flag + cpsr |= cpsr_q ? 1 << 27 : 0; + // GE flags + cpsr |= Common::Bit<31>(cpsr_ge) ? 1 << 19 : 0; + cpsr |= Common::Bit<23>(cpsr_ge) ? 1 << 18 : 0; + cpsr |= Common::Bit<15>(cpsr_ge) ? 1 << 17 : 0; + cpsr |= Common::Bit<7>(cpsr_ge) ? 1 << 16 : 0; + // E flag, T flag + cpsr |= Common::Bit<1>(upper_location_descriptor) ? 1 << 9 : 0; + cpsr |= Common::Bit<0>(upper_location_descriptor) ? 1 << 5 : 0; + // IT state + cpsr |= static_cast(upper_location_descriptor & 0b11111100'00000000); + cpsr |= static_cast(upper_location_descriptor & 0b00000011'00000000) << 17; + // Other flags + cpsr |= cpsr_jaifm; + + return cpsr; +} + +void A32JitState::SetCpsr(u32 cpsr) { + // NZCV flags + cpsr_nzcv = cpsr & 0xF0000000; + // Q flag + cpsr_q = Common::Bit<27>(cpsr) ? 1 : 0; + // GE flags + cpsr_ge = 0; + cpsr_ge |= Common::Bit<19>(cpsr) ? 0xFF000000 : 0; + cpsr_ge |= Common::Bit<18>(cpsr) ? 0x00FF0000 : 0; + cpsr_ge |= Common::Bit<17>(cpsr) ? 0x0000FF00 : 0; + cpsr_ge |= Common::Bit<16>(cpsr) ? 0x000000FF : 0; + + upper_location_descriptor &= 0xFFFF0000; + // E flag, T flag + upper_location_descriptor |= Common::Bit<9>(cpsr) ? 2 : 0; + upper_location_descriptor |= Common::Bit<5>(cpsr) ? 1 : 0; + // IT state + upper_location_descriptor |= (cpsr >> 0) & 0b11111100'00000000; + upper_location_descriptor |= (cpsr >> 17) & 0b00000011'00000000; + + // Other flags + cpsr_jaifm = cpsr & 0x010001DF; +} + +void A32JitState::ResetRSB() { + rsb_location_descriptors.fill(0xFFFFFFFFFFFFFFFFull); + rsb_codeptrs.fill(0); +} + +/** + * FPSCR + * ========================= + * + * VFP FPSCR cumulative exception bits + * ----------------------------------- + * IDC bit 7 Input Denormal cumulative exception bit // Only ever set when FPSCR.FTZ = 1 + * IXC bit 4 Inexact cumulative exception bit + * UFC bit 3 Underflow cumulative exception bit + * OFC bit 2 Overflow cumulative exception bit + * DZC bit 1 Division by Zero cumulative exception bit + * IOC bit 0 Invalid Operation cumulative exception bit + * + * VFP FPSCR exception trap enables + * -------------------------------- + * IDE bit 15 Input Denormal exception trap enable + * IXE bit 12 Inexact exception trap enable + * UFE bit 11 Underflow exception trap enable + * OFE bit 10 Overflow exception trap enable + * DZE bit 9 Division by Zero exception trap enable + * IOE bit 8 Invalid Operation exception trap enable + * + * VFP FPSCR mode bits + * ------------------- + * AHP bit 26 Alternate half-precision + * DN bit 25 Default NaN + * FZ bit 24 Flush to Zero + * RMode bits 22-23 Round to {0 = Nearest, 1 = Positive, 2 = Negative, 3 = Zero} + * Stride bits 20-21 Vector stride + * Len bits 16-18 Vector length + */ + +// NZCV; QC (ASIMD only), AHP; DN, FZ, RMode, Stride; SBZP; Len; trap enables; cumulative bits +constexpr u32 FPSCR_MODE_MASK = A32::LocationDescriptor::FPSCR_MODE_MASK; +constexpr u32 FPSCR_NZCV_MASK = 0xF0000000; + +u32 A32JitState::Fpscr() const { + DEBUG_ASSERT((fpsr_nzcv & ~FPSCR_NZCV_MASK) == 0); + + const u32 fpcr_mode = static_cast(upper_location_descriptor) & FPSCR_MODE_MASK; + + u32 FPSCR = fpcr_mode | fpsr_nzcv; + FPSCR |= (guest_fpsr & 0x1F); + FPSCR |= fpsr_exc; + + return FPSCR; +} + +void A32JitState::SetFpscr(u32 FPSCR) { + // Ensure that only upper half of upper_location_descriptor is used for FPSCR bits. + static_assert((FPSCR_MODE_MASK & 0xFFFF0000) == FPSCR_MODE_MASK); + + upper_location_descriptor &= 0x0000FFFF; + upper_location_descriptor |= FPSCR & FPSCR_MODE_MASK; + + fpsr_nzcv = FPSCR & FPSCR_NZCV_MASK; + guest_fpcr = 0; + guest_fpsr = 0; + + // Cumulative flags IDC, IOC, IXC, UFC, OFC, DZC + fpsr_exc = FPSCR & 0x9F; + + // Mode Bits + guest_fpcr |= FPSCR & 0x07C09F00; + + // Exceptions + guest_fpsr |= FPSCR & 0x9F; +} + +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/a32_jitstate.h b/src/dynarmic/backend/A64/a32_jitstate.h new file mode 100644 index 00000000..0783fb2b --- /dev/null +++ b/src/dynarmic/backend/A64/a32_jitstate.h @@ -0,0 +1,111 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#pragma once + +#include + +#include "common/common_types.h" + +namespace Dynarmic::BackendA64 { + +class BlockOfCode; + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable:4324) // Structure was padded due to alignment specifier +#endif + +struct A32JitState { + using ProgramCounterType = u32; + + A32JitState() { ResetRSB(); } + + std::array Reg{}; // Current register file. + // TODO: Mode-specific register sets unimplemented. + + u32 upper_location_descriptor = 0; + + u32 cpsr_ge = 0; + u32 cpsr_q = 0; + u32 cpsr_nzcv = 0; + u32 cpsr_jaifm = 0; + u32 Cpsr() const; + void SetCpsr(u32 cpsr); + + alignas(u64) std::array ExtReg{}; // Extension registers. + + static constexpr size_t SpillCount = 64; + std::array Spill{}; // Spill. + static size_t GetSpillLocationOffsetFromIndex(size_t i) { + return static_cast(offsetof(A32JitState, Spill) + i * sizeof(u64)); + } + + // For internal use (See: BlockOfCode::RunCode) + u64 guest_fpcr = 0; + u64 guest_fpsr = 0; + u64 save_host_FPCR = 0; + s64 cycles_to_run = 0; + s64 cycles_remaining = 0; + bool halt_requested = false; + bool check_bit = false; + + // Exclusive state + static constexpr u32 RESERVATION_GRANULE_MASK = 0xFFFFFFF8; + u32 exclusive_state = 0; + u32 exclusive_address = 0; + + static constexpr size_t RSBSize = 8; // MUST be a power of 2. + static constexpr size_t RSBPtrMask = RSBSize - 1; + u32 rsb_ptr = 0; + std::array rsb_location_descriptors; + std::array rsb_codeptrs; + void ResetRSB(); + + u32 fpsr_exc = 0; + u32 fpsr_qc = 0; // Dummy value + u32 fpsr_nzcv = 0; + u32 Fpscr() const; + void SetFpscr(u32 FPSCR); + + u64 GetUniqueHash() const noexcept { + return (static_cast(upper_location_descriptor) << 32) | (static_cast(Reg[15])); + } + + void TransferJitState(const A32JitState& src, bool reset_rsb) { + Reg = src.Reg; + upper_location_descriptor = src.upper_location_descriptor; + cpsr_ge = src.cpsr_ge; + cpsr_q = src.cpsr_q; + cpsr_nzcv = src.cpsr_nzcv; + cpsr_jaifm = src.cpsr_jaifm; + ExtReg = src.ExtReg; + guest_fpcr = src.guest_fpcr; + guest_fpsr = src.guest_fpsr; + fpsr_exc = src.fpsr_exc; + fpsr_qc = src.fpsr_qc; + fpsr_nzcv = src.fpsr_nzcv; + + exclusive_state = 0; + exclusive_address = 0; + + if (reset_rsb) { + ResetRSB(); + } else { + rsb_ptr = src.rsb_ptr; + rsb_location_descriptors = src.rsb_location_descriptors; + rsb_codeptrs = src.rsb_codeptrs; + } + } +}; + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +using CodePtr = const void*; + +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/abi.cpp b/src/dynarmic/backend/A64/abi.cpp new file mode 100644 index 00000000..24fc1cb8 --- /dev/null +++ b/src/dynarmic/backend/A64/abi.cpp @@ -0,0 +1,87 @@ +// Copyright (C) 2003 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// 20th Sep 2018: This code was modified for Dynarmic. + +#include +#include + +#include "backend/A64/abi.h" +#include "common/common_types.h" +#include "common/math_util.h" +#include "common/iterator_util.h" + +namespace Dynarmic::BackendA64 { + +template +void ABI_PushRegistersAndAdjustStack(BlockOfCode& code, const RegisterArrayT& regs) { + u32 gprs = 0 , fprs = 0; + + for (HostLoc reg : regs) { + if (HostLocIsGPR(reg)) { + gprs |= 0x1 << static_cast(DecodeReg(HostLocToReg64(reg))); + } else if (HostLocIsFPR(reg)) { + fprs |= 0x1 << static_cast(DecodeReg(HostLocToFpr(reg))); + } + } + + code.fp_emitter.ABI_PushRegisters(fprs); + code.ABI_PushRegisters(gprs); +} + +template +void ABI_PopRegistersAndAdjustStack(BlockOfCode& code, const RegisterArrayT& regs) { + u32 gprs = 0, fprs = 0; + + for (HostLoc reg : regs) { + if (HostLocIsGPR(reg)) { + gprs |= 0x1 << static_cast(DecodeReg(HostLocToReg64(reg))); + } else if (HostLocIsFPR(reg)) { + fprs |= 0x1 << static_cast(DecodeReg(HostLocToFpr(reg))); + } + } + + code.ABI_PopRegisters(gprs); + code.fp_emitter.ABI_PopRegisters(fprs); +} + +void ABI_PushCalleeSaveRegistersAndAdjustStack(BlockOfCode& code) { + ABI_PushRegistersAndAdjustStack(code, ABI_ALL_CALLEE_SAVE); +} + +void ABI_PopCalleeSaveRegistersAndAdjustStack(BlockOfCode& code) { + ABI_PopRegistersAndAdjustStack(code, ABI_ALL_CALLEE_SAVE); +} + +void ABI_PushCallerSaveRegistersAndAdjustStack(BlockOfCode& code) { + ABI_PushRegistersAndAdjustStack(code, ABI_ALL_CALLER_SAVE); +} + +void ABI_PopCallerSaveRegistersAndAdjustStack(BlockOfCode& code) { + ABI_PopRegistersAndAdjustStack(code, ABI_ALL_CALLER_SAVE); +} + +void ABI_PushCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, HostLoc exception) { + std::vector regs; + std::remove_copy(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end(), std::back_inserter(regs), exception); + ABI_PushRegistersAndAdjustStack(code, regs); +} + +void ABI_PopCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, HostLoc exception) { + std::vector regs; + std::remove_copy(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end(), std::back_inserter(regs), exception); + ABI_PopRegistersAndAdjustStack(code, regs); +} + +} // namespace Dynarmic::BackendX64 diff --git a/src/dynarmic/backend/A64/abi.h b/src/dynarmic/backend/A64/abi.h new file mode 100644 index 00000000..1bbff25a --- /dev/null +++ b/src/dynarmic/backend/A64/abi.h @@ -0,0 +1,110 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ +#pragma once + +#include + +#include "backend/A64/block_of_code.h" +#include "backend/A64/hostloc.h" + +namespace Dynarmic::BackendA64 { + +constexpr HostLoc ABI_RETURN = HostLoc::X0; + +constexpr HostLoc ABI_PARAM1 = HostLoc::X0; +constexpr HostLoc ABI_PARAM2 = HostLoc::X1; +constexpr HostLoc ABI_PARAM3 = HostLoc::X2; +constexpr HostLoc ABI_PARAM4 = HostLoc::X3; +constexpr HostLoc ABI_PARAM5 = HostLoc::X4; +constexpr HostLoc ABI_PARAM6 = HostLoc::X5; +constexpr HostLoc ABI_PARAM7 = HostLoc::X6; +constexpr HostLoc ABI_PARAM8 = HostLoc::X7; + +constexpr std::array ABI_ALL_CALLER_SAVE = { + HostLoc::X0, + HostLoc::X1, + HostLoc::X2, + HostLoc::X3, + HostLoc::X4, + HostLoc::X5, + HostLoc::X6, + HostLoc::X7, + HostLoc::X8, + HostLoc::X9, + HostLoc::X10, + HostLoc::X11, + HostLoc::X12, + HostLoc::X13, + HostLoc::X14, + HostLoc::X15, + HostLoc::X16, + HostLoc::X17, + HostLoc::X18, + + HostLoc::Q0, + HostLoc::Q1, + HostLoc::Q2, + HostLoc::Q3, + HostLoc::Q4, + HostLoc::Q5, + HostLoc::Q6, + HostLoc::Q7, + + HostLoc::Q16, + HostLoc::Q17, + HostLoc::Q18, + HostLoc::Q19, + HostLoc::Q20, + HostLoc::Q21, + HostLoc::Q22, + HostLoc::Q23, + HostLoc::Q24, + HostLoc::Q25, + HostLoc::Q26, + HostLoc::Q27, + HostLoc::Q28, + HostLoc::Q29, + HostLoc::Q30, + HostLoc::Q31, +}; + +constexpr std::array ABI_ALL_CALLEE_SAVE = { + HostLoc::X19, + HostLoc::X20, + HostLoc::X21, + HostLoc::X22, + HostLoc::X23, + HostLoc::X24, + HostLoc::X25, + HostLoc::X26, + HostLoc::X27, + HostLoc::X28, + HostLoc::X29, + HostLoc::X30, + + HostLoc::Q8, + HostLoc::Q9, + HostLoc::Q10, + HostLoc::Q11, + HostLoc::Q12, + HostLoc::Q13, + HostLoc::Q14, + HostLoc::Q15, +}; + +constexpr size_t ABI_SHADOW_SPACE = 0; // bytes + +static_assert(ABI_ALL_CALLER_SAVE.size() + ABI_ALL_CALLEE_SAVE.size() == 63, "Invalid total number of registers"); + +void ABI_PushCalleeSaveRegistersAndAdjustStack(BlockOfCode& code); +void ABI_PopCalleeSaveRegistersAndAdjustStack(BlockOfCode& code); +void ABI_PushCallerSaveRegistersAndAdjustStack(BlockOfCode& code); +void ABI_PopCallerSaveRegistersAndAdjustStack(BlockOfCode& code); + +void ABI_PushCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, HostLoc exception); +void ABI_PopCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, HostLoc exception); + +} // namespace Dynarmic::BackendX64 diff --git a/src/dynarmic/backend/A64/block_of_code.cpp b/src/dynarmic/backend/A64/block_of_code.cpp new file mode 100644 index 00000000..2c534fc1 --- /dev/null +++ b/src/dynarmic/backend/A64/block_of_code.cpp @@ -0,0 +1,336 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#include +#include +#include + +#include "backend/A64/a32_jitstate.h" +#include "backend/A64/abi.h" +#include "backend/A64/block_of_code.h" +#include "backend/A64/perf_map.h" +#include "common/assert.h" + +#ifdef _WIN32 + #include +#else + #include +#endif + +#ifdef __APPLE__ +#include +#endif + +namespace Dynarmic::BackendA64 { + +const Arm64Gen::ARM64Reg BlockOfCode::ABI_RETURN = Arm64Gen::ARM64Reg::X0; +const Arm64Gen::ARM64Reg BlockOfCode::ABI_RETURN2 = Arm64Gen::ARM64Reg::X1; + +const Arm64Gen::ARM64Reg BlockOfCode::ABI_PARAM1 = Arm64Gen::ARM64Reg::X0; +const Arm64Gen::ARM64Reg BlockOfCode::ABI_PARAM2 = Arm64Gen::ARM64Reg::X1; +const Arm64Gen::ARM64Reg BlockOfCode::ABI_PARAM3 = Arm64Gen::ARM64Reg::X2; +const Arm64Gen::ARM64Reg BlockOfCode::ABI_PARAM4 = Arm64Gen::ARM64Reg::X3; +const Arm64Gen::ARM64Reg BlockOfCode::ABI_PARAM5 = Arm64Gen::ARM64Reg::X4; +const Arm64Gen::ARM64Reg BlockOfCode::ABI_PARAM6 = Arm64Gen::ARM64Reg::X5; +const Arm64Gen::ARM64Reg BlockOfCode::ABI_PARAM7 = Arm64Gen::ARM64Reg::X6; +const Arm64Gen::ARM64Reg BlockOfCode::ABI_PARAM8 = Arm64Gen::ARM64Reg::X7; + +const Arm64Gen::ARM64Reg BlockOfCode::ABI_SCRATCH1 = Arm64Gen::ARM64Reg::X30; + +const std::array BlockOfCode::ABI_PARAMS = {BlockOfCode::ABI_PARAM1, BlockOfCode::ABI_PARAM2, + BlockOfCode::ABI_PARAM3, BlockOfCode::ABI_PARAM4, + BlockOfCode::ABI_PARAM5, BlockOfCode::ABI_PARAM6, + BlockOfCode::ABI_PARAM7, BlockOfCode::ABI_PARAM8}; + +namespace { + +constexpr size_t TOTAL_CODE_SIZE = 128 * 1024 * 1024; +constexpr size_t FAR_CODE_OFFSET = 100 * 1024 * 1024; + +#ifdef DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT +void ProtectMemory([[maybe_unused]] const void* base, [[maybe_unused]] size_t size, bool is_executable) { +#if defined(_WIN32) + DWORD oldProtect = 0; + VirtualProtect(const_cast(base), size, is_executable ? PAGE_EXECUTE_READ : PAGE_READWRITE, &oldProtect); +#elif defined(__APPLE__) + pthread_jit_write_protect_np(is_executable); +#else + static const size_t pageSize = sysconf(_SC_PAGESIZE); + const size_t iaddr = reinterpret_cast(base); + const size_t roundAddr = iaddr & ~(pageSize - static_cast(1)); + const int mode = is_executable ? (PROT_READ | PROT_EXEC) : (PROT_READ | PROT_WRITE); + mprotect(reinterpret_cast(roundAddr), size + (iaddr - roundAddr), mode); +#endif +} +#endif + +} // anonymous namespace + +BlockOfCode::BlockOfCode(RunCodeCallbacks cb, JitStateInfo jsi) + : fp_emitter(this) + , cb(std::move(cb)) + , jsi(jsi) + , constant_pool(*this) { + AllocCodeSpace(TOTAL_CODE_SIZE); + EnableWriting(); + GenRunCode(); +} + +void BlockOfCode::PreludeComplete() { + prelude_complete = true; + near_code_begin = GetCodePtr(); + far_code_begin = GetCodePtr() + FAR_CODE_OFFSET; + FlushIcache(); + ClearCache(); + DisableWriting(); +} + +void BlockOfCode::EnableWriting() { +#ifdef DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT + ProtectMemory(GetCodePtr(), TOTAL_CODE_SIZE, false); +#endif +} + +void BlockOfCode::DisableWriting() { +#ifdef DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT + ProtectMemory(GetCodePtr(), TOTAL_CODE_SIZE, true); +#endif +} + +void BlockOfCode::ClearCache() { + ASSERT(prelude_complete); + in_far_code = false; + near_code_ptr = near_code_begin; + far_code_ptr = far_code_begin; + SetCodePtr(near_code_begin); + constant_pool.Clear(); +} + +size_t BlockOfCode::SpaceRemaining() const { + ASSERT(prelude_complete); + // This function provides an underestimate of near-code-size but that's okay. + // (Why? The maximum size of near code should be measured from near_code_begin, not top_.) + // These are offsets from Xbyak::CodeArray::top_. + std::size_t far_code_offset, near_code_offset; + if (in_far_code) { + near_code_offset = static_cast(near_code_ptr) - static_cast(region); + far_code_offset = GetCodePtr() - static_cast(region); + } else { + near_code_offset = GetCodePtr() - static_cast(region); + far_code_offset = static_cast(far_code_ptr) - static_cast(region); + } + if (far_code_offset > TOTAL_CODE_SIZE) + return 0; + if (near_code_offset > FAR_CODE_OFFSET) + return 0; + return std::min(TOTAL_CODE_SIZE - far_code_offset, FAR_CODE_OFFSET - near_code_offset); +} + +void BlockOfCode::RunCode(void* jit_state, CodePtr code_ptr) const { + run_code(jit_state, code_ptr); +} + +void BlockOfCode::StepCode(void* jit_state, CodePtr code_ptr) const { + step_code(jit_state, code_ptr); +} + +void BlockOfCode::ReturnFromRunCode(bool fpscr_already_exited) { + size_t index = 0; + if (fpscr_already_exited) + index |= FPSCR_ALREADY_EXITED; + B(return_from_run_code[index]); +} + +void BlockOfCode::ForceReturnFromRunCode(bool fpscr_already_exited) { + size_t index = FORCE_RETURN; + if (fpscr_already_exited) + index |= FPSCR_ALREADY_EXITED; + B(return_from_run_code[index]); +} + +void BlockOfCode::GenRunCode() { + const u8* loop, *enter_fpscr_then_loop; + + AlignCode16(); + run_code = reinterpret_cast(GetWritableCodePtr()); + + // This serves two purposes: + // 1. It saves all the registers we as a callee need to save. + // 2. It aligns the stack so that the code the JIT emits can assume + // that the stack is appropriately aligned for CALLs. + ABI_PushCalleeSaveRegistersAndAdjustStack(*this); + + MOV(Arm64Gen::X28, ABI_PARAM1); + MOVI2R(Arm64Gen::X27, cb.value_in_X27); + MOV(Arm64Gen::X25, ABI_PARAM2); // save temporarily in non-volatile register + + cb.GetTicksRemaining->EmitCall(*this); + STR(Arm64Gen::INDEX_UNSIGNED, ABI_RETURN, Arm64Gen::X28, jsi.offsetof_cycles_to_run); + MOV(Arm64Gen::X26, ABI_RETURN); + + SwitchFpscrOnEntry(); + BR(Arm64Gen::X25); + + AlignCode16(); + step_code = reinterpret_cast(GetWritableCodePtr()); + ABI_PushCalleeSaveRegistersAndAdjustStack(*this); + + MOV(Arm64Gen::X28, ABI_PARAM1); + + MOVI2R(Arm64Gen::X26, 1); + STR(Arm64Gen::INDEX_UNSIGNED, Arm64Gen::X26, Arm64Gen::X28, jsi.offsetof_cycles_to_run); + + SwitchFpscrOnEntry(); + BR(ABI_PARAM2); + + enter_fpscr_then_loop = GetCodePtr(); + SwitchFpscrOnEntry(); + loop = GetCodePtr(); + cb.LookupBlock->EmitCall(*this); + BR(ABI_RETURN); + + // Return from run code variants + const auto emit_return_from_run_code = [this, &loop, &enter_fpscr_then_loop](bool fpscr_already_exited, bool force_return){ + if (!force_return) { + CMP(Arm64Gen::X26, Arm64Gen::ZR); + B(CC_GT, fpscr_already_exited ? enter_fpscr_then_loop : loop); + } + + if (!fpscr_already_exited) { + SwitchFpscrOnExit(); + } + + cb.AddTicks->EmitCall(*this, [this](RegList param) { + LDR(Arm64Gen::INDEX_UNSIGNED, param[0], Arm64Gen::X28, jsi.offsetof_cycles_to_run); + SUB(param[0], param[0], Arm64Gen::X26); + }); + + ABI_PopCalleeSaveRegistersAndAdjustStack(*this); + RET(); + }; + + return_from_run_code[0] = AlignCode16(); + emit_return_from_run_code(false, false); + + return_from_run_code[FPSCR_ALREADY_EXITED] = AlignCode16(); + emit_return_from_run_code(true, false); + + return_from_run_code[FORCE_RETURN] = AlignCode16(); + emit_return_from_run_code(false, true); + + return_from_run_code[FPSCR_ALREADY_EXITED | FORCE_RETURN] = AlignCode16(); + emit_return_from_run_code(true, true); + + PerfMapRegister(run_code, GetCodePtr(), "dynarmic_dispatcher"); +} + +void BlockOfCode::SwitchFpscrOnEntry() { + MRS(ABI_SCRATCH1, Arm64Gen::FIELD_FPCR); + STR(Arm64Gen::INDEX_UNSIGNED, ABI_SCRATCH1, Arm64Gen::X28, jsi.offsetof_save_host_FPCR); + + LDR(Arm64Gen::INDEX_UNSIGNED, ABI_SCRATCH1, Arm64Gen::X28, jsi.offsetof_guest_fpcr); + _MSR(Arm64Gen::FIELD_FPCR, ABI_SCRATCH1); + LDR(Arm64Gen::INDEX_UNSIGNED, ABI_SCRATCH1, Arm64Gen::X28, jsi.offsetof_guest_fpsr); + _MSR(Arm64Gen::FIELD_FPSR, ABI_SCRATCH1); +} + +void BlockOfCode::SwitchFpscrOnExit() { + MRS(ABI_SCRATCH1, Arm64Gen::FIELD_FPCR); + STR(Arm64Gen::INDEX_UNSIGNED, ABI_SCRATCH1, Arm64Gen::X28, jsi.offsetof_guest_fpcr); + MRS(ABI_SCRATCH1, Arm64Gen::FIELD_FPSR); + STR(Arm64Gen::INDEX_UNSIGNED, ABI_SCRATCH1, Arm64Gen::X28, jsi.offsetof_guest_fpsr); + + LDR(Arm64Gen::INDEX_UNSIGNED, ABI_SCRATCH1, Arm64Gen::X28, jsi.offsetof_save_host_FPCR); + _MSR(Arm64Gen::FIELD_FPCR, ABI_SCRATCH1); +} + +void BlockOfCode::UpdateTicks() { + cb.AddTicks->EmitCall(*this, [this](RegList param) { + LDR(Arm64Gen::INDEX_UNSIGNED, param[0], Arm64Gen::X28, jsi.offsetof_cycles_to_run); + SUB(param[0], param[0], Arm64Gen::X26); + }); + + cb.GetTicksRemaining->EmitCall(*this); + STR(Arm64Gen::INDEX_UNSIGNED, ABI_RETURN, Arm64Gen::X28, jsi.offsetof_cycles_to_run); + MOV(Arm64Gen::X26, ABI_RETURN); +} + +void BlockOfCode::LookupBlock() { + cb.LookupBlock->EmitCall(*this); +} + +void BlockOfCode::EmitPatchLDR(Arm64Gen::ARM64Reg Rt, u64 lower, u64 upper) { + ASSERT_MSG(!in_far_code, "Can't patch when in far code, yet!"); + constant_pool.EmitPatchLDR(Rt, lower, upper); +} + +void BlockOfCode::PatchConstPool() { + constant_pool.PatchPool(); +} + +void BlockOfCode::SwitchToFarCode() { + ASSERT(prelude_complete); + ASSERT(!in_far_code); + in_far_code = true; + near_code_ptr = GetCodePtr(); + SetCodePtr(far_code_ptr); + + ASSERT_MSG(near_code_ptr < far_code_begin, "Near code has overwritten far code!"); +} + +void BlockOfCode::SwitchToNearCode() { + ASSERT(prelude_complete); + ASSERT(in_far_code); + in_far_code = false; + far_code_ptr = GetCodePtr(); + SetCodePtr(near_code_ptr); +} + +CodePtr BlockOfCode::GetCodeBegin() const { + return near_code_begin; +} + +u8* BlockOfCode::GetRegion() const { + return region; +} + +std::size_t BlockOfCode::GetRegionSize() const { + return total_region_size; +} + +void* BlockOfCode::AllocateFromCodeSpace(size_t alloc_size) { + ASSERT_MSG(GetSpaceLeft() >= alloc_size, "ERR_CODE_IS_TOO_BIG"); + + void* ret = GetWritableCodePtr(); + region_size += alloc_size; + SetCodePtr(GetCodePtr() + alloc_size); + memset(ret, 0, alloc_size); + return ret; +} + +void BlockOfCode::SetCodePtr(CodePtr code_ptr) { + u8* ptr = const_cast(reinterpret_cast(code_ptr)); + ARM64XEmitter::SetCodePtr(ptr); +} + +void BlockOfCode::EnsurePatchLocationSize(CodePtr begin, size_t size) { + size_t current_size = GetCodePtr() - reinterpret_cast(begin); + ASSERT(current_size <= size); + for (u32 i = 0; i < (size - current_size) / 4; i++) { + HINT(Arm64Gen::HINT_NOP); + } +} + +//bool BlockOfCode::DoesCpuSupport(Xbyak::util::Cpu::Type type) const { +//#ifdef DYNARMIC_ENABLE_CPU_FEATURE_DETECTION +// return cpu_info.has(type); +//#else +// (void)type; +// return false; +//#endif +//} + +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/block_of_code.h b/src/dynarmic/backend/A64/block_of_code.h new file mode 100644 index 00000000..44f5c9a0 --- /dev/null +++ b/src/dynarmic/backend/A64/block_of_code.h @@ -0,0 +1,147 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#pragma once + +#include +#include +#include + +#include "backend/A64/callback.h" +#include "backend/A64/constant_pool.h" +#include "backend/A64/jitstate_info.h" +#include "backend/A64/emitter/a64_emitter.h" +#include "common/common_types.h" + +namespace Dynarmic::BackendA64 { + +using CodePtr = const void*; + +struct RunCodeCallbacks { + std::unique_ptr LookupBlock; + std::unique_ptr AddTicks; + std::unique_ptr GetTicksRemaining; + u64 value_in_X27; +}; + +class BlockOfCode final : public Arm64Gen::ARM64CodeBlock { +public: + BlockOfCode(RunCodeCallbacks cb, JitStateInfo jsi); + BlockOfCode(const BlockOfCode&) = delete; + + + /// Call when external emitters have finished emitting their preludes. + void PreludeComplete(); + + /// Change permissions to RW. This is required to support systems with W^X enforced. + void EnableWriting(); + /// Change permissions to RX. This is required to support systems with W^X enforced. + void DisableWriting(); + + /// Clears this block of code and resets code pointer to beginning. + void ClearCache(); + /// Calculates how much space is remaining to use. This is the minimum of near code and far code. + size_t SpaceRemaining() const; + + /// Runs emulated code from code_ptr. + void RunCode(void* jit_state, CodePtr code_ptr) const; + /// Runs emulated code from code_ptr for a single cycle. + void StepCode(void* jit_state, CodePtr code_ptr) const; + /// Code emitter: Returns to dispatcher + void ReturnFromRunCode(bool fpscr_already_exited = false); + /// Code emitter: Returns to dispatcher, forces return to host + void ForceReturnFromRunCode(bool fpscr_already_exited = false); + /// Code emitter: Makes guest FPSR and FPCR the current FPSR and FPCR + void SwitchFpscrOnEntry(); + /// Code emitter: Makes saved host FPCR the current FPCR + void SwitchFpscrOnExit(); + /// Code emitter: Updates cycles remaining my calling cb.AddTicks and cb.GetTicksRemaining + /// @note this clobbers ABI caller-save registers + void UpdateTicks(); + /// Code emitter: Performs a block lookup based on current state + /// @note this clobbers ABI caller-save registers + void LookupBlock(); + + u64 MConst(u64 lower, u64 upper = 0); + + void EmitPatchLDR(Arm64Gen::ARM64Reg Rt, u64 lower, u64 upper = 0); + + void PatchConstPool(); + + /// Far code sits far away from the near code. Execution remains primarily in near code. + /// "Cold" / Rarely executed instructions sit in far code, so the CPU doesn't fetch them unless necessary. + void SwitchToFarCode(); + void SwitchToNearCode(); + + CodePtr GetCodeBegin() const; + u8* GetRegion() const; + std::size_t GetRegionSize() const; + + const void* GetReturnFromRunCodeAddress() const { + return return_from_run_code[0]; + } + + const void* GetForceReturnFromRunCodeAddress() const { + return return_from_run_code[FORCE_RETURN]; + } + + /// Allocate memory of `size` bytes from the same block of memory the code is in. + /// This is useful for objects that need to be placed close to or within code. + /// The lifetime of this memory is the same as the code around it. + void* AllocateFromCodeSpace(size_t size); + + void SetCodePtr(CodePtr code_ptr); + void EnsurePatchLocationSize(CodePtr begin, size_t size); + + Arm64Gen::ARM64FloatEmitter fp_emitter; + + // ABI registers + + static const Arm64Gen::ARM64Reg ABI_RETURN; + static const Arm64Gen::ARM64Reg ABI_RETURN2; + static const Arm64Gen::ARM64Reg ABI_PARAM1; + static const Arm64Gen::ARM64Reg ABI_PARAM2; + static const Arm64Gen::ARM64Reg ABI_PARAM3; + static const Arm64Gen::ARM64Reg ABI_PARAM4; + static const Arm64Gen::ARM64Reg ABI_PARAM5; + static const Arm64Gen::ARM64Reg ABI_PARAM6; + static const Arm64Gen::ARM64Reg ABI_PARAM7; + static const Arm64Gen::ARM64Reg ABI_PARAM8; + + static const Arm64Gen::ARM64Reg ABI_SCRATCH1; + + static const std::array ABI_PARAMS; + + // bool DoesCpuSupport(Xbyak::util::Cpu::Type type) const; + + JitStateInfo GetJitStateInfo() const { return jsi; } + +private: + RunCodeCallbacks cb; + JitStateInfo jsi; + + bool prelude_complete = false; + CodePtr near_code_begin; + CodePtr far_code_begin; + + ConstantPool constant_pool; + + bool in_far_code = false; + CodePtr near_code_ptr; + CodePtr far_code_ptr; + + using RunCodeFuncType = void(*)(void*, CodePtr); + RunCodeFuncType run_code = nullptr; + RunCodeFuncType step_code = nullptr; + static constexpr size_t FPSCR_ALREADY_EXITED = 1 << 0; + static constexpr size_t FORCE_RETURN = 1 << 1; + std::array return_from_run_code; + void GenRunCode(); + + //Xbyak::util::Cpu cpu_info; +}; + +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/block_range_information.cpp b/src/dynarmic/backend/A64/block_range_information.cpp new file mode 100644 index 00000000..1f85c861 --- /dev/null +++ b/src/dynarmic/backend/A64/block_range_information.cpp @@ -0,0 +1,45 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#include + +#include +#include + +#include "backend/A64/block_range_information.h" +#include "common/common_types.h" + +namespace Dynarmic::BackendA64 { + +template +void BlockRangeInformation::AddRange(boost::icl::discrete_interval range, IR::LocationDescriptor location) { + block_ranges.add(std::make_pair(range, std::set{location})); +} + +template +void BlockRangeInformation::ClearCache() { + block_ranges.clear(); +} + +template +std::unordered_set BlockRangeInformation::InvalidateRanges(const boost::icl::interval_set& ranges) { + std::unordered_set erase_locations; + for (auto invalidate_interval : ranges) { + auto pair = block_ranges.equal_range(invalidate_interval); + for (auto it = pair.first; it != pair.second; ++it) { + for (const auto &descriptor : it->second) { + erase_locations.insert(descriptor); + } + } + } + // TODO: EFFICIENCY: Remove ranges that are to be erased. + return erase_locations; +} + +template class BlockRangeInformation; +template class BlockRangeInformation; + +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/block_range_information.h b/src/dynarmic/backend/A64/block_range_information.h new file mode 100644 index 00000000..f9d94315 --- /dev/null +++ b/src/dynarmic/backend/A64/block_range_information.h @@ -0,0 +1,29 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#pragma once + +#include + +#include +#include + +#include "frontend/ir/location_descriptor.h" + +namespace Dynarmic::BackendA64 { + +template +class BlockRangeInformation { +public: + void AddRange(boost::icl::discrete_interval range, IR::LocationDescriptor location); + void ClearCache(); + std::unordered_set InvalidateRanges(const boost::icl::interval_set& ranges); + +private: + boost::icl::interval_map> block_ranges; +}; + +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/callback.cpp b/src/dynarmic/backend/A64/callback.cpp new file mode 100644 index 00000000..c7e19f64 --- /dev/null +++ b/src/dynarmic/backend/A64/callback.cpp @@ -0,0 +1,41 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#include "backend/A64/callback.h" +#include "backend/A64/block_of_code.h" + +namespace Dynarmic::BackendA64 { + +Callback::~Callback() = default; + +void SimpleCallback::EmitCall(BlockOfCode& code, std::function l) const { + l({code.ABI_PARAM1, code.ABI_PARAM2, code.ABI_PARAM3, code.ABI_PARAM4}); + code.QuickCallFunction(fn); +} + +void SimpleCallback::EmitCallWithReturnPointer(BlockOfCode& code, std::function l) const { + l(code.ABI_PARAM1, {code.ABI_PARAM2, code.ABI_PARAM3, code.ABI_PARAM4}); + code.QuickCallFunction(fn); +} + +void ArgCallback::EmitCall(BlockOfCode& code, std::function l) const { + l({code.ABI_PARAM2, code.ABI_PARAM3, code.ABI_PARAM4}); + code.MOVI2R(code.ABI_PARAM1, arg); + code.QuickCallFunction(fn); +} + +void ArgCallback::EmitCallWithReturnPointer(BlockOfCode& code, std::function l) const { +#if defined(WIN32) && !defined(__MINGW64__) + l(code.ABI_PARAM2, {code.ABI_PARAM3, code.ABI_PARAM4}); + code.MOVI2R(code.ABI_PARAM1, arg); +#else + l(code.ABI_PARAM1, {code.ABI_PARAM3, code.ABI_PARAM4}); + code.MOVI2R(code.ABI_PARAM2, arg); +#endif + code.QuickCallFunction(fn); +} + +} // namespace Dynarmic::BackendX64 diff --git a/src/dynarmic/backend/A64/callback.h b/src/dynarmic/backend/A64/callback.h new file mode 100644 index 00000000..c9d88db4 --- /dev/null +++ b/src/dynarmic/backend/A64/callback.h @@ -0,0 +1,54 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#pragma once + +#include +#include + +#include "backend/A64/emitter/a64_emitter.h" +#include "common/common_types.h" + +namespace Dynarmic::BackendA64 { + +using RegList = std::vector; + +class BlockOfCode; + +class Callback { +public: + virtual ~Callback(); + + virtual void EmitCall(BlockOfCode& code, std::function fn = [](RegList) {}) const = 0; + virtual void EmitCallWithReturnPointer(BlockOfCode& code, std::function fn) const = 0; +}; + +class SimpleCallback final : public Callback { +public: + template + SimpleCallback(Function fn) : fn(reinterpret_cast(fn)) {} + + void EmitCall(BlockOfCode& code, std::function fn = [](RegList) {}) const override; + void EmitCallWithReturnPointer(BlockOfCode& code, std::function fn) const override; + +private: + void (*fn)(); +}; + +class ArgCallback final : public Callback { +public: + template + ArgCallback(Function fn, u64 arg) : fn(reinterpret_cast(fn)), arg(arg) {} + + void EmitCall(BlockOfCode& code, std::function fn = [](RegList) {}) const override; + void EmitCallWithReturnPointer(BlockOfCode& code, std::function fn) const override; + +private: + void (*fn)(); + u64 arg; +}; + +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/constant_pool.cpp b/src/dynarmic/backend/A64/constant_pool.cpp new file mode 100644 index 00000000..0b3a0f20 --- /dev/null +++ b/src/dynarmic/backend/A64/constant_pool.cpp @@ -0,0 +1,65 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#include + +#include "backend/A64/block_of_code.h" +#include "backend/A64/constant_pool.h" +#include "common/assert.h" + +namespace Dynarmic::BackendA64 { + +ConstantPool::ConstantPool(BlockOfCode& code) : code(code) {} + +void ConstantPool::EmitPatchLDR(Arm64Gen::ARM64Reg Rt, u64 lower, u64 upper) { + const auto constant = std::make_tuple(lower, upper); + auto iter = constant_info.find(constant); + if (iter == constant_info.end()) { + struct PatchInfo p = { code.GetCodePtr(), Rt, constant }; + patch_info.emplace_back(p); + code.BRK(0); + return; + } + + const s32 offset = reinterpret_cast(iter->second) - reinterpret_cast(code.GetCodePtr()); + + if (!(offset >= -0x40000 && offset <= 0x3FFFF)) { + constant_info.erase(constant); + struct PatchInfo p = { code.GetCodePtr(), Rt, constant }; + patch_info.emplace_back(p); + code.BRK(0x42); + return; + } + DEBUG_ASSERT((offset & 3) == 0); + code.LDR(Rt, offset / 4); +} + +void ConstantPool::PatchPool() { + u8* pool_ptr = code.GetWritableCodePtr(); + for (PatchInfo patch : patch_info) { + auto iter = constant_info.find(patch.constant); + if (iter == constant_info.end()) { + std::memcpy(pool_ptr, &std::get<0>(patch.constant), sizeof(u64)); + std::memcpy(pool_ptr + sizeof(u64), &std::get<1>(patch.constant), sizeof(u64)); + iter = constant_info.emplace(patch.constant, pool_ptr).first; + pool_ptr += align_size; + } + code.SetCodePtr(patch.ptr); + + const s32 offset = reinterpret_cast(iter->second) - reinterpret_cast(code.GetCodePtr()); + DEBUG_ASSERT((offset & 3) == 0); + code.LDR(patch.Rt, offset / 4); + } + patch_info.clear(); + code.SetCodePtr(pool_ptr); +} + +void ConstantPool::Clear() { + constant_info.clear(); + patch_info.clear(); +} + +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/constant_pool.h b/src/dynarmic/backend/A64/constant_pool.h new file mode 100644 index 00000000..ede34e5b --- /dev/null +++ b/src/dynarmic/backend/A64/constant_pool.h @@ -0,0 +1,47 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#pragma once + +#include + +#include "common/common_types.h" + +namespace Dynarmic::BackendA64 { + +class BlockOfCode; + +/// ConstantPool allocates a block of memory from BlockOfCode. +/// It places constants into this block of memory, returning the address +/// of the memory location where the constant is placed. If the constant +/// already exists, its memory location is reused. +class ConstantPool final { +public: + ConstantPool(BlockOfCode& code); + + void EmitPatchLDR(Arm64Gen::ARM64Reg Rt, u64 lower, u64 upper = 0); + + void PatchPool(); + + void Clear(); + +private: + static constexpr size_t align_size = 16; // bytes + + std::map, void*> constant_info; + + BlockOfCode& code; + + struct PatchInfo { + const void* ptr; + Arm64Gen::ARM64Reg Rt; + std::tuple constant; + }; + + std::vector patch_info; +}; + +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/devirtualize.h b/src/dynarmic/backend/A64/devirtualize.h new file mode 100644 index 00000000..caefc9b0 --- /dev/null +++ b/src/dynarmic/backend/A64/devirtualize.h @@ -0,0 +1,77 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#pragma once + +#include +#include + +#include + +#include "backend/A64/callback.h" +#include "common/assert.h" +#include "common/cast_util.h" +#include "common/common_types.h" + +namespace Dynarmic::BackendA64 { + +namespace impl { + +template +struct ThunkBuilder; + +template +struct ThunkBuilder { + static R Thunk(C* this_, Args... args) { + return (this_->*mfp)(std::forward(args)...); + } +}; + +} // namespace impl + +template +ArgCallback DevirtualizeGeneric(mp::class_type* this_) { + return ArgCallback{&impl::ThunkBuilder::Thunk, reinterpret_cast(this_)}; +} + +template +ArgCallback DevirtualizeWindows(mp::class_type* this_) { + static_assert(sizeof(mfp) == 8); + return ArgCallback{Common::BitCast(mfp), reinterpret_cast(this_)}; +} + +template +ArgCallback DevirtualizeAarch64(mp::class_type* this_) { + struct MemberFunctionPointer { + /// For a non-virtual function, this is a simple function pointer. + /// For a virtual function, it is virtual table offset in bytes. + u64 ptr; + /// Twice the required adjustment to `this`, plus 1 if the member function is virtual. + u64 adj; + } mfp_struct = Common::BitCast(mfp); + + static_assert(sizeof(MemberFunctionPointer) == 16); + static_assert(sizeof(MemberFunctionPointer) == sizeof(mfp)); + + u64 fn_ptr = mfp_struct.ptr; + u64 this_ptr = reinterpret_cast(this_) + mfp_struct.adj / 2; + if (mfp_struct.adj & 1) { + u64 vtable = Common::BitCastPointee(this_ptr); + fn_ptr = Common::BitCastPointee(vtable + fn_ptr); + } + return ArgCallback{fn_ptr, this_ptr}; +} + +template +ArgCallback Devirtualize(mp::class_type* this_) { +#if defined(linux) || defined(__linux) || defined(__linux__) + return DevirtualizeAarch64(this_); +#else + return DevirtualizeGeneric(this_); +#endif +} + +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/emit_a64.cpp b/src/dynarmic/backend/A64/emit_a64.cpp new file mode 100644 index 00000000..c1a3070f --- /dev/null +++ b/src/dynarmic/backend/A64/emit_a64.cpp @@ -0,0 +1,286 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#include +#include + +#include "backend/A64/block_of_code.h" +#include "backend/A64/emit_a64.h" +#include "backend/A64/hostloc.h" +#include "backend/A64/perf_map.h" +#include "common/assert.h" +#include "common/bit_util.h" +#include "common/common_types.h" +#include "common/scope_exit.h" +#include "common/variant_util.h" +#include "frontend/ir/basic_block.h" +#include "frontend/ir/microinstruction.h" +#include "frontend/ir/opcodes.h" + +// TODO: Have ARM flags in host flags and not have them use up GPR registers unless necessary. +// TODO: Actually implement that proper instruction selector you've always wanted to sweetheart. + +namespace Dynarmic::BackendA64 { + +EmitContext::EmitContext(RegAlloc& reg_alloc, IR::Block& block) + : reg_alloc(reg_alloc), block(block) {} + +void EmitContext::EraseInstruction(IR::Inst* inst) { + block.Instructions().erase(inst); + inst->ClearArgs(); +} + +EmitA64::EmitA64(BlockOfCode& code) + : code(code) {} + +EmitA64::~EmitA64() = default; + +std::optional EmitA64::GetBasicBlock(IR::LocationDescriptor descriptor) const { + auto iter = block_descriptors.find(descriptor); + if (iter == block_descriptors.end()) + return std::nullopt; + return iter->second; +} + +void EmitA64::EmitVoid(EmitContext&, IR::Inst*) { +} + +void EmitA64::EmitBreakpoint(EmitContext&, IR::Inst*) { + code.BRK(0); +} + +void EmitA64::EmitIdentity(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + if (!args[0].IsImmediate()) { + ctx.reg_alloc.DefineValue(inst, args[0]); + } +} + +void EmitA64::PushRSBHelper(ARM64Reg loc_desc_reg, ARM64Reg index_reg, IR::LocationDescriptor target) { + auto iter = block_descriptors.find(target); + CodePtr target_code_ptr = iter != block_descriptors.end() + ? iter->second.entrypoint + : code.GetReturnFromRunCodeAddress(); + + code.LDR(INDEX_UNSIGNED, DecodeReg(index_reg), X28, code.GetJitStateInfo().offsetof_rsb_ptr); + + code.MOVI2R(loc_desc_reg, target.Value()); + + patch_information[target].mov_x0.emplace_back(code.GetCodePtr()); + EmitPatchMovX0(target_code_ptr); + + code.ADD(code.ABI_SCRATCH1, X28, DecodeReg(index_reg), ArithOption{index_reg, ST_LSL, 3}); + code.STR(INDEX_UNSIGNED, loc_desc_reg, code.ABI_SCRATCH1, code.GetJitStateInfo().offsetof_rsb_location_descriptors); + code.STR(INDEX_UNSIGNED, X0, code.ABI_SCRATCH1, code.GetJitStateInfo().offsetof_rsb_codeptrs); + + code.ADDI2R(DecodeReg(index_reg), DecodeReg(index_reg), 1); + code.ANDI2R(DecodeReg(index_reg), DecodeReg(index_reg), code.GetJitStateInfo().rsb_ptr_mask, code.ABI_SCRATCH1); + code.STR(INDEX_UNSIGNED, DecodeReg(index_reg), X28, code.GetJitStateInfo().offsetof_rsb_ptr); +} + +void EmitA64::EmitPushRSB(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ASSERT(args[0].IsImmediate()); + u64 unique_hash_of_target = args[0].GetImmediateU64(); + + ctx.reg_alloc.ScratchGpr({HostLoc::X0}); + Arm64Gen::ARM64Reg loc_desc_reg = ctx.reg_alloc.ScratchGpr(); + Arm64Gen::ARM64Reg index_reg = ctx.reg_alloc.ScratchGpr(); + + PushRSBHelper(loc_desc_reg, index_reg, IR::LocationDescriptor{unique_hash_of_target}); +} + +void EmitA64::EmitGetCarryFromOp(EmitContext&, IR::Inst*) { + ASSERT_FALSE("should never happen"); +} + +void EmitA64::EmitGetOverflowFromOp(EmitContext&, IR::Inst*) { + ASSERT_FALSE("should never happen"); +} + +void EmitA64::EmitGetGEFromOp(EmitContext&, IR::Inst*) { + ASSERT_FALSE("should never happen"); +} + +void EmitA64::EmitGetUpperFromOp(EmitContext&, IR::Inst*) { + ASSERT_FALSE("should never happen"); +} + +void EmitA64::EmitGetLowerFromOp(EmitContext&, IR::Inst*) { + ASSERT_FALSE("should never happen"); +} + +void EmitA64::EmitGetNZCVFromOp(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Arm64Gen::ARM64Reg nzcv = ctx.reg_alloc.ScratchGpr(); + Arm64Gen::ARM64Reg value = ctx.reg_alloc.UseGpr(args[0]); + code.CMP(value, ZR); + code.MRS(nzcv, FIELD_NZCV); + ctx.reg_alloc.DefineValue(inst, nzcv); +} + +void EmitA64::EmitNZCVFromPackedFlags(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if (args[0].IsImmediate()) { + Arm64Gen::ARM64Reg nzcv = DecodeReg(ctx.reg_alloc.ScratchGpr()); + u32 value = 0; + value |= Common::Bit<31>(args[0].GetImmediateU32()) ? (1 << 15) : 0; + value |= Common::Bit<30>(args[0].GetImmediateU32()) ? (1 << 14) : 0; + value |= Common::Bit<29>(args[0].GetImmediateU32()) ? (1 << 8) : 0; + value |= Common::Bit<28>(args[0].GetImmediateU32()) ? (1 << 0) : 0; + code.MOVI2R(nzcv, value); + ctx.reg_alloc.DefineValue(inst, nzcv); + } else { + Arm64Gen::ARM64Reg nzcv = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + Arm64Gen::ARM64Reg scratch = DecodeReg(ctx.reg_alloc.ScratchGpr()); + // TODO: Optimize + code.LSR(nzcv, nzcv, 28); + code.MOVI2R(scratch, 0b00010000'10000001); + code.MUL(nzcv, nzcv, scratch); + code.ANDI2R(nzcv, nzcv, 1, scratch); + ctx.reg_alloc.DefineValue(inst, nzcv); + } +} + +void EmitA64::EmitAddCycles(size_t cycles) { + ASSERT(cycles < std::numeric_limits::max()); + code.SUBI2R(X26, X26, static_cast(cycles)); +} + +FixupBranch EmitA64::EmitCond(IR::Cond cond) { + FixupBranch label; + + const Arm64Gen::ARM64Reg cpsr = code.ABI_SCRATCH1; + code.LDR(INDEX_UNSIGNED, DecodeReg(cpsr), X28, code.GetJitStateInfo().offsetof_cpsr_nzcv); + code._MSR(FIELD_NZCV, cpsr); + + switch (cond) { + case IR::Cond::EQ: //z + label = code.B(CC_EQ); + break; + case IR::Cond::NE: //!z + label = code.B(CC_NEQ); + break; + case IR::Cond::CS: //c + label = code.B(CC_CS); + break; + case IR::Cond::CC: //!c + label = code.B(CC_CC); + break; + case IR::Cond::MI: //n + label = code.B(CC_MI); + break; + case IR::Cond::PL: //!n + label = code.B(CC_PL); + break; + case IR::Cond::VS: //v + label = code.B(CC_VS); + break; + case IR::Cond::VC: //!v + label = code.B(CC_VC); + break; + case IR::Cond::HI: //c & !z + label = code.B(CC_HI); + break; + case IR::Cond::LS: //!c | z + label = code.B(CC_LS); + break; + case IR::Cond::GE: // n == v + label = code.B(CC_GE); + break; + case IR::Cond::LT: // n != v + label = code.B(CC_LT); + break; + case IR::Cond::GT: // !z & (n == v) + label = code.B(CC_GT); + break; + case IR::Cond::LE: // z | (n != v) + label = code.B(CC_LE); + break; + default: + ASSERT_MSG(false, "Unknown cond {}", static_cast(cond)); + break; + } + + return label; +} + +EmitA64::BlockDescriptor EmitA64::RegisterBlock(const IR::LocationDescriptor& descriptor, CodePtr entrypoint, size_t size) { + PerfMapRegister(entrypoint, code.GetCodePtr(), LocationDescriptorToFriendlyName(descriptor)); + Patch(descriptor, entrypoint); + BlockDescriptor block_desc{entrypoint, size}; + + block_descriptors.emplace(descriptor.Value(), block_desc); + return block_desc; +} + +void EmitA64::EmitTerminal(IR::Terminal terminal, IR::LocationDescriptor initial_location, bool is_single_step) { + Common::VisitVariant(terminal, [this, initial_location, is_single_step](auto x) { + using T = std::decay_t; + if constexpr (!std::is_same_v) { + this->EmitTerminalImpl(x, initial_location, is_single_step); + } else { + ASSERT_MSG(false, "Invalid terminal"); + } + }); +} + +void EmitA64::Patch(const IR::LocationDescriptor& desc, CodePtr bb) { + const CodePtr save_code_ptr = code.GetCodePtr(); + const PatchInformation& patch_info = patch_information[desc]; + + for (CodePtr location : patch_info.jg) { + code.SetCodePtr(location); + EmitPatchJg(desc, bb); + code.FlushIcache(); + } + + for (CodePtr location : patch_info.jmp) { + code.SetCodePtr(location); + EmitPatchJmp(desc, bb); + code.FlushIcache(); + } + + for (CodePtr location : patch_info.mov_x0) { + code.SetCodePtr(location); + EmitPatchMovX0(bb); + code.FlushIcache(); + } + + code.SetCodePtr(save_code_ptr); +} + +void EmitA64::Unpatch(const IR::LocationDescriptor& desc) { + Patch(desc, nullptr); +} + +void EmitA64::ClearCache() { + block_descriptors.clear(); + patch_information.clear(); + + PerfMapClear(); +} + +void EmitA64::InvalidateBasicBlocks(const std::unordered_set& locations) { + code.EnableWriting(); + SCOPE_EXIT { code.DisableWriting(); }; + + for (const auto &descriptor : locations) { + auto it = block_descriptors.find(descriptor); + if (it == block_descriptors.end()) { + continue; + } + + if (patch_information.count(descriptor)) { + Unpatch(descriptor); + } + block_descriptors.erase(it); + } +} + +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/emit_a64.h b/src/dynarmic/backend/A64/emit_a64.h new file mode 100644 index 00000000..1716af6d --- /dev/null +++ b/src/dynarmic/backend/A64/emit_a64.h @@ -0,0 +1,124 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "backend/A64/reg_alloc.h" +#include "backend/A64/emitter/a64_emitter.h" +#include "common/bit_util.h" +#include "common/fp/rounding_mode.h" +#include "frontend/ir/location_descriptor.h" +#include "frontend/ir/terminal.h" + +namespace Dynarmic::IR { +class Block; +class Inst; +} // namespace Dynarmic::IR + +namespace Dynarmic::BackendA64 { + +class BlockOfCode; + +using namespace Arm64Gen; + +using A64FullVectorWidth = std::integral_constant; + +// Array alias that always sizes itself according to the given type T +// relative to the size of a vector register. e.g. T = u32 would result +// in a std::array. +template +using VectorArray = std::array()>; + +struct EmitContext { + EmitContext(RegAlloc& reg_alloc, IR::Block& block); + + void EraseInstruction(IR::Inst* inst); + + virtual FP::RoundingMode FPSCR_RMode() const = 0; + virtual u32 FPCR() const = 0; + virtual bool FPSCR_FTZ() const = 0; + virtual bool FPSCR_DN() const = 0; + virtual bool AccurateNaN() const { return true; } + + RegAlloc& reg_alloc; + IR::Block& block; +}; + +class EmitA64 { +public: + struct BlockDescriptor { + CodePtr entrypoint; // Entrypoint of emitted code + size_t size; // Length in bytes of emitted code + }; + + EmitA64(BlockOfCode& code); + virtual ~EmitA64(); + + /// Looks up an emitted host block in the cache. + std::optional GetBasicBlock(IR::LocationDescriptor descriptor) const; + + /// Empties the entire cache. + virtual void ClearCache(); + + /// Invalidates a selection of basic blocks. + void InvalidateBasicBlocks(const std::unordered_set& locations); + +protected: + // Microinstruction emitters +#define OPCODE(name, type, ...) void Emit##name(EmitContext& ctx, IR::Inst* inst); +#define A32OPC(...) +#define A64OPC(...) +#include "backend/A64/opcodes.inc" +#undef OPCODE +#undef A32OPC +#undef A64OPC + + // Helpers + virtual std::string LocationDescriptorToFriendlyName(const IR::LocationDescriptor&) const = 0; + void EmitAddCycles(size_t cycles); + FixupBranch EmitCond(IR::Cond cond); + BlockDescriptor RegisterBlock(const IR::LocationDescriptor& location_descriptor, CodePtr entrypoint, size_t size); + void PushRSBHelper(Arm64Gen::ARM64Reg loc_desc_reg, Arm64Gen::ARM64Reg index_reg, IR::LocationDescriptor target); + + // Terminal instruction emitters + void EmitTerminal(IR::Terminal terminal, IR::LocationDescriptor initial_location, bool is_single_step); + virtual void EmitTerminalImpl(IR::Term::Interpret terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0; + virtual void EmitTerminalImpl(IR::Term::ReturnToDispatch terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0; + virtual void EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0; + virtual void EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0; + virtual void EmitTerminalImpl(IR::Term::PopRSBHint terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0; + virtual void EmitTerminalImpl(IR::Term::FastDispatchHint terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0; + virtual void EmitTerminalImpl(IR::Term::If terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0; + virtual void EmitTerminalImpl(IR::Term::CheckBit terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0; + virtual void EmitTerminalImpl(IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0; + + // Patching + struct PatchInformation { + std::vector jg; + std::vector jmp; + std::vector mov_x0; + }; + void Patch(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr); + virtual void Unpatch(const IR::LocationDescriptor& target_desc); + virtual void EmitPatchJg(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) = 0; + virtual void EmitPatchJmp(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) = 0; + virtual void EmitPatchMovX0(CodePtr target_code_ptr = nullptr) = 0; + + // State + BlockOfCode& code; + std::unordered_map block_descriptors; + std::unordered_map patch_information; +}; + +} // namespace Dynarmic::BackendX64 diff --git a/src/dynarmic/backend/A64/emit_a64_data_processing.cpp b/src/dynarmic/backend/A64/emit_a64_data_processing.cpp new file mode 100644 index 00000000..07b49276 --- /dev/null +++ b/src/dynarmic/backend/A64/emit_a64_data_processing.cpp @@ -0,0 +1,1128 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#include "backend/A64/block_of_code.h" +#include "backend/A64/emit_a64.h" +#include "common/assert.h" +#include "common/common_types.h" +#include "frontend/ir/basic_block.h" +#include "frontend/ir/microinstruction.h" +#include "frontend/ir/opcodes.h" + +namespace Dynarmic::BackendA64 { + +void EmitA64::EmitPack2x32To1x64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ARM64Reg lo = ctx.reg_alloc.UseScratchGpr(args[0]); + ARM64Reg hi = ctx.reg_alloc.UseScratchGpr(args[1]); + + code.ORR(lo, lo, hi, ArithOption{hi, ST_LSL, 32}); + + ctx.reg_alloc.DefineValue(inst, lo); +} + +//void EmitA64::EmitPack2x64To1x128(EmitContext& ctx, IR::Inst* inst) { +// auto args = ctx.reg_alloc.GetArgumentInfo(inst); +// Xbyak::Reg64 lo = ctx.reg_alloc.UseGpr(args[0]); +// Xbyak::Reg64 hi = ctx.reg_alloc.UseGpr(args[1]); +// Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); +// +// if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) { +// code.movq(result, lo); +// code.pinsrq(result, hi, 1); +// } else { +// Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); +// code.movq(result, lo); +// code.movq(tmp, hi); +// code.punpcklqdq(result, tmp); +// } +// +// ctx.reg_alloc.DefineValue(inst, result); +//} + +void EmitA64::EmitLeastSignificantWord(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.DefineValue(inst, args[0]); +} + +void EmitA64::EmitMostSignificantWord(EmitContext& ctx, IR::Inst* inst) { + auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]); + + if (carry_inst) { + ARM64Reg carry = ctx.reg_alloc.ScratchGpr(); + code.UBFX(carry, result, 31, 1); + ctx.reg_alloc.DefineValue(carry_inst, carry); + ctx.EraseInstruction(carry_inst); + } + + code.LSR(result, result, 32); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitLeastSignificantHalf(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.DefineValue(inst, args[0]); +} + +void EmitA64::EmitLeastSignificantByte(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ctx.reg_alloc.DefineValue(inst, args[0]); +} + +void EmitA64::EmitMostSignificantBit(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + // TODO: Flag optimization + code.LSR(result,result, 31); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitIsZero32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + // TODO: Flag optimization + code.CMP(result, WZR); + code.CSET(result, CC_EQ); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitIsZero64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]); + // TODO: Flag optimization + code.CMP(result, ZR); + code.CSET(result, CC_EQ); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitTestBit(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]); + ASSERT(args[1].IsImmediate()); + // TODO: Flag optimization + code.UBFX(result, result, args[1].GetImmediateU8(), 1); + ctx.reg_alloc.DefineValue(inst, result); +} + +static void EmitConditionalSelect(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bitsize) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + Arm64Gen::ARM64Reg nzcv = ctx.reg_alloc.ScratchGpr(); + Arm64Gen::ARM64Reg then_ = ctx.reg_alloc.UseGpr(args[1]); + Arm64Gen::ARM64Reg else_ = ctx.reg_alloc.UseScratchGpr(args[2]); + + then_ = bitsize == 64 ? then_ : DecodeReg(then_); + else_ = bitsize == 64 ? else_ : DecodeReg(else_); + + code.LDR(INDEX_UNSIGNED, DecodeReg(nzcv), X28, code.GetJitStateInfo().offsetof_cpsr_nzcv); + // TODO: Flag optimization + code._MSR(FIELD_NZCV, nzcv); + + switch (args[0].GetImmediateCond()) { + case IR::Cond::EQ: //z + code.CSEL(else_, else_, then_ , CC_EQ); + break; + case IR::Cond::NE: //!z + code.CSEL(else_, else_, then_, CC_NEQ); + break; + case IR::Cond::CS: //c + code.CSEL(else_, else_, then_, CC_CS); + break; + case IR::Cond::CC: //!c + code.CSEL(else_, else_, then_ , CC_CC); + break; + case IR::Cond::MI: //n + code.CSEL(else_, else_, then_, CC_MI); + break; + case IR::Cond::PL: //!n + code.CSEL(else_, else_, then_, CC_PL); + break; + case IR::Cond::VS: //v + code.CSEL(else_, else_, then_, CC_VS); + break; + case IR::Cond::VC: //!v + code.CSEL(else_, else_, then_, CC_VC); + break; + case IR::Cond::HI: //c & !z + code.CSEL(else_, else_, then_, CC_HI); + break; + case IR::Cond::LS: //!c | z + code.CSEL(else_, else_, then_, CC_LS); + break; + case IR::Cond::GE: // n == v + code.CSEL(else_, else_, then_, CC_GE); + break; + case IR::Cond::LT: // n != v + code.CSEL(else_, else_, then_, CC_LT); + break; + case IR::Cond::GT: // !z & (n == v) + code.CSEL(else_, else_, then_, CC_GT); + break; + case IR::Cond::LE: // z | (n != v) + code.CSEL(else_, else_, then_, CC_LE); + break; + case IR::Cond::AL: + case IR::Cond::NV: + code.MOV(else_, then_); + break; + default: + ASSERT_MSG(false, "Invalid cond {}", static_cast(args[0].GetImmediateCond())); + } + + ctx.reg_alloc.DefineValue(inst, else_); +} + +void EmitA64::EmitConditionalSelect32(EmitContext& ctx, IR::Inst* inst) { + EmitConditionalSelect(code, ctx, inst, 32); +} + +void EmitA64::EmitConditionalSelect64(EmitContext& ctx, IR::Inst* inst) { + EmitConditionalSelect(code, ctx, inst, 64); +} + +void EmitA64::EmitConditionalSelectNZCV(EmitContext& ctx, IR::Inst* inst) { + EmitConditionalSelect(code, ctx, inst, 32); +} + +void EmitA64::EmitLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) { + auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + auto& carry_arg = args[2]; + + if (!carry_inst) { + if (shift_arg.IsImmediate()) { + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg)); + u8 shift = shift_arg.GetImmediateU8(); + + if (shift <= 31) { + code.LSL(result, result, shift); + } else { + code.MOV(result, WZR); + } + + ctx.reg_alloc.DefineValue(inst, result); + } else { + //ctx.reg_alloc.Use(shift_arg, HostLoc::X0); + Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg)); + Arm64Gen::ARM64Reg result = ctx.reg_alloc.UseScratchGpr(operand_arg); + + code.ANDI2R(shift, shift, 0xFF); + code.LSLV(result, result, shift); + code.CMPI2R(shift, 32); + code.CSEL(result, WZR, DecodeReg(result), CC_GE); + ctx.reg_alloc.DefineValue(inst, DecodeReg(result)); + } + } else { + if (shift_arg.IsImmediate()) { + u8 shift = shift_arg.GetImmediateU8(); + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg)); + Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg)); + + if (shift == 0) { + // There is nothing more to do. + } else if (shift < 32) { + code.UBFX(carry, result, 32 - shift, 1); + code.LSL(result, result, shift); + } else if (shift > 32) { + code.MOV(result, WZR); + code.MOV(carry, WZR); + } else { + code.ANDI2R(carry, result, 1); + code.MOV(result, WZR); + } + + ctx.reg_alloc.DefineValue(carry_inst, carry); + ctx.EraseInstruction(carry_inst); + ctx.reg_alloc.DefineValue(inst, result); + } else { + Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg)); + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg)); + Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg)); + + FixupBranch end; + + code.ANDSI2R(shift, shift, 0xFF); + // if (Rs & 0xFF == 0) goto end; + end = code.B(CC_EQ); + + code.CMPI2R(shift, 32); + code.SUBI2R(shift, shift, 1); // Subtract 1 to get the bit that is shiftedout, into the MSB. + code.LSLV(result, result, shift); + code.UBFX(carry, result, 31, 1); + code.LSL(result, result, 1); + + code.CSEL(result, result, WZR, CC_LT); + code.CSEL(carry, carry, WZR, CC_LE); + + code.SetJumpTarget(end); + + ctx.reg_alloc.DefineValue(carry_inst, carry); + ctx.EraseInstruction(carry_inst); + ctx.reg_alloc.DefineValue(inst, result); + } + } +} + +void EmitA64::EmitLogicalShiftLeft64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + + if (shift_arg.IsImmediate()) { + ARM64Reg result = ctx.reg_alloc.UseScratchGpr(operand_arg); + u8 shift = shift_arg.GetImmediateU8(); + + if (shift < 64) { + code.LSL(result, result, shift); + } else { + code.MOV(result, ZR); + } + + ctx.reg_alloc.DefineValue(inst, result); + } else { + ARM64Reg result = ctx.reg_alloc.UseScratchGpr(operand_arg); + ARM64Reg shift = ctx.reg_alloc.UseGpr(shift_arg); + + code.LSLV(result, result, shift); + + ctx.reg_alloc.DefineValue(inst, result); + } +} + +void EmitA64::EmitLogicalShiftRight32(EmitContext& ctx, IR::Inst* inst) { + auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + auto& carry_arg = args[2]; + + if (!carry_inst) { + if (shift_arg.IsImmediate()) { + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg)); + u8 shift = shift_arg.GetImmediateU8(); + + if (shift <= 31) { + code.LSR(result, result, shift); + } else { + code.MOVI2R(result, 0); + } + ctx.reg_alloc.DefineValue(inst, result); + } else { + Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg)); + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg)); + + // The 32-bit A64 LSR instruction masks the shift count by 0x1F before performing the shift. + // ARM differs from the behaviour: It does not mask the count, so shifts above 31 result in zeros. + + code.ANDI2R(shift, shift, 0xFF); + code.LSRV(result, result, shift); + code.CMPI2R(shift, 31); + code.CSEL(result, WZR, result, CC_GT); + + ctx.reg_alloc.DefineValue(inst, result); + } + } else { + if (shift_arg.IsImmediate()) { + u8 shift = shift_arg.GetImmediateU8(); + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg)); + Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg)); + + if (shift == 0) { + // There is nothing more to do. + } else if (shift < 32) { + code.LSR(carry, result, shift - 1); + code.ANDI2R(carry, carry, 1); + code.LSR(result,result, shift); + } else if (shift == 32) { + code.UBFX(carry, result, 31, 1); + code.MOV(result, WZR); + } else { + code.MOV(result, WZR); + code.MOV(carry, WZR); + } + + ctx.reg_alloc.DefineValue(carry_inst, carry); + ctx.EraseInstruction(carry_inst); + ctx.reg_alloc.DefineValue(inst, result); + } else { + Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg)); + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg)); + Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg)); + + // TODO: Optimize this. + FixupBranch end; + + code.ANDSI2R(shift, shift, 0xFF); + // if (Rs & 0xFF == 0) goto end; + end = code.B(CC_EQ); + + code.CMPI2R(shift, 32); + code.SUBI2R(shift, shift, 1); // Subtract 1 to get the bit that is shifted out to the carry. + code.LSRV(result, result, shift); + code.ANDI2R(carry, result, 1); + code.LSR(result, result, 1); + + code.CSEL(result, result, WZR, CC_LT); + code.CSEL(carry, carry, WZR, CC_LE); + + code.SetJumpTarget(end); + + ctx.reg_alloc.DefineValue(carry_inst, carry); + ctx.EraseInstruction(carry_inst); + ctx.reg_alloc.DefineValue(inst, result); + } + } +} + +void EmitA64::EmitLogicalShiftRight64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + + if (shift_arg.IsImmediate()) { + ARM64Reg result = ctx.reg_alloc.UseScratchGpr(operand_arg); + u8 shift = shift_arg.GetImmediateU8(); + + if (shift < 64) { + code.LSR(result, result, shift); + } else { + code.MOV(result, ZR); + } + + ctx.reg_alloc.DefineValue(inst, result); + } else { + ARM64Reg shift = ctx.reg_alloc.UseScratchGpr(shift_arg); + ARM64Reg result = ctx.reg_alloc.UseScratchGpr(operand_arg); + + code.ANDI2R(shift, shift, 0xFF); + code.LSRV(result, result, shift); + code.CMP(shift, 63); + code.CSEL(result, WZR, result, CC_GT); + + ctx.reg_alloc.DefineValue(inst, result); + } +} + +void EmitA64::EmitArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst) { + auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + auto& carry_arg = args[2]; + + if (!carry_inst) { + if (shift_arg.IsImmediate()) { + u8 shift = shift_arg.GetImmediateU8(); + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg)); + + code.ASR(result, result, u8(shift < 31 ? shift : 31)); + + ctx.reg_alloc.DefineValue(inst, result); + } else { + //ctx.reg_alloc.UseScratch(shift_arg, HostLoc::X0); + Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg)); + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg)); + Arm64Gen::ARM64Reg const31 = DecodeReg(ctx.reg_alloc.ScratchGpr()); + + // The 32-bit arm64 ASR instruction masks the shift count by 0x1F before performing the shift. + // ARM differs from the behaviour: It does not mask the count. + + // We note that all shift values above 31 have the same behaviour as 31 does, so we saturate `shift` to 31. + code.ANDI2R(shift, shift, 0xFF); + code.MOVI2R(const31, 31); + code.CMPI2R(shift, u32(31)); + code.CSEL(shift, shift, const31, CC_LE); + code.ASRV(result, result, shift); + + ctx.reg_alloc.DefineValue(inst, result); + } + } else { + if (shift_arg.IsImmediate()) { + u8 shift = shift_arg.GetImmediateU8(); + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg)); + Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg)); + + if (shift == 0) { + // There is nothing more to do. + } else if (shift <= 31) { + code.ASR(result, result, shift - 1); + code.ANDI2R(carry, result, 1); + code.ASR(result, result, 1); + } else { + code.ASR(result, result, 31); + code.ANDI2R(carry, result, 1); + } + + ctx.reg_alloc.DefineValue(carry_inst, carry); + ctx.EraseInstruction(carry_inst); + ctx.reg_alloc.DefineValue(inst, result); + } else { + Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg)); + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg)); + Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg)); + + // TODO: Optimize this. + + FixupBranch end; + + code.ANDSI2R(shift, shift, 0xFF); + // if (Rs & 0xFF == 0) goto end; + end = code.B(CC_EQ); + // else { + code.MOVI2R(carry, 32); + code.CMPI2R(shift, u32(31)); + code.CSEL(shift, shift, carry, CC_LE); + code.SUBI2R(shift, shift, 1); + code.ASRV(result, result, shift); + code.ANDI2R(carry, result, 1); + code.ASR(result, result, 1); + // } + + code.SetJumpTarget(end); + + ctx.reg_alloc.DefineValue(carry_inst, carry); + ctx.EraseInstruction(carry_inst); + ctx.reg_alloc.DefineValue(inst, result); + } + } +} + +//void EmitA64::EmitArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst) { +// auto args = ctx.reg_alloc.GetArgumentInfo(inst); +// auto& operand_arg = args[0]; +// auto& shift_arg = args[1]; +// +// if (shift_arg.IsImmediate()) { +// u8 shift = shift_arg.GetImmediateU8(); +// Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg); +// +// code.sar(result, u8(shift < 63 ? shift : 63)); +// +// ctx.reg_alloc.DefineValue(inst, result); +// } else { +// ctx.reg_alloc.UseScratch(shift_arg, HostLoc::RCX); +// Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg); +// Xbyak::Reg64 const63 = ctx.reg_alloc.ScratchGpr(); +// +// // The 64-bit x64 SAR instruction masks the shift count by 0x3F before performing the shift. +// // ARM differs from the behaviour: It does not mask the count. +// +// // We note that all shift values above 63 have the same behaviour as 63 does, so we saturate `shift` to 63. +// code.mov(const63, 63); +// code.movzx(code.ecx, code.cl); +// code.cmp(code.ecx, u32(63)); +// code.cmovg(code.ecx, const63); +// code.sar(result, code.cl); +// +// ctx.reg_alloc.DefineValue(inst, result); +// } +//} + +void EmitA64::EmitRotateRight32(EmitContext& ctx, IR::Inst* inst) { + auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + auto& carry_arg = args[2]; + + if (!carry_inst) { + if (shift_arg.IsImmediate()) { + u8 shift = shift_arg.GetImmediateU8(); + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg)); + + code.ROR(result, result, u8(shift & 0x1F)); + + ctx.reg_alloc.DefineValue(inst, result); + } else { + Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseGpr(shift_arg)); + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg)); + + // aarch64 ROR instruction does (shift & 0x1F) for us. + code.RORV(result, result, shift); + + ctx.reg_alloc.DefineValue(inst, result); + } + } else { + if (shift_arg.IsImmediate()) { + u8 shift = shift_arg.GetImmediateU8(); + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg)); + Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg)); + + if (shift == 0) { + // There is nothing more to do. + } else if ((shift & 0x1F) == 0) { + code.MOV(carry, result, ArithOption{result, ST_LSR, 31}); + } else { + code.ROR(result, result, (shift & 0x1F) - 1); + code.ANDI2R(carry, result, 1); + code.ROR(result, result, 1); + } + + ctx.reg_alloc.DefineValue(carry_inst, carry); + ctx.EraseInstruction(carry_inst); + ctx.reg_alloc.DefineValue(inst, result); + } else { + Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg)); + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg)); + Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg)); + + // TODO: Optimize + + std::vector end; + FixupBranch zero_1F; + + code.ANDSI2R(shift, shift, u32(0xFF)); + // if (Rs & 0xFF == 0) goto end; + end.push_back(code.B(CC_EQ)); + code.ANDSI2R(shift, shift, u32(0x1F)); + zero_1F = code.B(CC_EQ); + // if (Rs & 0x1F != 0) { + code.SUBI2R(shift, shift, 1); + code.RORV(result, result, shift); + code.ANDI2R(carry, result, 1); + code.ROR(result, result, 1); + end.push_back(code.B()); + // } else { + code.SetJumpTarget(zero_1F); + code.MOV(carry, result, ArithOption{result, ST_LSR, 31}); + // } + + for (FixupBranch e : end) { + code.SetJumpTarget(e); + } + + ctx.reg_alloc.DefineValue(carry_inst, carry); + ctx.EraseInstruction(carry_inst); + ctx.reg_alloc.DefineValue(inst, result); + } + } +} + +void EmitA64::EmitRotateRight64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& operand_arg = args[0]; + auto& shift_arg = args[1]; + + if (shift_arg.IsImmediate()) { + u8 shift = shift_arg.GetImmediateU8(); + ARM64Reg result = ctx.reg_alloc.UseScratchGpr(operand_arg); + + code.ROR(result, result, u8(shift & 0x3F)); + + ctx.reg_alloc.DefineValue(inst, result); + } else { + ARM64Reg result = ctx.reg_alloc.UseScratchGpr(operand_arg); + ARM64Reg shift = ctx.reg_alloc.UseGpr(shift_arg); + + code.RORV(result, result, shift); + + ctx.reg_alloc.DefineValue(inst, result); + } +} + +void EmitA64::EmitRotateRightExtended(EmitContext& ctx, IR::Inst* inst) { + auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[1])); + Arm64Gen::ARM64Reg temp = DecodeReg(ctx.reg_alloc.ScratchGpr()); + + if (carry_inst) { + code.MOV(temp, result); + } + + // Set carry to the LSB and perform ROR. + code.BFI(result, carry, 0, 1); + code.ROR(result, result, 1); + + if (carry_inst) { + code.ANDI2R(carry, temp, 1); + + ctx.reg_alloc.DefineValue(carry_inst, carry); + ctx.EraseInstruction(carry_inst); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +static Arm64Gen::ARM64Reg DoCarry(RegAlloc& reg_alloc, Argument& carry_in, IR::Inst* carry_out) { + if (carry_in.IsImmediate()) { + return carry_out ? reg_alloc.ScratchGpr() : INVALID_REG; + } else { + return carry_out ? reg_alloc.UseScratchGpr(carry_in) : reg_alloc.UseGpr(carry_in); + } +} + +static void EmitAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bitsize) { + auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); + auto nzcv_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetNZCVFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& carry_in = args[2]; + + Arm64Gen::ARM64Reg nzcv = nzcv_inst ? ctx.reg_alloc.ScratchGpr() : INVALID_REG; + Arm64Gen::ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]); + Arm64Gen::ARM64Reg carry = DecodeReg(DoCarry(ctx.reg_alloc, carry_in, carry_inst)); + Arm64Gen::ARM64Reg overflow = overflow_inst ? ctx.reg_alloc.ScratchGpr() : INVALID_REG; + + result = bitsize == 64 ? result : DecodeReg(result); + + if (args[1].IsImmediate() && args[1].GetType() == IR::Type::U32) { + if (carry_in.IsImmediate()) { + if (carry_in.GetImmediateU1()) { + Arm64Gen::ARM64Reg op_arg = ctx.reg_alloc.UseGpr(args[1]); + code.CMP(op_arg, op_arg); + code.ADCS(result, result, op_arg); + } else { + u32 op_arg = args[1].GetImmediateU32(); + code.ADDSI2R(result, result, op_arg, ctx.reg_alloc.ScratchGpr()); + } + } else { + Arm64Gen::ARM64Reg op_arg = ctx.reg_alloc.UseGpr(args[1]); + code.CMPI2R(carry, 1); + code.ADCS(result, result, op_arg); + } + } else { + Arm64Gen::ARM64Reg op_arg = ctx.reg_alloc.UseGpr(args[1]); + if (carry_in.IsImmediate()) { + if (carry_in.GetImmediateU1()) { + code.CMP(DecodeReg(op_arg), DecodeReg(op_arg)); + code.ADCS(result, result, op_arg); + } else { + code.ADDS(result,result, op_arg); + } + } else { + code.CMPI2R(DecodeReg(carry), 1); + code.ADCS(result, result, op_arg); + } + } + + if (nzcv_inst) { + code.MRS(nzcv, FIELD_NZCV); + ctx.reg_alloc.DefineValue(nzcv_inst, nzcv); + ctx.EraseInstruction(nzcv_inst); + } + if (carry_inst) { + code.CSET(carry, CC_CS); + ctx.reg_alloc.DefineValue(carry_inst, carry); + ctx.EraseInstruction(carry_inst); + } + if (overflow_inst) { + code.CSET(overflow, CC_VS); + ctx.reg_alloc.DefineValue(overflow_inst, overflow); + ctx.EraseInstruction(overflow_inst); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitAdd32(EmitContext& ctx, IR::Inst* inst) { + EmitAdd(code, ctx, inst, 32); +} + +void EmitA64::EmitAdd64(EmitContext& ctx, IR::Inst* inst) { + EmitAdd(code, ctx, inst, 64); +} + +static void EmitSub(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bitsize) { + auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp); + auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); + auto nzcv_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetNZCVFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto& carry_in = args[2]; + + Arm64Gen::ARM64Reg nzcv = nzcv_inst ? ctx.reg_alloc.ScratchGpr() : INVALID_REG; + Arm64Gen::ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]); + Arm64Gen::ARM64Reg carry = DoCarry(ctx.reg_alloc, carry_in, carry_inst); + Arm64Gen::ARM64Reg overflow = overflow_inst ? ctx.reg_alloc.ScratchGpr() : INVALID_REG; + + // TODO: Consider using LEA. + // TODO: Optimize CMP case. + + result = bitsize == 64 ? result : DecodeReg(result); + + if (args[1].IsImmediate() && args[1].GetType() == IR::Type::U32) { + if (carry_in.IsImmediate()) { + if (carry_in.GetImmediateU1()) { + u32 op_arg = args[1].GetImmediateU32(); + code.SUBSI2R(result, result, op_arg, ctx.reg_alloc.ScratchGpr()); + } else { + Arm64Gen::ARM64Reg op_arg = ctx.reg_alloc.UseGpr(args[1]); + + code.ADDSI2R(op_arg, op_arg, 0); // Clear carry + code.SBCS(result, result, op_arg); + } + } else { + Arm64Gen::ARM64Reg op_arg = ctx.reg_alloc.UseGpr(args[1]); + code.CMPI2R(carry, 0x1); + code.SBCS(result, result, op_arg); + } + } else { + Arm64Gen::ARM64Reg op_arg = ctx.reg_alloc.UseGpr(args[1]); + if (carry_in.IsImmediate()) { + if (carry_in.GetImmediateU1()) { + code.SUBS(result, result, op_arg); + } else { + code.ADDSI2R(DecodeReg(op_arg), DecodeReg(op_arg), 0); // Clear carry + code.SBCS(result,result, op_arg); + } + } else { + code.CMPI2R(DecodeReg(carry), 0x1); + code.SBCS(result,result, op_arg); + } + } + + if (nzcv_inst) { + code.MRS(nzcv, FIELD_NZCV); + ctx.reg_alloc.DefineValue(nzcv_inst, nzcv); + ctx.EraseInstruction(nzcv_inst); + } + if (carry_inst) { + code.CSET(carry, CC_CS); + ctx.reg_alloc.DefineValue(carry_inst, carry); + ctx.EraseInstruction(carry_inst); + } + if (overflow_inst) { + code.CSET(overflow, CC_VS); + ctx.reg_alloc.DefineValue(overflow_inst, overflow); + ctx.EraseInstruction(overflow_inst); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitSub32(EmitContext& ctx, IR::Inst* inst) { + EmitSub(code, ctx, inst, 32); +} + +void EmitA64::EmitSub64(EmitContext& ctx, IR::Inst* inst) { + EmitSub(code, ctx, inst, 64); +} + +void EmitA64::EmitMul32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + ARM64Reg op_arg = DecodeReg(ctx.reg_alloc.UseGpr(args[1])); + + code.MUL(result, result, op_arg); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitMul64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]); + ARM64Reg op_arg = ctx.reg_alloc.UseGpr(args[1]); + + code.MUL(result, result, op_arg); + + ctx.reg_alloc.DefineValue(inst, result); +} + + +void EmitA64::EmitUnsignedDiv32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + const ARM64Reg divisor = DecodeReg(ctx.reg_alloc.UseGpr(args[1])); + + code.UDIV(result, result, divisor); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitUnsignedDiv64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]); + const ARM64Reg divisor = ctx.reg_alloc.UseGpr(args[1]); + + code.UDIV(result, result, divisor); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitSignedDiv32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + const ARM64Reg divisor = DecodeReg(ctx.reg_alloc.UseGpr(args[1])); + + code.SDIV(result, result, divisor); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitSignedDiv64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]); + const ARM64Reg divisor = ctx.reg_alloc.UseGpr(args[1]); + + code.SDIV(result, result, divisor); + ctx.reg_alloc.DefineValue(inst, result); +} + + +void EmitA64::EmitAnd32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + + if (args[1].IsImmediate()) { + u32 op_arg = args[1].GetImmediateU32(); + code.ANDI2R(result, result, op_arg, ctx.reg_alloc.ScratchGpr()); + } else { + Arm64Gen::ARM64Reg op_arg = DecodeReg(ctx.reg_alloc.UseGpr(args[1])); + code.AND(result, result, op_arg); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitAnd64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Arm64Gen::ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]); + + if (args[1].IsImmediate()) { + u32 op_arg = args[1].GetImmediateU32(); + code.ANDI2R(result, result, op_arg, ctx.reg_alloc.ScratchGpr()); + } + else { + Arm64Gen::ARM64Reg op_arg = ctx.reg_alloc.UseGpr(args[1]); + code.AND(result, result, op_arg); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitEor32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + + if (args[1].IsImmediate()) { + u32 op_arg = args[1].GetImmediateU32(); + code.EORI2R(result, result, op_arg, ctx.reg_alloc.ScratchGpr()); + } else { + Arm64Gen::ARM64Reg op_arg = DecodeReg(ctx.reg_alloc.UseGpr(args[1])); + code.EOR(result, result, op_arg); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitEor64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Arm64Gen::ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]); + + if (args[1].IsImmediate()) { + u32 op_arg = args[1].GetImmediateU32(); + code.EORI2R(result, result, op_arg, ctx.reg_alloc.ScratchGpr()); + } + else { + Arm64Gen::ARM64Reg op_arg = ctx.reg_alloc.UseGpr(args[1]); + code.EOR(result, result, op_arg); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitOr32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + + if (args[1].IsImmediate()) { + u32 op_arg = args[1].GetImmediateU32(); + code.ORRI2R(result, result, op_arg, ctx.reg_alloc.ScratchGpr()); + } else { + Arm64Gen::ARM64Reg op_arg = DecodeReg(ctx.reg_alloc.UseGpr(args[1])); + code.ORR(result, result , op_arg); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitOr64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Arm64Gen::ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]); + + if (args[1].IsImmediate()) { + u32 op_arg = args[1].GetImmediateU32(); + code.ORRI2R(result, result, op_arg, ctx.reg_alloc.ScratchGpr()); + } + else { + Arm64Gen::ARM64Reg op_arg = ctx.reg_alloc.UseGpr(args[1]); + code.ORR(result, result, op_arg); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitNot32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Arm64Gen::ARM64Reg result; + if (args[0].IsImmediate()) { + result = DecodeReg(ctx.reg_alloc.ScratchGpr()); + code.MOVI2R(result, u32(~args[0].GetImmediateU32())); + } else { + result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + code.MVN(result, result); + } + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitNot64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + Arm64Gen::ARM64Reg result; + if (args[0].IsImmediate()) { + result = ctx.reg_alloc.ScratchGpr(); + code.MOVI2R(result, u32(~args[0].GetImmediateU32())); + } + else { + result = ctx.reg_alloc.UseScratchGpr(args[0]); + code.MVN(result, result); + } + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitSignExtendByteToWord(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + code.SXTB(result, result); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitSignExtendHalfToWord(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + code.SXTH(result, result); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitSignExtendByteToLong(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]); + code.SXTB(result, result); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitSignExtendHalfToLong(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]); + code.SXTH(result, result); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitSignExtendWordToLong(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]); + code.SXTW(result, result); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitZeroExtendByteToWord(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + code.UXTB(result, result); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitZeroExtendHalfToWord(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + code.UXTH(result, result); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitZeroExtendByteToLong(EmitContext& ctx, IR::Inst* inst) { + // a64 zeros upper 32 bits on a 32-bit move + EmitZeroExtendByteToWord(ctx, inst); +} + +void EmitA64::EmitZeroExtendHalfToLong(EmitContext& ctx, IR::Inst* inst) { + // a64 zeros upper 32 bits on a 32-bit move + EmitZeroExtendHalfToWord(ctx, inst); +} + +void EmitA64::EmitZeroExtendWordToLong(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]); + code.MOV(result, DecodeReg(result)); + ctx.reg_alloc.DefineValue(inst, result); +} + +//void EmitA64::EmitZeroExtendLongToQuad(EmitContext& ctx, IR::Inst* inst) { +// auto args = ctx.reg_alloc.GetArgumentInfo(inst); +// if (args[0].IsInGpr()) { +// Xbyak::Reg64 source = ctx.reg_alloc.UseGpr(args[0]); +// Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); +// code.movq(result, source); +// ctx.reg_alloc.DefineValue(inst, result); +// } else { +// Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); +// code.movq(result, result); +// ctx.reg_alloc.DefineValue(inst, result); +// } +//} + +void EmitA64::EmitByteReverseWord(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + code.REV32(result, result); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitByteReverseHalf(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + code.REV16(result, result); + ctx.reg_alloc.DefineValue(inst, result); +} + +//void EmitA64::EmitByteReverseDual(EmitContext& ctx, IR::Inst* inst) { +// auto args = ctx.reg_alloc.GetArgumentInfo(inst); +// Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]); +// code.bswap(result); +// ctx.reg_alloc.DefineValue(inst, result); +//} + +void EmitA64::EmitCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ARM64Reg source = DecodeReg(ctx.reg_alloc.UseGpr(args[0])); + ARM64Reg result = DecodeReg(ctx.reg_alloc.ScratchGpr()); + + code.CLZ(result, source); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitCountLeadingZeros64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ARM64Reg source = ctx.reg_alloc.UseGpr(args[0]); + ARM64Reg result = ctx.reg_alloc.ScratchGpr(); + + code.CLZ(result, source); + ctx.reg_alloc.DefineValue(inst, result); +} +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/emit_a64_floating_point.cpp b/src/dynarmic/backend/A64/emit_a64_floating_point.cpp new file mode 100644 index 00000000..be0b97a6 --- /dev/null +++ b/src/dynarmic/backend/A64/emit_a64_floating_point.cpp @@ -0,0 +1,471 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#include +#include +#include + +#include "backend/A64/abi.h" +#include "backend/A64/block_of_code.h" +#include "backend/A64/emit_a64.h" +#include "common/assert.h" +#include "common/common_types.h" +#include "common/fp/fpcr.h" +#include "common/fp/fpsr.h" +#include "common/fp/info.h" +#include "common/fp/op.h" +#include "common/fp/rounding_mode.h" +#include "common/fp/util.h" +#include "frontend/ir/basic_block.h" +#include "frontend/ir/microinstruction.h" +#include "frontend/ir/opcodes.h" + +namespace Dynarmic::BackendA64 { + +namespace { + +Arm64Gen::RoundingMode ConvertRoundingModeToA64RoundingMode(FP::RoundingMode rounding_mode) { + switch (rounding_mode) { + case FP::RoundingMode::ToNearest_TieEven: + return RoundingMode::ROUND_N; + case FP::RoundingMode::TowardsPlusInfinity: + return RoundingMode::ROUND_P; + case FP::RoundingMode::TowardsMinusInfinity: + return RoundingMode::ROUND_M; + case FP::RoundingMode::TowardsZero: + return RoundingMode::ROUND_Z; + case FP::RoundingMode::ToNearest_TieAwayFromZero: + return RoundingMode::ROUND_A; + default: + UNREACHABLE(); + } +} + +template +void FPTwoOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + ARM64Reg result = ctx.reg_alloc.UseScratchFpr(args[0]); + result = fsize == 32 ? EncodeRegToSingle(result) : EncodeRegToDouble(result); + if constexpr (std::is_member_function_pointer_v) { + (code.fp_emitter.*fn)(result, result); + } else { + fn(result); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +template +void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + ARM64Reg result = ctx.reg_alloc.UseScratchFpr(args[0]); + ARM64Reg operand = ctx.reg_alloc.UseScratchFpr(args[1]); + result = fsize == 32 ? EncodeRegToSingle(result) : EncodeRegToDouble(result); + operand = fsize == 32 ? EncodeRegToSingle(operand) : EncodeRegToDouble(operand); + + if constexpr (std::is_member_function_pointer_v) { + (code.fp_emitter.*fn)(result, result, operand); + } + else { + fn(result, result, operand); + } + + ctx.reg_alloc.DefineValue(inst, result); +} +} // anonymous namespace + +//void EmitA64::EmitFPAbs16(EmitContext& ctx, IR::Inst* inst) { +// auto args = ctx.reg_alloc.GetArgumentInfo(inst); +// const ARM64Reg result = ctx.reg_alloc.UseScratchXmm(args[0]); +// +// code.pand(result, code.MConst(xword, f16_non_sign_mask)); +// +// ctx.reg_alloc.DefineValue(inst, result); +//} + +void EmitA64::EmitFPAbs32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const ARM64Reg result = EncodeRegToSingle(ctx.reg_alloc.UseScratchFpr(args[0])); + + code.fp_emitter.FABS(result, result); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitFPAbs64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const ARM64Reg result = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + + code.fp_emitter.FABS(result, result); + + ctx.reg_alloc.DefineValue(inst, result); +} + +//void EmitA64::EmitFPNeg16(EmitContext& ctx, IR::Inst* inst) { +// auto args = ctx.reg_alloc.GetArgumentInfo(inst); +// const ARM64Reg result = ctx.reg_alloc.UseScratchXmm(args[0]); +// +// code.pxor(result, code.MConst(xword, f16_negative_zero)); +// +// ctx.reg_alloc.DefineValue(inst, result); +//} + +void EmitA64::EmitFPNeg32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const ARM64Reg result = EncodeRegToSingle(ctx.reg_alloc.UseScratchFpr(args[0])); + + code.fp_emitter.FNEG(result, result); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitFPNeg64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const ARM64Reg result = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + + code.fp_emitter.FNEG(result, result); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitFPAdd32(EmitContext& ctx, IR::Inst* inst) { + FPThreeOp<32, void(Arm64Gen::ARM64FloatEmitter::*)(ARM64Reg, ARM64Reg, ARM64Reg)>(code, ctx, inst, &Arm64Gen::ARM64FloatEmitter::FADD); +} + +void EmitA64::EmitFPAdd64(EmitContext& ctx, IR::Inst* inst) { + FPThreeOp<64, void(Arm64Gen::ARM64FloatEmitter::*)(ARM64Reg, ARM64Reg, ARM64Reg)>(code, ctx, inst, &Arm64Gen::ARM64FloatEmitter::FADD); +} + +void EmitA64::EmitFPDiv32(EmitContext& ctx, IR::Inst* inst) { + FPThreeOp<32, void(Arm64Gen::ARM64FloatEmitter::*)(ARM64Reg, ARM64Reg, ARM64Reg)>(code, ctx, inst, &Arm64Gen::ARM64FloatEmitter::FDIV); +} + +void EmitA64::EmitFPDiv64(EmitContext& ctx, IR::Inst* inst) { + FPThreeOp<64, void(Arm64Gen::ARM64FloatEmitter::*)(ARM64Reg, ARM64Reg, ARM64Reg)>(code, ctx, inst, &Arm64Gen::ARM64FloatEmitter::FDIV); +} + +void EmitA64::EmitFPMul32(EmitContext& ctx, IR::Inst* inst) { + FPThreeOp<32, void(Arm64Gen::ARM64FloatEmitter::*)(ARM64Reg, ARM64Reg, ARM64Reg)>(code, ctx, inst, &Arm64Gen::ARM64FloatEmitter::FMUL); +} + +void EmitA64::EmitFPMul64(EmitContext& ctx, IR::Inst* inst) { + FPThreeOp<64, void(Arm64Gen::ARM64FloatEmitter::*)(ARM64Reg, ARM64Reg, ARM64Reg)>(code, ctx, inst, &Arm64Gen::ARM64FloatEmitter::FMUL); +} +void EmitA64::EmitFPSqrt32(EmitContext& ctx, IR::Inst* inst) { + FPTwoOp<32>(code, ctx, inst, &Arm64Gen::ARM64FloatEmitter::FSQRT); +} + +void EmitA64::EmitFPSqrt64(EmitContext& ctx, IR::Inst* inst) { + FPTwoOp<64>(code, ctx, inst, &Arm64Gen::ARM64FloatEmitter::FSQRT); +} + +void EmitA64::EmitFPSub32(EmitContext& ctx, IR::Inst* inst) { + FPThreeOp<32, void(Arm64Gen::ARM64FloatEmitter::*)(ARM64Reg, ARM64Reg, ARM64Reg)>(code, ctx, inst, &Arm64Gen::ARM64FloatEmitter::FSUB); +} + +void EmitA64::EmitFPSub64(EmitContext& ctx, IR::Inst* inst) { + FPThreeOp<64, void(Arm64Gen::ARM64FloatEmitter::*)(ARM64Reg, ARM64Reg, ARM64Reg)>(code, ctx, inst, &Arm64Gen::ARM64FloatEmitter::FSUB); +} + +static ARM64Reg SetFpscrNzcvFromFlags(BlockOfCode& code, EmitContext& ctx) { + ARM64Reg nzcv = ctx.reg_alloc.ScratchGpr(); + // Fpsr's nzcv is copied across integer nzcv + code.MRS(nzcv, FIELD_NZCV); + return nzcv; +} + +void EmitA64::EmitFPCompare32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + ARM64Reg reg_a = EncodeRegToSingle(ctx.reg_alloc.UseFpr(args[0])); + ARM64Reg reg_b = EncodeRegToSingle(ctx.reg_alloc.UseFpr(args[1])); + bool exc_on_qnan = args[2].GetImmediateU1(); + + if (exc_on_qnan) { + code.fp_emitter.FCMPE(reg_a, reg_b); + } else { + code.fp_emitter.FCMP(reg_a, reg_b); + } + + ARM64Reg nzcv = SetFpscrNzcvFromFlags(code, ctx); + ctx.reg_alloc.DefineValue(inst, nzcv); +} + +void EmitA64::EmitFPCompare64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const ARM64Reg reg_a = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[0])); + const ARM64Reg reg_b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + bool exc_on_qnan = args[2].GetImmediateU1(); + + if (exc_on_qnan) { + code.fp_emitter.FCMPE(reg_a, reg_b); + } else { + code.fp_emitter.FCMP(reg_a, reg_b); + } + + ARM64Reg nzcv = SetFpscrNzcvFromFlags(code, ctx); + ctx.reg_alloc.DefineValue(inst, nzcv); +} + +void EmitA64::EmitFPHalfToDouble(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const ARM64Reg result = EncodeRegToSingle(ctx.reg_alloc.UseScratchFpr(args[0])); + + code.fp_emitter.FCVT(64, 16, result, result); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitFPHalfToSingle(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const ARM64Reg result = EncodeRegToSingle(ctx.reg_alloc.UseScratchFpr(args[0])); + code.fp_emitter.FCVT(32, 16, result, result); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitFPSingleToDouble(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const ARM64Reg result = EncodeRegToSingle(ctx.reg_alloc.UseScratchFpr(args[0])); + + code.fp_emitter.FCVT(64, 32, result, result); + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitFPSingleToHalf(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const ARM64Reg result = EncodeRegToSingle(ctx.reg_alloc.UseScratchFpr(args[0])); + code.fp_emitter.FCVT(16, 32, result, result); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitFPDoubleToHalf(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const ARM64Reg result = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + code.fp_emitter.FCVT(16, 64, result, result); + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitFPDoubleToSingle(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const ARM64Reg result = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + code.fp_emitter.FCVT(32, 64, result, result); + ctx.reg_alloc.DefineValue(inst, result); +} + +template +static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const size_t fbits = args[1].GetImmediateU8(); + const auto rounding_mode = static_cast(args[2].GetImmediateU8()); + const auto round_imm = ConvertRoundingModeToA64RoundingMode(rounding_mode); + + ASSERT_MSG(fbits == 0, "fixed point conversions are not supported yet"); + + ARM64Reg src = ctx.reg_alloc.UseScratchFpr(args[0]); + ARM64Reg result = ctx.reg_alloc.ScratchGpr(); + src = fsize == 64 ? EncodeRegToDouble(src) : EncodeRegToSingle(src); + result = isize == 64 ? result : DecodeReg(result); + + if constexpr (unsigned_) { + code.fp_emitter.FCVTU(result, src, round_imm); + } + else { + code.fp_emitter.FCVTS(result, src, round_imm); + } + + ctx.reg_alloc.DefineValue(inst, result); + +} + +void EmitA64::EmitFPDoubleToFixedS32(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<64, false, 32>(code, ctx, inst); +} + +void EmitA64::EmitFPDoubleToFixedS64(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<64, false, 64>(code, ctx, inst); +} + +void EmitA64::EmitFPDoubleToFixedU32(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<64, true, 32>(code, ctx, inst); +} + +void EmitA64::EmitFPDoubleToFixedU64(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<64, true, 64>(code, ctx, inst); +} + +void EmitA64::EmitFPSingleToFixedS32(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<32, false, 32>(code, ctx, inst); +} + +void EmitA64::EmitFPSingleToFixedS64(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<32, false, 64>(code, ctx, inst); +} + +void EmitA64::EmitFPSingleToFixedU32(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<32, true, 32>(code, ctx, inst); +} + +void EmitA64::EmitFPSingleToFixedU64(EmitContext& ctx, IR::Inst* inst) { + EmitFPToFixed<32, true, 64>(code, ctx, inst); +} + +void EmitA64::EmitFPFixedS32ToSingle(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg from = DecodeReg(ctx.reg_alloc.UseGpr(args[0])); + const ARM64Reg result = EncodeRegToSingle(ctx.reg_alloc.ScratchFpr()); + const size_t fbits = args[1].GetImmediateU8(); + const FP::RoundingMode rounding_mode = static_cast(args[2].GetImmediateU8()); + ASSERT(rounding_mode == ctx.FPSCR_RMode()); + + if (fbits != 0) { + code.fp_emitter.SCVTF(result, from, fbits); + } + else { + code.fp_emitter.SCVTF(result, from); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitFPFixedU32ToSingle(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg from = DecodeReg(ctx.reg_alloc.UseGpr(args[0])); + const ARM64Reg result = EncodeRegToSingle(ctx.reg_alloc.ScratchFpr()); + const size_t fbits = args[1].GetImmediateU8(); + const FP::RoundingMode rounding_mode = static_cast(args[2].GetImmediateU8()); + ASSERT(rounding_mode == ctx.FPSCR_RMode()); + + if (fbits != 0) { + code.fp_emitter.UCVTF(result, from, fbits); + } + else { + code.fp_emitter.UCVTF(result, from); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitFPFixedS32ToDouble(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg from = DecodeReg(ctx.reg_alloc.UseGpr(args[0])); + const ARM64Reg result = EncodeRegToDouble(ctx.reg_alloc.ScratchFpr()); + const size_t fbits = args[1].GetImmediateU8(); + const FP::RoundingMode rounding_mode = static_cast(args[2].GetImmediateU8()); + ASSERT(rounding_mode == ctx.FPSCR_RMode()); + + if (fbits != 0) { + code.fp_emitter.SCVTF(result, from, fbits); + } + else { + code.fp_emitter.SCVTF(result, from); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitFPFixedS64ToDouble(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg from = ctx.reg_alloc.UseGpr(args[0]); + const ARM64Reg result = EncodeRegToDouble(ctx.reg_alloc.ScratchFpr()); + const size_t fbits = args[1].GetImmediateU8(); + const FP::RoundingMode rounding_mode = static_cast(args[2].GetImmediateU8()); + ASSERT(rounding_mode == ctx.FPSCR_RMode()); + + if (fbits != 0) { + code.fp_emitter.SCVTF(result, from, fbits); + } + else { + code.fp_emitter.SCVTF(result, from); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitFPFixedS64ToSingle(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg from = ctx.reg_alloc.UseGpr(args[0]); + const ARM64Reg result = EncodeRegToSingle(ctx.reg_alloc.ScratchFpr()); + const size_t fbits = args[1].GetImmediateU8(); + const FP::RoundingMode rounding_mode = static_cast(args[2].GetImmediateU8()); + ASSERT(rounding_mode == ctx.FPSCR_RMode()); + + if (fbits != 0) { + code.fp_emitter.SCVTF(result, from, fbits); + } + else { + code.fp_emitter.SCVTF(result, from); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitFPFixedU32ToDouble(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg from = DecodeReg(ctx.reg_alloc.UseGpr(args[0])); + const ARM64Reg result = EncodeRegToDouble(ctx.reg_alloc.ScratchFpr()); + const size_t fbits = args[1].GetImmediateU8(); + const FP::RoundingMode rounding_mode = static_cast(args[2].GetImmediateU8()); + ASSERT(rounding_mode == ctx.FPSCR_RMode()); + + if (fbits != 0) { + code.fp_emitter.UCVTF(result, from, fbits); + } + else { + code.fp_emitter.UCVTF(result, from); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitFPFixedU64ToDouble(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + + const ARM64Reg from = ctx.reg_alloc.UseGpr(args[0]); + const ARM64Reg result = EncodeRegToDouble(ctx.reg_alloc.ScratchFpr()); + const size_t fbits = args[1].GetImmediateU8(); + const FP::RoundingMode rounding_mode = static_cast(args[2].GetImmediateU8()); + ASSERT(rounding_mode == ctx.FPSCR_RMode()); + + if (fbits != 0) { + code.fp_emitter.UCVTF(result, from, fbits); + } + else { + code.fp_emitter.UCVTF(result, from); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitFPFixedU64ToSingle(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + + const ARM64Reg from = ctx.reg_alloc.UseGpr(args[0]); + const ARM64Reg result = EncodeRegToSingle(ctx.reg_alloc.ScratchFpr()); + const size_t fbits = args[1].GetImmediateU8(); + const FP::RoundingMode rounding_mode = static_cast(args[2].GetImmediateU8()); + ASSERT(rounding_mode == ctx.FPSCR_RMode()); + + if (fbits != 0) { + code.fp_emitter.UCVTF(result, from, fbits); + } + else { + code.fp_emitter.UCVTF(result, from); + } + + ctx.reg_alloc.DefineValue(inst, result); +} +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/emit_a64_packed.cpp b/src/dynarmic/backend/A64/emit_a64_packed.cpp new file mode 100644 index 00000000..fb54361d --- /dev/null +++ b/src/dynarmic/backend/A64/emit_a64_packed.cpp @@ -0,0 +1,469 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#include "backend/A64/block_of_code.h" +#include "backend/A64/emit_a64.h" +#include "frontend/ir/microinstruction.h" +#include "frontend/ir/opcodes.h" + +namespace Dynarmic::BackendA64 { + +void EmitA64::EmitPackedAddU8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + const ARM64Reg sum = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + code.fp_emitter.ADD(B, sum, sum, b); + + if (ge_inst) { + const ARM64Reg ge = EncodeRegToDouble(ctx.reg_alloc.ScratchFpr()); + + code.fp_emitter.CMHI(B, ge, b, sum); + + ctx.reg_alloc.DefineValue(ge_inst, ge); + ctx.EraseInstruction(ge_inst); + } + + ctx.reg_alloc.DefineValue(inst, sum); +} + +void EmitA64::EmitPackedAddS8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + if (ge_inst) { + const ARM64Reg ge = EncodeRegToDouble(ctx.reg_alloc.ScratchFpr()); + + code.fp_emitter.SQADD(B, ge, a, b); + code.fp_emitter.CMGE_zero(B, ge, ge); + + ctx.reg_alloc.DefineValue(ge_inst, ge); + ctx.EraseInstruction(ge_inst); + } + + code.fp_emitter.ADD(B, a, a, b); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitA64::EmitPackedAddU16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + const ARM64Reg sum = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + code.fp_emitter.ADD(H, sum, sum, b); + + if (ge_inst) { + const ARM64Reg ge = EncodeRegToDouble(ctx.reg_alloc.ScratchFpr()); + + code.fp_emitter.CMHI(H, ge, b, sum); + + ctx.reg_alloc.DefineValue(ge_inst, ge); + ctx.EraseInstruction(ge_inst); + } + + ctx.reg_alloc.DefineValue(inst, sum); +} + +void EmitA64::EmitPackedAddS16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + if (ge_inst) { + const ARM64Reg ge = EncodeRegToDouble(ctx.reg_alloc.ScratchFpr()); + + code.fp_emitter.SQADD(H, ge, a, b); + code.fp_emitter.CMGE_zero(H, ge, ge); + + ctx.reg_alloc.DefineValue(ge_inst, ge); + ctx.EraseInstruction(ge_inst); + } + + code.fp_emitter.ADD(H, a, a, b); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitA64::EmitPackedSubU8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + if (ge_inst) { + const ARM64Reg ge = EncodeRegToDouble(ctx.reg_alloc.ScratchFpr()); + + code.fp_emitter.CMHS(B, ge, a, b); + + ctx.reg_alloc.DefineValue(ge_inst, ge); + ctx.EraseInstruction(ge_inst); + } + + code.fp_emitter.SUB(B, a, a, b); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitA64::EmitPackedSubS8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + if (ge_inst) { + const ARM64Reg ge = EncodeRegToDouble(ctx.reg_alloc.ScratchFpr()); + + code.fp_emitter.SQSUB(B, ge, a, b); + code.fp_emitter.CMGE_zero(B, ge, ge); + + ctx.reg_alloc.DefineValue(ge_inst, ge); + ctx.EraseInstruction(ge_inst); + } + + code.fp_emitter.SUB(B, a, a, b); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitA64::EmitPackedSubU16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + if (ge_inst) { + const ARM64Reg ge = EncodeRegToDouble(ctx.reg_alloc.ScratchFpr()); + + code.fp_emitter.CMHS(H, ge, a, b); + + ctx.reg_alloc.DefineValue(ge_inst, ge); + ctx.EraseInstruction(ge_inst); + } + + code.fp_emitter.SUB(H, a, a, b); + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitA64::EmitPackedSubS16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + if (ge_inst) { + const ARM64Reg ge = EncodeRegToDouble(ctx.reg_alloc.ScratchFpr()); + + code.fp_emitter.SQSUB(H, ge, a, b); + code.fp_emitter.CMGE_zero(H, ge, ge); + + ctx.reg_alloc.DefineValue(ge_inst, ge); + ctx.EraseInstruction(ge_inst); + } + + code.fp_emitter.SUB(H, a, a, b); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitA64::EmitPackedHalvingAddU8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + code.fp_emitter.UHADD(B, a, a, b); + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitA64::EmitPackedHalvingAddU16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + code.fp_emitter.UHADD(H, a, a, b); + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitA64::EmitPackedHalvingAddS8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + code.fp_emitter.SHADD(B, a, a, b); + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitA64::EmitPackedHalvingAddS16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + code.fp_emitter.SHADD(H, a, a, b); + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitA64::EmitPackedHalvingSubU8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + code.fp_emitter.UHSUB(B, a, a, b); + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitA64::EmitPackedHalvingSubS8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + code.fp_emitter.SHSUB(B, a, a, b); + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitA64::EmitPackedHalvingSubU16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + code.fp_emitter.UHSUB(H, a, a, b); + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitA64::EmitPackedHalvingSubS16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + code.fp_emitter.SHSUB(H, a, a, b); + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitPackedSubAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, bool hi_is_sum, bool is_signed, bool is_halving) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + const ARM64Reg reg_a_hi = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0])); + const ARM64Reg reg_b_hi = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[1])); + const ARM64Reg reg_a_lo = DecodeReg(ctx.reg_alloc.ScratchGpr()); + const ARM64Reg reg_b_lo = DecodeReg(ctx.reg_alloc.ScratchGpr()); + ARM64Reg reg_sum, reg_diff; + + if (is_signed) { + code.SXTH(reg_a_lo, reg_a_hi); + code.SXTH(reg_b_lo, reg_b_hi); + code.ASR(reg_a_hi, reg_a_hi, 16); + code.ASR(reg_b_hi, reg_b_hi, 16); + } else { + code.UXTH(reg_a_lo, reg_a_hi); + code.UXTH(reg_b_lo, reg_b_hi); + code.LSR(reg_a_hi, reg_a_hi, 16); + code.LSR(reg_b_hi, reg_b_hi, 16); + } + + if (hi_is_sum) { + code.SUB(reg_a_lo, reg_a_lo, reg_b_hi); + code.ADD(reg_a_hi, reg_a_hi, reg_b_lo); + reg_diff = reg_a_lo; + reg_sum = reg_a_hi; + } else { + code.ADD(reg_a_lo, reg_a_lo, reg_b_hi); + code.SUB(reg_a_hi, reg_a_hi, reg_b_lo); + reg_diff = reg_a_hi; + reg_sum = reg_a_lo; + } + + if (ge_inst) { + // The reg_b registers are no longer required. + const ARM64Reg ge_sum = reg_b_hi; + const ARM64Reg ge_diff = reg_b_lo; + + if (!is_signed) { + code.LSL(ge_sum, reg_sum, 15); + code.ASR(ge_sum, ge_sum, 31); + } else { + code.MVN(ge_sum, reg_sum); + code.ASR(ge_sum, ge_sum, 31); + } + code.MVN(ge_diff, reg_diff); + code.ASR(ge_diff, ge_diff, 31); + code.ANDI2R(ge_sum, ge_sum, hi_is_sum ? 0xFFFF0000 : 0x0000FFFF); + code.ANDI2R(ge_diff, ge_diff, hi_is_sum ? 0x0000FFFF : 0xFFFF0000); + code.ORR(ge_sum, ge_sum, ge_diff); + + ctx.reg_alloc.DefineValue(ge_inst, ge_sum); + ctx.EraseInstruction(ge_inst); + } + + if (is_halving) { + code.LSR(reg_a_hi, reg_a_hi, 1); + code.LSR(reg_a_lo, reg_a_lo, 1); + } + + // reg_a_lo now contains the low word and reg_a_hi now contains the high word. + // Merge them. + code.BFM(reg_a_lo, reg_a_hi, 16, 15); + + ctx.reg_alloc.DefineValue(inst, reg_a_lo); +} + +void EmitA64::EmitPackedAddSubU16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedSubAdd(code, ctx, inst, true, false, false); +} + +void EmitA64::EmitPackedAddSubS16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedSubAdd(code, ctx, inst, true, true, false); +} + +void EmitA64::EmitPackedSubAddU16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedSubAdd(code, ctx, inst, false, false, false); +} + +void EmitA64::EmitPackedSubAddS16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedSubAdd(code, ctx, inst, false, true, false); +} + +void EmitA64::EmitPackedHalvingAddSubU16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedSubAdd(code, ctx, inst, true, false, true); +} + +void EmitA64::EmitPackedHalvingAddSubS16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedSubAdd(code, ctx, inst, true, true, true); +} + +void EmitA64::EmitPackedHalvingSubAddU16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedSubAdd(code, ctx, inst, false, false, true); +} + +void EmitA64::EmitPackedHalvingSubAddS16(EmitContext& ctx, IR::Inst* inst) { + EmitPackedSubAdd(code, ctx, inst, false, true, true); +} + +void EmitA64::EmitPackedSaturatedAddU8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + code.fp_emitter.UQADD(B, a, a, b); + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitA64::EmitPackedSaturatedAddS8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + code.fp_emitter.SQADD(B, a, a, b); + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitA64::EmitPackedSaturatedSubU8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + code.fp_emitter.UQSUB(B, a, a, b); + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitA64::EmitPackedSaturatedSubS8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + code.fp_emitter.SQSUB(B, a, a, b); + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitA64::EmitPackedSaturatedAddU16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + code.fp_emitter.UQADD(H, a, a, b); + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitA64::EmitPackedSaturatedAddS16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + code.fp_emitter.SQADD(H, a, a, b); + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitA64::EmitPackedSaturatedSubU16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + code.fp_emitter.UQSUB(H, a, a, b); + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitA64::EmitPackedSaturatedSubS16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + code.fp_emitter.SQSUB(H, a, a, b); + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitA64::EmitPackedAbsDiffSumS8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + code.fp_emitter.UABD(B, a, a, b); + code.fp_emitter.UADDLV(B, a, a); + + ctx.reg_alloc.DefineValue(inst, a); +} + +void EmitA64::EmitPackedSelect(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const ARM64Reg ge = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[2])); + + code.fp_emitter.BSL(ge, b, a); + + ctx.reg_alloc.DefineValue(inst, ge); +} + +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/emit_a64_saturation.cpp b/src/dynarmic/backend/A64/emit_a64_saturation.cpp new file mode 100644 index 00000000..5462fba4 --- /dev/null +++ b/src/dynarmic/backend/A64/emit_a64_saturation.cpp @@ -0,0 +1,167 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#include + +#include "backend/A64/block_of_code.h" +#include "backend/A64/emit_a64.h" +#include "common/assert.h" +#include "common/bit_util.h" +#include "common/common_types.h" +#include "frontend/ir/basic_block.h" +#include "frontend/ir/microinstruction.h" +#include "frontend/ir/opcodes.h" + +namespace Dynarmic::BackendA64 { + +namespace { + +enum class Op { + Add, + Sub, +}; + +template +void EmitSignedSaturatedOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + ARM64Reg result = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0])); + ARM64Reg addend = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1])); + + if constexpr (op == Op::Add) { + code.fp_emitter.SQADD(size, result, result, addend); + } + else { + code.fp_emitter.SQSUB(size, result, result, addend); + } + + if (overflow_inst) { + ARM64Reg overflow = ctx.reg_alloc.ScratchGpr(); + + code.MRS(overflow, FIELD_FPSR); + code.UBFX(overflow, overflow, 27, 1); + + ctx.reg_alloc.DefineValue(overflow_inst, overflow); + ctx.EraseInstruction(overflow_inst); + } + + ctx.reg_alloc.DefineValue(inst, result); +} +} // anonymous namespace + +void EmitA64::EmitSignedSaturatedAdd8(EmitContext& ctx, IR::Inst* inst) { + EmitSignedSaturatedOp(code, ctx, inst); +} + +void EmitA64::EmitSignedSaturatedAdd16(EmitContext& ctx, IR::Inst* inst) { + EmitSignedSaturatedOp(code, ctx, inst); +} + +void EmitA64::EmitSignedSaturatedAdd32(EmitContext& ctx, IR::Inst* inst) { + EmitSignedSaturatedOp(code, ctx, inst); +} + +void EmitA64::EmitSignedSaturatedAdd64(EmitContext& ctx, IR::Inst* inst) { + EmitSignedSaturatedOp(code, ctx, inst); +} + +void EmitA64::EmitSignedSaturatedSub8(EmitContext& ctx, IR::Inst* inst) { + EmitSignedSaturatedOp(code, ctx, inst); +} + +void EmitA64::EmitSignedSaturatedSub16(EmitContext& ctx, IR::Inst* inst) { + EmitSignedSaturatedOp(code, ctx, inst); +} + +void EmitA64::EmitSignedSaturatedSub32(EmitContext& ctx, IR::Inst* inst) { + EmitSignedSaturatedOp(code, ctx, inst); +} + +void EmitA64::EmitSignedSaturatedSub64(EmitContext& ctx, IR::Inst* inst) { + EmitSignedSaturatedOp(code, ctx, inst); +} + +void EmitA64::EmitSignedSaturation(EmitContext& ctx, IR::Inst* inst) { + const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const size_t N = args[1].GetImmediateU8(); + ASSERT(N >= 1 && N <= 32); + + if (N == 32) { + if (overflow_inst) { + const auto no_overflow = IR::Value(false); + overflow_inst->ReplaceUsesWith(no_overflow); + } + ctx.reg_alloc.DefineValue(inst, args[0]); + return; + } + + const u32 mask = (1u << N) - 1; + const u32 positive_saturated_value = (1u << (N - 1)) - 1; + const u32 negative_saturated_value = 1u << (N - 1); + const u32 sext_negative_satured_value = Common::SignExtend(N, negative_saturated_value); + + const ARM64Reg result = DecodeReg(ctx.reg_alloc.ScratchGpr()); + const ARM64Reg reg_a = DecodeReg(ctx.reg_alloc.UseGpr(args[0])); + const ARM64Reg overflow = DecodeReg(ctx.reg_alloc.ScratchGpr()); + const ARM64Reg tmp = DecodeReg(ctx.reg_alloc.ScratchGpr()); + + // overflow now contains a value between 0 and mask if it was originally between {negative,positive}_saturated_value. + code.ADDI2R(overflow, reg_a, negative_saturated_value, overflow); + + // Put the appropriate saturated value in result + code.MOVI2R(tmp, positive_saturated_value); + code.CMP(reg_a, tmp); + code.MOVI2R(result, sext_negative_satured_value); + code.CSEL(result, tmp, result, CC_GT); + + // Do the saturation + code.CMPI2R(overflow, mask, tmp); + code.CSEL(result, reg_a, result, CC_LS); + + if (overflow_inst) { + code.CSET(overflow, CC_HI); + + ctx.reg_alloc.DefineValue(overflow_inst, overflow); + ctx.EraseInstruction(overflow_inst); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +void EmitA64::EmitUnsignedSaturation(EmitContext& ctx, IR::Inst* inst) { + const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp); + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const size_t N = args[1].GetImmediateU8(); + ASSERT(N <= 31); + + const u32 saturated_value = (1u << N) - 1; + + const ARM64Reg result = DecodeReg(ctx.reg_alloc.ScratchGpr()); + const ARM64Reg reg_a = DecodeReg(ctx.reg_alloc.UseGpr(args[0])); + const ARM64Reg overflow = DecodeReg(ctx.reg_alloc.ScratchGpr()); + + // Pseudocode: result = clamp(reg_a, 0, saturated_value); + code.MOVI2R(result, saturated_value); + code.CMP(reg_a, result); + code.CSEL(result, WZR, result, CC_LE); + code.CSEL(result, reg_a, result, CC_LS); + + if (overflow_inst) { + code.CSET(overflow, CC_HI); + + ctx.reg_alloc.DefineValue(overflow_inst, overflow); + ctx.EraseInstruction(overflow_inst); + } + + ctx.reg_alloc.DefineValue(inst, result); +} + +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/emitter/a64_emitter.cpp b/src/dynarmic/backend/A64/emitter/a64_emitter.cpp new file mode 100644 index 00000000..efbb4767 --- /dev/null +++ b/src/dynarmic/backend/A64/emitter/a64_emitter.cpp @@ -0,0 +1,3897 @@ +// Copyright 2015 Dolphin Emulator Project / 2018 dynarmic project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#include +#include +#include +#include +#include + +#if defined(__APPLE__) +#include +#endif + +#include "a64_emitter.h" +#include "common/assert.h" +#include "common/bit_util.h" +#include "common/cast_util.h" +#include "common/common_types.h" +#include "common/math_util.h" + +namespace Dynarmic::BackendA64::Arm64Gen { + +namespace { +const int kWRegSizeInBits = 32; +const int kXRegSizeInBits = 64; + +// The below few functions are taken from V8. +int CountLeadingZeros(u64 value, int width) { +#ifdef _MSC_VER + if (width == 64) { + return _CountLeadingZeros64(value); + } +#else + if (width == 64) { + return __builtin_clzll(value); + } +#endif + // TODO(jbramley): Optimize this for ARM64 hosts. + int count = 0; + uint64_t bit_test = 1ULL << (width - 1); + while ((count < width) && ((bit_test & value) == 0)) { + count++; + bit_test >>= 1; + } + return count; +} + +uint64_t LargestPowerOf2Divisor(uint64_t value) { + return value & -(int64_t)value; +} + +// For ADD/SUB +bool IsImmArithmetic(uint64_t input, u32* val, bool* shift) { + if (input < 4096) { + *val = static_cast(input); + *shift = false; + return true; + } else if ((input & 0xFFF000) == input) { + *val = static_cast(input >> 12); + *shift = true; + return true; + } + return false; +} + +// For AND/TST/ORR/EOR etc +bool IsImmLogical(uint64_t value, unsigned int width, unsigned int* n, unsigned int* imm_s, + unsigned int* imm_r) { + bool negate = false; + + // Logical immediates are encoded using parameters n, imm_s and imm_r using + // the following table: + // + // N imms immr size S R + // 1 ssssss rrrrrr 64 UInt(ssssss) UInt(rrrrrr) + // 0 0sssss xrrrrr 32 UInt(sssss) UInt(rrrrr) + // 0 10ssss xxrrrr 16 UInt(ssss) UInt(rrrr) + // 0 110sss xxxrrr 8 UInt(sss) UInt(rrr) + // 0 1110ss xxxxrr 4 UInt(ss) UInt(rr) + // 0 11110s xxxxxr 2 UInt(s) UInt(r) + // (s bits must not be all set) + // + // A pattern is constructed of size bits, where the least significant S+1 bits + // are set. The pattern is rotated right by R, and repeated across a 32 or + // 64-bit value, depending on destination register width. + // + // Put another way: the basic format of a logical immediate is a single + // contiguous stretch of 1 bits, repeated across the whole word at intervals + // given by a power of 2. To identify them quickly, we first locate the + // lowest stretch of 1 bits, then the next 1 bit above that; that combination + // is different for every logical immediate, so it gives us all the + // information we need to identify the only logical immediate that our input + // could be, and then we simply check if that's the value we actually have. + // + // (The rotation parameter does give the possibility of the stretch of 1 bits + // going 'round the end' of the word. To deal with that, we observe that in + // any situation where that happens the bitwise NOT of the value is also a + // valid logical immediate. So we simply invert the input whenever its low bit + // is set, and then we know that the rotated case can't arise.) + + if (value & 1) { + // If the low bit is 1, negate the value, and set a flag to remember that we + // did (so that we can adjust the return values appropriately). + negate = true; + value = ~value; + } + + if (width == kWRegSizeInBits) { + // To handle 32-bit logical immediates, the very easiest thing is to repeat + // the input value twice to make a 64-bit word. The correct encoding of that + // as a logical immediate will also be the correct encoding of the 32-bit + // value. + + // The most-significant 32 bits may not be zero (ie. negate is true) so + // shift the value left before duplicating it. + value <<= kWRegSizeInBits; + value |= value >> kWRegSizeInBits; + } + + // The basic analysis idea: imagine our input word looks like this. + // + // 0011111000111110001111100011111000111110001111100011111000111110 + // c b a + // |<--d-->| + // + // We find the lowest set bit (as an actual power-of-2 value, not its index) + // and call it a. Then we add a to our original number, which wipes out the + // bottommost stretch of set bits and replaces it with a 1 carried into the + // next zero bit. Then we look for the new lowest set bit, which is in + // position b, and subtract it, so now our number is just like the original + // but with the lowest stretch of set bits completely gone. Now we find the + // lowest set bit again, which is position c in the diagram above. Then we'll + // measure the distance d between bit positions a and c (using CLZ), and that + // tells us that the only valid logical immediate that could possibly be equal + // to this number is the one in which a stretch of bits running from a to just + // below b is replicated every d bits. + uint64_t a = LargestPowerOf2Divisor(value); + uint64_t value_plus_a = value + a; + uint64_t b = LargestPowerOf2Divisor(value_plus_a); + uint64_t value_plus_a_minus_b = value_plus_a - b; + uint64_t c = LargestPowerOf2Divisor(value_plus_a_minus_b); + + int d, clz_a, out_n; + uint64_t mask; + + if (c != 0) { + // The general case, in which there is more than one stretch of set bits. + // Compute the repeat distance d, and set up a bitmask covering the basic + // unit of repetition (i.e. a word with the bottom d bits set). Also, in all + // of these cases the N bit of the output will be zero. + clz_a = CountLeadingZeros(a, kXRegSizeInBits); + int clz_c = CountLeadingZeros(c, kXRegSizeInBits); + d = clz_a - clz_c; + mask = ((UINT64_C(1) << d) - 1); + out_n = 0; + } else { + // Handle degenerate cases. + // + // If any of those 'find lowest set bit' operations didn't find a set bit at + // all, then the word will have been zero thereafter, so in particular the + // last lowest_set_bit operation will have returned zero. So we can test for + // all the special case conditions in one go by seeing if c is zero. + if (a == 0) { + // The input was zero (or all 1 bits, which will come to here too after we + // inverted it at the start of the function), for which we just return + // false. + return false; + } else { + // Otherwise, if c was zero but a was not, then there's just one stretch + // of set bits in our word, meaning that we have the trivial case of + // d == 64 and only one 'repetition'. Set up all the same variables as in + // the general case above, and set the N bit in the output. + clz_a = CountLeadingZeros(a, kXRegSizeInBits); + d = 64; + mask = ~UINT64_C(0); + out_n = 1; + } + } + + // If the repeat period d is not a power of two, it can't be encoded. + if (!Dynarmic::Common::IsPow2(d)) + return false; + + // If the bit stretch (b - a) does not fit within the mask derived from the + // repeat period, then fail. + if (((b - a) & ~mask) != 0) + return false; + + // The only possible option is b - a repeated every d bits. Now we're going to + // actually construct the valid logical immediate derived from that + // specification, and see if it equals our original input. + // + // To repeat a value every d bits, we multiply it by a number of the form + // (1 + 2^d + 2^(2d) + ...), i.e. 0x0001000100010001 or similar. These can + // be derived using a table lookup on CLZ(d). + static const std::array multipliers = {{ + 0x0000000000000001UL, + 0x0000000100000001UL, + 0x0001000100010001UL, + 0x0101010101010101UL, + 0x1111111111111111UL, + 0x5555555555555555UL, + }}; + + int multiplier_idx = CountLeadingZeros(d, kXRegSizeInBits) - 57; + + // Ensure that the index to the multipliers array is within bounds. + DEBUG_ASSERT((multiplier_idx >= 0) && + (static_cast(multiplier_idx) < multipliers.size())); + + uint64_t multiplier = multipliers[multiplier_idx]; + uint64_t candidate = (b - a) * multiplier; + + // The candidate pattern doesn't match our input value, so fail. + if (value != candidate) + return false; + + // We have a match! This is a valid logical immediate, so now we have to + // construct the bits and pieces of the instruction encoding that generates + // it. + + // Count the set bits in our basic stretch. The special case of clz(0) == -1 + // makes the answer come out right for stretches that reach the very top of + // the word (e.g. numbers like 0xffffc00000000000). + int clz_b = (b == 0) ? -1 : CountLeadingZeros(b, kXRegSizeInBits); + int s = clz_a - clz_b; + + // Decide how many bits to rotate right by, to put the low bit of that basic + // stretch in position a. + int r; + if (negate) { + // If we inverted the input right at the start of this function, here's + // where we compensate: the number of set bits becomes the number of clear + // bits, and the rotation count is based on position b rather than position + // a (since b is the location of the 'lowest' 1 bit after inversion). + s = d - s; + r = (clz_b + 1) & (d - 1); + } else { + r = (clz_a + 1) & (d - 1); + } + + // Now we're done, except for having to encode the S output in such a way that + // it gives both the number of set bits and the length of the repeated + // segment. The s field is encoded like this: + // + // imms size S + // ssssss 64 UInt(ssssss) + // 0sssss 32 UInt(sssss) + // 10ssss 16 UInt(ssss) + // 110sss 8 UInt(sss) + // 1110ss 4 UInt(ss) + // 11110s 2 UInt(s) + // + // So we 'or' (-d << 1) with our computed s to form imms. + *n = out_n; + *imm_s = ((-d << 1) | (s - 1)) & 0x3f; + *imm_r = r; + + return true; +} + +float FPImm8ToFloat(u8 bits) { + const u32 sign = bits >> 7; + const u32 bit6 = (bits >> 6) & 1; + const u32 exp = ((!bit6) << 7) | (0x7C * bit6) | ((bits >> 4) & 3); + const u32 mantissa = (bits & 0xF) << 19; + const u32 f = (sign << 31) | (exp << 23) | mantissa; + + return Dynarmic::Common::BitCast(f); +} + +bool FPImm8FromFloat(float value, u8* imm_out) { + const u32 f = Dynarmic::Common::BitCast(value); + const u32 mantissa4 = (f & 0x7FFFFF) >> 19; + const u32 exponent = (f >> 23) & 0xFF; + const u32 sign = f >> 31; + + if ((exponent >> 7) == ((exponent >> 6) & 1)) + return false; + + const u8 imm8 = static_cast((sign << 7) | ((!(exponent >> 7)) << 6) | + ((exponent & 3) << 4) | mantissa4); + const float new_float = FPImm8ToFloat(imm8); + if (new_float == value) + *imm_out = imm8; + else + return false; + + return true; +} + +static constexpr bool IsInRangeImm19(s64 distance) { + return (distance >= -0x40000 && distance <= 0x3FFFF); +} + +static constexpr bool IsInRangeImm14(s64 distance) { + return (distance >= -0x2000 && distance <= 0x1FFF); +} + +static constexpr bool IsInRangeImm26(s64 distance) { + return (distance >= -0x2000000 && distance <= 0x1FFFFFF); +} + +static constexpr u32 MaskImm19(s64 distance) { + return distance & 0x7FFFF; +} + +static constexpr u32 MaskImm14(s64 distance) { + return distance & 0x3FFF; +} + +static constexpr u32 MaskImm26(s64 distance) { + return distance & 0x3FFFFFF; +} + +} // Anonymous namespace + +void ARM64XEmitter::SetCodePtrUnsafe(u8* ptr) { + m_code = ptr; +} + +void ARM64XEmitter::SetCodePtr(u8* ptr) { + SetCodePtrUnsafe(ptr); + m_lastCacheFlushEnd = ptr; +} + +const u8* ARM64XEmitter::GetCodePtr() const { + return m_code; +} + +u8* ARM64XEmitter::GetWritableCodePtr() { + return m_code; +} + +void ARM64XEmitter::ReserveCodeSpace(u32 bytes) { + for (u32 i = 0; i < bytes / 4; i++) + BRK(0); +} + +const u8* ARM64XEmitter::AlignCode16() { + int c = int((u64)m_code & 15); + if (c) + ReserveCodeSpace(16 - c); + return m_code; +} + +const u8* ARM64XEmitter::AlignCodePage() { + int c = int((u64)m_code & 4095); + if (c) + ReserveCodeSpace(4096 - c); + return m_code; +} + +void ARM64XEmitter::Write32(u32 value) { + std::memcpy(m_code, &value, sizeof(u32)); + m_code += sizeof(u32); +} + +void ARM64XEmitter::FlushIcache() { + FlushIcacheSection(m_lastCacheFlushEnd, m_code); + m_lastCacheFlushEnd = m_code; +} + +void ARM64XEmitter::FlushIcacheSection(const u8* start, const u8* end) { + if (start == end) + return; + +#if defined(__APPLE__) + // Header file says this is equivalent to: sys_icache_invalidate(start, end - + // start); + sys_cache_control(kCacheFunctionPrepareForExecution, const_cast(start), end - start); +#else + // Don't rely on GCC's __clear_cache implementation, as it caches + // icache/dcache cache line sizes, that can vary between cores on + // big.LITTLE architectures. + u64 addr, ctr_el0; + static size_t icache_line_size = 0xffff, dcache_line_size = 0xffff; + size_t isize, dsize; + + __asm__ volatile("mrs %0, ctr_el0" : "=r"(ctr_el0)); + isize = 4 << ((ctr_el0 >> 0) & 0xf); + dsize = 4 << ((ctr_el0 >> 16) & 0xf); + + // use the global minimum cache line size + icache_line_size = isize = icache_line_size < isize ? icache_line_size : isize; + dcache_line_size = dsize = dcache_line_size < dsize ? dcache_line_size : dsize; + + addr = reinterpret_cast(start) & ~static_cast(dsize - 1); + for (; addr < reinterpret_cast(end); addr += dsize) + // use "civac" instead of "cvau", as this is the suggested workaround for + // Cortex-A53 errata 819472, 826319, 827319 and 824069. + __asm__ volatile("dc civac, %0" : : "r"(addr) : "memory"); + __asm__ volatile("dsb ish" : : : "memory"); + + addr = reinterpret_cast(start) & ~static_cast(isize - 1); + for (; addr < reinterpret_cast(end); addr += isize) + __asm__ volatile("ic ivau, %0" : : "r"(addr) : "memory"); + + __asm__ volatile("dsb ish" : : : "memory"); + __asm__ volatile("isb" : : : "memory"); +#endif +} + +// Exception generation +static const u32 ExcEnc[][3] = { + {0, 0, 1}, // SVC + {0, 0, 2}, // HVC + {0, 0, 3}, // SMC + {1, 0, 0}, // BRK + {2, 0, 0}, // HLT + {5, 0, 1}, // DCPS1 + {5, 0, 2}, // DCPS2 + {5, 0, 3}, // DCPS3 +}; + +// Arithmetic generation +static const u32 ArithEnc[] = { + 0x058, // ADD + 0x258, // SUB +}; + +// Conditional Select +static const u32 CondSelectEnc[][2] = { + {0, 0}, // CSEL + {0, 1}, // CSINC + {1, 0}, // CSINV + {1, 1}, // CSNEG +}; + +// Data-Processing (1 source) +static const u32 Data1SrcEnc[][2] = { + {0, 0}, // RBIT + {0, 1}, // REV16 + {0, 2}, // REV32 + {0, 3}, // REV64 + {0, 4}, // CLZ + {0, 5}, // CLS +}; + +// Data-Processing (2 source) +static const u32 Data2SrcEnc[] = { + 0x02, // UDIV + 0x03, // SDIV + 0x08, // LSLV + 0x09, // LSRV + 0x0A, // ASRV + 0x0B, // RORV + 0x10, // CRC32B + 0x11, // CRC32H + 0x12, // CRC32W + 0x14, // CRC32CB + 0x15, // CRC32CH + 0x16, // CRC32CW + 0x13, // CRC32X (64bit Only) + 0x17, // XRC32CX (64bit Only) +}; + +// Data-Processing (3 source) +static const u32 Data3SrcEnc[][2] = { + {0, 0}, // MADD + {0, 1}, // MSUB + {1, 0}, // SMADDL (64Bit Only) + {1, 1}, // SMSUBL (64Bit Only) + {2, 0}, // SMULH (64Bit Only) + {5, 0}, // UMADDL (64Bit Only) + {5, 1}, // UMSUBL (64Bit Only) + {6, 0}, // UMULH (64Bit Only) +}; + +// Logical (shifted register) +static const u32 LogicalEnc[][2] = { + {0, 0}, // AND + {0, 1}, // BIC + {1, 0}, // OOR + {1, 1}, // ORN + {2, 0}, // EOR + {2, 1}, // EON + {3, 0}, // ANDS + {3, 1}, // BICS +}; + +// Load/Store Exclusive +static const u32 LoadStoreExcEnc[][5] = { + {0, 0, 0, 0, 0}, // STXRB + {0, 0, 0, 0, 1}, // STLXRB + {0, 0, 1, 0, 0}, // LDXRB + {0, 0, 1, 0, 1}, // LDAXRB + {0, 1, 0, 0, 1}, // STLRB + {0, 1, 1, 0, 1}, // LDARB + {1, 0, 0, 0, 0}, // STXRH + {1, 0, 0, 0, 1}, // STLXRH + {1, 0, 1, 0, 0}, // LDXRH + {1, 0, 1, 0, 1}, // LDAXRH + {1, 1, 0, 0, 1}, // STLRH + {1, 1, 1, 0, 1}, // LDARH + {2, 0, 0, 0, 0}, // STXR + {3, 0, 0, 0, 0}, // (64bit) STXR + {2, 0, 0, 0, 1}, // STLXR + {3, 0, 0, 0, 1}, // (64bit) STLXR + {2, 0, 0, 1, 0}, // STXP + {3, 0, 0, 1, 0}, // (64bit) STXP + {2, 0, 0, 1, 1}, // STLXP + {3, 0, 0, 1, 1}, // (64bit) STLXP + {2, 0, 1, 0, 0}, // LDXR + {3, 0, 1, 0, 0}, // (64bit) LDXR + {2, 0, 1, 0, 1}, // LDAXR + {3, 0, 1, 0, 1}, // (64bit) LDAXR + {2, 0, 1, 1, 0}, // LDXP + {3, 0, 1, 1, 0}, // (64bit) LDXP + {2, 0, 1, 1, 1}, // LDAXP + {3, 0, 1, 1, 1}, // (64bit) LDAXP + {2, 1, 0, 0, 1}, // STLR + {3, 1, 0, 0, 1}, // (64bit) STLR + {2, 1, 1, 0, 1}, // LDAR + {3, 1, 1, 0, 1}, // (64bit) LDAR +}; + +void ARM64XEmitter::EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr) { + bool b64Bit = Is64Bit(Rt); + s64 distance = reinterpret_cast(ptr) - reinterpret_cast(m_code); + + ASSERT_MSG(!(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64, __func__, + distance); + + distance >>= 2; + + ASSERT_MSG(distance >= -0x40000 && distance <= 0x3FFFF, + "%s: Received too large distance: %" PRIx64, __func__, distance); + + Rt = DecodeReg(Rt); + Write32((b64Bit << 31) | (0x34 << 24) | (op << 24) | + ((static_cast(distance) << 5) & 0xFFFFE0) | Rt); +} + +void ARM64XEmitter::EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const void* ptr) { + bool b64Bit = Is64Bit(Rt); + s64 distance = reinterpret_cast(ptr) - reinterpret_cast(m_code); + + ASSERT_MSG(!(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64, __func__, + distance); + + distance >>= 2; + + ASSERT_MSG(distance >= -0x3FFF && distance < 0x3FFF, + "%s: Received too large distance: %" PRIx64, __func__, distance); + + Rt = DecodeReg(Rt); + Write32((b64Bit << 31) | (0x36 << 24) | (op << 24) | (bits << 19) | + ((static_cast(distance) << 5) & 0x7FFE0) | Rt); +} + +void ARM64XEmitter::EncodeUnconditionalBranchInst(u32 op, const void* ptr) { + s64 distance = reinterpret_cast(ptr) - reinterpret_cast(m_code); + + ASSERT_MSG(!(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64, __func__, + distance); + + distance >>= 2; + + ASSERT_MSG(distance >= -0x2000000LL && distance <= 0x1FFFFFFLL, + "%s: Received too large distance: %" PRIx64, __func__, distance); + + Write32((op << 31) | (0x5 << 26) | (distance & 0x3FFFFFF)); +} + +void ARM64XEmitter::EncodeUnconditionalBranchInst(u32 opc, u32 op2, u32 op3, u32 op4, ARM64Reg Rn) { + Rn = DecodeReg(Rn); + Write32((0x6B << 25) | (opc << 21) | (op2 << 16) | (op3 << 10) | (Rn << 5) | op4); +} + +void ARM64XEmitter::EncodeExceptionInst(u32 instenc, u32 imm) { + ASSERT_MSG(!(imm & ~0xFFFF), "%s: Exception instruction too large immediate: %d", __func__, + imm); + + Write32((0xD4 << 24) | (ExcEnc[instenc][0] << 21) | (imm << 5) | (ExcEnc[instenc][1] << 2) | + ExcEnc[instenc][2]); +} + +void ARM64XEmitter::EncodeSystemInst(u32 op0, u32 op1, u32 CRn, u32 CRm, u32 op2, ARM64Reg Rt) { + Write32((0x354 << 22) | (op0 << 19) | (op1 << 16) | (CRn << 12) | (CRm << 8) | (op2 << 5) | Rt); +} + +void ARM64XEmitter::EncodeArithmeticInst(u32 instenc, bool flags, ARM64Reg Rd, ARM64Reg Rn, + ARM64Reg Rm, ArithOption Option) { + bool b64Bit = Is64Bit(Rd); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + Write32((b64Bit << 31) | (flags << 29) | (ArithEnc[instenc] << 21) | + (Option.GetType() == ArithOption::TYPE_EXTENDEDREG ? (1 << 21) : 0) | (Rm << 16) | + Option.GetData() | (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeArithmeticCarryInst(u32 op, bool flags, ARM64Reg Rd, ARM64Reg Rn, + ARM64Reg Rm) { + bool b64Bit = Is64Bit(Rd); + + Rd = DecodeReg(Rd); + Rm = DecodeReg(Rm); + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (op << 30) | (flags << 29) | (0xD0 << 21) | (Rm << 16) | (Rn << 5) | + Rd); +} + +void ARM64XEmitter::EncodeCondCompareImmInst(u32 op, ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond) { + bool b64Bit = Is64Bit(Rn); + + ASSERT_MSG(!(imm & ~0x1F), "%s: too large immediate: %d", __func__, imm); + ASSERT_MSG(!(nzcv & ~0xF), "%s: Flags out of range: %d", __func__, nzcv); + + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (op << 30) | (1 << 29) | (0xD2 << 21) | (imm << 16) | (cond << 12) | + (1 << 11) | (Rn << 5) | nzcv); +} + +void ARM64XEmitter::EncodeCondCompareRegInst(u32 op, ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, + CCFlags cond) { + bool b64Bit = Is64Bit(Rm); + + ASSERT_MSG(!(nzcv & ~0xF), "%s: Flags out of range: %d", __func__, nzcv); + + Rm = DecodeReg(Rm); + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (op << 30) | (1 << 29) | (0xD2 << 21) | (Rm << 16) | (cond << 12) | + (Rn << 5) | nzcv); +} + +void ARM64XEmitter::EncodeCondSelectInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, + CCFlags cond) { + bool b64Bit = Is64Bit(Rd); + + Rd = DecodeReg(Rd); + Rm = DecodeReg(Rm); + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (CondSelectEnc[instenc][0] << 30) | (0xD4 << 21) | (Rm << 16) | + (cond << 12) | (CondSelectEnc[instenc][1] << 10) | (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn) { + bool b64Bit = Is64Bit(Rd); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (0x2D6 << 21) | (Data1SrcEnc[instenc][0] << 16) | + (Data1SrcEnc[instenc][1] << 10) | (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + bool b64Bit = Is64Bit(Rd); + + Rd = DecodeReg(Rd); + Rm = DecodeReg(Rm); + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (0x0D6 << 21) | (Rm << 16) | (Data2SrcEnc[instenc] << 10) | (Rn << 5) | + Rd); +} + +void ARM64XEmitter::EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, + ARM64Reg Ra) { + bool b64Bit = Is64Bit(Rd); + + Rd = DecodeReg(Rd); + Rm = DecodeReg(Rm); + Rn = DecodeReg(Rn); + Ra = DecodeReg(Ra); + Write32((b64Bit << 31) | (0xD8 << 21) | (Data3SrcEnc[instenc][0] << 21) | (Rm << 16) | + (Data3SrcEnc[instenc][1] << 15) | (Ra << 10) | (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, + ArithOption Shift) { + bool b64Bit = Is64Bit(Rd); + + Rd = DecodeReg(Rd); + Rm = DecodeReg(Rm); + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (LogicalEnc[instenc][0] << 29) | (0x5 << 25) | + (LogicalEnc[instenc][1] << 21) | Shift.GetData() | (Rm << 16) | (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeLoadRegisterInst(u32 bitop, ARM64Reg Rt, s32 imm) { + bool b64Bit = Is64Bit(Rt); + bool bVec = IsVector(Rt); + + ASSERT_MSG(IsInRangeImm19(imm), "{}: offset too large {}", __func__, imm); + + Rt = DecodeReg(Rt); + if (b64Bit && bitop != 0x2) // LDRSW(0x2) uses 64bit reg, doesn't have 64bit bit set + bitop |= 0x1; + Write32((bitop << 30) | (bVec << 26) | (0x18 << 24) | (MaskImm19(imm) << 5) | Rt); +} + +void ARM64XEmitter::EncodeLoadStoreExcInst(u32 instenc, ARM64Reg Rs, ARM64Reg Rt2, ARM64Reg Rn, + ARM64Reg Rt) { + Rs = DecodeReg(Rs); + Rt2 = DecodeReg(Rt2); + Rn = DecodeReg(Rn); + Rt = DecodeReg(Rt); + Write32((LoadStoreExcEnc[instenc][0] << 30) | (0x8 << 24) | + (LoadStoreExcEnc[instenc][1] << 23) | (LoadStoreExcEnc[instenc][2] << 22) | + (LoadStoreExcEnc[instenc][3] << 21) | (Rs << 16) | (LoadStoreExcEnc[instenc][4] << 15) | + (Rt2 << 10) | (Rn << 5) | Rt); +} + +void ARM64XEmitter::EncodeLoadStorePairedInst(u32 op, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, + u32 imm) { + bool b64Bit = Is64Bit(Rt); + bool b128Bit = IsQuad(Rt); + bool bVec = IsVector(Rt); + + if (b128Bit) + imm >>= 4; + else if (b64Bit) + imm >>= 3; + else + imm >>= 2; + + ASSERT_MSG(!(imm & ~0xF), "%s: offset too large %d", __func__, imm); + + u32 opc = 0; + if (b128Bit) + opc = 2; + else if (b64Bit && bVec) + opc = 1; + else if (b64Bit && !bVec) + opc = 2; + + Rt = DecodeReg(Rt); + Rt2 = DecodeReg(Rt2); + Rn = DecodeReg(Rn); + Write32((opc << 30) | (bVec << 26) | (op << 22) | (imm << 15) | (Rt2 << 10) | (Rn << 5) | Rt); +} + +void ARM64XEmitter::EncodeLoadStoreIndexedInst(u32 op, u32 op2, ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + bool b64Bit = Is64Bit(Rt); + bool bVec = IsVector(Rt); + + u32 offset = imm & 0x1FF; + + ASSERT_MSG(!(imm < -256 || imm > 255), "%s: offset too large %d", __func__, imm); + + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + Write32((b64Bit << 30) | (op << 22) | (bVec << 26) | (offset << 12) | (op2 << 10) | (Rn << 5) | + Rt); +} + +void ARM64XEmitter::EncodeLoadStoreIndexedInst(u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm, u8 size) { + bool b64Bit = Is64Bit(Rt); + bool bVec = IsVector(Rt); + + if (size == 64) + imm >>= 3; + else if (size == 32) + imm >>= 2; + else if (size == 16) + imm >>= 1; + + ASSERT_MSG(imm >= 0, "%s(INDEX_UNSIGNED): offset must be positive %d", __func__, imm); + ASSERT_MSG(!(imm & ~0xFFF), "%s(INDEX_UNSIGNED): offset too large %d", __func__, imm); + + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + Write32((b64Bit << 30) | (op << 22) | (bVec << 26) | (imm << 10) | (Rn << 5) | Rt); +} + +void ARM64XEmitter::EncodeMOVWideInst(u32 op, ARM64Reg Rd, u32 imm, ShiftAmount pos) { + bool b64Bit = Is64Bit(Rd); + + ASSERT_MSG(!(imm & ~0xFFFF), "%s: immediate out of range: %d", __func__, imm); + + Rd = DecodeReg(Rd); + Write32((b64Bit << 31) | (op << 29) | (0x25 << 23) | (pos << 21) | (imm << 5) | Rd); +} + +void ARM64XEmitter::EncodeBitfieldMOVInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms) { + bool b64Bit = Is64Bit(Rd); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (op << 29) | (0x26 << 23) | (b64Bit << 22) | (immr << 16) | + (imms << 10) | (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn, + ArithOption Rm) { + ASSERT_MSG(Rm.GetType() == ArithOption::TYPE_EXTENDEDREG, "Shifted registers are not supported used Indexed registers"); + + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + ARM64Reg decoded_Rm = DecodeReg(Rm.GetReg()); + + Write32((size << 30) | (opc << 22) | (0x1C1 << 21) | (decoded_Rm << 16) | Rm.GetData() | + (1 << 11) | (Rn << 5) | Rt); +} + +void ARM64XEmitter::EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn, + ARM64Reg Rd) { + bool b64Bit = Is64Bit(Rd); + + ASSERT_MSG(!(imm & ~0xFFF), "%s: immediate too large: %x", __func__, imm); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Write32((b64Bit << 31) | (op << 30) | (flags << 29) | (0x11 << 24) | (shift << 22) | + (imm << 10) | (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, + int n) { + // Sometimes Rd is fixed to SP, but can still be 32bit or 64bit. + // Use Rn to determine bitness here. + bool b64Bit = Is64Bit(Rn); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((b64Bit << 31) | (op << 29) | (0x24 << 23) | (n << 22) | (immr << 16) | (imms << 10) | + (Rn << 5) | Rd); +} + +void ARM64XEmitter::EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, + ARM64Reg Rn, s32 imm) { + bool b64Bit = Is64Bit(Rt); + u32 type_encode = 0; + + switch (type) { + case INDEX_SIGNED: + type_encode = 0b010; + break; + case INDEX_POST: + type_encode = 0b001; + break; + case INDEX_PRE: + type_encode = 0b011; + break; + case INDEX_UNSIGNED: + ASSERT_MSG(false, "%s doesn't support INDEX_UNSIGNED!", __func__); + break; + } + + if (b64Bit) { + op |= 0b10; + imm >>= 3; + } else { + imm >>= 2; + } + + Rt = DecodeReg(Rt); + Rt2 = DecodeReg(Rt2); + Rn = DecodeReg(Rn); + + Write32((op << 30) | (0b101 << 27) | (type_encode << 23) | (load << 22) | ((imm & 0x7F) << 15) | + (Rt2 << 10) | (Rn << 5) | Rt); +} +void ARM64XEmitter::EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm) { + Rd = DecodeReg(Rd); + + Write32((op << 31) | ((imm & 0x3) << 29) | (0x10 << 24) | ((imm & 0x1FFFFC) << 3) | Rd); +} + +void ARM64XEmitter::EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + ASSERT_MSG(!(imm < -256 || imm > 255), "%s received too large offset: %d", __func__, imm); + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + + Write32((size << 30) | (0b111 << 27) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt); +} + +// FixupBranch branching +void ARM64XEmitter::SetJumpTarget(FixupBranch const& branch, u8* target) { + if(!target) + target = m_code; + bool Not = false; + u32 inst = 0; + s64 distance = static_cast(target - branch.ptr); + distance >>= 2; + + switch (branch.type) { + case 1: // CBNZ + Not = true; + [[fallthrough]]; + case 0: // CBZ + { + ASSERT_MSG(IsInRangeImm19(distance), "%s(%d): Received too large distance: %" PRIx64, + __func__, branch.type, distance); + bool b64Bit = Is64Bit(branch.reg); + ARM64Reg reg = DecodeReg(branch.reg); + inst = (b64Bit << 31) | (0x1A << 25) | (Not << 24) | (MaskImm19(distance) << 5) | reg; + } break; + case 2: // B (conditional) + ASSERT_MSG(IsInRangeImm19(distance), "%s(%d): Received too large distance: %" PRIx64, + __func__, branch.type, distance); + inst = (0x2A << 25) | (MaskImm19(distance) << 5) | branch.cond; + break; + case 4: // TBNZ + Not = true; + [[fallthrough]]; + case 3: // TBZ + { + ASSERT_MSG(IsInRangeImm14(distance), "%s(%d): Received too large distance: %" PRIx64, + __func__, branch.type, distance); + ARM64Reg reg = DecodeReg(branch.reg); + inst = ((branch.bit & 0x20) << 26) | (0x1B << 25) | (Not << 24) | + ((branch.bit & 0x1F) << 19) | (MaskImm14(distance) << 5) | reg; + } break; + case 5: // B (unconditional) + ASSERT_MSG(IsInRangeImm26(distance), "%s(%d): Received too large distance: %" PRIx64, + __func__, branch.type, distance); + inst = (0x5 << 26) | MaskImm26(distance); + break; + case 6: // BL (unconditional) + ASSERT_MSG(IsInRangeImm26(distance), "%s(%d): Received too large distance: %" PRIx64, + __func__, branch.type, distance); + inst = (0x25 << 26) | MaskImm26(distance); + break; + } + std::memcpy(branch.ptr, &inst, sizeof(inst)); +} + +FixupBranch ARM64XEmitter::CBZ(ARM64Reg Rt) { + FixupBranch branch; + branch.ptr = m_code; + branch.type = 0; + branch.reg = Rt; + HINT(HINT_NOP); + return branch; +} +FixupBranch ARM64XEmitter::CBNZ(ARM64Reg Rt) { + FixupBranch branch; + branch.ptr = m_code; + branch.type = 1; + branch.reg = Rt; + HINT(HINT_NOP); + return branch; +} +FixupBranch ARM64XEmitter::B(CCFlags cond) { + FixupBranch branch; + branch.ptr = m_code; + branch.type = 2; + branch.cond = cond; + HINT(HINT_NOP); + return branch; +} +FixupBranch ARM64XEmitter::TBZ(ARM64Reg Rt, u8 bit) { + FixupBranch branch; + branch.ptr = m_code; + branch.type = 3; + branch.reg = Rt; + branch.bit = bit; + HINT(HINT_NOP); + return branch; +} +FixupBranch ARM64XEmitter::TBNZ(ARM64Reg Rt, u8 bit) { + FixupBranch branch; + branch.ptr = m_code; + branch.type = 4; + branch.reg = Rt; + branch.bit = bit; + HINT(HINT_NOP); + return branch; +} +FixupBranch ARM64XEmitter::B() { + FixupBranch branch; + branch.ptr = m_code; + branch.type = 5; + HINT(HINT_NOP); + return branch; +} +FixupBranch ARM64XEmitter::BL() { + FixupBranch branch; + branch.ptr = m_code; + branch.type = 6; + HINT(HINT_NOP); + return branch; +} + +// Compare and Branch +void ARM64XEmitter::CBZ(ARM64Reg Rt, const void* ptr) { + EncodeCompareBranchInst(0, Rt, ptr); +} +void ARM64XEmitter::CBNZ(ARM64Reg Rt, const void* ptr) { + EncodeCompareBranchInst(1, Rt, ptr); +} + +// Conditional Branch +void ARM64XEmitter::B(CCFlags cond, const void* ptr) { + s64 distance = reinterpret_cast(ptr) - reinterpret_cast(m_code); + + distance >>= 2; + + ASSERT_MSG(IsInRangeImm19(distance), + "%s: Received too large distance: %p->%p %" PRIi64 " %" PRIx64, __func__, m_code, + ptr, distance, distance); + Write32((0x54 << 24) | (MaskImm19(distance) << 5) | cond); +} + +// Test and Branch +void ARM64XEmitter::TBZ(ARM64Reg Rt, u8 bits, const void* ptr) { + EncodeTestBranchInst(0, Rt, bits, ptr); +} +void ARM64XEmitter::TBNZ(ARM64Reg Rt, u8 bits, const void* ptr) { + EncodeTestBranchInst(1, Rt, bits, ptr); +} + +// Unconditional Branch +void ARM64XEmitter::B(const void* ptr) { + EncodeUnconditionalBranchInst(0, ptr); +} +void ARM64XEmitter::BL(const void* ptr) { + EncodeUnconditionalBranchInst(1, ptr); +} + +void ARM64XEmitter::QuickCallFunction(const void* func, ARM64Reg scratchreg) { + s64 distance = reinterpret_cast(func) - reinterpret_cast(m_code); + distance >>= 2; // Can only branch to opcode-aligned (4) addresses + if (!IsInRangeImm26(distance)) { + // WARN_LOG( "Distance too far in function call (%p to %p)! Using scratch.", + // m_code, func); + MOVI2R(scratchreg, reinterpret_cast(func)); + BLR(scratchreg); + } else { + BL(func); + } +} + +// Unconditional Branch (register) +void ARM64XEmitter::BR(ARM64Reg Rn) { + EncodeUnconditionalBranchInst(0, 0x1F, 0, 0, Rn); +} +void ARM64XEmitter::BLR(ARM64Reg Rn) { + EncodeUnconditionalBranchInst(1, 0x1F, 0, 0, Rn); +} +void ARM64XEmitter::RET(ARM64Reg Rn) { + EncodeUnconditionalBranchInst(2, 0x1F, 0, 0, Rn); +} +void ARM64XEmitter::ERET() { + EncodeUnconditionalBranchInst(4, 0x1F, 0, 0, SP); +} +void ARM64XEmitter::DRPS() { + EncodeUnconditionalBranchInst(5, 0x1F, 0, 0, SP); +} + +// Exception generation +void ARM64XEmitter::SVC(u32 imm) { + EncodeExceptionInst(0, imm); +} + +void ARM64XEmitter::HVC(u32 imm) { + EncodeExceptionInst(1, imm); +} + +void ARM64XEmitter::SMC(u32 imm) { + EncodeExceptionInst(2, imm); +} + +void ARM64XEmitter::BRK(u32 imm) { + EncodeExceptionInst(3, imm); +} + +void ARM64XEmitter::HLT(u32 imm) { + EncodeExceptionInst(4, imm); +} + +void ARM64XEmitter::DCPS1(u32 imm) { + EncodeExceptionInst(5, imm); +} + +void ARM64XEmitter::DCPS2(u32 imm) { + EncodeExceptionInst(6, imm); +} + +void ARM64XEmitter::DCPS3(u32 imm) { + EncodeExceptionInst(7, imm); +} + +// System +void ARM64XEmitter::_MSR(PStateField field, u8 imm) { + u32 op1 = 0, op2 = 0; + switch (field) { + case FIELD_SPSel: + op1 = 0; + op2 = 5; + break; + case FIELD_DAIFSet: + op1 = 3; + op2 = 6; + break; + case FIELD_DAIFClr: + op1 = 3; + op2 = 7; + break; + default: + ASSERT_MSG(false, "Invalid PStateField to do a imm move to"); + break; + } + EncodeSystemInst(0, op1, 4, imm, op2, WSP); +} + +static void GetSystemReg(PStateField field, int& o0, int& op1, int& CRn, int& CRm, int& op2) { + switch (field) { + case FIELD_NZCV: + o0 = 3; + op1 = 3; + CRn = 4; + CRm = 2; + op2 = 0; + break; + case FIELD_FPCR: + o0 = 3; + op1 = 3; + CRn = 4; + CRm = 4; + op2 = 0; + break; + case FIELD_FPSR: + o0 = 3; + op1 = 3; + CRn = 4; + CRm = 4; + op2 = 1; + break; + case FIELD_PMCR_EL0: + o0 = 3; + op1 = 3; + CRn = 9; + CRm = 6; + op2 = 0; + break; + case FIELD_PMCCNTR_EL0: + o0 = 3; + op1 = 3; + CRn = 9; + CRm = 7; + op2 = 0; + break; + default: + ASSERT_MSG(false, "Invalid PStateField to do a register move from/to"); + break; + } +} + +void ARM64XEmitter::_MSR(PStateField field, ARM64Reg Rt) { + int o0 = 0, op1 = 0, CRn = 0, CRm = 0, op2 = 0; + ASSERT_MSG(Is64Bit(Rt), "MSR: Rt must be 64-bit"); + GetSystemReg(field, o0, op1, CRn, CRm, op2); + EncodeSystemInst(o0, op1, CRn, CRm, op2, DecodeReg(Rt)); +} + +void ARM64XEmitter::MRS(ARM64Reg Rt, PStateField field) { + int o0 = 0, op1 = 0, CRn = 0, CRm = 0, op2 = 0; + ASSERT_MSG(Is64Bit(Rt), "MRS: Rt must be 64-bit"); + GetSystemReg(field, o0, op1, CRn, CRm, op2); + EncodeSystemInst(o0 | 4, op1, CRn, CRm, op2, DecodeReg(Rt)); +} + +void ARM64XEmitter::CNTVCT(ARM64Reg Rt) { + ASSERT_MSG(Is64Bit(Rt), "CNTVCT: Rt must be 64-bit"); + + // MRS , CNTVCT_EL0 ; Read CNTVCT_EL0 into Xt + EncodeSystemInst(3 | 4, 3, 0xe, 0, 2, DecodeReg(Rt)); +} + +void ARM64XEmitter::HINT(SystemHint op) { + EncodeSystemInst(0, 3, 2, 0, op, WSP); +} +void ARM64XEmitter::CLREX() { + EncodeSystemInst(0, 3, 3, 0, 2, WSP); +} +void ARM64XEmitter::DSB(BarrierType type) { + EncodeSystemInst(0, 3, 3, type, 4, WSP); +} +void ARM64XEmitter::DMB(BarrierType type) { + EncodeSystemInst(0, 3, 3, type, 5, WSP); +} +void ARM64XEmitter::ISB(BarrierType type) { + EncodeSystemInst(0, 3, 3, type, 6, WSP); +} + +// Add/Subtract (extended register) +void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + ADD(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); +} + +void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) { + EncodeArithmeticInst(0, false, Rd, Rn, Rm, Option); +} + +void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EncodeArithmeticInst(0, true, Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); +} + +void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) { + EncodeArithmeticInst(0, true, Rd, Rn, Rm, Option); +} + +void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + SUB(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); +} + +void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) { + EncodeArithmeticInst(1, false, Rd, Rn, Rm, Option); +} + +void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EncodeArithmeticInst(1, true, Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); +} + +void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) { + EncodeArithmeticInst(1, true, Rd, Rn, Rm, Option); +} + +void ARM64XEmitter::CMN(ARM64Reg Rn, ARM64Reg Rm) { + CMN(Rn, Rm, ArithOption(Rn, ST_LSL, 0)); +} + +void ARM64XEmitter::CMN(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) { + EncodeArithmeticInst(0, true, Is64Bit(Rn) ? ZR : WZR, Rn, Rm, Option); +} + +void ARM64XEmitter::CMP(ARM64Reg Rn, ARM64Reg Rm) { + CMP(Rn, Rm, ArithOption(Rn, ST_LSL, 0)); +} + +void ARM64XEmitter::CMP(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) { + EncodeArithmeticInst(1, true, Is64Bit(Rn) ? ZR : WZR, Rn, Rm, Option); +} + +// Add/Subtract (with carry) +void ARM64XEmitter::ADC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EncodeArithmeticCarryInst(0, false, Rd, Rn, Rm); +} +void ARM64XEmitter::ADCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EncodeArithmeticCarryInst(0, true, Rd, Rn, Rm); +} +void ARM64XEmitter::SBC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EncodeArithmeticCarryInst(1, false, Rd, Rn, Rm); +} +void ARM64XEmitter::SBCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EncodeArithmeticCarryInst(1, true, Rd, Rn, Rm); +} + +// Conditional Compare (immediate) +void ARM64XEmitter::CCMN(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond) { + EncodeCondCompareImmInst(0, Rn, imm, nzcv, cond); +} +void ARM64XEmitter::CCMP(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond) { + EncodeCondCompareImmInst(1, Rn, imm, nzcv, cond); +} + +// Conditiona Compare (register) +void ARM64XEmitter::CCMN(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond) { + EncodeCondCompareRegInst(0, Rn, Rm, nzcv, cond); +} +void ARM64XEmitter::CCMP(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond) { + EncodeCondCompareRegInst(1, Rn, Rm, nzcv, cond); +} + +// Conditional Select +void ARM64XEmitter::CSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) { + EncodeCondSelectInst(0, Rd, Rn, Rm, cond); +} +void ARM64XEmitter::CSINC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) { + EncodeCondSelectInst(1, Rd, Rn, Rm, cond); +} +void ARM64XEmitter::CSINV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) { + EncodeCondSelectInst(2, Rd, Rn, Rm, cond); +} +void ARM64XEmitter::CSNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) { + EncodeCondSelectInst(3, Rd, Rn, Rm, cond); +} + +// Data-Processing 1 source +void ARM64XEmitter::RBIT(ARM64Reg Rd, ARM64Reg Rn) { + EncodeData1SrcInst(0, Rd, Rn); +} +void ARM64XEmitter::REV16(ARM64Reg Rd, ARM64Reg Rn) { + EncodeData1SrcInst(1, Rd, Rn); +} +void ARM64XEmitter::REV32(ARM64Reg Rd, ARM64Reg Rn) { + EncodeData1SrcInst(2, Rd, Rn); +} +void ARM64XEmitter::REV64(ARM64Reg Rd, ARM64Reg Rn) { + EncodeData1SrcInst(3, Rd, Rn); +} +void ARM64XEmitter::CLZ(ARM64Reg Rd, ARM64Reg Rn) { + EncodeData1SrcInst(4, Rd, Rn); +} +void ARM64XEmitter::CLS(ARM64Reg Rd, ARM64Reg Rn) { + EncodeData1SrcInst(5, Rd, Rn); +} + +// Data-Processing 2 source +void ARM64XEmitter::UDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EncodeData2SrcInst(0, Rd, Rn, Rm); +} +void ARM64XEmitter::SDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EncodeData2SrcInst(1, Rd, Rn, Rm); +} +void ARM64XEmitter::LSLV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EncodeData2SrcInst(2, Rd, Rn, Rm); +} +void ARM64XEmitter::LSRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EncodeData2SrcInst(3, Rd, Rn, Rm); +} +void ARM64XEmitter::ASRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EncodeData2SrcInst(4, Rd, Rn, Rm); +} +void ARM64XEmitter::RORV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EncodeData2SrcInst(5, Rd, Rn, Rm); +} +void ARM64XEmitter::CRC32B(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EncodeData2SrcInst(6, Rd, Rn, Rm); +} +void ARM64XEmitter::CRC32H(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EncodeData2SrcInst(7, Rd, Rn, Rm); +} +void ARM64XEmitter::CRC32W(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EncodeData2SrcInst(8, Rd, Rn, Rm); +} +void ARM64XEmitter::CRC32CB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EncodeData2SrcInst(9, Rd, Rn, Rm); +} +void ARM64XEmitter::CRC32CH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EncodeData2SrcInst(10, Rd, Rn, Rm); +} +void ARM64XEmitter::CRC32CW(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EncodeData2SrcInst(11, Rd, Rn, Rm); +} +void ARM64XEmitter::CRC32X(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EncodeData2SrcInst(12, Rd, Rn, Rm); +} +void ARM64XEmitter::CRC32CX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EncodeData2SrcInst(13, Rd, Rn, Rm); +} + +// Data-Processing 3 source +void ARM64XEmitter::MADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) { + EncodeData3SrcInst(0, Rd, Rn, Rm, Ra); +} +void ARM64XEmitter::MSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) { + EncodeData3SrcInst(1, Rd, Rn, Rm, Ra); +} +void ARM64XEmitter::SMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) { + EncodeData3SrcInst(2, Rd, Rn, Rm, Ra); +} +void ARM64XEmitter::SMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + SMADDL(Rd, Rn, Rm, SP); +} +void ARM64XEmitter::SMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) { + EncodeData3SrcInst(3, Rd, Rn, Rm, Ra); +} +void ARM64XEmitter::SMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EncodeData3SrcInst(4, Rd, Rn, Rm, SP); +} +void ARM64XEmitter::UMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) { + EncodeData3SrcInst(5, Rd, Rn, Rm, Ra); +} +void ARM64XEmitter::UMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + UMADDL(Rd, Rn, Rm, SP); +} +void ARM64XEmitter::UMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) { + EncodeData3SrcInst(6, Rd, Rn, Rm, Ra); +} +void ARM64XEmitter::UMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EncodeData3SrcInst(7, Rd, Rn, Rm, SP); +} +void ARM64XEmitter::MUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EncodeData3SrcInst(0, Rd, Rn, Rm, SP); +} +void ARM64XEmitter::MNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EncodeData3SrcInst(1, Rd, Rn, Rm, SP); +} + +// Logical (shifted register) +void ARM64XEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) { + EncodeLogicalInst(0, Rd, Rn, Rm, Shift); +} +void ARM64XEmitter::BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) { + EncodeLogicalInst(1, Rd, Rn, Rm, Shift); +} +void ARM64XEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) { + EncodeLogicalInst(2, Rd, Rn, Rm, Shift); +} +void ARM64XEmitter::ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) { + EncodeLogicalInst(3, Rd, Rn, Rm, Shift); +} +void ARM64XEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) { + EncodeLogicalInst(4, Rd, Rn, Rm, Shift); +} +void ARM64XEmitter::EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) { + EncodeLogicalInst(5, Rd, Rn, Rm, Shift); +} +void ARM64XEmitter::ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) { + EncodeLogicalInst(6, Rd, Rn, Rm, Shift); +} +void ARM64XEmitter::BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) { + EncodeLogicalInst(7, Rd, Rn, Rm, Shift); +} + +void ARM64XEmitter::MOV(ARM64Reg Rd, ARM64Reg Rm, ArithOption Shift) { + ORR(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, Shift); +} + +void ARM64XEmitter::MOV(ARM64Reg Rd, ARM64Reg Rm) { + if (IsGPR(Rd) && IsGPR(Rm)) + ORR(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, ArithOption(Rm, ST_LSL, 0)); + else + ASSERT_MSG(false, "Non-GPRs not supported in MOV"); +} +void ARM64XEmitter::MVN(ARM64Reg Rd, ARM64Reg Rm) { + ORN(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, ArithOption(Rm, ST_LSL, 0)); +} +void ARM64XEmitter::LSL(ARM64Reg Rd, ARM64Reg Rm, int shift) { + int bits = Is64Bit(Rd) ? 64 : 32; + UBFM(Rd, Rm, (bits - shift) & (bits - 1), bits - shift - 1); +} +void ARM64XEmitter::LSR(ARM64Reg Rd, ARM64Reg Rm, int shift) { + int bits = Is64Bit(Rd) ? 64 : 32; + UBFM(Rd, Rm, shift, bits - 1); +} +void ARM64XEmitter::ASR(ARM64Reg Rd, ARM64Reg Rm, int shift) { + int bits = Is64Bit(Rd) ? 64 : 32; + SBFM(Rd, Rm, shift, bits - 1); +} +void ARM64XEmitter::ROR(ARM64Reg Rd, ARM64Reg Rm, int shift) { + EXTR(Rd, Rm, Rm, shift); +} + +// Logical (immediate) +void ARM64XEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert) { + EncodeLogicalImmInst(0, Rd, Rn, immr, imms, invert); +} +void ARM64XEmitter::ANDS(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert) { + EncodeLogicalImmInst(3, Rd, Rn, immr, imms, invert); +} +void ARM64XEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert) { + EncodeLogicalImmInst(2, Rd, Rn, immr, imms, invert); +} +void ARM64XEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert) { + EncodeLogicalImmInst(1, Rd, Rn, immr, imms, invert); +} +void ARM64XEmitter::TST(ARM64Reg Rn, u32 immr, u32 imms, bool invert) { + EncodeLogicalImmInst(3, Is64Bit(Rn) ? ZR : WZR, Rn, immr, imms, invert); +} + +// Add/subtract (immediate) +void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift) { + EncodeAddSubImmInst(0, false, shift, imm, Rn, Rd); +} +void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift) { + EncodeAddSubImmInst(0, true, shift, imm, Rn, Rd); +} +void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift) { + EncodeAddSubImmInst(1, false, shift, imm, Rn, Rd); +} +void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift) { + EncodeAddSubImmInst(1, true, shift, imm, Rn, Rd); +} +void ARM64XEmitter::CMP(ARM64Reg Rn, u32 imm, bool shift) { + EncodeAddSubImmInst(1, true, shift, imm, Rn, Is64Bit(Rn) ? ZR : WZR); +} + +// Data Processing (Immediate) +void ARM64XEmitter::MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos) { + EncodeMOVWideInst(2, Rd, imm, pos); +} +void ARM64XEmitter::MOVN(ARM64Reg Rd, u32 imm, ShiftAmount pos) { + EncodeMOVWideInst(0, Rd, imm, pos); +} +void ARM64XEmitter::MOVK(ARM64Reg Rd, u32 imm, ShiftAmount pos) { + EncodeMOVWideInst(3, Rd, imm, pos); +} + +// Bitfield move +void ARM64XEmitter::BFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms) { + EncodeBitfieldMOVInst(1, Rd, Rn, immr, imms); +} +void ARM64XEmitter::SBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms) { + EncodeBitfieldMOVInst(0, Rd, Rn, immr, imms); +} +void ARM64XEmitter::UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms) { + EncodeBitfieldMOVInst(2, Rd, Rn, immr, imms); +} + +void ARM64XEmitter::BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width) { + u32 size = Is64Bit(Rn) ? 64 : 32; + ASSERT_MSG((lsb + width) <= size, + "%s passed lsb %d and width %d which is greater than the register size!", __func__, + lsb, width); + EncodeBitfieldMOVInst(1, Rd, Rn, (size - lsb) % size, width - 1); +} +void ARM64XEmitter::UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width) { + u32 size = Is64Bit(Rn) ? 64 : 32; + ASSERT_MSG((lsb + width) <= size, + "%s passed lsb %d and width %d which is greater than the register size!", __func__, + lsb, width); + EncodeBitfieldMOVInst(2, Rd, Rn, (size - lsb) % size, width - 1); +} +void ARM64XEmitter::EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift) { + bool sf = Is64Bit(Rd); + bool N = sf; + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + Write32((sf << 31) | (0x27 << 23) | (N << 22) | (Rm << 16) | (shift << 10) | (Rm << 5) | Rd); +} +void ARM64XEmitter::SXTB(ARM64Reg Rd, ARM64Reg Rn) { + SBFM(Rd, Rn, 0, 7); +} +void ARM64XEmitter::SXTH(ARM64Reg Rd, ARM64Reg Rn) { + SBFM(Rd, Rn, 0, 15); +} +void ARM64XEmitter::SXTW(ARM64Reg Rd, ARM64Reg Rn) { + ASSERT_MSG(Is64Bit(Rd), "%s requires 64bit register as destination", __func__); + SBFM(Rd, Rn, 0, 31); +} +void ARM64XEmitter::UXTB(ARM64Reg Rd, ARM64Reg Rn) { + UBFM(Rd, Rn, 0, 7); +} +void ARM64XEmitter::UXTH(ARM64Reg Rd, ARM64Reg Rn) { + UBFM(Rd, Rn, 0, 15); +} + +// Load Register (Literal) +void ARM64XEmitter::LDR(ARM64Reg Rt, s32 imm) { + EncodeLoadRegisterInst(0, Rt, imm); +} +void ARM64XEmitter::LDRSW(ARM64Reg Rt, s32 imm) { + EncodeLoadRegisterInst(2, Rt, imm); +} +void ARM64XEmitter::PRFM(ARM64Reg Rt, s32 imm) { + EncodeLoadRegisterInst(3, Rt, imm); +} + +// Load/Store pair +void ARM64XEmitter::LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) { + EncodeLoadStorePair(0, 1, type, Rt, Rt2, Rn, imm); +} +void ARM64XEmitter::LDPSW(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) { + EncodeLoadStorePair(1, 1, type, Rt, Rt2, Rn, imm); +} +void ARM64XEmitter::STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) { + EncodeLoadStorePair(0, 0, type, Rt, Rt2, Rn, imm); +} + +// Load/Store Exclusive +void ARM64XEmitter::STXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn) { + EncodeLoadStoreExcInst(0, Rs, SP, Rt, Rn); +} +void ARM64XEmitter::STLXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn) { + EncodeLoadStoreExcInst(1, Rs, SP, Rt, Rn); +} +void ARM64XEmitter::LDXRB(ARM64Reg Rt, ARM64Reg Rn) { + EncodeLoadStoreExcInst(2, SP, SP, Rt, Rn); +} +void ARM64XEmitter::LDAXRB(ARM64Reg Rt, ARM64Reg Rn) { + EncodeLoadStoreExcInst(3, SP, SP, Rt, Rn); +} +void ARM64XEmitter::STLRB(ARM64Reg Rt, ARM64Reg Rn) { + EncodeLoadStoreExcInst(4, SP, SP, Rt, Rn); +} +void ARM64XEmitter::LDARB(ARM64Reg Rt, ARM64Reg Rn) { + EncodeLoadStoreExcInst(5, SP, SP, Rt, Rn); +} +void ARM64XEmitter::STXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn) { + EncodeLoadStoreExcInst(6, Rs, SP, Rt, Rn); +} +void ARM64XEmitter::STLXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn) { + EncodeLoadStoreExcInst(7, Rs, SP, Rt, Rn); +} +void ARM64XEmitter::LDXRH(ARM64Reg Rt, ARM64Reg Rn) { + EncodeLoadStoreExcInst(8, SP, SP, Rt, Rn); +} +void ARM64XEmitter::LDAXRH(ARM64Reg Rt, ARM64Reg Rn) { + EncodeLoadStoreExcInst(9, SP, SP, Rt, Rn); +} +void ARM64XEmitter::STLRH(ARM64Reg Rt, ARM64Reg Rn) { + EncodeLoadStoreExcInst(10, SP, SP, Rt, Rn); +} +void ARM64XEmitter::LDARH(ARM64Reg Rt, ARM64Reg Rn) { + EncodeLoadStoreExcInst(11, SP, SP, Rt, Rn); +} +void ARM64XEmitter::STXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn) { + EncodeLoadStoreExcInst(12 + Is64Bit(Rt), Rs, SP, Rt, Rn); +} +void ARM64XEmitter::STLXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn) { + EncodeLoadStoreExcInst(14 + Is64Bit(Rt), Rs, SP, Rt, Rn); +} +void ARM64XEmitter::STXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn) { + EncodeLoadStoreExcInst(16 + Is64Bit(Rt), Rs, Rt2, Rt, Rn); +} +void ARM64XEmitter::STLXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn) { + EncodeLoadStoreExcInst(18 + Is64Bit(Rt), Rs, Rt2, Rt, Rn); +} +void ARM64XEmitter::LDXR(ARM64Reg Rt, ARM64Reg Rn) { + EncodeLoadStoreExcInst(20 + Is64Bit(Rt), SP, SP, Rt, Rn); +} +void ARM64XEmitter::LDAXR(ARM64Reg Rt, ARM64Reg Rn) { + EncodeLoadStoreExcInst(22 + Is64Bit(Rt), SP, SP, Rt, Rn); +} +void ARM64XEmitter::LDXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn) { + EncodeLoadStoreExcInst(24 + Is64Bit(Rt), SP, Rt2, Rt, Rn); +} +void ARM64XEmitter::LDAXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn) { + EncodeLoadStoreExcInst(26 + Is64Bit(Rt), SP, Rt2, Rt, Rn); +} +void ARM64XEmitter::STLR(ARM64Reg Rt, ARM64Reg Rn) { + EncodeLoadStoreExcInst(28 + Is64Bit(Rt), SP, SP, Rt, Rn); +} +void ARM64XEmitter::LDAR(ARM64Reg Rt, ARM64Reg Rn) { + EncodeLoadStoreExcInst(30 + Is64Bit(Rt), SP, SP, Rt, Rn); +} + +// Load/Store no-allocate pair (offset) +void ARM64XEmitter::STNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm) { + EncodeLoadStorePairedInst(0xA0, Rt, Rt2, Rn, imm); +} +void ARM64XEmitter::LDNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm) { + EncodeLoadStorePairedInst(0xA1, Rt, Rt2, Rn, imm); +} + +// Load/Store register (immediate post-indexed) +// XXX: Most of these support vectors +void ARM64XEmitter::STRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(0x0E4, Rt, Rn, imm, 8); + else + EncodeLoadStoreIndexedInst(0x0E0, type == INDEX_POST ? 1 : 3, Rt, Rn, imm); +} +void ARM64XEmitter::LDRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(0x0E5, Rt, Rn, imm, 8); + else + EncodeLoadStoreIndexedInst(0x0E1, type == INDEX_POST ? 1 : 3, Rt, Rn, imm); +} +void ARM64XEmitter::LDRSB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x0E6 : 0x0E7, Rt, Rn, imm, 8); + else + EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x0E2 : 0x0E3, type == INDEX_POST ? 1 : 3, Rt, Rn, + imm); +} +void ARM64XEmitter::STRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(0x1E4, Rt, Rn, imm, 16); + else + EncodeLoadStoreIndexedInst(0x1E0, type == INDEX_POST ? 1 : 3, Rt, Rn, imm); +} +void ARM64XEmitter::LDRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(0x1E5, Rt, Rn, imm, 16); + else + EncodeLoadStoreIndexedInst(0x1E1, type == INDEX_POST ? 1 : 3, Rt, Rn, imm); +} +void ARM64XEmitter::LDRSH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x1E6 : 0x1E7, Rt, Rn, imm, 16); + else + EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x1E2 : 0x1E3, type == INDEX_POST ? 1 : 3, Rt, Rn, + imm); +} +void ARM64XEmitter::STR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E4 : 0x2E4, Rt, Rn, imm, Is64Bit(Rt) ? 64 : 32); + else + EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E0 : 0x2E0, type == INDEX_POST ? 1 : 3, Rt, Rn, + imm); +} +void ARM64XEmitter::LDR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E5 : 0x2E5, Rt, Rn, imm, Is64Bit(Rt) ? 64 : 32); + else + EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E1 : 0x2E1, type == INDEX_POST ? 1 : 3, Rt, Rn, + imm); +} +void ARM64XEmitter::LDRSW(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + if (type == INDEX_UNSIGNED) + EncodeLoadStoreIndexedInst(0x2E6, Rt, Rn, imm, 32); + else + EncodeLoadStoreIndexedInst(0x2E2, type == INDEX_POST ? 1 : 3, Rt, Rn, imm); +} + +// Load/Store register (register offset) +void ARM64XEmitter::STRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) { + EncodeLoadStoreRegisterOffset(0, 0, Rt, Rn, Rm); +} +void ARM64XEmitter::LDRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) { + EncodeLoadStoreRegisterOffset(0, 1, Rt, Rn, Rm); +} +void ARM64XEmitter::LDRSB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) { + bool b64Bit = Is64Bit(Rt); + EncodeLoadStoreRegisterOffset(0, 3 - b64Bit, Rt, Rn, Rm); +} +void ARM64XEmitter::STRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) { + EncodeLoadStoreRegisterOffset(1, 0, Rt, Rn, Rm); +} +void ARM64XEmitter::LDRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) { + EncodeLoadStoreRegisterOffset(1, 1, Rt, Rn, Rm); +} +void ARM64XEmitter::LDRSH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) { + bool b64Bit = Is64Bit(Rt); + EncodeLoadStoreRegisterOffset(1, 3 - b64Bit, Rt, Rn, Rm); +} +void ARM64XEmitter::STR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) { + bool b64Bit = Is64Bit(Rt); + EncodeLoadStoreRegisterOffset(2 + b64Bit, 0, Rt, Rn, Rm); +} +void ARM64XEmitter::LDR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) { + bool b64Bit = Is64Bit(Rt); + EncodeLoadStoreRegisterOffset(2 + b64Bit, 1, Rt, Rn, Rm); +} +void ARM64XEmitter::LDRSW(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) { + EncodeLoadStoreRegisterOffset(2, 2, Rt, Rn, Rm); +} +void ARM64XEmitter::PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) { + EncodeLoadStoreRegisterOffset(3, 2, Rt, Rn, Rm); +} + +// Load/Store register (unscaled offset) +void ARM64XEmitter::STURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + EncodeLoadStoreUnscaled(0, 0, Rt, Rn, imm); +} +void ARM64XEmitter::LDURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + EncodeLoadStoreUnscaled(0, 1, Rt, Rn, imm); +} +void ARM64XEmitter::LDURSB(ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + EncodeLoadStoreUnscaled(0, Is64Bit(Rt) ? 2 : 3, Rt, Rn, imm); +} +void ARM64XEmitter::STURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + EncodeLoadStoreUnscaled(1, 0, Rt, Rn, imm); +} +void ARM64XEmitter::LDURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + EncodeLoadStoreUnscaled(1, 1, Rt, Rn, imm); +} +void ARM64XEmitter::LDURSH(ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + EncodeLoadStoreUnscaled(1, Is64Bit(Rt) ? 2 : 3, Rt, Rn, imm); +} +void ARM64XEmitter::STUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + EncodeLoadStoreUnscaled(Is64Bit(Rt) ? 3 : 2, 0, Rt, Rn, imm); +} +void ARM64XEmitter::LDUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + EncodeLoadStoreUnscaled(Is64Bit(Rt) ? 3 : 2, 1, Rt, Rn, imm); +} +void ARM64XEmitter::LDURSW(ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + ASSERT_MSG(!Is64Bit(Rt), "%s must have a 64bit destination register!", __func__); + EncodeLoadStoreUnscaled(2, 2, Rt, Rn, imm); +} + +// Address of label/page PC-relative +void ARM64XEmitter::ADR(ARM64Reg Rd, s32 imm) { + EncodeAddressInst(0, Rd, imm); +} +void ARM64XEmitter::ADRP(ARM64Reg Rd, s32 imm) { + EncodeAddressInst(1, Rd, imm >> 12); +} + +// Wrapper around MOVZ+MOVK (and later MOVN) +void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm, bool optimize) { + unsigned int parts = Is64Bit(Rd) ? 4 : 2; + std::bitset<32> upload_part(0); + + // Always start with a movz! Kills the dependency on the register. + bool use_movz = true; + + if (!imm) { + // Zero immediate, just clear the register. EOR is pointless when we have + // MOVZ, which looks clearer in disasm too. + MOVZ(Rd, 0, SHIFT_0); + return; + } + + if ((Is64Bit(Rd) && imm == std::numeric_limits::max()) || + (!Is64Bit(Rd) && imm == std::numeric_limits::max())) { + // Max unsigned value (or if signed, -1) + // Set to ~ZR + ARM64Reg ZR = Is64Bit(Rd) ? SP : WSP; + ORN(Rd, ZR, ZR, ArithOption(ZR, ST_LSL, 0)); + return; + } + + // TODO: Make some more systemic use of MOVN, but this will take care of most + // cases. Small negative integer. Use MOVN + if (!Is64Bit(Rd) && (imm | 0xFFFF0000) == imm) { + MOVN(Rd, static_cast(~imm), SHIFT_0); + return; + } + + // XXX: Use MOVN when possible. + // XXX: Optimize more + // XXX: Support rotating immediates to save instructions + if (optimize) { + for (unsigned int i = 0; i < parts; ++i) { + if ((imm >> (i * 16)) & 0xFFFF) + upload_part[i] = 1; + } + } + + u64 aligned_pc = reinterpret_cast(GetCodePtr()) & ~0xFFF; + s64 aligned_offset = static_cast(imm) - static_cast(aligned_pc); + // The offset for ADR/ADRP is an s32, so make sure it can be represented in + // that + if (upload_part.count() > 1 && std::abs(aligned_offset) < 0x7FFFFFFFLL) { + // Immediate we are loading is within 4GB of our aligned range + // Most likely a address that we can load in one or two instructions + if (!(std::abs(aligned_offset) & 0xFFF)) { + // Aligned ADR + ADRP(Rd, static_cast(aligned_offset)); + return; + } else { + // If the address is within 1MB of PC we can load it in a single + // instruction still + s64 offset = static_cast(imm) - reinterpret_cast(GetCodePtr()); + if (offset >= -0xFFFFF && offset <= 0xFFFFF) { + ADR(Rd, static_cast(offset)); + return; + } else { + ADRP(Rd, static_cast(aligned_offset & ~0xFFF)); + ADD(Rd, Rd, imm & 0xFFF); + return; + } + } + } + + for (unsigned i = 0; i < parts; ++i) { + if (use_movz && upload_part[i]) { + MOVZ(Rd, (imm >> (i * 16)) & 0xFFFF, static_cast(i)); + use_movz = false; + } else { + if (upload_part[i] || !optimize) + MOVK(Rd, (imm >> (i * 16)) & 0xFFFF, static_cast(i)); + } + } +} + +bool ARM64XEmitter::MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2) { + // TODO: Also optimize for performance, not just for code size. + u8* start_pointer = GetWritableCodePtr(); + + MOVI2R(Rd, imm1); + u64 size1 = GetCodePtr() - start_pointer; + + SetCodePtrUnsafe(start_pointer); + + MOVI2R(Rd, imm2); + u64 size2 = GetCodePtr() - start_pointer; + + SetCodePtrUnsafe(start_pointer); + + bool element = size1 > size2; + + MOVI2R(Rd, element ? imm2 : imm1); + + return element; +} + +void ARM64XEmitter::ABI_PushRegisters(u32 registers) { + int num_regs = Common::BitCount(registers); + int stack_size = (num_regs + (num_regs & 1)) * 8; + int it = 0; + + std::array gpr{}; + + if (!num_regs) + return; + + for (int i = 0; i < 32; ++i) { + if (Common::Bit(i, registers)) { + gpr[it++] = static_cast(X0 + i); + } + } + + // 8 byte per register, but 16 byte alignment, so we may have to padd one register. + // Only update the SP on the last write to avoid the dependency between those stores. + + // The first push must adjust the SP, else a context switch may invalidate everything below SP. + + it = 0; + if (num_regs & 1) { + STR(INDEX_PRE, gpr[0], SP, -stack_size); + it++; + } else { + STP(INDEX_PRE, gpr[0], gpr[1], SP, -stack_size); + it += 2; + } + + // Fast store for all other registers, this is always an even number. + for (int i = 0; i < (num_regs - 1) / 2; i++) { + STP(INDEX_SIGNED, gpr[it], gpr[it + 1], SP, 16 * (i + 1)); + it += 2; + } + + ASSERT_MSG(it == num_regs, "%s registers don't match.", __func__); +} + +void ARM64XEmitter::ABI_PopRegisters(u32 registers) { + u8 num_regs = static_cast(Common::BitCount(registers)); + int stack_size = (num_regs + (num_regs & 1)) * 8; + int it = 0; + + std::array gpr{}; + + if (!num_regs) + return; + + for (int i = 0; i < 32; ++i) { + if (Common::Bit(i, registers)) { + gpr[it++] = static_cast(X0 + i); + } + } + it = 0; + // We must adjust the SP in the end, so load the first (two) registers at least. + ARM64Reg first = gpr[it++]; + ARM64Reg second = INVALID_REG; + if (!(num_regs & 1)) + second = gpr[it++]; + + // 8 byte per register, but 16 byte alignment, so we may have to padd one register. + // Only update the SP on the last load to avoid the dependency between those loads. + + // Fast load for all but the first (two) registers, this is always an even number. + + for (int i = 0; i < (num_regs - 1) / 2; ++i) { + LDP(INDEX_SIGNED, gpr[it], gpr[it + 1], SP, 16 * (i + 1)); + it += 2; + } + + // Post loading the first (two) registers. + if (num_regs & 1) + LDR(INDEX_POST, first, SP, stack_size); + else + LDP(INDEX_POST, first, second, SP, stack_size); + + ASSERT_MSG(it == num_regs, "%s registers don't match.", __func__); +} + +// Float Emitter +void ARM64FloatEmitter::EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt, + ARM64Reg Rn, s32 imm) { + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + u32 encoded_size = 0; + u32 encoded_imm = 0; + + if (size == 8) + encoded_size = 0; + else if (size == 16) + encoded_size = 1; + else if (size == 32) + encoded_size = 2; + else if (size == 64) + encoded_size = 3; + else if (size == 128) + encoded_size = 0; + + if (type == INDEX_UNSIGNED) { + ASSERT_MSG(!(imm & ((size - 1) >> 3)), + "%s(INDEX_UNSIGNED) immediate offset must be aligned to size! " + "(%d) (%p)", + __func__, imm, m_emit->GetCodePtr()); + ASSERT_MSG(imm >= 0, "%s(INDEX_UNSIGNED) immediate offset must be positive!", __func__); + if (size == 16) + imm >>= 1; + else if (size == 32) + imm >>= 2; + else if (size == 64) + imm >>= 3; + else if (size == 128) + imm >>= 4; + encoded_imm = (imm & 0xFFF); + } else { + ASSERT_MSG(!(imm < -256 || imm > 255), + "%s immediate offset must be within range of -256 to 256!", __func__); + encoded_imm = (imm & 0x1FF) << 2; + if (type == INDEX_POST) + encoded_imm |= 1; + else + encoded_imm |= 3; + } + + Write32((encoded_size << 30) | (0xF << 26) | (type == INDEX_UNSIGNED ? (1 << 24) : 0) | + (size == 128 ? (1 << 23) : 0) | (opc << 22) | (encoded_imm << 10) | (Rn << 5) | Rt); +} + +void ARM64FloatEmitter::EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, + ARM64Reg Rn, ARM64Reg Rm) { + ASSERT_MSG(!IsQuad(Rd), "%s only supports double and single registers!", __func__); + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((M << 31) | (S << 29) | (0b11110001 << 21) | (type << 22) | (Rm << 16) | + (opcode << 12) | (1 << 11) | (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, + ARM64Reg Rm) { + ASSERT_MSG(!IsSingle(Rd), "%s doesn't support singles!", __func__); + bool quad = IsQuad(Rd); + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((quad << 30) | (U << 29) | (0b1110001 << 21) | (size << 22) | (Rm << 16) | + (opcode << 11) | (1 << 10) | (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitScalarThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, + ARM64Reg Rm) { + ASSERT_MSG(!IsQuad(Rd), "%s doesn't support quads!", __func__); + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + int esize = 0; + switch (size) { + case 8: + esize = 0; + break; + case 16: + esize = 1; + break; + case 32: + esize = 2; + break; + case 64: + esize = 3; + break; + default: + ASSERT_MSG(false, "Size must be 8, 16, 32, or 64"); + break; + } + + + Write32((U << 29) | (0b1011110001 << 21) | (esize << 22) | (Rm << 16) | + (opcode << 11) | (1 << 10) | (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn) { + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((Q << 30) | (op << 29) | (0b111 << 25) | (imm5 << 16) | (imm4 << 11) | (1 << 10) | + (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, + ARM64Reg Rn) { + ASSERT_MSG(!IsSingle(Rd), "%s doesn't support singles!", __func__); + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((Q << 30) | (U << 29) | (0b1110001 << 21) | (size << 22) | (opcode << 12) | (1 << 11) | + (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, + ARM64Reg Rt, ARM64Reg Rn) { + ASSERT_MSG(!IsSingle(Rt), "%s doesn't support singles!", __func__); + bool quad = IsQuad(Rt); + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + + Write32((quad << 30) | (0b1101 << 24) | (L << 22) | (R << 21) | (opcode << 13) | (S << 12) | + (size << 10) | (Rn << 5) | Rt); +} + +void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, + ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) { + ASSERT_MSG(!IsSingle(Rt), "%s doesn't support singles!", __func__); + bool quad = IsQuad(Rt); + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((quad << 30) | (0x1B << 23) | (L << 22) | (R << 21) | (Rm << 16) | (opcode << 13) | + (S << 12) | (size << 10) | (Rn << 5) | Rt); +} + +void ARM64FloatEmitter::Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, + ARM64Reg Rn) { + ASSERT_MSG(!IsQuad(Rd), "%s doesn't support vector!", __func__); + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((M << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (opcode << 15) | (1 << 14) | + (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 opcode, + ARM64Reg Rd, ARM64Reg Rn) { + ASSERT_MSG(Rn <= SP, "%s only supports GPR as source!", __func__); + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((sf << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (rmode << 19) | (opcode << 16) | + (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round, + bool sign) { + DEBUG_ASSERT_MSG(IsScalar(Rn), "fcvts: Rn must be floating point"); + if (IsGPR(Rd)) { + // Use the encoding that transfers the result to a GPR. + bool sf = Is64Bit(Rd); + int type = IsDouble(Rn) ? 1 : 0; + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + int opcode = (sign ? 1 : 0); + int rmode = 0; + switch (round) { + case ROUND_A: + rmode = 0; + opcode |= 4; + break; + case ROUND_P: + rmode = 1; + break; + case ROUND_M: + rmode = 2; + break; + case ROUND_Z: + rmode = 3; + break; + case ROUND_N: + rmode = 0; + break; + } + EmitConversion2(sf, 0, true, type, rmode, opcode, 0, Rd, Rn); + } else { + // Use the encoding (vector, single) that keeps the result in the fp + // register. + int sz = IsDouble(Rn); + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + int opcode = 0; + switch (round) { + case ROUND_A: + opcode = 0x1C; + break; + case ROUND_N: + opcode = 0x1A; + break; + case ROUND_M: + opcode = 0x1B; + break; + case ROUND_P: + opcode = 0x1A; + sz |= 2; + break; + case ROUND_Z: + opcode = 0x1B; + sz |= 2; + break; + } + Write32((0x5E << 24) | (sign << 29) | (sz << 22) | (1 << 21) | (opcode << 12) | (2 << 10) | + (Rn << 5) | Rd); + } +} + +void ARM64FloatEmitter::FCVTS(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round) { + EmitConvertScalarToInt(Rd, Rn, round, false); +} + +void ARM64FloatEmitter::FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round) { + EmitConvertScalarToInt(Rd, Rn, round, true); +} + +void ARM64FloatEmitter::EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode, + u32 opcode, int scale, ARM64Reg Rd, ARM64Reg Rn) { + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((sf << 31) | (S << 29) | (0xF0 << 21) | (direction << 21) | (type << 22) | + (rmode << 19) | (opcode << 16) | (scale << 10) | (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm) { + ASSERT_MSG(!IsQuad(Rn), "%s doesn't support vector!", __func__); + bool is_double = IsDouble(Rn); + + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (Rm << 16) | (op << 14) | + (1 << 13) | (Rn << 5) | opcode2); +} + +void ARM64FloatEmitter::EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn, + ARM64Reg Rm) { + ASSERT_MSG(!IsQuad(Rd), "%s doesn't support vector!", __func__); + bool is_double = IsDouble(Rd); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (Rm << 16) | (cond << 12) | + (3 << 10) | (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + ASSERT_MSG(!IsSingle(Rd), "%s doesn't support singles!", __func__); + + bool quad = IsQuad(Rd); + + u32 encoded_size = 0; + if (size == 16) + encoded_size = 1; + else if (size == 32) + encoded_size = 2; + else if (size == 64) + encoded_size = 3; + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((quad << 30) | (7 << 25) | (encoded_size << 22) | (Rm << 16) | (op << 12) | (1 << 11) | + (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8) { + ASSERT_MSG(!IsQuad(Rd), "%s doesn't support vector!", __func__); + + bool is_double = !IsSingle(Rd); + + Rd = DecodeReg(Rd); + + Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (type << 22) | (imm8 << 13) | + (1 << 12) | (imm5 << 5) | Rd); +} + +void ARM64FloatEmitter::EmitShiftImm(bool Q, bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, + ARM64Reg Rn) { + ASSERT_MSG(immh, "%s bad encoding! Can't have zero immh", __func__); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((Q << 30) | (U << 29) | (0xF << 24) | (immh << 19) | (immb << 16) | (opcode << 11) | + (1 << 10) | (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitScalarShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, + ARM64Reg Rn) { + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((2 << 30) | (U << 29) | (0x3E << 23) | (immh << 19) | (immb << 16) | (opcode << 11) | + (1 << 10) | (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, + ARM64Reg Rn) { + bool quad = IsQuad(Rt); + u32 encoded_size = 0; + + if (size == 16) + encoded_size = 1; + else if (size == 32) + encoded_size = 2; + else if (size == 64) + encoded_size = 3; + + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + + Write32((quad << 30) | (3 << 26) | (L << 22) | (opcode << 12) | (encoded_size << 10) | + (Rn << 5) | Rt); +} + +void ARM64FloatEmitter::EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode, + ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) { + bool quad = IsQuad(Rt); + u32 encoded_size = 0; + + if (size == 16) + encoded_size = 1; + else if (size == 32) + encoded_size = 2; + else if (size == 64) + encoded_size = 3; + + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((quad << 30) | (0b11001 << 23) | (L << 22) | (Rm << 16) | (opcode << 12) | + (encoded_size << 10) | (Rn << 5) | Rt); +} + +void ARM64FloatEmitter::EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, + ARM64Reg Rn) { + ASSERT_MSG(!IsQuad(Rd), "%s doesn't support vector!", __func__); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((M << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (opcode << 15) | (1 << 14) | + (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H, + ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + bool quad = IsQuad(Rd); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((quad << 30) | (U << 29) | (0xF << 24) | (size << 22) | (L << 21) | (Rm << 16) | + (opcode << 12) | (H << 11) | (Rn << 5) | Rd); +} + +void ARM64FloatEmitter::EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + ASSERT_MSG(!(imm < -256 || imm > 255), "%s received too large offset: %d", __func__, imm); + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + + Write32((size << 30) | (0xF << 26) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt); +} + +void ARM64FloatEmitter::EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, + ARM64Reg Rt2, ARM64Reg Rn, s32 imm) { + u32 type_encode = 0; + u32 opc = 0; + + switch (type) { + case INDEX_SIGNED: + type_encode = 0b010; + break; + case INDEX_POST: + type_encode = 0b001; + break; + case INDEX_PRE: + type_encode = 0b011; + break; + case INDEX_UNSIGNED: + ASSERT_MSG(false, "%s doesn't support INDEX_UNSIGNED!", __func__); + break; + } + + if (size == 128) { + ASSERT_MSG(!(imm & 0xF), "%s received invalid offset 0x%x!", __func__, imm); + opc = 2; + imm >>= 4; + } else if (size == 64) { + ASSERT_MSG(!(imm & 0x7), "%s received invalid offset 0x%x!", __func__, imm); + opc = 1; + imm >>= 3; + } else if (size == 32) { + ASSERT_MSG(!(imm & 0x3), "%s received invalid offset 0x%x!", __func__, imm); + opc = 0; + imm >>= 2; + } + + Rt = DecodeReg(Rt); + Rt2 = DecodeReg(Rt2); + Rn = DecodeReg(Rn); + + Write32((opc << 30) | (0b1011 << 26) | (type_encode << 23) | (load << 22) | + ((imm & 0x7F) << 15) | (Rt2 << 10) | (Rn << 5) | Rt); +} + +void ARM64FloatEmitter::EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, + ArithOption Rm) { + ASSERT_MSG(Rm.GetType() == ArithOption::TYPE_EXTENDEDREG, + "%s must contain an extended reg as Rm!", __func__); + + u32 encoded_size = 0; + u32 encoded_op = 0; + + if (size == 8) { + encoded_size = 0; + encoded_op = 0; + } else if (size == 16) { + encoded_size = 1; + encoded_op = 0; + } else if (size == 32) { + encoded_size = 2; + encoded_op = 0; + } else if (size == 64) { + encoded_size = 3; + encoded_op = 0; + } else if (size == 128) { + encoded_size = 0; + encoded_op = 2; + } + + if (load) + encoded_op |= 1; + + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + ARM64Reg decoded_Rm = DecodeReg(Rm.GetReg()); + + Write32((encoded_size << 30) | (encoded_op << 22) | (0b111100001 << 21) | (decoded_Rm << 16) | + Rm.GetData() | (1 << 11) | (Rn << 5) | Rt); +} + +void ARM64FloatEmitter::EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh) { + union V { + u8 hex; + struct InV { + unsigned defgh : 5; + unsigned abc : 3; + } in; + } v; + v.hex = abcdefgh; + Rd = DecodeReg(Rd); + Write32((Q << 30) | (op << 29) | (0xF << 24) | (v.in.abc << 16) | (cmode << 12) | (o2 << 11) | + (1 << 10) | (v.in.defgh << 5) | Rd); +} + +void ARM64FloatEmitter::LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + EmitLoadStoreImmediate(size, 1, type, Rt, Rn, imm); +} +void ARM64FloatEmitter::STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + EmitLoadStoreImmediate(size, 0, type, Rt, Rn, imm); +} + +// Loadstore unscaled +void ARM64FloatEmitter::LDUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + u32 encoded_size = 0; + u32 encoded_op = 0; + + if (size == 8) { + encoded_size = 0; + encoded_op = 1; + } else if (size == 16) { + encoded_size = 1; + encoded_op = 1; + } else if (size == 32) { + encoded_size = 2; + encoded_op = 1; + } else if (size == 64) { + encoded_size = 3; + encoded_op = 1; + } else if (size == 128) { + encoded_size = 0; + encoded_op = 3; + } + + EmitLoadStoreUnscaled(encoded_size, encoded_op, Rt, Rn, imm); +} +void ARM64FloatEmitter::STUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm) { + u32 encoded_size = 0; + u32 encoded_op = 0; + + if (size == 8) { + encoded_size = 0; + encoded_op = 0; + } else if (size == 16) { + encoded_size = 1; + encoded_op = 0; + } else if (size == 32) { + encoded_size = 2; + encoded_op = 0; + } else if (size == 64) { + encoded_size = 3; + encoded_op = 0; + } else if (size == 128) { + encoded_size = 0; + encoded_op = 2; + } + + EmitLoadStoreUnscaled(encoded_size, encoded_op, Rt, Rn, imm); +} + +// Loadstore single structure +void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn) { + bool S = 0; + u32 opcode = 0; + u32 encoded_size = 0; + ARM64Reg encoded_reg = INVALID_REG; + + if (size == 8) { + S = (index & 4) != 0; + opcode = 0; + encoded_size = index & 3; + if (index & 8) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } else if (size == 16) { + S = (index & 2) != 0; + opcode = 2; + encoded_size = (index & 1) << 1; + if (index & 4) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } else if (size == 32) { + S = (index & 1) != 0; + opcode = 4; + encoded_size = 0; + if (index & 2) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } else if (size == 64) { + S = 0; + opcode = 4; + encoded_size = 1; + if (index == 1) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + + EmitLoadStoreSingleStructure(1, 0, opcode, S, encoded_size, encoded_reg, Rn); +} + +void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm) { + bool S = 0; + u32 opcode = 0; + u32 encoded_size = 0; + ARM64Reg encoded_reg = INVALID_REG; + + if (size == 8) { + S = (index & 4) != 0; + opcode = 0; + encoded_size = index & 3; + if (index & 8) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } else if (size == 16) { + S = (index & 2) != 0; + opcode = 2; + encoded_size = (index & 1) << 1; + if (index & 4) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } else if (size == 32) { + S = (index & 1) != 0; + opcode = 4; + encoded_size = 0; + if (index & 2) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } else if (size == 64) { + S = 0; + opcode = 4; + encoded_size = 1; + if (index == 1) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + + EmitLoadStoreSingleStructure(1, 0, opcode, S, encoded_size, encoded_reg, Rn, Rm); +} + +void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn) { + EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn); +} +void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn) { + EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn); +} +void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) { + EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn, Rm); +} +void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) { + EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn, Rm); +} + +void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn) { + bool S = 0; + u32 opcode = 0; + u32 encoded_size = 0; + ARM64Reg encoded_reg = INVALID_REG; + + if (size == 8) { + S = (index & 4) != 0; + opcode = 0; + encoded_size = index & 3; + if (index & 8) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } else if (size == 16) { + S = (index & 2) != 0; + opcode = 2; + encoded_size = (index & 1) << 1; + if (index & 4) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } else if (size == 32) { + S = (index & 1) != 0; + opcode = 4; + encoded_size = 0; + if (index & 2) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } else if (size == 64) { + S = 0; + opcode = 4; + encoded_size = 1; + if (index == 1) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + + EmitLoadStoreSingleStructure(0, 0, opcode, S, encoded_size, encoded_reg, Rn); +} + +void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm) { + bool S = 0; + u32 opcode = 0; + u32 encoded_size = 0; + ARM64Reg encoded_reg = INVALID_REG; + + if (size == 8) { + S = (index & 4) != 0; + opcode = 0; + encoded_size = index & 3; + if (index & 8) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } else if (size == 16) { + S = (index & 2) != 0; + opcode = 2; + encoded_size = (index & 1) << 1; + if (index & 4) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } else if (size == 32) { + S = (index & 1) != 0; + opcode = 4; + encoded_size = 0; + if (index & 2) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } else if (size == 64) { + S = 0; + opcode = 4; + encoded_size = 1; + if (index == 1) + encoded_reg = EncodeRegToQuad(Rt); + else + encoded_reg = EncodeRegToDouble(Rt); + } + + EmitLoadStoreSingleStructure(0, 0, opcode, S, encoded_size, encoded_reg, Rn, Rm); +} + +// Loadstore multiple structure +void ARM64FloatEmitter::LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn) { + ASSERT_MSG(!(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", __func__); + u32 opcode = 0; + if (count == 1) + opcode = 0b111; + else if (count == 2) + opcode = 0b1010; + else if (count == 3) + opcode = 0b0110; + else if (count == 4) + opcode = 0b0010; + EmitLoadStoreMultipleStructure(size, 1, opcode, Rt, Rn); +} +void ARM64FloatEmitter::LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, + ARM64Reg Rm) { + ASSERT_MSG(!(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", __func__); + ASSERT_MSG(type == INDEX_POST, "%s only supports post indexing!", __func__); + + u32 opcode = 0; + if (count == 1) + opcode = 0b111; + else if (count == 2) + opcode = 0b1010; + else if (count == 3) + opcode = 0b0110; + else if (count == 4) + opcode = 0b0010; + EmitLoadStoreMultipleStructurePost(size, 1, opcode, Rt, Rn, Rm); +} +void ARM64FloatEmitter::ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn) { + ASSERT_MSG(!(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", __func__); + u32 opcode = 0; + if (count == 1) + opcode = 0b111; + else if (count == 2) + opcode = 0b1010; + else if (count == 3) + opcode = 0b0110; + else if (count == 4) + opcode = 0b0010; + EmitLoadStoreMultipleStructure(size, 0, opcode, Rt, Rn); +} +void ARM64FloatEmitter::ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, + ARM64Reg Rm) { + ASSERT_MSG(!(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", __func__); + ASSERT_MSG(type == INDEX_POST, "%s only supports post indexing!", __func__); + + u32 opcode = 0; + if (count == 1) + opcode = 0b111; + else if (count == 2) + opcode = 0b1010; + else if (count == 3) + opcode = 0b0110; + else if (count == 4) + opcode = 0b0010; + EmitLoadStoreMultipleStructurePost(size, 0, opcode, Rt, Rn, Rm); +} + +// Scalar - 1 Source +void ARM64FloatEmitter::FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top) { + if (IsScalar(Rd) && IsScalar(Rn)) { + EmitScalar1Source(0, 0, IsDouble(Rd), 0, Rd, Rn); + } else { + ASSERT_MSG(!IsQuad(Rd) && !IsQuad(Rn), "FMOV can't move to/from quads"); + int rmode = 0; + int opcode = 6; + int encoded_size = 0; + int sf = 0; + if (IsSingle(Rd) && !Is64Bit(Rn) && !top) { + // GPR to scalar single + opcode |= 1; + } else if (!Is64Bit(Rd) && IsSingle(Rn) && !top) { + // Scalar single to GPR - defaults are correct + } else if (Is64Bit(Rd) && IsDouble(Rn) && !top) { + // Scalar double to GPR + sf = 1; + encoded_size = 1; + } else if (IsDouble(Rd) && Is64Bit(Rn) && !top) { + // GPR to Scalar double + sf = 1; + encoded_size = 1; + opcode |= 1; + } else { + // TODO + ASSERT_MSG(0, "FMOV: Unhandled case"); + } + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Write32((sf << 31) | (encoded_size << 22) | (0x1e2 << 20) | (rmode << 19) | (opcode << 16) | (Rn << 5) | Rd); + } +} + +// Loadstore paired +void ARM64FloatEmitter::LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, + s32 imm) { + EncodeLoadStorePair(size, true, type, Rt, Rt2, Rn, imm); +} +void ARM64FloatEmitter::STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, + s32 imm) { + EncodeLoadStorePair(size, false, type, Rt, Rt2, Rn, imm); +} + +// Loadstore register offset +void ARM64FloatEmitter::STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) { + EncodeLoadStoreRegisterOffset(size, false, Rt, Rn, Rm); +} +void ARM64FloatEmitter::LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) { + EncodeLoadStoreRegisterOffset(size, true, Rt, Rn, Rm); +} + +void ARM64FloatEmitter::FABS(ARM64Reg Rd, ARM64Reg Rn) { + EmitScalar1Source(0, 0, IsDouble(Rd), 1, Rd, Rn); +} +void ARM64FloatEmitter::FNEG(ARM64Reg Rd, ARM64Reg Rn) { + EmitScalar1Source(0, 0, IsDouble(Rd), 2, Rd, Rn); +} +void ARM64FloatEmitter::FSQRT(ARM64Reg Rd, ARM64Reg Rn) { + EmitScalar1Source(0, 0, IsDouble(Rd), 3, Rd, Rn); +} + +// Scalar - 2 Source +void ARM64FloatEmitter::FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitScalar2Source(0, 0, IsDouble(Rd), 2, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitScalar2Source(0, 0, IsDouble(Rd), 0, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitScalar2Source(0, 0, IsDouble(Rd), 3, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitScalar2Source(0, 0, IsDouble(Rd), 1, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitScalar2Source(0, 0, IsDouble(Rd), 4, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitScalar2Source(0, 0, IsDouble(Rd), 5, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMAXNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitScalar2Source(0, 0, IsDouble(Rd), 6, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMINNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitScalar2Source(0, 0, IsDouble(Rd), 7, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FNMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitScalar2Source(0, 0, IsDouble(Rd), 8, Rd, Rn, Rm); +} + +void ARM64FloatEmitter::FMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) { + EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 0); +} +void ARM64FloatEmitter::FMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) { + EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 1); +} +void ARM64FloatEmitter::FNMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) { + EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 2); +} +void ARM64FloatEmitter::FNMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) { + EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 3); +} + +void ARM64FloatEmitter::EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, + ARM64Reg Ra, int opcode) { + int type = isDouble ? 1 : 0; + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + Ra = DecodeReg(Ra); + int o1 = opcode >> 1; + int o0 = opcode & 1; + m_emit->Write32((0x1F << 24) | (type << 22) | (o1 << 21) | (Rm << 16) | (o0 << 15) | + (Ra << 10) | (Rn << 5) | Rd); +} + +// Scalar three same +void ARM64FloatEmitter::SQADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitScalarThreeSame(0, size, 0b00001, Rd, Rn, Rm); +} +void ARM64FloatEmitter::SQSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitScalarThreeSame(0, size, 0b00101, Rd, Rn, Rm); +} +void ARM64FloatEmitter::UQADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitScalarThreeSame(1, size, 0b00001, Rd, Rn, Rm); +} +void ARM64FloatEmitter::UQSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitScalarThreeSame(1, size, 0b00101, Rd, Rn, Rm); +} + +// Scalar floating point immediate +void ARM64FloatEmitter::FMOV(ARM64Reg Rd, uint8_t imm8) { + EmitScalarImm(0, 0, 0, 0, Rd, imm8); +} + +// Vector +void ARM64FloatEmitter::ADD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + ASSERT(!(IsDouble(Rd) && esize == D)); + EmitThreeSame(0, static_cast(esize), 0b10000, Rd, Rn, Rm); +} +void ARM64FloatEmitter::ADDV(ESize esize, ARM64Reg Rd, ARM64Reg Rn) { + ASSERT(esize != D); + Emit2RegMisc(IsQuad(Rd), 0, static_cast(esize), 0b100011011, Rd, Rn); +} +void ARM64FloatEmitter::SUB(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + ASSERT(!(IsDouble(Rd) && esize == D)); + EmitThreeSame(1, static_cast(esize), 0b10000, Rd, Rn, Rm); +} +void ARM64FloatEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitThreeSame(0, 0, 3, Rd, Rn, Rm); +} +void ARM64FloatEmitter::BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitThreeSame(1, 1, 3, Rd, Rn, Rm); +} +void ARM64FloatEmitter::CMGE(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + ASSERT(!(IsDouble(Rd) && esize == D)); + EmitThreeSame(0, static_cast(esize), 0b00111, Rd, Rn, Rm); +} +void ARM64FloatEmitter::CMGE_zero(ESize esize, ARM64Reg Rd, ARM64Reg Rn) { + ASSERT(!(IsDouble(Rd) && esize == D)); + Emit2RegMisc(IsQuad(Rd), 1, static_cast(esize), 0b1000, Rd, Rn); +} +void ARM64FloatEmitter::CMGT(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + ASSERT(!(IsDouble(Rd) && esize == D)); + EmitThreeSame(0, static_cast(esize), 0b00110, Rd, Rn, Rm); +} +void ARM64FloatEmitter::CMHI(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + ASSERT(!(IsDouble(Rd) && esize == D)); + EmitThreeSame(1, static_cast(esize), 0b00110, Rd, Rn, Rm); +} +void ARM64FloatEmitter::CMHS(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + ASSERT(!(IsDouble(Rd) && esize == D)); + EmitThreeSame(1, static_cast(esize), 0b00111, Rd, Rn, Rm); +} +void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) { + u32 imm5 = 0; + + if (size == 8) { + imm5 = 1; + imm5 |= index << 1; + } else if (size == 16) { + imm5 = 2; + imm5 |= index << 2; + } else if (size == 32) { + imm5 = 4; + imm5 |= index << 3; + } else if (size == 64) { + imm5 = 8; + imm5 |= index << 4; + } + + EmitCopy(IsQuad(Rd), 0, imm5, 0, Rd, Rn); +} +void ARM64FloatEmitter::FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xF, Rd, Rn); +} +void ARM64FloatEmitter::FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitThreeSame(0, size >> 6, 0x1A, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitThreeSame(0, size >> 6, 0b11110, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitThreeSame(0, size >> 6, 0x19, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitThreeSame(0, 2 | size >> 6, 0b11110, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(false, 0, size >> 6, 0x17, Rd, Rn); +} +void ARM64FloatEmitter::FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(true, 0, size >> 6, 0x17, Rd, Rn); +} +void ARM64FloatEmitter::FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(IsQuad(Rd), 0, dest_size >> 5, 0x16, Rd, Rn); +} +void ARM64FloatEmitter::FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x1B, Rd, Rn); +} +void ARM64FloatEmitter::FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1B, Rd, Rn); +} +void ARM64FloatEmitter::FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitThreeSame(1, size >> 6, 0x1F, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitThreeSame(1, size >> 6, 0x1B, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0xF, Rd, Rn); +} +void ARM64FloatEmitter::FRECPE(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x1D, Rd, Rn); +} +void ARM64FloatEmitter::FRSQRTE(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1D, Rd, Rn); +} +void ARM64FloatEmitter::FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitThreeSame(0, 2 | (size >> 6), 0x1A, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitThreeSame(0, 2 | (size >> 6), 0x19, Rd, Rn, Rm); +} +void ARM64FloatEmitter::NOT(ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(IsQuad(Rd), 1, 0, 5, Rd, Rn); +} +void ARM64FloatEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitThreeSame(0, 2, 3, Rd, Rn, Rm); +} +void ARM64FloatEmitter::REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 1, Rd, Rn); +} +void ARM64FloatEmitter::REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(IsQuad(Rd), 1, size >> 4, 0, Rd, Rn); +} +void ARM64FloatEmitter::REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 0, Rd, Rn); +} +void ARM64FloatEmitter::SABD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + ASSERT(esize != D); + EmitThreeSame(0, static_cast(esize), 0b01110, Rd, Rn, Rm); +} +void ARM64FloatEmitter::UABD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + ASSERT(esize != D); + EmitThreeSame(1, static_cast(esize), 0b01110, Rd, Rn, Rm); +} +void ARM64FloatEmitter::SADDLV(ESize esize, ARM64Reg Rd, ARM64Reg Rn) { + ASSERT(esize != D); + Emit2RegMisc(IsQuad(Rd), 0, static_cast(esize), 0b100000011, Rd, Rn); +} +void ARM64FloatEmitter::UADDLV(ESize esize, ARM64Reg Rd, ARM64Reg Rn) { + ASSERT(esize != D); + Emit2RegMisc(IsQuad(Rd), 1, static_cast(esize), 0b100000011, Rd, Rn); +} +void ARM64FloatEmitter::SHADD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + ASSERT(!(IsDouble(Rd) && esize == D)); + EmitThreeSame(0, static_cast(esize), 0b0, Rd, Rn, Rm); +} +void ARM64FloatEmitter::UHADD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + ASSERT(!(IsDouble(Rd) && esize == D)); + EmitThreeSame(1, static_cast(esize), 0b0, Rd, Rn, Rm); +} +void ARM64FloatEmitter::SHSUB(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + ASSERT(!(IsDouble(Rd) && esize == D)); + EmitThreeSame(0, static_cast(esize), 0b00100, Rd, Rn, Rm); +} +void ARM64FloatEmitter::UHSUB(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + ASSERT(!(IsDouble(Rd) && esize == D)); + EmitThreeSame(1, static_cast(esize), 0b00100, Rd, Rn, Rm); +} +void ARM64FloatEmitter::SMIN(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + ASSERT(!(IsDouble(Rd) && esize == D)); + EmitThreeSame(0, static_cast(esize), 0b01101, Rd, Rn, Rm); +} +void ARM64FloatEmitter::UMIN(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + ASSERT(!(IsDouble(Rd) && esize == D)); + EmitThreeSame(1, static_cast(esize), 0b01101, Rd, Rn, Rm); +} +void ARM64FloatEmitter::SQADD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + ASSERT(!(IsDouble(Rd) && esize == D)); + EmitThreeSame(0, static_cast(esize), 0b00001, Rd, Rn, Rm); +} +void ARM64FloatEmitter::SQSUB(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + ASSERT(!(IsDouble(Rd) && esize == D)); + EmitThreeSame(0, static_cast(esize), 0b00101, Rd, Rn, Rm); +} +void ARM64FloatEmitter::UQADD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + ASSERT(!(IsDouble(Rd) && esize == D)); + EmitThreeSame(1, static_cast(esize), 0b00001, Rd, Rn, Rm); +} +void ARM64FloatEmitter::UQSUB(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + ASSERT(!(IsDouble(Rd) && esize == D)); + EmitThreeSame(1, static_cast(esize), 0b00101, Rd, Rn, Rm); +} +void ARM64FloatEmitter::SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(IsQuad(Rd), 0, size >> 6, 0x1D, Rd, Rn); +} +void ARM64FloatEmitter::UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(IsQuad(Rd), 1, size >> 6, 0x1D, Rd, Rn); +} +void ARM64FloatEmitter::SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale) { + int imm = size * 2 - scale; + EmitShiftImm(IsQuad(Rd), 0, imm >> 3, imm & 7, 0x1C, Rd, Rn); +} +void ARM64FloatEmitter::UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale) { + int imm = size * 2 - scale; + EmitShiftImm(IsQuad(Rd), 1, imm >> 3, imm & 7, 0x1C, Rd, Rn); +} +void ARM64FloatEmitter::SQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(false, 0, dest_size >> 4, 0b10100, Rd, Rn); +} +void ARM64FloatEmitter::SQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(true, 0, dest_size >> 4, 0b10100, Rd, Rn); +} +void ARM64FloatEmitter::UQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(false, 1, dest_size >> 4, 0b10100, Rd, Rn); +} +void ARM64FloatEmitter::UQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(true, 1, dest_size >> 4, 0b10100, Rd, Rn); +} +void ARM64FloatEmitter::XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(false, 0, dest_size >> 4, 0b10010, Rd, Rn); +} +void ARM64FloatEmitter::XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(true, 0, dest_size >> 4, 0b10010, Rd, Rn); +} + +// Move +void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + u32 imm5 = 0; + + if (size == 8) + imm5 = 1; + else if (size == 16) + imm5 = 2; + else if (size == 32) + imm5 = 4; + else if (size == 64) + imm5 = 8; + + EmitCopy(IsQuad(Rd), 0, imm5, 1, Rd, Rn); +} +void ARM64FloatEmitter::INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn) { + u32 imm5 = 0; + + if (size == 8) { + imm5 = 1; + imm5 |= index << 1; + } else if (size == 16) { + imm5 = 2; + imm5 |= index << 2; + } else if (size == 32) { + imm5 = 4; + imm5 |= index << 3; + } else if (size == 64) { + imm5 = 8; + imm5 |= index << 4; + } + + EmitCopy(1, 0, imm5, 3, Rd, Rn); +} +void ARM64FloatEmitter::INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 index2) { + u32 imm5 = 0, imm4 = 0; + + if (size == 8) { + imm5 = 1; + imm5 |= index1 << 1; + imm4 = index2; + } else if (size == 16) { + imm5 = 2; + imm5 |= index1 << 2; + imm4 = index2 << 1; + } else if (size == 32) { + imm5 = 4; + imm5 |= index1 << 3; + imm4 = index2 << 2; + } else if (size == 64) { + imm5 = 8; + imm5 |= index1 << 4; + imm4 = index2 << 3; + } + + EmitCopy(1, 1, imm5, imm4, Rd, Rn); +} + +void ARM64FloatEmitter::UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) { + bool b64Bit = Is64Bit(Rd); + ASSERT_MSG(Rd < SP, "%s destination must be a GPR!", __func__); + ASSERT_MSG(!(b64Bit && size != 64), "%s must have a size of 64 when destination is 64bit!", + __func__); + u32 imm5 = 0; + + if (size == 8) { + imm5 = 1; + imm5 |= index << 1; + } else if (size == 16) { + imm5 = 2; + imm5 |= index << 2; + } else if (size == 32) { + imm5 = 4; + imm5 |= index << 3; + } else if (size == 64) { + imm5 = 8; + imm5 |= index << 4; + } + + EmitCopy(b64Bit, 0, imm5, 7, Rd, Rn); +} +void ARM64FloatEmitter::SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) { + bool b64Bit = Is64Bit(Rd); + ASSERT_MSG(Rd < SP, "%s destination must be a GPR!", __func__); + ASSERT_MSG(size != 64, "%s doesn't support 64bit destination. Use UMOV!", __func__); + u32 imm5 = 0; + + if (size == 8) { + imm5 = 1; + imm5 |= index << 1; + } else if (size == 16) { + imm5 = 2; + imm5 |= index << 2; + } else if (size == 32) { + imm5 = 4; + imm5 |= index << 3; + } + + EmitCopy(b64Bit, 0, imm5, 5, Rd, Rn); +} + +// One source +void ARM64FloatEmitter::FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn) { + u32 dst_encoding = 0; + u32 src_encoding = 0; + + if (size_to == 16) + dst_encoding = 3; + else if (size_to == 32) + dst_encoding = 0; + else if (size_to == 64) + dst_encoding = 1; + + if (size_from == 16) + src_encoding = 3; + else if (size_from == 32) + src_encoding = 0; + else if (size_from == 64) + src_encoding = 1; + + Emit1Source(0, 0, src_encoding, 4 | dst_encoding, Rd, Rn); +} + +void ARM64FloatEmitter::SCVTF(ARM64Reg Rd, ARM64Reg Rn) { + if (IsScalar(Rn)) { + // Source is in FP register (like destination!). We must use a vector + // encoding. + bool sign = false; + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + int sz = IsDouble(Rn); + Write32((0x5e << 24) | (sign << 29) | (sz << 22) | (0x876 << 10) | (Rn << 5) | Rd); + } else { + bool sf = Is64Bit(Rn); + u32 type = 0; + if (IsDouble(Rd)) + type = 1; + EmitConversion(sf, 0, type, 0, 2, Rd, Rn); + } +} + +void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn) { + if (IsScalar(Rn)) { + // Source is in FP register (like destination!). We must use a vector + // encoding. + bool sign = true; + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + int sz = IsDouble(Rn); + Write32((0x5e << 24) | (sign << 29) | (sz << 22) | (0x876 << 10) | (Rn << 5) | Rd); + } else { + bool sf = Is64Bit(Rn); + u32 type = 0; + if (IsDouble(Rd)) + type = 1; + + EmitConversion(sf, 0, type, 0, 3, Rd, Rn); + } +} + +void ARM64FloatEmitter::SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale) { + bool sf = Is64Bit(Rn); + u32 type = 0; + if (IsDouble(Rd)) + type = 1; + + EmitConversion2(sf, 0, false, type, 0, 2, 64 - scale, Rd, Rn); +} + +void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale) { + bool sf = Is64Bit(Rn); + u32 type = 0; + if (IsDouble(Rd)) + type = 1; + + EmitConversion2(sf, 0, false, type, 0, 3, 64 - scale, Rd, Rn); +} + +void ARM64FloatEmitter::FCMP(ARM64Reg Rn, ARM64Reg Rm) { + EmitCompare(0, 0, 0, 0, Rn, Rm); +} +void ARM64FloatEmitter::FCMP(ARM64Reg Rn) { + EmitCompare(0, 0, 0, 8, Rn, static_cast(0)); +} +void ARM64FloatEmitter::FCMPE(ARM64Reg Rn, ARM64Reg Rm) { + EmitCompare(0, 0, 0, 0x10, Rn, Rm); +} +void ARM64FloatEmitter::FCMPE(ARM64Reg Rn) { + EmitCompare(0, 0, 0, 0x18, Rn, static_cast(0)); +} +void ARM64FloatEmitter::FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitThreeSame(0, size >> 6, 0x1C, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xD, Rd, Rn); +} +void ARM64FloatEmitter::FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitThreeSame(1, size >> 6, 0x1C, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x0C, Rd, Rn); +} +void ARM64FloatEmitter::FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitThreeSame(1, 2 | (size >> 6), 0x1C, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x0C, Rd, Rn); +} +void ARM64FloatEmitter::FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0xD, Rd, Rn); +} +void ARM64FloatEmitter::FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xE, Rd, Rn); +} + +void ARM64FloatEmitter::FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) { + EmitCondSelect(0, 0, cond, Rd, Rn, Rm); +} + +// Permute +void ARM64FloatEmitter::UZP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitPermute(size, 0b001, Rd, Rn, Rm); +} +void ARM64FloatEmitter::TRN1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitPermute(size, 0b010, Rd, Rn, Rm); +} +void ARM64FloatEmitter::ZIP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitPermute(size, 0b011, Rd, Rn, Rm); +} +void ARM64FloatEmitter::UZP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitPermute(size, 0b101, Rd, Rn, Rm); +} +void ARM64FloatEmitter::TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitPermute(size, 0b110, Rd, Rn, Rm); +} +void ARM64FloatEmitter::ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitPermute(size, 0b111, Rd, Rn, Rm); +} + +// Shift by immediate +void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) { + SSHLL(src_size, Rd, Rn, shift, false); +} +void ARM64FloatEmitter::SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) { + SSHLL(src_size, Rd, Rn, shift, true); +} +void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) { + SHRN(dest_size, Rd, Rn, shift, false); +} +void ARM64FloatEmitter::SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) { + SHRN(dest_size, Rd, Rn, shift, true); +} +void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) { + USHLL(src_size, Rd, Rn, shift, false); +} +void ARM64FloatEmitter::USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) { + USHLL(src_size, Rd, Rn, shift, true); +} +void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) { + SXTL(src_size, Rd, Rn, false); +} +void ARM64FloatEmitter::SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) { + SXTL(src_size, Rd, Rn, true); +} +void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) { + UXTL(src_size, Rd, Rn, false); +} +void ARM64FloatEmitter::UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) { + UXTL(src_size, Rd, Rn, true); +} + +void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper) { + ASSERT_MSG(shift < src_size, "%s shift amount must less than the element size!", __func__); + u32 immh = 0; + u32 immb = shift & 0xFFF; + + if (src_size == 8) { + immh = 1; + } else if (src_size == 16) { + immh = 2 | ((shift >> 3) & 1); + } else if (src_size == 32) { + immh = 4 | ((shift >> 3) & 3); + ; + } + EmitShiftImm(upper, 0, immh, immb, 0b10100, Rd, Rn); +} + +void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper) { + ASSERT_MSG(shift < src_size, "%s shift amount must less than the element size!", __func__); + u32 immh = 0; + u32 immb = shift & 0xFFF; + + if (src_size == 8) { + immh = 1; + } else if (src_size == 16) { + immh = 2 | ((shift >> 3) & 1); + } else if (src_size == 32) { + immh = 4 | ((shift >> 3) & 3); + ; + } + EmitShiftImm(upper, 1, immh, immb, 0b10100, Rd, Rn); +} + +void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper) { + ASSERT_MSG(shift < dest_size, "%s shift amount must less than the element size!", __func__); + u32 immh = 0; + u32 immb = shift & 0xFFF; + + if (dest_size == 8) { + immh = 1; + } else if (dest_size == 16) { + immh = 2 | ((shift >> 3) & 1); + } else if (dest_size == 32) { + immh = 4 | ((shift >> 3) & 3); + ; + } + EmitShiftImm(upper, 1, immh, immb, 0b10000, Rd, Rn); +} + +void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper) { + SSHLL(src_size, Rd, Rn, 0, upper); +} + +void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper) { + USHLL(src_size, Rd, Rn, 0, upper); +} + +// vector x indexed element +void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index) { + ASSERT_MSG(size == 32 || size == 64, "%s only supports 32bit or 64bit size!", __func__); + + bool L = false; + bool H = false; + if (size == 32) { + L = index & 1; + H = (index >> 1) & 1; + } else if (size == 64) { + H = index == 1; + } + + EmitVectorxElement(0, 2 | (size >> 6), L, 0x9, H, Rd, Rn, Rm); +} + +void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index) { + ASSERT_MSG(size == 32 || size == 64, "%s only supports 32bit or 64bit size!", __func__); + + bool L = false; + bool H = false; + if (size == 32) { + L = index & 1; + H = (index >> 1) & 1; + } else if (size == 64) { + H = index == 1; + } + + EmitVectorxElement(0, 2 | (size >> 6), L, 1, H, Rd, Rn, Rm); +} + +// Modified Immediate +void ARM64FloatEmitter::MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift) { + bool Q = IsQuad(Rd); + u8 cmode = 0; + u8 op = 0; + u8 abcdefgh = imm & 0xFF; + if (size == 8) { + ASSERT_MSG(shift == 0, "%s(size8) doesn't support shift!", __func__); + ASSERT_MSG(!(imm & ~0xFFULL), "%s(size8) only supports 8bit values!", __func__); + } else if (size == 16) { + ASSERT_MSG(shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!", __func__); + ASSERT_MSG(!(imm & ~0xFFULL), "%s(size16) only supports 8bit values!", __func__); + + if (shift == 8) + cmode |= 2; + } else if (size == 32) { + ASSERT_MSG(shift == 0 || shift == 8 || shift == 16 || shift == 24, + "%s(size32) only supports shift of {0, 8, 16, 24}!", __func__); + // XXX: Implement support for MOVI - shifting ones variant + ASSERT_MSG(!(imm & ~0xFFULL), "%s(size32) only supports 8bit values!", __func__); + switch (shift) { + case 8: + cmode |= 2; + break; + case 16: + cmode |= 4; + break; + case 24: + cmode |= 6; + break; + default: + break; + } + } else // 64 + { + ASSERT_MSG(shift == 0, "%s(size64) doesn't support shift!", __func__); + + op = 1; + cmode = 0xE; + abcdefgh = 0; + for (int i = 0; i < 8; ++i) { + u8 tmp = (imm >> (i << 3)) & 0xFF; + ASSERT_MSG(tmp == 0xFF || tmp == 0, "%s(size64) Invalid immediate!", __func__); + if (tmp == 0xFF) + abcdefgh |= (1 << i); + } + } + EncodeModImm(Q, op, cmode, 0, Rd, abcdefgh); +} + +void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift) { + bool Q = IsQuad(Rd); + u8 cmode = 1; + u8 op = 1; + if (size == 16) { + ASSERT_MSG(shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!", __func__); + + if (shift == 8) + cmode |= 2; + } else if (size == 32) { + ASSERT_MSG(shift == 0 || shift == 8 || shift == 16 || shift == 24, + "%s(size32) only supports shift of {0, 8, 16, 24}!", __func__); + // XXX: Implement support for MOVI - shifting ones variant + switch (shift) { + case 8: + cmode |= 2; + break; + case 16: + cmode |= 4; + break; + case 24: + cmode |= 6; + break; + default: + break; + } + } else { + ASSERT_MSG(false, "%s only supports size of {16, 32}!", __func__); + } + EncodeModImm(Q, op, cmode, 0, Rd, imm); +} + +void ARM64FloatEmitter::ABI_PushRegisters(u32 registers, ARM64Reg tmp) { + bool bundled_loadstore = false; + + for (int i = 0; i < 32; ++i) { + if (!Common::Bit(i, registers)) + continue; + + int count = 0; + while (++count < 4 && (i + count) < 32 && Common::Bit(i + count, registers)) { + } + if (count > 1) { + bundled_loadstore = true; + break; + } + } + + if (bundled_loadstore && tmp != INVALID_REG) { + int num_regs = Common::BitCount(registers); + m_emit->SUB(SP, SP, num_regs * 16); + m_emit->ADD(tmp, SP, 0); + std::vector island_regs; + for (int i = 0; i < 32; ++i) { + if (!Common::Bit(i, registers)) + continue; + + int count = 0; + + // 0 = true + // 1 < 4 && registers[i + 1] true! + // 2 < 4 && registers[i + 2] true! + // 3 < 4 && registers[i + 3] true! + // 4 < 4 && registers[i + 4] false! + while (++count < 4 && (i + count) < 32 && Common::Bit(i + count, registers)) { + } + + if (count == 1) + island_regs.push_back((ARM64Reg)(Q0 + i)); + else + ST1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), tmp); + + i += count - 1; + } + + // Handle island registers + std::vector pair_regs; + for (auto& it : island_regs) { + pair_regs.push_back(it); + if (pair_regs.size() == 2) { + STP(128, INDEX_POST, pair_regs[0], pair_regs[1], tmp, 32); + pair_regs.clear(); + } + } + if (pair_regs.size()) + STR(128, INDEX_POST, pair_regs[0], tmp, 16); + } else { + std::vector pair_regs; + for (int i = 0; i < 32; ++i) { + if (Common::Bit(i, registers)) { + pair_regs.push_back((ARM64Reg)(Q0 + i)); + if (pair_regs.size() == 2) { + STP(128, INDEX_PRE, pair_regs[0], pair_regs[1], SP, -32); + pair_regs.clear(); + } + } + } + if (pair_regs.size()) + STR(128, INDEX_PRE, pair_regs[0], SP, -16); + } +} +void ARM64FloatEmitter::ABI_PopRegisters(u32 registers, ARM64Reg tmp) { + bool bundled_loadstore = false; + int num_regs = Common::BitCount(registers); + + for (int i = 0; i < 32; ++i) { + if (!Common::Bit(i, registers)) + continue; + + int count = 0; + while (++count < 4 && (i + count) < 32 && Common::Bit(i + count, registers)) { + } + if (count > 1) { + bundled_loadstore = true; + break; + } + } + + if (bundled_loadstore && tmp != INVALID_REG) { + // The temporary register is only used to indicate that we can use this code path + std::vector island_regs; + for (int i = 0; i < 32; ++i) { + if (!Common::Bit(i, registers)) + continue; + + u8 count = 0; + while (++count < 4 && (i + count) < 32 && Common::Bit(i + count, registers)) { + } + + if (count == 1) + island_regs.push_back(static_cast(Q0 + i)); + else + LD1(64, count, INDEX_POST, static_cast(Q0 + i), SP); + + i += count - 1; + } + + // Handle island registers + std::vector pair_regs; + for (auto& it : island_regs) { + pair_regs.push_back(it); + if (pair_regs.size() == 2) { + LDP(128, INDEX_POST, pair_regs[0], pair_regs[1], SP, 32); + pair_regs.clear(); + } + } + if (pair_regs.size()) + LDR(128, INDEX_POST, pair_regs[0], SP, 16); + } else { + bool odd = num_regs % 2; + std::vector pair_regs; + for (int i = 31; i >= 0; --i) { + if (!Common::Bit(i, registers)) + continue; + + if (odd) { + // First load must be a regular LDR if odd + odd = false; + LDR(128, INDEX_POST, static_cast(Q0 + i), SP, 16); + } else { + pair_regs.push_back(static_cast(Q0 + i)); + if (pair_regs.size() == 2) { + LDP(128, INDEX_POST, pair_regs[1], pair_regs[0], SP, 32); + pair_regs.clear(); + } + } + } + } +} + +void ARM64XEmitter::ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) { + unsigned int n, imm_s, imm_r; + if (!Is64Bit(Rn)) + imm &= 0xFFFFFFFF; + if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) { + AND(Rd, Rn, imm_r, imm_s, n != 0); + } else { + ASSERT_MSG(scratch != INVALID_REG, + "ANDI2R - failed to construct logical immediate value from " + "%08x, need scratch", + static_cast(imm)); + MOVI2R(scratch, imm); + AND(Rd, Rn, scratch); + } +} + +void ARM64XEmitter::ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) { + unsigned int n, imm_s, imm_r; + if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) { + ORR(Rd, Rn, imm_r, imm_s, n != 0); + } else { + ASSERT_MSG(scratch != INVALID_REG, + "ORRI2R - failed to construct logical immediate value from " + "%08x, need scratch", + static_cast(imm)); + MOVI2R(scratch, imm); + ORR(Rd, Rn, scratch); + } +} + +void ARM64XEmitter::EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) { + unsigned int n, imm_s, imm_r; + if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) { + EOR(Rd, Rn, imm_r, imm_s, n != 0); + } else { + ASSERT_MSG(scratch != INVALID_REG, + "EORI2R - failed to construct logical immediate value from " + "%08x, need scratch", + static_cast(imm)); + MOVI2R(scratch, imm); + EOR(Rd, Rn, scratch); + } +} + +void ARM64XEmitter::ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) { + unsigned int n, imm_s, imm_r; + if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) { + ANDS(Rd, Rn, imm_r, imm_s, n != 0); + } else { + ASSERT_MSG(scratch != INVALID_REG, + "ANDSI2R - failed to construct logical immediate value from " + "%08x, need scratch", + static_cast(imm)); + MOVI2R(scratch, imm); + ANDS(Rd, Rn, scratch); + } +} + +void ARM64XEmitter::AddImmediate(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool shift, bool negative, + bool flags) { + switch ((negative << 1) | static_cast(flags)) { + case 0: + ADD(Rd, Rn, static_cast(imm), shift); + break; + case 1: + ADDS(Rd, Rn, static_cast(imm), shift); + break; + case 2: + SUB(Rd, Rn, static_cast(imm), shift); + break; + case 3: + SUBS(Rd, Rn, static_cast(imm), shift); + break; + } +} + +void ARM64XEmitter::ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool negative, bool flags, + ARM64Reg scratch) { + bool has_scratch = scratch != INVALID_REG; + u64 imm_neg = Is64Bit(Rd) ? ~imm + 1 : (~imm + 1) & 0xFFFFFFFFuLL; + bool neg_neg = negative ? false : true; + + // Fast paths, aarch64 immediate instructions + // Try them all first + if (imm <= 0xFFF) { + AddImmediate(Rd, Rn, imm, false, negative, flags); + return; + } + if (imm <= 0xFFFFFF && (imm & 0xFFF) == 0) { + AddImmediate(Rd, Rn, imm >> 12, true, negative, flags); + return; + } + if (imm_neg <= 0xFFF) { + AddImmediate(Rd, Rn, imm_neg, false, neg_neg, flags); + return; + } + if (imm_neg <= 0xFFFFFF && (imm_neg & 0xFFF) == 0) { + AddImmediate(Rd, Rn, imm_neg >> 12, true, neg_neg, flags); + return; + } + + // ADD+ADD is slower than MOVK+ADD, but inplace. + // But it supports a few more bits, so use it to avoid MOVK+MOVK+ADD. + // As this splits the addition in two parts, this must not be done on setting + // flags. + if (!flags && (imm >= 0x10000u || !has_scratch) && imm < 0x1000000u) { + AddImmediate(Rd, Rn, imm & 0xFFF, false, negative, false); + AddImmediate(Rd, Rd, imm >> 12, true, negative, false); + return; + } + if (!flags && (imm_neg >= 0x10000u || !has_scratch) && imm_neg < 0x1000000u) { + AddImmediate(Rd, Rn, imm_neg & 0xFFF, false, neg_neg, false); + AddImmediate(Rd, Rd, imm_neg >> 12, true, neg_neg, false); + return; + } + + ASSERT_MSG(has_scratch, + "ADDI2R - failed to construct arithmetic immediate value from " + "%08x, need scratch", + static_cast(imm)); + + negative ^= MOVI2R2(scratch, imm, imm_neg); + switch ((negative << 1) | static_cast(flags)) { + case 0: + ADD(Rd, Rn, scratch); + break; + case 1: + ADDS(Rd, Rn, scratch); + break; + case 2: + SUB(Rd, Rn, scratch); + break; + case 3: + SUBS(Rd, Rn, scratch); + break; + } +} + +void ARM64XEmitter::ADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) { + ADDI2R_internal(Rd, Rn, imm, false, false, scratch); +} + +void ARM64XEmitter::ADDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) { + ADDI2R_internal(Rd, Rn, imm, false, true, scratch); +} + +void ARM64XEmitter::SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) { + ADDI2R_internal(Rd, Rn, imm, true, false, scratch); +} + +void ARM64XEmitter::SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) { + ADDI2R_internal(Rd, Rn, imm, true, true, scratch); +} + +void ARM64XEmitter::CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch) { + ADDI2R_internal(Is64Bit(Rn) ? ZR : WZR, Rn, imm, true, true, scratch); +} + +bool ARM64XEmitter::TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) { + u32 val; + bool shift; + if (IsImmArithmetic(imm, &val, &shift)) + ADD(Rd, Rn, val, shift); + else + return false; + + return true; +} + +bool ARM64XEmitter::TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) { + u32 val; + bool shift; + if (IsImmArithmetic(imm, &val, &shift)) + SUB(Rd, Rn, val, shift); + else + return false; + + return true; +} + +bool ARM64XEmitter::TryCMPI2R(ARM64Reg Rn, u32 imm) { + u32 val; + bool shift; + if (IsImmArithmetic(imm, &val, &shift)) + CMP(Rn, val, shift); + else + return false; + + return true; +} + +bool ARM64XEmitter::TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) { + u32 n, imm_r, imm_s; + if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r)) + AND(Rd, Rn, imm_r, imm_s, n != 0); + else + return false; + + return true; +} +bool ARM64XEmitter::TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) { + u32 n, imm_r, imm_s; + if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r)) + ORR(Rd, Rn, imm_r, imm_s, n != 0); + else + return false; + + return true; +} +bool ARM64XEmitter::TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) { + u32 n, imm_r, imm_s; + if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r)) + EOR(Rd, Rn, imm_r, imm_s, n != 0); + else + return false; + + return true; +} + +void ARM64FloatEmitter::MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch, bool negate) { + ASSERT_MSG(!IsDouble(Rd), "MOVI2F does not yet support double precision"); + uint8_t imm8; + if (value == 0.0) { + FMOV(Rd, IsDouble(Rd) ? ZR : WZR); + if (negate) + FNEG(Rd, Rd); + // TODO: There are some other values we could generate with the float-imm + // instruction, like 1.0... + } else if (FPImm8FromFloat(value, &imm8)) { + FMOV(Rd, imm8); + } else { + ASSERT_MSG(scratch != INVALID_REG, + "Failed to find a way to generate FP immediate %f without scratch", value); + if (negate) + value = -value; + + const u32 ival = Dynarmic::Common::BitCast(value); + m_emit->MOVI2R(scratch, ival); + FMOV(Rd, scratch); + } +} + +// TODO: Quite a few values could be generated easily using the MOVI instruction +// and friends. +void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch) { + // TODO: Make it work with more element sizes + // TODO: Optimize - there are shorter solution for many values + ARM64Reg s = static_cast(S0 + DecodeReg(Rd)); + MOVI2F(s, value, scratch); + DUP(32, Rd, Rd, 0); +} + +} // namespace Dynarmic::BackendA64::Arm64Gen diff --git a/src/dynarmic/backend/A64/emitter/a64_emitter.h b/src/dynarmic/backend/A64/emitter/a64_emitter.h new file mode 100644 index 00000000..e7d84638 --- /dev/null +++ b/src/dynarmic/backend/A64/emitter/a64_emitter.h @@ -0,0 +1,1172 @@ +// Copyright 2015 Dolphin Emulator Project / 2018 dynarmic project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#pragma once + +#include +#include +#include + +#include "arm_common.h" +#include "code_block.h" +#include "common/assert.h" +#include "common/common_types.h" + +namespace Dynarmic::BackendA64::Arm64Gen { + +// X30 serves a dual purpose as a link register +// Encoded as +// Types: +// 000 - 32bit GPR +// 001 - 64bit GPR +// 010 - VFP single precision +// 100 - VFP double precision +// 110 - VFP quad precision +enum ARM64Reg { + // 32bit registers + W0 = 0, + W1, + W2, + W3, + W4, + W5, + W6, + W7, + W8, + W9, + W10, + W11, + W12, + W13, + W14, + W15, + W16, + W17, + W18, + W19, + W20, + W21, + W22, + W23, + W24, + W25, + W26, + W27, + W28, + W29, + W30, + + WSP, // 32bit stack pointer + + // 64bit registers + X0 = 0x20, + X1, + X2, + X3, + X4, + X5, + X6, + X7, + X8, + X9, + X10, + X11, + X12, + X13, + X14, + X15, + X16, + X17, + X18, + X19, + X20, + X21, + X22, + X23, + X24, + X25, + X26, + X27, + X28, + X29, + X30, + + SP, // 64bit stack pointer + + // VFP single precision registers + S0 = 0x40, + S1, + S2, + S3, + S4, + S5, + S6, + S7, + S8, + S9, + S10, + S11, + S12, + S13, + S14, + S15, + S16, + S17, + S18, + S19, + S20, + S21, + S22, + S23, + S24, + S25, + S26, + S27, + S28, + S29, + S30, + S31, + + // VFP Double Precision registers + D0 = 0x80, + D1, + D2, + D3, + D4, + D5, + D6, + D7, + D8, + D9, + D10, + D11, + D12, + D13, + D14, + D15, + D16, + D17, + D18, + D19, + D20, + D21, + D22, + D23, + D24, + D25, + D26, + D27, + D28, + D29, + D30, + D31, + + // ASIMD Quad-Word registers + Q0 = 0xC0, + Q1, + Q2, + Q3, + Q4, + Q5, + Q6, + Q7, + Q8, + Q9, + Q10, + Q11, + Q12, + Q13, + Q14, + Q15, + Q16, + Q17, + Q18, + Q19, + Q20, + Q21, + Q22, + Q23, + Q24, + Q25, + Q26, + Q27, + Q28, + Q29, + Q30, + Q31, + + // For PRFM(prefetch memory) encoding + // This is encoded in the Rt register + // Data preload + PLDL1KEEP = 0, + PLDL1STRM, + PLDL2KEEP, + PLDL2STRM, + PLDL3KEEP, + PLDL3STRM, + // Instruction preload + PLIL1KEEP = 8, + PLIL1STRM, + PLIL2KEEP, + PLIL2STRM, + PLIL3KEEP, + PLIL3STRM, + // Prepare for store + PLTL1KEEP = 16, + PLTL1STRM, + PLTL2KEEP, + PLTL2STRM, + PLTL3KEEP, + PLTL3STRM, + + WZR = WSP, + ZR = SP, + + INVALID_REG = 0xFFFFFFFF +}; + +constexpr bool Is64Bit(ARM64Reg reg) { + return (reg & 0x20) != 0; +} +constexpr bool IsSingle(ARM64Reg reg) { + return (reg & 0xC0) == 0x40; +} +constexpr bool IsDouble(ARM64Reg reg) { + return (reg & 0xC0) == 0x80; +} +constexpr bool IsScalar(ARM64Reg reg) { + return IsSingle(reg) || IsDouble(reg); +} +constexpr bool IsQuad(ARM64Reg reg) { + return (reg & 0xC0) == 0xC0; +} +constexpr bool IsVector(ARM64Reg reg) { + return (reg & 0xC0) != 0; +} +constexpr bool IsGPR(ARM64Reg reg) { + return static_cast(reg) < 0x40; +} + +constexpr ARM64Reg DecodeReg(ARM64Reg reg) { + return static_cast(reg & 0x1F); +} +constexpr ARM64Reg EncodeRegTo64(ARM64Reg reg) { + return static_cast(reg | 0x20); +} +constexpr ARM64Reg EncodeRegToSingle(ARM64Reg reg) { + return static_cast(DecodeReg(reg) + S0); +} +constexpr ARM64Reg EncodeRegToDouble(ARM64Reg reg) { + return static_cast((reg & ~0xC0) | 0x80); +} +constexpr ARM64Reg EncodeRegToQuad(ARM64Reg reg) { + return static_cast(reg | 0xC0); +} + +enum OpType { TYPE_IMM = 0, TYPE_REG, TYPE_IMMSREG, TYPE_RSR, TYPE_MEM }; + +enum ShiftType { + ST_LSL = 0, + ST_LSR = 1, + ST_ASR = 2, + ST_ROR = 3, +}; + +enum IndexType { + INDEX_UNSIGNED, + INDEX_POST, + INDEX_PRE, + INDEX_SIGNED, // used in LDP/STP +}; + +enum ShiftAmount { + SHIFT_0 = 0, + SHIFT_16 = 1, + SHIFT_32 = 2, + SHIFT_48 = 3, +}; + +enum RoundingMode { + ROUND_A, // round to nearest, ties to away + ROUND_M, // round towards -inf + ROUND_N, // round to nearest, ties to even + ROUND_P, // round towards +inf + ROUND_Z, // round towards zero +}; + +// Size of each element in the Vector +enum ESize { + B, // Byte + H, // Half Word + S, // Single Word + D, // Double Word +}; + +struct FixupBranch { + u8* ptr; + // Type defines + // 0 = CBZ (32bit) + // 1 = CBNZ (32bit) + // 2 = B (conditional) + // 3 = TBZ + // 4 = TBNZ + // 5 = B (unconditional) + // 6 = BL (unconditional) + u32 type; + + // Used with B.cond + CCFlags cond; + + // Used with TBZ/TBNZ + u8 bit; + + // Used with Test/Compare and Branch + ARM64Reg reg; +}; + +// The only system registers accessible from EL0 (user space) +enum PStateField { + FIELD_SPSel = 0, + FIELD_DAIFSet, + FIELD_DAIFClr, + FIELD_NZCV, + FIELD_PMCR_EL0, + FIELD_PMCCNTR_EL0, + FIELD_FPCR = 0x340, + FIELD_FPSR = 0x341, +}; + +enum SystemHint { + HINT_NOP = 0, + HINT_YIELD, + HINT_WFE, + HINT_WFI, + HINT_SEV, + HINT_SEVL, +}; + +enum BarrierType { + OSHLD = 1, + OSHST = 2, + OSH = 3, + NSHLD = 5, + NSHST = 6, + NSH = 7, + ISHLD = 9, + ISHST = 10, + ISH = 11, + LD = 13, + ST = 14, + SY = 15, +}; + +class ArithOption { +public: + enum WidthSpecifier { + WIDTH_DEFAULT, + WIDTH_32BIT, + WIDTH_64BIT, + }; + + enum ExtendSpecifier { + EXTEND_UXTB = 0x0, + EXTEND_UXTH = 0x1, + EXTEND_UXTW = 0x2, /* Also LSL on 32bit width */ + EXTEND_UXTX = 0x3, /* Also LSL on 64bit width */ + EXTEND_SXTB = 0x4, + EXTEND_SXTH = 0x5, + EXTEND_SXTW = 0x6, + EXTEND_SXTX = 0x7, + }; + + enum TypeSpecifier { + TYPE_EXTENDEDREG, + TYPE_IMM, + TYPE_SHIFTEDREG, + }; + +private: + ARM64Reg m_destReg; + WidthSpecifier m_width; + ExtendSpecifier m_extend; + TypeSpecifier m_type; + ShiftType m_shifttype; + u32 m_shift; + +public: + ArithOption(ARM64Reg Rd, bool index = false) { + // Indexed registers are a certain feature of AARch64 + // On Loadstore instructions that use a register offset + // We can have the register as an index + // If we are indexing then the offset register will + // be shifted to the left so we are indexing at intervals + // of the size of what we are loading + // 8-bit: Index does nothing + // 16-bit: Index LSL 1 + // 32-bit: Index LSL 2 + // 64-bit: Index LSL 3 + if (index) + m_shift = 4; + else + m_shift = 0; + + m_destReg = Rd; + m_type = TYPE_EXTENDEDREG; + if (Is64Bit(Rd)) { + m_width = WIDTH_64BIT; + m_extend = EXTEND_UXTX; + } else { + m_width = WIDTH_32BIT; + m_extend = EXTEND_UXTW; + } + m_shifttype = ST_LSL; + } + ArithOption(ARM64Reg Rd, ShiftType shift_type, u32 shift) { + m_destReg = Rd; + m_shift = shift; + m_shifttype = shift_type; + m_type = TYPE_SHIFTEDREG; + if (Is64Bit(Rd)) { + m_width = WIDTH_64BIT; + if (shift == 64) + m_shift = 0; + } else { + m_width = WIDTH_32BIT; + if (shift == 32) + m_shift = 0; + } + } + TypeSpecifier GetType() const { + return m_type; + } + ARM64Reg GetReg() const { + return m_destReg; + } + u32 GetData() const { + switch (m_type) { + case TYPE_EXTENDEDREG: + return (m_extend << 13) | (m_shift << 10); + break; + case TYPE_SHIFTEDREG: + return (m_shifttype << 22) | (m_shift << 10); + break; + default: + ASSERT_MSG(false, "Invalid type in GetData"); + break; + } + return 0; + } +}; + +class ARM64XEmitter { + friend class ARM64FloatEmitter; + +private: + u8* m_code; + u8* m_lastCacheFlushEnd; + + void AddImmediate(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool shift, bool negative, bool flags); + void EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr); + void EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const void* ptr); + void EncodeUnconditionalBranchInst(u32 op, const void* ptr); + void EncodeUnconditionalBranchInst(u32 opc, u32 op2, u32 op3, u32 op4, ARM64Reg Rn); + void EncodeExceptionInst(u32 instenc, u32 imm); + void EncodeSystemInst(u32 op0, u32 op1, u32 CRn, u32 CRm, u32 op2, ARM64Reg Rt); + void EncodeArithmeticInst(u32 instenc, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, + ArithOption Option); + void EncodeArithmeticCarryInst(u32 op, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void EncodeCondCompareImmInst(u32 op, ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond); + void EncodeCondCompareRegInst(u32 op, ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond); + void EncodeCondSelectInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); + void EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn); + void EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + void EncodeLoadRegisterInst(u32 bitop, ARM64Reg Rt, s32 imm); + void EncodeLoadStoreExcInst(u32 instenc, ARM64Reg Rs, ARM64Reg Rt2, ARM64Reg Rn, ARM64Reg Rt); + void EncodeLoadStorePairedInst(u32 op, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm); + void EncodeLoadStoreIndexedInst(u32 op, u32 op2, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void EncodeLoadStoreIndexedInst(u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm, u8 size); + void EncodeMOVWideInst(u32 op, ARM64Reg Rd, u32 imm, ShiftAmount pos); + void EncodeBitfieldMOVInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); + void EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn, ARM64Reg Rd); + void EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, int n); + void EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, + ARM64Reg Rn, s32 imm); + void EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm); + void EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + +protected: + void Write32(u32 value); + +public: + ARM64XEmitter() : m_code(nullptr), m_lastCacheFlushEnd(nullptr) { + } + + ARM64XEmitter(u8* code_ptr) { + m_code = code_ptr; + m_lastCacheFlushEnd = code_ptr; + } + + virtual ~ARM64XEmitter() { + } + + void SetCodePtr(u8* ptr); + void SetCodePtrUnsafe(u8* ptr); + void ReserveCodeSpace(u32 bytes); + const u8* AlignCode16(); + const u8* AlignCodePage(); + const u8* GetCodePtr() const; + void FlushIcache(); + void FlushIcacheSection(const u8* start, const u8* end); + u8* GetWritableCodePtr(); + + // FixupBranch branching + void SetJumpTarget(FixupBranch const& branch, u8* target = nullptr); + FixupBranch CBZ(ARM64Reg Rt); + FixupBranch CBNZ(ARM64Reg Rt); + FixupBranch B(CCFlags cond); + FixupBranch TBZ(ARM64Reg Rt, u8 bit); + FixupBranch TBNZ(ARM64Reg Rt, u8 bit); + FixupBranch B(); + FixupBranch BL(); + + // Compare and Branch + void CBZ(ARM64Reg Rt, const void* ptr); + void CBNZ(ARM64Reg Rt, const void* ptr); + + // Conditional Branch + void B(CCFlags cond, const void* ptr); + + // Test and Branch + void TBZ(ARM64Reg Rt, u8 bits, const void* ptr); + void TBNZ(ARM64Reg Rt, u8 bits, const void* ptr); + + // Unconditional Branch + void B(const void* ptr); + void BL(const void* ptr); + + // Unconditional Branch (register) + void BR(ARM64Reg Rn); + void BLR(ARM64Reg Rn); + void RET(ARM64Reg Rn = X30); + void ERET(); + void DRPS(); + + // Exception generation + void SVC(u32 imm); + void HVC(u32 imm); + void SMC(u32 imm); + void BRK(u32 imm); + void HLT(u32 imm); + void DCPS1(u32 imm); + void DCPS2(u32 imm); + void DCPS3(u32 imm); + + // System + void _MSR(PStateField field, u8 imm); + void _MSR(PStateField field, ARM64Reg Rt); + void MRS(ARM64Reg Rt, PStateField field); + void CNTVCT(ARM64Reg Rt); + + void HINT(SystemHint op); + void CLREX(); + void DSB(BarrierType type); + void DMB(BarrierType type); + void ISB(BarrierType type); + + // Add/Subtract (Extended/Shifted register) + void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); + void ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); + void SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); + void SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); + void CMN(ARM64Reg Rn, ARM64Reg Rm); + void CMN(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); + void CMP(ARM64Reg Rn, ARM64Reg Rm); + void CMP(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); + + // Add/Subtract (with carry) + void ADC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void ADCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SBC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SBCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + + // Conditional Compare (immediate) + void CCMN(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond); + void CCMP(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond); + + // Conditional Compare (register) + void CCMN(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond); + void CCMP(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond); + + // Conditional Select + void CSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); + void CSINC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); + void CSINV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); + void CSNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); + + // Aliases + void CSET(ARM64Reg Rd, CCFlags cond) { + ARM64Reg zr = Is64Bit(Rd) ? ZR : WZR; + CSINC(Rd, zr, zr, (CCFlags)((u32)cond ^ 1)); + } + void CSETM(ARM64Reg Rd, CCFlags cond) { + ARM64Reg zr = Is64Bit(Rd) ? ZR : WZR; + CSINV(Rd, zr, zr, (CCFlags)((u32)cond ^ 1)); + } + void NEG(ARM64Reg Rd, ARM64Reg Rs) { + SUB(Rd, Is64Bit(Rd) ? ZR : WZR, Rs); + } + // Data-Processing 1 source + void RBIT(ARM64Reg Rd, ARM64Reg Rn); + void REV16(ARM64Reg Rd, ARM64Reg Rn); + void REV32(ARM64Reg Rd, ARM64Reg Rn); + void REV64(ARM64Reg Rd, ARM64Reg Rn); + void CLZ(ARM64Reg Rd, ARM64Reg Rn); + void CLS(ARM64Reg Rd, ARM64Reg Rn); + + // Data-Processing 2 source + void UDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void LSLV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void LSRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void ASRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void RORV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CRC32B(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CRC32H(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CRC32W(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CRC32CB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CRC32CH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CRC32CW(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CRC32X(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CRC32CX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + + // Data-Processing 3 source + void MADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void MSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void SMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void SMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void SMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void UMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void UMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void UMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void UMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void MUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void MNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + + // Logical (shifted register) + void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); + + // Wrap the above for saner syntax + void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + AND(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); + } + void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + BIC(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); + } + void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + ORR(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); + } + void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + ORN(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); + } + void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EOR(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); + } + void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EON(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); + } + void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + ANDS(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); + } + void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + BICS(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0)); + } + + // Convenience wrappers around ORR. These match the official convenience + // syntax. + void MOV(ARM64Reg Rd, ARM64Reg Rm, ArithOption Shift); + void MOV(ARM64Reg Rd, ARM64Reg Rm); + void MVN(ARM64Reg Rd, ARM64Reg Rm); + + // Convenience wrappers around UBFM/EXTR. + void LSR(ARM64Reg Rd, ARM64Reg Rm, int shift); + void LSL(ARM64Reg Rd, ARM64Reg Rm, int shift); + void ASR(ARM64Reg Rd, ARM64Reg Rm, int shift); + void ROR(ARM64Reg Rd, ARM64Reg Rm, int shift); + + // Logical (immediate) + void AND(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false); + void ANDS(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false); + void EOR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false); + void ORR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false); + void TST(ARM64Reg Rn, u32 immr, u32 imms, bool invert = false); + void TST(ARM64Reg Rn, ARM64Reg Rm) { + ANDS(Is64Bit(Rn) ? ZR : WZR, Rn, Rm); + } + + // Add/subtract (immediate) + void ADD(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false); + void ADDS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false); + void SUB(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false); + void SUBS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false); + void CMP(ARM64Reg Rn, u32 imm, bool shift = false); + + // Data Processing (Immediate) + void MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0); + void MOVN(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0); + void MOVK(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0); + + // Bitfield move + void BFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); + void SBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); + void UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); + void BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width); + void UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width); + + // Extract register (ROR with two inputs, if same then faster on A67) + void EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift); + + // Aliases + void SXTB(ARM64Reg Rd, ARM64Reg Rn); + void SXTH(ARM64Reg Rd, ARM64Reg Rn); + void SXTW(ARM64Reg Rd, ARM64Reg Rn); + void UXTB(ARM64Reg Rd, ARM64Reg Rn); + void UXTH(ARM64Reg Rd, ARM64Reg Rn); + void UBFX(ARM64Reg Rd, ARM64Reg Rn, int lsb, int width) { + UBFM(Rd, Rn, lsb, lsb + width - 1); + } + + // Load Register (Literal) + void LDR(ARM64Reg Rt, s32 imm); + void LDRSW(ARM64Reg Rt, s32 imm); + void PRFM(ARM64Reg Rt, s32 imm); + + // Load/Store Exclusive + void STXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn); + void STLXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn); + void LDXRB(ARM64Reg Rt, ARM64Reg Rn); + void LDAXRB(ARM64Reg Rt, ARM64Reg Rn); + void STLRB(ARM64Reg Rt, ARM64Reg Rn); + void LDARB(ARM64Reg Rt, ARM64Reg Rn); + void STXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn); + void STLXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn); + void LDXRH(ARM64Reg Rt, ARM64Reg Rn); + void LDAXRH(ARM64Reg Rt, ARM64Reg Rn); + void STLRH(ARM64Reg Rt, ARM64Reg Rn); + void LDARH(ARM64Reg Rt, ARM64Reg Rn); + void STXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn); + void STLXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn); + void STXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn); + void STLXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn); + void LDXR(ARM64Reg Rt, ARM64Reg Rn); + void LDAXR(ARM64Reg Rt, ARM64Reg Rn); + void LDXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn); + void LDAXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn); + void STLR(ARM64Reg Rt, ARM64Reg Rn); + void LDAR(ARM64Reg Rt, ARM64Reg Rn); + + // Load/Store no-allocate pair (offset) + void STNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm); + void LDNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm); + + // Load/Store register (immediate indexed) + void STRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDRSB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void STRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDRSH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void STR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDRSW(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + + // Load/Store register (register offset) + void STRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDRSB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void STRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDRSH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void STR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDRSW(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + + // Load/Store register (unscaled offset) + void STURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDURSB(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void STURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDURSH(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void STUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDURSW(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + + // Load/Store pair + void LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); + void LDPSW(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); + void STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); + + // Address of label/page PC-relative + void ADR(ARM64Reg Rd, s32 imm); + void ADRP(ARM64Reg Rd, s32 imm); + + // Wrapper around MOVZ+MOVK + void MOVI2R(ARM64Reg Rd, u64 imm, bool optimize = true); + bool MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2); + template + void MOVP2R(ARM64Reg Rd, P* ptr) { + ASSERT_MSG(Is64Bit(Rd), "Can't store pointers in 32-bit registers"); + MOVI2R(Rd, (uintptr_t)ptr); + } + + // Wrapper around AND x, y, imm etc. If you are sure the imm will work, no + // need to pass a scratch register. + void ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void TSTI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG) { + ANDSI2R(Is64Bit(Rn) ? ZR : WZR, Rn, imm, scratch); + } + void ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + + void ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool negative, bool flags, + ARM64Reg scratch); + void ADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void ADDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + void SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); + + bool TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); + bool TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); + bool TryCMPI2R(ARM64Reg Rn, u32 imm); + + bool TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); + bool TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); + bool TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); + + // ABI related + void ABI_PushRegisters(u32 registers); + void ABI_PopRegisters(u32 registers); + + // Utility to generate a call to a std::function object. + // + // Unfortunately, calling operator() directly is undefined behavior in C++ + // (this method might be a thunk in the case of multi-inheritance) so we + // have to go through a trampoline function. + template + static T CallLambdaTrampoline(const std::function* f, Args... args) { + return (*f)(args...); + } + + // This function expects you to have set up the state. + // Overwrites X0 and X30 + template + ARM64Reg ABI_SetupLambda(const std::function* f) { + auto trampoline = &ARM64XEmitter::CallLambdaTrampoline; + MOVI2R(X30, (uintptr_t)trampoline); + MOVI2R(X0, (uintptr_t) const_cast((const void*)f)); + return X30; + } + + // Plain function call + void QuickCallFunction(const void* func, ARM64Reg scratchreg = X16); + template + void QuickCallFunction(T func, ARM64Reg scratchreg = X16) { + QuickCallFunction((const void*)func, scratchreg); + } +}; + +class ARM64FloatEmitter { +public: + ARM64FloatEmitter(ARM64XEmitter* emit) : m_emit(emit) { + } + + void LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + + // Loadstore unscaled + void LDUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void STUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + + // Loadstore single structure + void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn); + void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm); + void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn); + void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn); + void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm); + void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm); + void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn); + void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm); + + // Loadstore multiple structure + void LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn); + void LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP); + void ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn); + void ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP); + + // Loadstore paired + void LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); + void STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); + + // Loadstore register offset + void STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + + // Scalar - 1 Source + void FABS(ARM64Reg Rd, ARM64Reg Rn); + void FNEG(ARM64Reg Rd, ARM64Reg Rn); + void FSQRT(ARM64Reg Rd, ARM64Reg Rn); + void FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top = false); // Also generalized move between GPR/FP + + // Scalar - 2 Source + void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMAXNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMINNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FNMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + + // Scalar - 3 Source. Note - the accumulator is last on ARM! + void FMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void FMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void FNMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void FNMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + + // Scalar three same + void SQADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void UQADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SQSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void UQSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + + // Scalar floating point immediate + void FMOV(ARM64Reg Rd, uint8_t imm8); + + // Vector + void ADD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void ADDV(ESize esize, ARM64Reg Rd, ARM64Reg Rn); + void SUB(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CMGE(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CMGE_zero(ESize esize, ARM64Reg Rd, ARM64Reg Rn); + void CMGT(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CMHI(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CMHS(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); + void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void FCVTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FRECPE(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FRSQRTE(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void NOT(ARM64Reg Rd, ARM64Reg Rn); + void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void MOV(ARM64Reg Rd, ARM64Reg Rn) { + ORR(Rd, Rn, Rn); + } + void REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void SABD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void UABD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SADDLV(ESize esize, ARM64Reg Rd, ARM64Reg Rn); + void UADDLV(ESize esize, ARM64Reg Rd, ARM64Reg Rn); + void SHADD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void UHADD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SHSUB(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void UHSUB(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SMIN(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void UMIN(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SQADD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SQSUB(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void UQADD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void UQSUB(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale); + void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale); + void SQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void SQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void UQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void UQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + + // Move + void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn); + void INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 index2); + void UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); + void SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); + + // One source + void FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn); + + // Scalar convert float to int, in a lot of variants. + // Note that the scalar version of this operation has two encodings, one that + // goes to an integer register and one that outputs to a scalar fp register. + void FCVTS(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round); + void FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round); + + // Scalar convert int to float. No rounding mode specifier necessary. + void SCVTF(ARM64Reg Rd, ARM64Reg Rn); + void UCVTF(ARM64Reg Rd, ARM64Reg Rn); + + // Scalar fixed point to float. scale is the number of fractional bits. + void SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale); + void UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale); + + // Float comparison + void FCMP(ARM64Reg Rn, ARM64Reg Rm); + void FCMP(ARM64Reg Rn); + void FCMPE(ARM64Reg Rn, ARM64Reg Rm); + void FCMPE(ARM64Reg Rn); + void FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn); + + // Conditional select + void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); + + // Permute + void UZP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void TRN1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void ZIP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void UZP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + + // Shift by immediate + void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); + void SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); + void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); + void UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); + + // vector x indexed element + void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index); + void FMLA(u8 esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index); + + // Modified Immediate + void MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift = 0); + void BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift = 0); + + void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG, bool negate = false); + void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG); + + // ABI related + void ABI_PushRegisters(u32 registers, ARM64Reg tmp = INVALID_REG); + void ABI_PopRegisters(u32 registers, ARM64Reg tmp = INVALID_REG); + +private: + ARM64XEmitter* m_emit; + + inline void Write32(u32 value) { + m_emit->Write32(value); + } + + // Emitting functions + void EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt, ARM64Reg Rn, + s32 imm); + void EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, + ARM64Reg Rm); + void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void EmitScalarThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn); + void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, + ARM64Reg Rn); + void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, + ARM64Reg Rn, ARM64Reg Rm); + void Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode, u32 opcode, + int scale, ARM64Reg Rd, ARM64Reg Rn); + void EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm); + void EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8); + void EmitShiftImm(bool Q, bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitScalarShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn); + void EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn, + ARM64Reg Rm); + void EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H, ARM64Reg Rd, ARM64Reg Rn, + ARM64Reg Rm); + void EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round, bool sign); + void EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra, + int opcode); + void EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, + ARM64Reg Rn, s32 imm); + void EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, + ArithOption Rm); + void EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh); + + void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); + void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); + void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); + void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper); + void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper); +}; + +class ARM64CodeBlock : public CodeBlock { +private: + void PoisonMemory() override { + // If our memory isn't a multiple of u32 then this won't write the last + // remaining bytes with anything Less than optimal, but there would be + // nothing we could do but throw a runtime warning anyway. AArch64: + // 0xD4200000 = BRK 0 + constexpr u32 brk_0 = 0xD4200000; + + for (size_t i = 0; i < region_size; i += sizeof(u32)) { + std::memcpy(region + i, &brk_0, sizeof(u32)); + } + } +}; + +} // namespace Dynarmic::BackendA64::Arm64Gen diff --git a/src/dynarmic/backend/A64/emitter/arm_common.h b/src/dynarmic/backend/A64/emitter/arm_common.h new file mode 100644 index 00000000..257467a6 --- /dev/null +++ b/src/dynarmic/backend/A64/emitter/arm_common.h @@ -0,0 +1,28 @@ +// Copyright 2014 Dolphin Emulator Project / 2018 dynarmic project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#include "common/common_types.h" + +namespace Dynarmic::BackendA64 { +enum CCFlags { + CC_EQ = 0, // Equal + CC_NEQ, // Not equal + CC_CS, // Carry Set + CC_CC, // Carry Clear + CC_MI, // Minus (Negative) + CC_PL, // Plus + CC_VS, // Overflow + CC_VC, // No Overflow + CC_HI, // Unsigned higher + CC_LS, // Unsigned lower or same + CC_GE, // Signed greater than or equal + CC_LT, // Signed less than + CC_GT, // Signed greater than + CC_LE, // Signed less than or equal + CC_AL, // Always (unconditional) 14 + CC_HS = CC_CS, // Alias of CC_CS Unsigned higher or same + CC_LO = CC_CC, // Alias of CC_CC Unsigned lower +}; +const u32 NO_COND = 0xE0000000; +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/emitter/code_block.h b/src/dynarmic/backend/A64/emitter/code_block.h new file mode 100644 index 00000000..26b0ebbc --- /dev/null +++ b/src/dynarmic/backend/A64/emitter/code_block.h @@ -0,0 +1,139 @@ +// Copyright 2014 Dolphin Emulator Project / 2018 dynarmic project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#pragma once + +#include +#include + +#ifdef _WIN32 +#include +#else +#include +#endif + +#include "common/assert.h" +#include "common/common_types.h" + +namespace Dynarmic::BackendA64 { +// Everything that needs to generate code should inherit from this. +// You get memory management for free, plus, you can use all emitter functions +// without having to prefix them with gen-> or something similar. Example +// implementation: class JIT : public CodeBlock {} +template +class CodeBlock : public T { +private: + // A privately used function to set the executable RAM space to something + // invalid. For debugging usefulness it should be used to set the RAM to a + // host specific breakpoint instruction + virtual void PoisonMemory() = 0; + +protected: + u8* region = nullptr; + // Size of region we can use. + size_t region_size = 0; + // Original size of the region we allocated. + size_t total_region_size = 0; + + bool m_is_child = false; + std::vector m_children; + +public: + CodeBlock() = default; + virtual ~CodeBlock() { + if (region) + FreeCodeSpace(); + } + CodeBlock(const CodeBlock&) = delete; + CodeBlock& operator=(const CodeBlock&) = delete; + CodeBlock(CodeBlock&&) = delete; + CodeBlock& operator=(CodeBlock&&) = delete; + + // Call this before you generate any code. + void AllocCodeSpace(size_t size) { + region_size = size; + total_region_size = size; +#if defined(_WIN32) + void* ptr = VirtualAlloc(nullptr, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE); +#else +#if defined(__APPLE__) + void* ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE | MAP_JIT, -1, 0); +#else + void* ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE, -1, 0); +#endif + + if (ptr == MAP_FAILED) + ptr = nullptr; +#endif + ASSERT_MSG(ptr != nullptr, "Failed to allocate executable memory"); + region = static_cast(ptr); + T::SetCodePtr(region); + } + + // Always clear code space with breakpoints, so that if someone accidentally + // executes uninitialized, it just breaks into the debugger. + void ClearCodeSpace() { + PoisonMemory(); + ResetCodePtr(); + } + + // Call this when shutting down. Don't rely on the destructor, even though + // it'll do the job. + void FreeCodeSpace() { + ASSERT(!m_is_child); + ASSERT(munmap(region, total_region_size) == 0); + region = nullptr; + region_size = 0; + total_region_size = 0; + for (CodeBlock* child : m_children) { + child->region = nullptr; + child->region_size = 0; + child->total_region_size = 0; + } + } + + bool IsInSpace(const u8* ptr) const { + return ptr >= region && ptr < (region + region_size); + } + // Cannot currently be undone. Will write protect the entire code region. + // Start over if you need to change the code (call FreeCodeSpace(), + // AllocCodeSpace()). + void WriteProtect() { + ASSERT(mprotect(region, region_size, PROT_READ | PROT_EXEC) != 0); + } + void ResetCodePtr() { + T::SetCodePtr(region); + } + size_t GetSpaceLeft() const { + ASSERT(static_cast(T::GetCodePtr() - region) < region_size); + return region_size - (T::GetCodePtr() - region); + } + + bool IsAlmostFull() const { + // This should be bigger than the biggest block ever. + return GetSpaceLeft() < 0x10000; + } + + bool HasChildren() const { + return region_size != total_region_size; + } + + u8* AllocChildCodeSpace(size_t child_size) { + ASSERT_MSG(child_size < GetSpaceLeft(), "Insufficient space for child allocation."); + u8* child_region = region + region_size - child_size; + region_size -= child_size; + return child_region; + } + + void AddChildCodeSpace(CodeBlock* child, size_t child_size) { + u8* child_region = AllocChildCodeSpace(child_size); + child->m_is_child = true; + child->region = child_region; + child->region_size = child_size; + child->total_region_size = child_size; + child->ResetCodePtr(); + m_children.emplace_back(child); + } +}; +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/exception_handler.h b/src/dynarmic/backend/A64/exception_handler.h new file mode 100644 index 00000000..04eb7d0c --- /dev/null +++ b/src/dynarmic/backend/A64/exception_handler.h @@ -0,0 +1,39 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#pragma once + +#include +#include +#include + +#include "backend/A64/a32_jitstate.h" +#include "common/common_types.h" + +namespace Dynarmic::BackendA64 { + +class BlockOfCode; + +struct A64State { + std::array X; + std::array, 16> Q; +}; +static_assert(sizeof(A64State) == sizeof(A64State::X) + sizeof(A64State::Q)); + +class ExceptionHandler final { +public: + ExceptionHandler(); + ~ExceptionHandler(); + + void Register(BlockOfCode& code, std::function segv_callback = nullptr); + + bool SupportsFastmem() const; +private: + struct Impl; + std::unique_ptr impl; +}; + +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/exception_handler_generic.cpp b/src/dynarmic/backend/A64/exception_handler_generic.cpp new file mode 100644 index 00000000..c5b17c07 --- /dev/null +++ b/src/dynarmic/backend/A64/exception_handler_generic.cpp @@ -0,0 +1,25 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#include "backend/A64/exception_handler.h" + +namespace Dynarmic::BackendA64 { + +struct ExceptionHandler::Impl final { +}; + +ExceptionHandler::ExceptionHandler() = default; +ExceptionHandler::~ExceptionHandler() = default; + +void ExceptionHandler::Register(BlockOfCode&, std::function) { + // Do nothing +} + +bool ExceptionHandler::SupportsFastmem() const { + return false; +} + +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/exception_handler_posix.cpp b/src/dynarmic/backend/A64/exception_handler_posix.cpp new file mode 100644 index 00000000..0ddb55e4 --- /dev/null +++ b/src/dynarmic/backend/A64/exception_handler_posix.cpp @@ -0,0 +1,166 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2019 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +// Copyright 2008 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#include +#include + +#include +#ifdef __APPLE__ +#include +#else +#include +#endif + +#include "backend/A64/a32_jitstate.h" +#include "backend/A64/block_of_code.h" +#include "backend/A64/exception_handler.h" +#include "common/assert.h" +#include "common/cast_util.h" +#include "common/common_types.h" + +namespace Dynarmic::BackendA64 { + +namespace { + +struct CodeBlockInfo { + BlockOfCode* block; + std::function callback; +}; + +class SigHandler { +public: + SigHandler(); + + ~SigHandler(); + + void AddCodeBlock(CodeBlockInfo info); + + void RemoveCodeBlock(CodePtr PC); + +private: + auto FindCodeBlockInfo(CodePtr PC) { + return std::find_if(code_block_infos.begin(), code_block_infos.end(), + [&](const CodeBlockInfo& x) { return x.block->GetRegion() <= PC && x.block->GetRegion() + x.block->GetRegionSize() > PC; }); + } + + std::vector code_block_infos; + std::mutex code_block_infos_mutex; + + struct sigaction old_sa_segv; + struct sigaction old_sa_bus; + + static void SigAction(int sig, siginfo_t* info, void* raw_context); +}; + +SigHandler sig_handler; + +SigHandler::SigHandler() { + // Method below from dolphin. + + const size_t signal_stack_size = std::max(SIGSTKSZ, 2 * 1024 * 1024); + + stack_t signal_stack; + signal_stack.ss_sp = malloc(signal_stack_size); + signal_stack.ss_size = signal_stack_size; + signal_stack.ss_flags = 0; + ASSERT_MSG(sigaltstack(&signal_stack, nullptr) == 0, + "dynarmic: POSIX SigHandler: init failure at sigaltstack"); + + struct sigaction sa; + sa.sa_handler = nullptr; + sa.sa_sigaction = &SigHandler::SigAction; + sa.sa_flags = SA_SIGINFO | SA_ONSTACK | SA_RESTART; + sigemptyset(&sa.sa_mask); + sigaction(SIGSEGV, &sa, &old_sa_segv); +} + +SigHandler::~SigHandler() { + // No cleanup required. +} + +void SigHandler::AddCodeBlock(CodeBlockInfo cb) { + std::lock_guard guard(code_block_infos_mutex); + ASSERT(FindCodeBlockInfo(cb.block->GetRegion()) == code_block_infos.end()); + code_block_infos.push_back(std::move(cb)); +} + +void SigHandler::RemoveCodeBlock(CodePtr PC) { + std::lock_guard guard(code_block_infos_mutex); + const auto iter = FindCodeBlockInfo(PC); + ASSERT(iter != code_block_infos.end()); + code_block_infos.erase(iter); +} + +void SigHandler::SigAction(int sig, siginfo_t* info, void* raw_context) { + ASSERT(sig == SIGSEGV || sig == SIGBUS); + + std::lock_guard guard(sig_handler.code_block_infos_mutex); +#ifdef __APPLE__ + auto PC = reinterpret_cast(((ucontext_t*)raw_context)->uc_mcontext->__ss.__pc); +#else + auto PC = reinterpret_cast(((ucontext_t*)raw_context)->uc_mcontext.pc); +#endif + const auto iter = sig_handler.FindCodeBlockInfo(PC); + if (iter != sig_handler.code_block_infos.end()) { + iter->callback(PC); + return; + } + + fmt::print( + stderr, + "dynarmic: POSIX SigHandler: Exception was not in registered code blocks (PC {})\n", + PC); + + struct sigaction* retry_sa = + sig == SIGSEGV ? &sig_handler.old_sa_segv : &sig_handler.old_sa_bus; + if (retry_sa->sa_flags & SA_SIGINFO) { + retry_sa->sa_sigaction(sig, info, raw_context); + return; + } + if (retry_sa->sa_handler == SIG_DFL) { + signal(sig, SIG_DFL); + return; + } + if (retry_sa->sa_handler == SIG_IGN) { + return; + } + retry_sa->sa_handler(sig); +} + +} // anonymous namespace + +struct ExceptionHandler::Impl final { + Impl(BlockOfCode& code, std::function cb) { + code_begin = code.GetRegion(); + sig_handler.AddCodeBlock({&code, std::move(cb)}); + } + + ~Impl() { + sig_handler.RemoveCodeBlock(code_begin); + } + +private: + CodePtr code_begin; +}; + +ExceptionHandler::ExceptionHandler() = default; + +ExceptionHandler::~ExceptionHandler() = default; + +void ExceptionHandler::Register(BlockOfCode& code, std::function cb) { + if (cb) + impl = std::make_unique(code, std::move(cb)); +} + +bool ExceptionHandler::SupportsFastmem() const { + return static_cast(impl); +} + +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/hostloc.cpp b/src/dynarmic/backend/A64/hostloc.cpp new file mode 100644 index 00000000..8d1094ec --- /dev/null +++ b/src/dynarmic/backend/A64/hostloc.cpp @@ -0,0 +1,21 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#include "backend/A64/hostloc.h" + +namespace Dynarmic::BackendA64 { + +Arm64Gen::ARM64Reg HostLocToReg64(HostLoc loc) { + ASSERT(HostLocIsGPR(loc)); + return static_cast(static_cast(Arm64Gen::X0) + static_cast(loc)); +} + +Arm64Gen::ARM64Reg HostLocToFpr(HostLoc loc) { + ASSERT(HostLocIsFPR(loc)); + return EncodeRegToQuad(static_cast(static_cast(loc) - static_cast(HostLoc::Q0))); +} + +} // namespace Dynarmic::BackendX64 diff --git a/src/dynarmic/backend/A64/hostloc.h b/src/dynarmic/backend/A64/hostloc.h new file mode 100644 index 00000000..7183d0a8 --- /dev/null +++ b/src/dynarmic/backend/A64/hostloc.h @@ -0,0 +1,176 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ +#pragma once + +#include "backend/A64/emitter/a64_emitter.h" +#include "common/assert.h" +#include "common/common_types.h" + +namespace Dynarmic::BackendA64 { + +enum class HostLoc { + // Ordering of the registers is intentional. See also: HostLocToA64. + + // 64bit GPR registers + X0, + X1, + X2, + X3, + X4, + X5, + X6, + X7, + X8, + X9, + X10, + X11, + X12, + X13, + X14, + X15, + X16, + X17, + X18, + X19, + X20, + X21, + X22, + X23, + X24, + X25, + X26, + X27, + X28, + X29, + X30, + + SP, // 64bit stack pointer + + // Qword FPR registers + Q0, + Q1, + Q2, + Q3, + Q4, + Q5, + Q6, + Q7, + Q8, + Q9, + Q10, + Q11, + Q12, + Q13, + Q14, + Q15, + Q16, + Q17, + Q18, + Q19, + Q20, + Q21, + Q22, + Q23, + Q24, + Q25, + Q26, + Q27, + Q28, + Q29, + Q30, + Q31, + + FirstSpill, +}; + +constexpr size_t NonSpillHostLocCount = static_cast(HostLoc::FirstSpill); + +inline bool HostLocIsGPR(HostLoc reg) { + return reg >= HostLoc::X0 && reg <= HostLoc::X30; +} + +inline bool HostLocIsFPR(HostLoc reg) { + return reg >= HostLoc::Q0 && reg <= HostLoc::Q31; +} + +inline bool HostLocIsRegister(HostLoc reg) { + return HostLocIsGPR(reg) || HostLocIsFPR(reg); +} + +inline HostLoc HostLocRegIdx(int idx) { + ASSERT(idx >= 0 && idx <= 30); + return static_cast(idx); +} + +inline HostLoc HostLocFprIdx(int idx) { + ASSERT(idx >= 0 && idx <= 31); + return static_cast(static_cast(HostLoc::Q0) + idx); +} + +inline HostLoc HostLocSpill(size_t i) { + return static_cast(static_cast(HostLoc::FirstSpill) + i); +} + +inline bool HostLocIsSpill(HostLoc reg) { + return reg >= HostLoc::FirstSpill; +} + +inline size_t HostLocBitWidth(HostLoc loc) { + if (HostLocIsGPR(loc)) + return 64; + if (HostLocIsFPR(loc)) + return 128; + if (HostLocIsSpill(loc)) + return 128; + UNREACHABLE(); +} + +using HostLocList = std::initializer_list; + +// X18 may be reserved.(Windows and iOS) +// X26 holds the cycle counter +// X27 contains an emulated memory relate pointer +// X28 used for holding the JitState. +// X30 is the link register. +// In order of desireablity based first on ABI +constexpr HostLocList any_gpr = { + HostLoc::X19, HostLoc::X20, HostLoc::X21, HostLoc::X22, HostLoc::X23, + HostLoc::X24, HostLoc::X25, + + HostLoc::X8, HostLoc::X9, HostLoc::X10, HostLoc::X11, HostLoc::X12, + HostLoc::X13, HostLoc::X14, HostLoc::X15, HostLoc::X16, HostLoc::X17, + + HostLoc::X7, HostLoc::X6, HostLoc::X5, HostLoc::X4, HostLoc::X3, + HostLoc::X2, HostLoc::X1, HostLoc::X0, +}; + +constexpr HostLocList any_fpr = { + HostLoc::Q8, HostLoc::Q9, HostLoc::Q10, HostLoc::Q11, HostLoc::Q12, HostLoc::Q13, + HostLoc::Q14, HostLoc::Q15, + + HostLoc::Q16, HostLoc::Q17, HostLoc::Q18, HostLoc::Q19, HostLoc::Q20, HostLoc::Q21, + HostLoc::Q22, HostLoc::Q23, HostLoc::Q24, HostLoc::Q25, HostLoc::Q26, HostLoc::Q27, + HostLoc::Q28, HostLoc::Q29, HostLoc::Q30, HostLoc::Q31, + + HostLoc::Q7, HostLoc::Q6, HostLoc::Q5, HostLoc::Q4, HostLoc::Q3, HostLoc::Q2, + HostLoc::Q1, HostLoc::Q0, +}; + +Arm64Gen::ARM64Reg HostLocToReg64(HostLoc loc); +Arm64Gen::ARM64Reg HostLocToFpr(HostLoc loc); + +template +size_t SpillToOpArg(HostLoc loc) { + ASSERT(HostLocIsSpill(loc)); + + size_t i = static_cast(loc) - static_cast(HostLoc::FirstSpill); + ASSERT_MSG(i < JitStateType::SpillCount, + "Spill index greater than number of available spill locations"); + + return JitStateType::GetSpillLocationOffsetFromIndex(i); +} + +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/jitstate_info.h b/src/dynarmic/backend/A64/jitstate_info.h new file mode 100644 index 00000000..63336d79 --- /dev/null +++ b/src/dynarmic/backend/A64/jitstate_info.h @@ -0,0 +1,44 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#pragma once + +#include + +namespace Dynarmic::BackendA64 { + +struct JitStateInfo { + template + JitStateInfo(const JitStateType&) + : offsetof_cycles_remaining(offsetof(JitStateType, cycles_remaining)) + , offsetof_cycles_to_run(offsetof(JitStateType, cycles_to_run)) + , offsetof_save_host_FPCR(offsetof(JitStateType, save_host_FPCR)) + , offsetof_guest_fpcr(offsetof(JitStateType, guest_fpcr)) + , offsetof_guest_fpsr(offsetof(JitStateType, guest_fpsr)) + , offsetof_rsb_ptr(offsetof(JitStateType, rsb_ptr)) + , rsb_ptr_mask(JitStateType::RSBPtrMask) + , offsetof_rsb_location_descriptors(offsetof(JitStateType, rsb_location_descriptors)) + , offsetof_rsb_codeptrs(offsetof(JitStateType, rsb_codeptrs)) + , offsetof_cpsr_nzcv(offsetof(JitStateType, cpsr_nzcv)) + , offsetof_fpsr_exc(offsetof(JitStateType, fpsr_exc)) + , offsetof_fpsr_qc(offsetof(JitStateType, fpsr_qc)) + {} + + const size_t offsetof_cycles_remaining; + const size_t offsetof_cycles_to_run; + const size_t offsetof_save_host_FPCR; + const size_t offsetof_guest_fpcr; + const size_t offsetof_guest_fpsr; + const size_t offsetof_rsb_ptr; + const size_t rsb_ptr_mask; + const size_t offsetof_rsb_location_descriptors; + const size_t offsetof_rsb_codeptrs; + const size_t offsetof_cpsr_nzcv; + const size_t offsetof_fpsr_exc; + const size_t offsetof_fpsr_qc; +}; + +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/opcodes.inc b/src/dynarmic/backend/A64/opcodes.inc new file mode 100644 index 00000000..8857cf78 --- /dev/null +++ b/src/dynarmic/backend/A64/opcodes.inc @@ -0,0 +1,651 @@ +// opcode name, return type, arg1 type, arg2 type, arg3 type, arg4 type, ... + +OPCODE(Void, Void, ) +OPCODE(Identity, Opaque, Opaque ) +OPCODE(Breakpoint, Void, ) + +// A32 Context getters/setters +A32OPC(SetCheckBit, Void, U1 ) +A32OPC(GetRegister, U32, A32Reg ) +A32OPC(GetExtendedRegister32, U32, A32ExtReg ) +A32OPC(GetExtendedRegister64, U64, A32ExtReg ) +A32OPC(SetRegister, Void, A32Reg, U32 ) +A32OPC(SetExtendedRegister32, Void, A32ExtReg, U32 ) +A32OPC(SetExtendedRegister64, Void, A32ExtReg, U64 ) +A32OPC(GetCpsr, U32, ) +A32OPC(SetCpsr, Void, U32 ) +A32OPC(SetCpsrNZCVRaw, Void, U32 ) +A32OPC(SetCpsrNZCV, Void, NZCV ) +A32OPC(SetCpsrNZCVQ, Void, U32 ) +A32OPC(GetNFlag, U1, ) +A32OPC(SetNFlag, Void, U1 ) +A32OPC(GetZFlag, U1, ) +A32OPC(SetZFlag, Void, U1 ) +A32OPC(GetCFlag, U1, ) +A32OPC(SetCFlag, Void, U1 ) +A32OPC(GetVFlag, U1, ) +A32OPC(SetVFlag, Void, U1 ) +A32OPC(OrQFlag, Void, U1 ) +A32OPC(GetGEFlags, U32, ) +A32OPC(SetGEFlags, Void, U32 ) +A32OPC(SetGEFlagsCompressed, Void, U32 ) +A32OPC(BXWritePC, Void, U32 ) +A32OPC(CallSupervisor, Void, U32 ) +A32OPC(ExceptionRaised, Void, U32, U64 ) +A32OPC(GetFpscr, U32, ) +A32OPC(SetFpscr, Void, U32, ) +A32OPC(GetFpscrNZCV, U32, ) +A32OPC(SetFpscrNZCV, Void, NZCV ) + +// A64 Context getters/setters +//A64OPC(SetCheckBit, Void, U1 ) +//A64OPC(GetCFlag, U1, ) +//A64OPC(GetNZCVRaw, U32, ) +//A64OPC(SetNZCVRaw, Void, U32 ) +//A64OPC(SetNZCV, Void, NZCV ) +//A64OPC(GetW, U32, A64Reg ) +//A64OPC(GetX, U64, A64Reg ) +//A64OPC(GetS, U128, A64Vec ) +//A64OPC(GetD, U128, A64Vec ) +//A64OPC(GetQ, U128, A64Vec ) +//A64OPC(GetSP, U64, ) +//A64OPC(GetFPCR, U32, ) +//A64OPC(GetFPSR, U32, ) +//A64OPC(SetW, Void, A64Reg, U32 ) +//A64OPC(SetX, Void, A64Reg, U64 ) +//A64OPC(SetS, Void, A64Vec, U128 ) +//A64OPC(SetD, Void, A64Vec, U128 ) +//A64OPC(SetQ, Void, A64Vec, U128 ) +//A64OPC(SetSP, Void, U64 ) +//A64OPC(SetFPCR, Void, U32 ) +//A64OPC(SetFPSR, Void, U32 ) +//A64OPC(OrQC, Void, U1 ) +//A64OPC(SetPC, Void, U64 ) +//A64OPC(CallSupervisor, Void, U32 ) +//A64OPC(ExceptionRaised, Void, U64, U64 ) +//A64OPC(DataCacheOperationRaised, Void, U64, U64 ) +//A64OPC(DataSynchronizationBarrier, Void, ) +//A64OPC(DataMemoryBarrier, Void, ) +//A64OPC(InstructionSynchronizationBarrier, Void, ) +//A64OPC(GetCNTFRQ, U32, ) +//A64OPC(GetCNTPCT, U64, ) +//A64OPC(GetCTR, U32, ) +//A64OPC(GetDCZID, U32, ) +//A64OPC(GetTPIDR, U64, ) +//A64OPC(GetTPIDRRO, U64, ) +//A64OPC(SetTPIDR, Void, U64 ) + +// Hints +OPCODE(PushRSB, Void, U64 ) + +// Pseudo-operation, handled specially at final emit +OPCODE(GetCarryFromOp, U1, Opaque ) +OPCODE(GetOverflowFromOp, U1, Opaque ) +OPCODE(GetGEFromOp, U32, Opaque ) +OPCODE(GetNZCVFromOp, NZCV, Opaque ) +OPCODE(GetUpperFromOp, U128, Opaque ) +OPCODE(GetLowerFromOp, U128, Opaque ) + +OPCODE(NZCVFromPackedFlags, NZCV, U32 ) + +// Calculations +OPCODE(Pack2x32To1x64, U64, U32, U32 ) +//OPCODE(Pack2x64To1x128, U128, U64, U64 ) +OPCODE(LeastSignificantWord, U32, U64 ) +OPCODE(MostSignificantWord, U32, U64 ) +OPCODE(LeastSignificantHalf, U16, U32 ) +OPCODE(LeastSignificantByte, U8, U32 ) +OPCODE(MostSignificantBit, U1, U32 ) +OPCODE(IsZero32, U1, U32 ) +OPCODE(IsZero64, U1, U64 ) +OPCODE(TestBit, U1, U64, U8 ) +OPCODE(ConditionalSelect32, U32, Cond, U32, U32 ) +OPCODE(ConditionalSelect64, U64, Cond, U64, U64 ) +OPCODE(ConditionalSelectNZCV, NZCV, Cond, NZCV, NZCV ) +OPCODE(LogicalShiftLeft32, U32, U32, U8, U1 ) +OPCODE(LogicalShiftLeft64, U64, U64, U8 ) +OPCODE(LogicalShiftRight32, U32, U32, U8, U1 ) +OPCODE(LogicalShiftRight64, U64, U64, U8 ) +OPCODE(ArithmeticShiftRight32, U32, U32, U8, U1 ) +//OPCODE(ArithmeticShiftRight64, U64, U64, U8 ) +OPCODE(RotateRight32, U32, U32, U8, U1 ) +OPCODE(RotateRight64, U64, U64, U8 ) +OPCODE(RotateRightExtended, U32, U32, U1 ) +OPCODE(Add32, U32, U32, U32, U1 ) +OPCODE(Add64, U64, U64, U64, U1 ) +OPCODE(Sub32, U32, U32, U32, U1 ) +OPCODE(Sub64, U64, U64, U64, U1 ) +OPCODE(Mul32, U32, U32, U32 ) +OPCODE(Mul64, U64, U64, U64 ) +//OPCODE(SignedMultiplyHigh64, U64, U64, U64 ) +//OPCODE(UnsignedMultiplyHigh64, U64, U64, U64 ) +OPCODE(UnsignedDiv32, U32, U32, U32 ) +OPCODE(UnsignedDiv64, U64, U64, U64 ) +OPCODE(SignedDiv32, U32, U32, U32 ) +OPCODE(SignedDiv64, U64, U64, U64 ) +OPCODE(And32, U32, U32, U32 ) +OPCODE(And64, U64, U64, U64 ) +OPCODE(Eor32, U32, U32, U32 ) +OPCODE(Eor64, U64, U64, U64 ) +OPCODE(Or32, U32, U32, U32 ) +OPCODE(Or64, U64, U64, U64 ) +OPCODE(Not32, U32, U32 ) +OPCODE(Not64, U64, U64 ) +OPCODE(SignExtendByteToWord, U32, U8 ) +OPCODE(SignExtendHalfToWord, U32, U16 ) +OPCODE(SignExtendByteToLong, U64, U8 ) +OPCODE(SignExtendHalfToLong, U64, U16 ) +OPCODE(SignExtendWordToLong, U64, U32 ) +OPCODE(ZeroExtendByteToWord, U32, U8 ) +OPCODE(ZeroExtendHalfToWord, U32, U16 ) +OPCODE(ZeroExtendByteToLong, U64, U8 ) +OPCODE(ZeroExtendHalfToLong, U64, U16 ) +OPCODE(ZeroExtendWordToLong, U64, U32 ) +//OPCODE(ZeroExtendLongToQuad, U128, U64 ) +//OPCODE(ByteReverseDual, U64, U64 ) +OPCODE(ByteReverseWord, U32, U32 ) +OPCODE(ByteReverseHalf, U16, U16 ) +OPCODE(CountLeadingZeros32, U32, U32 ) +OPCODE(CountLeadingZeros64, U64, U64 ) +//OPCODE(ExtractRegister32, U32, U32, U32, U8 ) +//OPCODE(ExtractRegister64, U64, U64, U64, U8 ) +//OPCODE(MaxSigned32, U32, U32, U32 ) +//OPCODE(MaxSigned64, U64, U64, U64 ) +//OPCODE(MaxUnsigned32, U32, U32, U32 ) +//OPCODE(MaxUnsigned64, U64, U64, U64 ) +//OPCODE(MinSigned32, U32, U32, U32 ) +//OPCODE(MinSigned64, U64, U64, U64 ) +//OPCODE(MinUnsigned32, U32, U32, U32 ) +//OPCODE(MinUnsigned64, U64, U64, U64 ) + +// Saturated instructions +OPCODE(SignedSaturatedAdd8, U8, U8, U8 ) +OPCODE(SignedSaturatedAdd16, U16, U16, U16 ) +OPCODE(SignedSaturatedAdd32, U32, U32, U32 ) +OPCODE(SignedSaturatedAdd64, U64, U64, U64 ) +//OPCODE(SignedSaturatedDoublingMultiplyReturnHigh16, U16, U16, U16 ) +//OPCODE(SignedSaturatedDoublingMultiplyReturnHigh32, U32, U32, U32 ) +OPCODE(SignedSaturatedSub8, U8, U8, U8 ) +OPCODE(SignedSaturatedSub16, U16, U16, U16 ) +OPCODE(SignedSaturatedSub32, U32, U32, U32 ) +OPCODE(SignedSaturatedSub64, U64, U64, U64 ) +OPCODE(SignedSaturation, U32, U32, U8 ) +//OPCODE(UnsignedSaturatedAdd8, U8, U8, U8 ) +//OPCODE(UnsignedSaturatedAdd16, U16, U16, U16 ) +//OPCODE(UnsignedSaturatedAdd32, U32, U32, U32 ) +//OPCODE(UnsignedSaturatedAdd64, U64, U64, U64 ) +//OPCODE(UnsignedSaturatedSub8, U8, U8, U8 ) +//OPCODE(UnsignedSaturatedSub16, U16, U16, U16 ) +//OPCODE(UnsignedSaturatedSub32, U32, U32, U32 ) +//OPCODE(UnsignedSaturatedSub64, U64, U64, U64 ) +OPCODE(UnsignedSaturation, U32, U32, U8 ) + +// Packed instructions +OPCODE(PackedAddU8, U32, U32, U32 ) +OPCODE(PackedAddS8, U32, U32, U32 ) +OPCODE(PackedSubU8, U32, U32, U32 ) +OPCODE(PackedSubS8, U32, U32, U32 ) +OPCODE(PackedAddU16, U32, U32, U32 ) +OPCODE(PackedAddS16, U32, U32, U32 ) +OPCODE(PackedSubU16, U32, U32, U32 ) +OPCODE(PackedSubS16, U32, U32, U32 ) +OPCODE(PackedAddSubU16, U32, U32, U32 ) +OPCODE(PackedAddSubS16, U32, U32, U32 ) +OPCODE(PackedSubAddU16, U32, U32, U32 ) +OPCODE(PackedSubAddS16, U32, U32, U32 ) +OPCODE(PackedHalvingAddU8, U32, U32, U32 ) +OPCODE(PackedHalvingAddS8, U32, U32, U32 ) +OPCODE(PackedHalvingSubU8, U32, U32, U32 ) +OPCODE(PackedHalvingSubS8, U32, U32, U32 ) +OPCODE(PackedHalvingAddU16, U32, U32, U32 ) +OPCODE(PackedHalvingAddS16, U32, U32, U32 ) +OPCODE(PackedHalvingSubU16, U32, U32, U32 ) +OPCODE(PackedHalvingSubS16, U32, U32, U32 ) +OPCODE(PackedHalvingAddSubU16, U32, U32, U32 ) +OPCODE(PackedHalvingAddSubS16, U32, U32, U32 ) +OPCODE(PackedHalvingSubAddU16, U32, U32, U32 ) +OPCODE(PackedHalvingSubAddS16, U32, U32, U32 ) +OPCODE(PackedSaturatedAddU8, U32, U32, U32 ) +OPCODE(PackedSaturatedAddS8, U32, U32, U32 ) +OPCODE(PackedSaturatedSubU8, U32, U32, U32 ) +OPCODE(PackedSaturatedSubS8, U32, U32, U32 ) +OPCODE(PackedSaturatedAddU16, U32, U32, U32 ) +OPCODE(PackedSaturatedAddS16, U32, U32, U32 ) +OPCODE(PackedSaturatedSubU16, U32, U32, U32 ) +OPCODE(PackedSaturatedSubS16, U32, U32, U32 ) +OPCODE(PackedAbsDiffSumS8, U32, U32, U32 ) +OPCODE(PackedSelect, U32, U32, U32, U32 ) + +// CRC instructions +//OPCODE(CRC32Castagnoli8, U32, U32, U32 ) +//OPCODE(CRC32Castagnoli16, U32, U32, U32 ) +//OPCODE(CRC32Castagnoli32, U32, U32, U32 ) +//OPCODE(CRC32Castagnoli64, U32, U32, U64 ) +//OPCODE(CRC32ISO8, U32, U32, U32 ) +//OPCODE(CRC32ISO16, U32, U32, U32 ) +//OPCODE(CRC32ISO32, U32, U32, U32 ) +//OPCODE(CRC32ISO64, U32, U32, U64 ) + +// AES instructions +//OPCODE(AESDecryptSingleRound, U128, U128 ) +//OPCODE(AESEncryptSingleRound, U128, U128 ) +//OPCODE(AESInverseMixColumns, U128, U128 ) +//OPCODE(AESMixColumns, U128, U128 ) + +// SM4 instructions +//OPCODE(SM4AccessSubstitutionBox, U8, U8 ) + +// Vector instructions +//OPCODE(VectorGetElement8, U8, U128, U8 ) +//OPCODE(VectorGetElement16, U16, U128, U8 ) +//OPCODE(VectorGetElement32, U32, U128, U8 ) +//OPCODE(VectorGetElement64, U64, U128, U8 ) +//OPCODE(VectorSetElement8, U128, U128, U8, U8 ) +//OPCODE(VectorSetElement16, U128, U128, U8, U16 ) +//OPCODE(VectorSetElement32, U128, U128, U8, U32 ) +//OPCODE(VectorSetElement64, U128, U128, U8, U64 ) +//OPCODE(VectorAbs8, U128, U128 ) +//OPCODE(VectorAbs16, U128, U128 ) +//OPCODE(VectorAbs32, U128, U128 ) +//OPCODE(VectorAbs64, U128, U128 ) +//OPCODE(VectorAdd8, U128, U128, U128 ) +//OPCODE(VectorAdd16, U128, U128, U128 ) +//OPCODE(VectorAdd32, U128, U128, U128 ) +//OPCODE(VectorAdd64, U128, U128, U128 ) +//OPCODE(VectorAnd, U128, U128, U128 ) +//OPCODE(VectorArithmeticShiftRight8, U128, U128, U8 ) +//OPCODE(VectorArithmeticShiftRight16, U128, U128, U8 ) +//OPCODE(VectorArithmeticShiftRight32, U128, U128, U8 ) +//OPCODE(VectorArithmeticShiftRight64, U128, U128, U8 ) +//OPCODE(VectorArithmeticVShift8, U128, U128, U128 ) +//OPCODE(VectorArithmeticVShift16, U128, U128, U128 ) +//OPCODE(VectorArithmeticVShift32, U128, U128, U128 ) +//OPCODE(VectorArithmeticVShift64, U128, U128, U128 ) +//OPCODE(VectorBroadcastLower8, U128, U8 ) +//OPCODE(VectorBroadcastLower16, U128, U16 ) +//OPCODE(VectorBroadcastLower32, U128, U32 ) +//OPCODE(VectorBroadcast8, U128, U8 ) +//OPCODE(VectorBroadcast16, U128, U16 ) +//OPCODE(VectorBroadcast32, U128, U32 ) +//OPCODE(VectorBroadcast64, U128, U64 ) +//OPCODE(VectorCountLeadingZeros8, U128, U128 ) +//OPCODE(VectorCountLeadingZeros16, U128, U128 ) +//OPCODE(VectorCountLeadingZeros32, U128, U128 ) +//OPCODE(VectorDeinterleaveEven8, U128, U128, U128 ) +//OPCODE(VectorDeinterleaveEven16, U128, U128, U128 ) +//OPCODE(VectorDeinterleaveEven32, U128, U128, U128 ) +//OPCODE(VectorDeinterleaveEven64, U128, U128, U128 ) +//OPCODE(VectorDeinterleaveOdd8, U128, U128, U128 ) +//OPCODE(VectorDeinterleaveOdd16, U128, U128, U128 ) +//OPCODE(VectorDeinterleaveOdd32, U128, U128, U128 ) +//OPCODE(VectorDeinterleaveOdd64, U128, U128, U128 ) +//OPCODE(VectorEor, U128, U128, U128 ) +//OPCODE(VectorEqual8, U128, U128, U128 ) +//OPCODE(VectorEqual16, U128, U128, U128 ) +//OPCODE(VectorEqual32, U128, U128, U128 ) +//OPCODE(VectorEqual64, U128, U128, U128 ) +//OPCODE(VectorEqual128, U128, U128, U128 ) +//OPCODE(VectorExtract, U128, U128, U128, U8 ) +//OPCODE(VectorExtractLower, U128, U128, U128, U8 ) +//OPCODE(VectorGreaterS8, U128, U128, U128 ) +//OPCODE(VectorGreaterS16, U128, U128, U128 ) +//OPCODE(VectorGreaterS32, U128, U128, U128 ) +//OPCODE(VectorGreaterS64, U128, U128, U128 ) +//OPCODE(VectorHalvingAddS8, U128, U128, U128 ) +//OPCODE(VectorHalvingAddS16, U128, U128, U128 ) +//OPCODE(VectorHalvingAddS32, U128, U128, U128 ) +//OPCODE(VectorHalvingAddU8, U128, U128, U128 ) +//OPCODE(VectorHalvingAddU16, U128, U128, U128 ) +//OPCODE(VectorHalvingAddU32, U128, U128, U128 ) +//OPCODE(VectorHalvingSubS8, U128, U128, U128 ) +//OPCODE(VectorHalvingSubS16, U128, U128, U128 ) +//OPCODE(VectorHalvingSubS32, U128, U128, U128 ) +//OPCODE(VectorHalvingSubU8, U128, U128, U128 ) +//OPCODE(VectorHalvingSubU16, U128, U128, U128 ) +//OPCODE(VectorHalvingSubU32, U128, U128, U128 ) +//OPCODE(VectorInterleaveLower8, U128, U128, U128 ) +//OPCODE(VectorInterleaveLower16, U128, U128, U128 ) +//OPCODE(VectorInterleaveLower32, U128, U128, U128 ) +//OPCODE(VectorInterleaveLower64, U128, U128, U128 ) +//OPCODE(VectorInterleaveUpper8, U128, U128, U128 ) +//OPCODE(VectorInterleaveUpper16, U128, U128, U128 ) +//OPCODE(VectorInterleaveUpper32, U128, U128, U128 ) +//OPCODE(VectorInterleaveUpper64, U128, U128, U128 ) +//OPCODE(VectorLogicalShiftLeft8, U128, U128, U8 ) +//OPCODE(VectorLogicalShiftLeft16, U128, U128, U8 ) +//OPCODE(VectorLogicalShiftLeft32, U128, U128, U8 ) +//OPCODE(VectorLogicalShiftLeft64, U128, U128, U8 ) +//OPCODE(VectorLogicalShiftRight8, U128, U128, U8 ) +//OPCODE(VectorLogicalShiftRight16, U128, U128, U8 ) +//OPCODE(VectorLogicalShiftRight32, U128, U128, U8 ) +//OPCODE(VectorLogicalShiftRight64, U128, U128, U8 ) +//OPCODE(VectorLogicalVShift8, U128, U128, U128 ) +//OPCODE(VectorLogicalVShift16, U128, U128, U128 ) +//OPCODE(VectorLogicalVShift32, U128, U128, U128 ) +//OPCODE(VectorLogicalVShift64, U128, U128, U128 ) +//OPCODE(VectorMaxS8, U128, U128, U128 ) +//OPCODE(VectorMaxS16, U128, U128, U128 ) +//OPCODE(VectorMaxS32, U128, U128, U128 ) +//OPCODE(VectorMaxS64, U128, U128, U128 ) +//OPCODE(VectorMaxU8, U128, U128, U128 ) +//OPCODE(VectorMaxU16, U128, U128, U128 ) +//OPCODE(VectorMaxU32, U128, U128, U128 ) +//OPCODE(VectorMaxU64, U128, U128, U128 ) +//OPCODE(VectorMinS8, U128, U128, U128 ) +//OPCODE(VectorMinS16, U128, U128, U128 ) +//OPCODE(VectorMinS32, U128, U128, U128 ) +//OPCODE(VectorMinS64, U128, U128, U128 ) +//OPCODE(VectorMinU8, U128, U128, U128 ) +//OPCODE(VectorMinU16, U128, U128, U128 ) +//OPCODE(VectorMinU32, U128, U128, U128 ) +//OPCODE(VectorMinU64, U128, U128, U128 ) +//OPCODE(VectorMultiply8, U128, U128, U128 ) +//OPCODE(VectorMultiply16, U128, U128, U128 ) +//OPCODE(VectorMultiply32, U128, U128, U128 ) +//OPCODE(VectorMultiply64, U128, U128, U128 ) +//OPCODE(VectorNarrow16, U128, U128 ) +//OPCODE(VectorNarrow32, U128, U128 ) +//OPCODE(VectorNarrow64, U128, U128 ) +//OPCODE(VectorNot, U128, U128 ) +//OPCODE(VectorOr, U128, U128, U128 ) +//OPCODE(VectorPairedAddLower8, U128, U128, U128 ) +//OPCODE(VectorPairedAddLower16, U128, U128, U128 ) +//OPCODE(VectorPairedAddLower32, U128, U128, U128 ) +//OPCODE(VectorPairedAddSignedWiden8, U128, U128 ) +//OPCODE(VectorPairedAddSignedWiden16, U128, U128 ) +//OPCODE(VectorPairedAddSignedWiden32, U128, U128 ) +//OPCODE(VectorPairedAddUnsignedWiden8, U128, U128 ) +//OPCODE(VectorPairedAddUnsignedWiden16, U128, U128 ) +//OPCODE(VectorPairedAddUnsignedWiden32, U128, U128 ) +//OPCODE(VectorPairedAdd8, U128, U128, U128 ) +//OPCODE(VectorPairedAdd16, U128, U128, U128 ) +//OPCODE(VectorPairedAdd32, U128, U128, U128 ) +//OPCODE(VectorPairedAdd64, U128, U128, U128 ) +//OPCODE(VectorPairedMaxS8, U128, U128, U128 ) +//OPCODE(VectorPairedMaxS16, U128, U128, U128 ) +//OPCODE(VectorPairedMaxS32, U128, U128, U128 ) +//OPCODE(VectorPairedMaxU8, U128, U128, U128 ) +//OPCODE(VectorPairedMaxU16, U128, U128, U128 ) +//OPCODE(VectorPairedMaxU32, U128, U128, U128 ) +//OPCODE(VectorPairedMinS8, U128, U128, U128 ) +//OPCODE(VectorPairedMinS16, U128, U128, U128 ) +//OPCODE(VectorPairedMinS32, U128, U128, U128 ) +//OPCODE(VectorPairedMinU8, U128, U128, U128 ) +//OPCODE(VectorPairedMinU16, U128, U128, U128 ) +//OPCODE(VectorPairedMinU32, U128, U128, U128 ) +//OPCODE(VectorPolynomialMultiply8, U128, U128, U128 ) +//OPCODE(VectorPolynomialMultiplyLong8, U128, U128, U128 ) +//OPCODE(VectorPolynomialMultiplyLong64, U128, U128, U128 ) +//OPCODE(VectorPopulationCount, U128, U128 ) +//OPCODE(VectorReverseBits, U128, U128 ) +//OPCODE(VectorRoundingHalvingAddS8, U128, U128, U128 ) +//OPCODE(VectorRoundingHalvingAddS16, U128, U128, U128 ) +//OPCODE(VectorRoundingHalvingAddS32, U128, U128, U128 ) +//OPCODE(VectorRoundingHalvingAddU8, U128, U128, U128 ) +//OPCODE(VectorRoundingHalvingAddU16, U128, U128, U128 ) +//OPCODE(VectorRoundingHalvingAddU32, U128, U128, U128 ) +//OPCODE(VectorRoundingShiftLeftS8, U128, U128, U128 ) +//OPCODE(VectorRoundingShiftLeftS16, U128, U128, U128 ) +//OPCODE(VectorRoundingShiftLeftS32, U128, U128, U128 ) +//OPCODE(VectorRoundingShiftLeftS64, U128, U128, U128 ) +//OPCODE(VectorRoundingShiftLeftU8, U128, U128, U128 ) +//OPCODE(VectorRoundingShiftLeftU16, U128, U128, U128 ) +//OPCODE(VectorRoundingShiftLeftU32, U128, U128, U128 ) +//OPCODE(VectorRoundingShiftLeftU64, U128, U128, U128 ) +//OPCODE(VectorShuffleHighHalfwords, U128, U128, U8 ) +//OPCODE(VectorShuffleLowHalfwords, U128, U128, U8 ) +//OPCODE(VectorShuffleWords, U128, U128, U8 ) +//OPCODE(VectorSignExtend8, U128, U128 ) +//OPCODE(VectorSignExtend16, U128, U128 ) +//OPCODE(VectorSignExtend32, U128, U128 ) +//OPCODE(VectorSignExtend64, U128, U128 ) +//OPCODE(VectorSignedAbsoluteDifference8, U128, U128, U128 ) +//OPCODE(VectorSignedAbsoluteDifference16, U128, U128, U128 ) +//OPCODE(VectorSignedAbsoluteDifference32, U128, U128, U128 ) +//OPCODE(VectorSignedMultiply16, Void, U128, U128 ) +//OPCODE(VectorSignedMultiply32, Void, U128, U128 ) +//OPCODE(VectorSignedSaturatedAbs8, U128, U128 ) +//OPCODE(VectorSignedSaturatedAbs16, U128, U128 ) +//OPCODE(VectorSignedSaturatedAbs32, U128, U128 ) +//OPCODE(VectorSignedSaturatedAbs64, U128, U128 ) +//OPCODE(VectorSignedSaturatedAccumulateUnsigned8, U128, U128, U128 ) +//OPCODE(VectorSignedSaturatedAccumulateUnsigned16, U128, U128, U128 ) +//OPCODE(VectorSignedSaturatedAccumulateUnsigned32, U128, U128, U128 ) +//OPCODE(VectorSignedSaturatedAccumulateUnsigned64, U128, U128, U128 ) +//OPCODE(VectorSignedSaturatedDoublingMultiply16, Void, U128, U128 ) +//OPCODE(VectorSignedSaturatedDoublingMultiply32, Void, U128, U128 ) +//OPCODE(VectorSignedSaturatedDoublingMultiplyLong16, U128, U128, U128 ) +//OPCODE(VectorSignedSaturatedDoublingMultiplyLong32, U128, U128, U128 ) +//OPCODE(VectorSignedSaturatedNarrowToSigned16, U128, U128 ) +//OPCODE(VectorSignedSaturatedNarrowToSigned32, U128, U128 ) +//OPCODE(VectorSignedSaturatedNarrowToSigned64, U128, U128 ) +//OPCODE(VectorSignedSaturatedNarrowToUnsigned16, U128, U128 ) +//OPCODE(VectorSignedSaturatedNarrowToUnsigned32, U128, U128 ) +//OPCODE(VectorSignedSaturatedNarrowToUnsigned64, U128, U128 ) +//OPCODE(VectorSignedSaturatedNeg8, U128, U128 ) +//OPCODE(VectorSignedSaturatedNeg16, U128, U128 ) +//OPCODE(VectorSignedSaturatedNeg32, U128, U128 ) +//OPCODE(VectorSignedSaturatedNeg64, U128, U128 ) +//OPCODE(VectorSignedSaturatedShiftLeft8, U128, U128, U128 ) +//OPCODE(VectorSignedSaturatedShiftLeft16, U128, U128, U128 ) +//OPCODE(VectorSignedSaturatedShiftLeft32, U128, U128, U128 ) +//OPCODE(VectorSignedSaturatedShiftLeft64, U128, U128, U128 ) +//OPCODE(VectorSignedSaturatedShiftLeftUnsigned8, U128, U128, U128 ) +//OPCODE(VectorSignedSaturatedShiftLeftUnsigned16, U128, U128, U128 ) +//OPCODE(VectorSignedSaturatedShiftLeftUnsigned32, U128, U128, U128 ) +//OPCODE(VectorSignedSaturatedShiftLeftUnsigned64, U128, U128, U128 ) +//OPCODE(VectorSub8, U128, U128, U128 ) +//OPCODE(VectorSub16, U128, U128, U128 ) +//OPCODE(VectorSub32, U128, U128, U128 ) +//OPCODE(VectorSub64, U128, U128, U128 ) +//OPCODE(VectorTable, Table, U128, Opaque, Opaque, Opaque ) +//OPCODE(VectorTableLookup, U128, U128, Table, U128 ) +//OPCODE(VectorUnsignedAbsoluteDifference8, U128, U128, U128 ) +//OPCODE(VectorUnsignedAbsoluteDifference16, U128, U128, U128 ) +//OPCODE(VectorUnsignedAbsoluteDifference32, U128, U128, U128 ) +//OPCODE(VectorUnsignedMultiply16, Void, U128, U128 ) +//OPCODE(VectorUnsignedMultiply32, Void, U128, U128 ) +//OPCODE(VectorUnsignedRecipEstimate, U128, U128 ) +//OPCODE(VectorUnsignedRecipSqrtEstimate, U128, U128 ) +//OPCODE(VectorUnsignedSaturatedAccumulateSigned8, U128, U128, U128 ) +//OPCODE(VectorUnsignedSaturatedAccumulateSigned16, U128, U128, U128 ) +//OPCODE(VectorUnsignedSaturatedAccumulateSigned32, U128, U128, U128 ) +//OPCODE(VectorUnsignedSaturatedAccumulateSigned64, U128, U128, U128 ) +//OPCODE(VectorUnsignedSaturatedNarrow16, U128, U128 ) +//OPCODE(VectorUnsignedSaturatedNarrow32, U128, U128 ) +//OPCODE(VectorUnsignedSaturatedNarrow64, U128, U128 ) +//OPCODE(VectorUnsignedSaturatedShiftLeft8, U128, U128, U128 ) +//OPCODE(VectorUnsignedSaturatedShiftLeft16, U128, U128, U128 ) +//OPCODE(VectorUnsignedSaturatedShiftLeft32, U128, U128, U128 ) +//OPCODE(VectorUnsignedSaturatedShiftLeft64, U128, U128, U128 ) +//OPCODE(VectorZeroExtend8, U128, U128 ) +//OPCODE(VectorZeroExtend16, U128, U128 ) +//OPCODE(VectorZeroExtend32, U128, U128 ) +//OPCODE(VectorZeroExtend64, U128, U128 ) +//OPCODE(VectorZeroUpper, U128, U128 ) +//OPCODE(ZeroVector, U128, ) + +// Floating-point operations +//OPCODE(FPAbs16, U16, U16 ) +OPCODE(FPAbs32, U32, U32 ) +OPCODE(FPAbs64, U64, U64 ) +OPCODE(FPAdd32, U32, U32, U32 ) +OPCODE(FPAdd64, U64, U64, U64 ) +OPCODE(FPCompare32, NZCV, U32, U32, U1 ) +OPCODE(FPCompare64, NZCV, U64, U64, U1 ) +OPCODE(FPDiv32, U32, U32, U32 ) +OPCODE(FPDiv64, U64, U64, U64 ) +//OPCODE(FPMax32, U32, U32, U32 ) +//OPCODE(FPMax64, U64, U64, U64 ) +//OPCODE(FPMaxNumeric32, U32, U32, U32 ) +//OPCODE(FPMaxNumeric64, U64, U64, U64 ) +//OPCODE(FPMin32, U32, U32, U32 ) +//OPCODE(FPMin64, U64, U64, U64 ) +//OPCODE(FPMinNumeric32, U32, U32, U32 ) +//OPCODE(FPMinNumeric64, U64, U64, U64 ) +OPCODE(FPMul32, U32, U32, U32 ) +OPCODE(FPMul64, U64, U64, U64 ) +//OPCODE(FPMulAdd16, U16, U16, U16, U16 ) +//OPCODE(FPMulAdd32, U32, U32, U32, U32 ) +//OPCODE(FPMulAdd64, U64, U64, U64, U64 ) +//OPCODE(FPMulX32, U32, U32, U32 ) +//OPCODE(FPMulX64, U64, U64, U64 ) +//OPCODE(FPNeg16, U16, U16 ) +OPCODE(FPNeg32, U32, U32 ) +OPCODE(FPNeg64, U64, U64 ) +//OPCODE(FPRecipEstimate16, U16, U16 ) +//OPCODE(FPRecipEstimate32, U32, U32 ) +//OPCODE(FPRecipEstimate64, U64, U64 ) +//OPCODE(FPRecipExponent16, U16, U16 ) +//OPCODE(FPRecipExponent32, U32, U32 ) +//OPCODE(FPRecipExponent64, U64, U64 ) +//OPCODE(FPRecipStepFused16, U16, U16, U16 ) +//OPCODE(FPRecipStepFused32, U32, U32, U32 ) +//OPCODE(FPRecipStepFused64, U64, U64, U64 ) +//OPCODE(FPRoundInt16, U16, U16, U8, U1 ) +//OPCODE(FPRoundInt32, U32, U32, U8, U1 ) +//OPCODE(FPRoundInt64, U64, U64, U8, U1 ) +//OPCODE(FPRSqrtEstimate16, U16, U16 ) +//OPCODE(FPRSqrtEstimate32, U32, U32 ) +//OPCODE(FPRSqrtEstimate64, U64, U64 ) +//OPCODE(FPRSqrtStepFused16, U16, U16, U16 ) +//OPCODE(FPRSqrtStepFused32, U32, U32, U32 ) +//OPCODE(FPRSqrtStepFused64, U64, U64, U64 ) +OPCODE(FPSqrt32, U32, U32 ) +OPCODE(FPSqrt64, U64, U64 ) +OPCODE(FPSub32, U32, U32, U32 ) +OPCODE(FPSub64, U64, U64, U64 ) + +// Floating-point conversions +OPCODE(FPHalfToDouble, U64, U16, U8 ) +OPCODE(FPHalfToSingle, U32, U16, U8 ) +OPCODE(FPSingleToDouble, U64, U32, U8 ) +OPCODE(FPSingleToHalf, U16, U32, U8 ) +OPCODE(FPDoubleToHalf, U16, U64, U8 ) +OPCODE(FPDoubleToSingle, U32, U64, U8 ) +OPCODE(FPDoubleToFixedS32, U32, U64, U8, U8 ) +OPCODE(FPDoubleToFixedS64, U64, U64, U8, U8 ) +OPCODE(FPDoubleToFixedU32, U32, U64, U8, U8 ) +OPCODE(FPDoubleToFixedU64, U64, U64, U8, U8 ) +//OPCODE(FPHalfToFixedS32, U32, U16, U8, U8 ) +//OPCODE(FPHalfToFixedS64, U64, U16, U8, U8 ) +//OPCODE(FPHalfToFixedU32, U32, U16, U8, U8 ) +//OPCODE(FPHalfToFixedU64, U64, U16, U8, U8 ) +OPCODE(FPSingleToFixedS32, U32, U32, U8, U8 ) +OPCODE(FPSingleToFixedS64, U64, U32, U8, U8 ) +OPCODE(FPSingleToFixedU32, U32, U32, U8, U8 ) +OPCODE(FPSingleToFixedU64, U64, U32, U8, U8 ) +OPCODE(FPFixedU32ToSingle, U32, U32, U8, U8 ) +OPCODE(FPFixedS32ToSingle, U32, U32, U8, U8 ) +OPCODE(FPFixedU32ToDouble, U64, U32, U8, U8 ) +OPCODE(FPFixedU64ToDouble, U64, U64, U8, U8 ) +OPCODE(FPFixedU64ToSingle, U32, U64, U8, U8 ) +OPCODE(FPFixedS32ToDouble, U64, U32, U8, U8 ) +OPCODE(FPFixedS64ToDouble, U64, U64, U8, U8 ) +OPCODE(FPFixedS64ToSingle, U32, U64, U8, U8 ) + +// Floating-point vector instructions +//OPCODE(FPVectorAbs16, U128, U128 ) +//OPCODE(FPVectorAbs32, U128, U128 ) +//OPCODE(FPVectorAbs64, U128, U128 ) +//OPCODE(FPVectorAdd32, U128, U128, U128 ) +//OPCODE(FPVectorAdd64, U128, U128, U128 ) +//OPCODE(FPVectorDiv32, U128, U128, U128 ) +//OPCODE(FPVectorDiv64, U128, U128, U128 ) +//OPCODE(FPVectorEqual32, U128, U128, U128 ) +//OPCODE(FPVectorEqual64, U128, U128, U128 ) +//OPCODE(FPVectorFromSignedFixed32, U128, U128, U8, U8 ) +//OPCODE(FPVectorFromSignedFixed64, U128, U128, U8, U8 ) +//OPCODE(FPVectorFromUnsignedFixed32, U128, U128, U8, U8 ) +//OPCODE(FPVectorFromUnsignedFixed64, U128, U128, U8, U8 ) +//OPCODE(FPVectorGreater32, U128, U128, U128 ) +//OPCODE(FPVectorGreater64, U128, U128, U128 ) +//OPCODE(FPVectorGreaterEqual32, U128, U128, U128 ) +//OPCODE(FPVectorGreaterEqual64, U128, U128, U128 ) +//OPCODE(FPVectorMax32, U128, U128, U128 ) +//OPCODE(FPVectorMax64, U128, U128, U128 ) +//OPCODE(FPVectorMin32, U128, U128, U128 ) +//OPCODE(FPVectorMin64, U128, U128, U128 ) +//OPCODE(FPVectorMul32, U128, U128, U128 ) +//OPCODE(FPVectorMul64, U128, U128, U128 ) +//OPCODE(FPVectorMulAdd16, U128, U128, U128, U128 ) +//OPCODE(FPVectorMulAdd32, U128, U128, U128, U128 ) +//OPCODE(FPVectorMulAdd64, U128, U128, U128, U128 ) +//OPCODE(FPVectorMulX32, U128, U128, U128 ) +//OPCODE(FPVectorMulX64, U128, U128, U128 ) +//OPCODE(FPVectorNeg16, U128, U128 ) +//OPCODE(FPVectorNeg32, U128, U128 ) +//OPCODE(FPVectorNeg64, U128, U128 ) +//OPCODE(FPVectorPairedAdd32, U128, U128, U128 ) +//OPCODE(FPVectorPairedAdd64, U128, U128, U128 ) +//OPCODE(FPVectorPairedAddLower32, U128, U128, U128 ) +//OPCODE(FPVectorPairedAddLower64, U128, U128, U128 ) +//OPCODE(FPVectorRecipEstimate16, U128, U128 ) +//OPCODE(FPVectorRecipEstimate32, U128, U128 ) +//OPCODE(FPVectorRecipEstimate64, U128, U128 ) +//OPCODE(FPVectorRecipStepFused16, U128, U128, U128 ) +//OPCODE(FPVectorRecipStepFused32, U128, U128, U128 ) +//OPCODE(FPVectorRecipStepFused64, U128, U128, U128 ) +//OPCODE(FPVectorRoundInt16, U128, U128, U8, U1 ) +//OPCODE(FPVectorRoundInt32, U128, U128, U8, U1 ) +//OPCODE(FPVectorRoundInt64, U128, U128, U8, U1 ) +//OPCODE(FPVectorRSqrtEstimate16, U128, U128 ) +//OPCODE(FPVectorRSqrtEstimate32, U128, U128 ) +//OPCODE(FPVectorRSqrtEstimate64, U128, U128 ) +//OPCODE(FPVectorRSqrtStepFused16, U128, U128, U128 ) +//OPCODE(FPVectorRSqrtStepFused32, U128, U128, U128 ) +//OPCODE(FPVectorRSqrtStepFused64, U128, U128, U128 ) +//OPCODE(FPVectorSqrt32, U128, U128 ) +//OPCODE(FPVectorSqrt64, U128, U128 ) +//OPCODE(FPVectorSub32, U128, U128, U128 ) +//OPCODE(FPVectorSub64, U128, U128, U128 ) +//OPCODE(FPVectorToSignedFixed16, U128, U128, U8, U8 ) +//OPCODE(FPVectorToSignedFixed32, U128, U128, U8, U8 ) +//OPCODE(FPVectorToSignedFixed64, U128, U128, U8, U8 ) +//OPCODE(FPVectorToUnsignedFixed16, U128, U128, U8, U8 ) +//OPCODE(FPVectorToUnsignedFixed32, U128, U128, U8, U8 ) +//OPCODE(FPVectorToUnsignedFixed64, U128, U128, U8, U8 ) + +// A32 Memory access +A32OPC(ClearExclusive, Void, ) +A32OPC(SetExclusive, Void, U32, U8 ) +A32OPC(ReadMemory8, U8, U32 ) +A32OPC(ReadMemory16, U16, U32 ) +A32OPC(ReadMemory32, U32, U32 ) +A32OPC(ReadMemory64, U64, U32 ) +A32OPC(WriteMemory8, Void, U32, U8 ) +A32OPC(WriteMemory16, Void, U32, U16 ) +A32OPC(WriteMemory32, Void, U32, U32 ) +A32OPC(WriteMemory64, Void, U32, U64 ) +A32OPC(ExclusiveWriteMemory8, U32, U32, U8 ) +A32OPC(ExclusiveWriteMemory16, U32, U32, U16 ) +A32OPC(ExclusiveWriteMemory32, U32, U32, U32 ) +A32OPC(ExclusiveWriteMemory64, U32, U32, U64 ) + +// A64 Memory access +//A64OPC(ClearExclusive, Void, ) +//A64OPC(SetExclusive, Void, U64, U8 ) +//A64OPC(ReadMemory8, U8, U64 ) +//A64OPC(ReadMemory16, U16, U64 ) +//A64OPC(ReadMemory32, U32, U64 ) +//A64OPC(ReadMemory64, U64, U64 ) +//A64OPC(ReadMemory128, U128, U64 ) +//A64OPC(WriteMemory8, Void, U64, U8 ) +//A64OPC(WriteMemory16, Void, U64, U16 ) +//A64OPC(WriteMemory32, Void, U64, U32 ) +//A64OPC(WriteMemory64, Void, U64, U64 ) +//A64OPC(WriteMemory128, Void, U64, U128 ) +//A64OPC(ExclusiveWriteMemory8, U32, U64, U8 ) +//A64OPC(ExclusiveWriteMemory16, U32, U64, U16 ) +//A64OPC(ExclusiveWriteMemory32, U32, U64, U32 ) +//A64OPC(ExclusiveWriteMemory64, U32, U64, U64 ) +//A64OPC(ExclusiveWriteMemory128, U32, U64, U128 ) + +// Coprocessor +A32OPC(CoprocInternalOperation, Void, CoprocInfo ) +A32OPC(CoprocSendOneWord, Void, CoprocInfo, U32 ) +A32OPC(CoprocSendTwoWords, Void, CoprocInfo, U32, U32 ) +A32OPC(CoprocGetOneWord, U32, CoprocInfo ) +A32OPC(CoprocGetTwoWords, U64, CoprocInfo ) +A32OPC(CoprocLoadWords, Void, CoprocInfo, U32 ) +A32OPC(CoprocStoreWords, Void, CoprocInfo, U32 ) diff --git a/src/dynarmic/backend/A64/perf_map.cpp b/src/dynarmic/backend/A64/perf_map.cpp new file mode 100644 index 00000000..af46fa08 --- /dev/null +++ b/src/dynarmic/backend/A64/perf_map.cpp @@ -0,0 +1,89 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#include +#include + +#ifdef __linux__ + +#include +#include +#include +#include +#include + +#include + +#include "common/common_types.h" + +namespace Dynarmic::BackendA64 { + +namespace { +std::mutex mutex; +std::FILE* file = nullptr; + +void OpenFile() { + const char* perf_dir = std::getenv("PERF_BUILDID_DIR"); + if (!perf_dir) { + file = nullptr; + return; + } + + const pid_t pid = getpid(); + const std::string filename = fmt::format("{:s}/perf-{:d}.map", perf_dir, pid); + + file = std::fopen(filename.c_str(), "w"); + if (!file) { + return; + } + + std::setvbuf(file, nullptr, _IONBF, 0); +} +} // anonymous namespace + +namespace detail { +void PerfMapRegister(const void* start, const void* end, const std::string& friendly_name) { + std::lock_guard guard{mutex}; + + if (!file) { + OpenFile(); + if (!file) { + return; + } + } + + const std::string line = fmt::format("{:016x} {:016x} {:s}\n", reinterpret_cast(start), reinterpret_cast(end) - reinterpret_cast(start), friendly_name); + std::fwrite(line.data(), sizeof *line.data(), line.size(), file); +} +} // namespace detail + +void PerfMapClear() { + std::lock_guard guard{mutex}; + + if (!file) { + return; + } + + std::fclose(file); + file = nullptr; + OpenFile(); +} + +} // namespace Dynarmic::BackendX64 + +#else + +namespace Dynarmic::BackendA64 { + +namespace detail { +void PerfMapRegister(const void*, const void*, const std::string&) {} +} // namespace detail + +void PerfMapClear() {} + +} // namespace Dynarmic::BackendX64 + +#endif diff --git a/src/dynarmic/backend/A64/perf_map.h b/src/dynarmic/backend/A64/perf_map.h new file mode 100644 index 00000000..0b563dd1 --- /dev/null +++ b/src/dynarmic/backend/A64/perf_map.h @@ -0,0 +1,27 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2018 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#pragma once + +#include +#include + +#include "common/cast_util.h" + +namespace Dynarmic::BackendA64 { + +namespace detail { +void PerfMapRegister(const void* start, const void* end, const std::string& friendly_name); +} // namespace detail + +template +void PerfMapRegister(T start, const void* end, const std::string& friendly_name) { + detail::PerfMapRegister(Common::BitCast(start), end, friendly_name); +} + +void PerfMapClear(); + +} // namespace Dynarmic::BackendX64 diff --git a/src/dynarmic/backend/A64/reg_alloc.cpp b/src/dynarmic/backend/A64/reg_alloc.cpp new file mode 100644 index 00000000..353eecac --- /dev/null +++ b/src/dynarmic/backend/A64/reg_alloc.cpp @@ -0,0 +1,650 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#include +#include +#include + +#include + +#include "backend/A64/abi.h" +#include "backend/A64/reg_alloc.h" +#include "common/assert.h" + +namespace Dynarmic::BackendA64 { + +static u64 ImmediateToU64(const IR::Value& imm) { + switch (imm.GetType()) { + case IR::Type::U1: + return u64(imm.GetU1()); + case IR::Type::U8: + return u64(imm.GetU8()); + case IR::Type::U16: + return u64(imm.GetU16()); + case IR::Type::U32: + return u64(imm.GetU32()); + case IR::Type::U64: + return u64(imm.GetU64()); + default: + ASSERT_FALSE("This should never happen."); + } +} + +static bool CanExchange(HostLoc a, HostLoc b) { + return HostLocIsGPR(a) && HostLocIsGPR(b); +} + +// Minimum number of bits required to represent a type +static size_t GetBitWidth(IR::Type type) { + switch (type) { + case IR::Type::A32Reg: + case IR::Type::A32ExtReg: + case IR::Type::A64Reg: + case IR::Type::A64Vec: + case IR::Type::CoprocInfo: + case IR::Type::Cond: + case IR::Type::Void: + case IR::Type::Table: + ASSERT_FALSE("Type {} cannot be represented at runtime", type); + return 0; + case IR::Type::Opaque: + ASSERT_FALSE("Not a concrete type"); + return 0; + case IR::Type::U1: + return 8; + case IR::Type::U8: + return 8; + case IR::Type::U16: + return 16; + case IR::Type::U32: + return 32; + case IR::Type::U64: + return 64; + case IR::Type::U128: + return 128; + case IR::Type::NZCVFlags: + return 32; // TODO: Update to 16 when flags optimization is done + } + UNREACHABLE(); + return 0; +} + +static bool IsValuelessType(IR::Type type) { + switch (type) { + case IR::Type::Table: + return true; + default: + return false; + } +} + +bool HostLocInfo::IsLocked() const { + return is_being_used_count > 0; +} + +bool HostLocInfo::IsEmpty() const { + return is_being_used_count == 0 && values.empty(); +} + +bool HostLocInfo::IsLastUse() const { + return is_being_used_count == 0 && current_references == 1 && accumulated_uses + 1 == total_uses; +} + +void HostLocInfo::ReadLock() { + ASSERT(!is_scratch); + is_being_used_count++; +} + +void HostLocInfo::WriteLock() { + ASSERT(is_being_used_count == 0); + is_being_used_count++; + is_scratch = true; +} + +void HostLocInfo::AddArgReference() { + current_references++; + ASSERT(accumulated_uses + current_references <= total_uses); +} + +void HostLocInfo::ReleaseOne() { + is_being_used_count--; + is_scratch = false; + + if (current_references == 0) + return; + + accumulated_uses++; + current_references--; + + if (current_references == 0) + ReleaseAll(); +} + +void HostLocInfo::ReleaseAll() { + accumulated_uses += current_references; + current_references = 0; + + ASSERT(total_uses == std::accumulate(values.begin(), values.end(), size_t(0), [](size_t sum, IR::Inst* inst) { return sum + inst->UseCount(); })); + + if (total_uses == accumulated_uses) { + values.clear(); + accumulated_uses = 0; + total_uses = 0; + max_bit_width = 0; + } + + is_being_used_count = 0; + is_scratch = false; +} + +bool HostLocInfo::ContainsValue(const IR::Inst* inst) const { + return std::find(values.begin(), values.end(), inst) != values.end(); +} + +size_t HostLocInfo::GetMaxBitWidth() const { + return max_bit_width; +} + +void HostLocInfo::AddValue(IR::Inst* inst) { + values.push_back(inst); + total_uses += inst->UseCount(); + max_bit_width = std::max(max_bit_width, GetBitWidth(inst->GetType())); +} + +IR::Type Argument::GetType() const { + return value.GetType(); +} + +bool Argument::IsImmediate() const { + return value.IsImmediate(); +} + +bool Argument::IsVoid() const { + return GetType() == IR::Type::Void; +} + +bool Argument::FitsInImmediateU32() const { + if (!IsImmediate()) + return false; + u64 imm = ImmediateToU64(value); + return imm < 0x100000000; +} + +bool Argument::FitsInImmediateS32() const { + if (!IsImmediate()) + return false; + s64 imm = static_cast(ImmediateToU64(value)); + return -s64(0x80000000) <= imm && imm <= s64(0x7FFFFFFF); +} + +bool Argument::GetImmediateU1() const { + return value.GetU1(); +} + +u8 Argument::GetImmediateU8() const { + u64 imm = ImmediateToU64(value); + ASSERT(imm < 0x100); + return u8(imm); +} + +u16 Argument::GetImmediateU16() const { + u64 imm = ImmediateToU64(value); + ASSERT(imm < 0x10000); + return u16(imm); +} + +u32 Argument::GetImmediateU32() const { + u64 imm = ImmediateToU64(value); + ASSERT(imm < 0x100000000); + return u32(imm); +} + +u64 Argument::GetImmediateS32() const { + ASSERT(FitsInImmediateS32()); + u64 imm = ImmediateToU64(value); + return imm; +} + +u64 Argument::GetImmediateU64() const { + return ImmediateToU64(value); +} + +IR::Cond Argument::GetImmediateCond() const { + ASSERT(IsImmediate() && GetType() == IR::Type::Cond); + return value.GetCond(); +} + +bool Argument::IsInGpr() const { + if (IsImmediate()) + return false; + return HostLocIsGPR(*reg_alloc.ValueLocation(value.GetInst())); +} + +bool Argument::IsInFpr() const { + if (IsImmediate()) + return false; + return HostLocIsFPR(*reg_alloc.ValueLocation(value.GetInst())); +} + +bool Argument::IsInMemory() const { + if (IsImmediate()) + return false; + return HostLocIsSpill(*reg_alloc.ValueLocation(value.GetInst())); +} + +RegAlloc::ArgumentInfo RegAlloc::GetArgumentInfo(IR::Inst* inst) { + ArgumentInfo ret = {Argument{*this}, Argument{*this}, Argument{*this}, Argument{*this}}; + for (size_t i = 0; i < inst->NumArgs(); i++) { + const IR::Value& arg = inst->GetArg(i); + ret[i].value = arg; + if (!arg.IsImmediate() && !IsValuelessType(arg.GetType())) { + ASSERT_MSG(ValueLocation(arg.GetInst()), "argument must already been defined"); + LocInfo(*ValueLocation(arg.GetInst())).AddArgReference(); + } + } + return ret; +} + +Arm64Gen::ARM64Reg RegAlloc::UseGpr(Argument& arg) { + ASSERT(!arg.allocated); + arg.allocated = true; + return HostLocToReg64(UseImpl(arg.value, any_gpr)); +} + +Arm64Gen::ARM64Reg RegAlloc::UseFpr(Argument& arg) { + ASSERT(!arg.allocated); + arg.allocated = true; + return HostLocToFpr(UseImpl(arg.value, any_fpr)); +} + +//OpArg RegAlloc::UseOpArg(Argument& arg) { +// return UseGpr(arg); +//} + +void RegAlloc::Use(Argument& arg, HostLoc host_loc) { + ASSERT(!arg.allocated); + arg.allocated = true; + UseImpl(arg.value, {host_loc}); +} + +Arm64Gen::ARM64Reg RegAlloc::UseScratchGpr(Argument& arg) { + ASSERT(!arg.allocated); + arg.allocated = true; + return HostLocToReg64(UseScratchImpl(arg.value, any_gpr)); +} + +Arm64Gen::ARM64Reg RegAlloc::UseScratchFpr(Argument& arg) { + ASSERT(!arg.allocated); + arg.allocated = true; + return HostLocToFpr(UseScratchImpl(arg.value, any_fpr)); +} + +void RegAlloc::UseScratch(Argument& arg, HostLoc host_loc) { + ASSERT(!arg.allocated); + arg.allocated = true; + UseScratchImpl(arg.value, {host_loc}); +} + +void RegAlloc::DefineValue(IR::Inst* inst, const Arm64Gen::ARM64Reg& reg) { + ASSERT(IsVector(reg) || IsGPR(reg)); + HostLoc hostloc = static_cast(DecodeReg(reg) + static_cast(IsVector(reg) ? HostLoc::Q0 : HostLoc::X0)); + DefineValueImpl(inst, hostloc); +} + +void RegAlloc::DefineValue(IR::Inst* inst, Argument& arg) { + ASSERT(!arg.allocated); + arg.allocated = true; + DefineValueImpl(inst, arg.value); +} + +void RegAlloc::Release(const Arm64Gen::ARM64Reg& reg) { + ASSERT(IsVector(reg) || IsGPR(reg)); + const HostLoc hostloc = static_cast(DecodeReg(reg) + static_cast(IsVector(reg) ? HostLoc::Q0 : HostLoc::X0)); + LocInfo(hostloc).ReleaseOne(); +} + +Arm64Gen::ARM64Reg RegAlloc::ScratchGpr(HostLocList desired_locations) { + return HostLocToReg64(ScratchImpl(desired_locations)); +} + +Arm64Gen::ARM64Reg RegAlloc::ScratchFpr(HostLocList desired_locations) { + return HostLocToFpr(ScratchImpl(desired_locations)); +} + +HostLoc RegAlloc::UseImpl(IR::Value use_value, HostLocList desired_locations) { + if (use_value.IsImmediate()) { + return LoadImmediate(use_value, ScratchImpl(desired_locations)); + } + + const IR::Inst* use_inst = use_value.GetInst(); + const HostLoc current_location = *ValueLocation(use_inst); + const size_t max_bit_width = LocInfo(current_location).GetMaxBitWidth(); + + const bool can_use_current_location = std::find(desired_locations.begin(), desired_locations.end(), current_location) != desired_locations.end(); + if (can_use_current_location) { + LocInfo(current_location).ReadLock(); + return current_location; + } + + if (LocInfo(current_location).IsLocked()) { + return UseScratchImpl(use_value, desired_locations); + } + + const HostLoc destination_location = SelectARegister(desired_locations); + if (max_bit_width > HostLocBitWidth(destination_location)) { + return UseScratchImpl(use_value, desired_locations); + } else if (CanExchange(destination_location, current_location)) { + Exchange(destination_location, current_location); + } else { + MoveOutOfTheWay(destination_location); + Move(destination_location, current_location); + } + LocInfo(destination_location).ReadLock(); + return destination_location; +} + +HostLoc RegAlloc::UseScratchImpl(IR::Value use_value, HostLocList desired_locations) { + if (use_value.IsImmediate()) { + return LoadImmediate(use_value, ScratchImpl(desired_locations)); + } + + const IR::Inst* use_inst = use_value.GetInst(); + const HostLoc current_location = *ValueLocation(use_inst); + const size_t bit_width = GetBitWidth(use_inst->GetType()); + + const bool can_use_current_location = std::find(desired_locations.begin(), desired_locations.end(), current_location) != desired_locations.end(); + if (can_use_current_location && !LocInfo(current_location).IsLocked()) { + if (!LocInfo(current_location).IsLastUse()) { + MoveOutOfTheWay(current_location); + } + LocInfo(current_location).WriteLock(); + return current_location; + } + + const HostLoc destination_location = SelectARegister(desired_locations); + MoveOutOfTheWay(destination_location); + CopyToScratch(bit_width, destination_location, current_location); + LocInfo(destination_location).WriteLock(); + return destination_location; +} + +HostLoc RegAlloc::ScratchImpl(HostLocList desired_locations) { + HostLoc location = SelectARegister(desired_locations); + MoveOutOfTheWay(location); + LocInfo(location).WriteLock(); + return location; +} + +void RegAlloc::HostCall(IR::Inst* result_def, std::optional arg0, + std::optional arg1, + std::optional arg2, + std::optional arg3, + std::optional arg4, + std::optional arg5, + std::optional arg6, + std::optional arg7) { + constexpr size_t args_count = 8; + constexpr std::array args_hostloc = { ABI_PARAM1, ABI_PARAM2, ABI_PARAM3, ABI_PARAM4, ABI_PARAM5, ABI_PARAM6, ABI_PARAM7, ABI_PARAM8 }; + const std::array, args_count> args = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; + + static const std::vector other_caller_save = [args_hostloc]() { + std::vector ret(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end()); + + for (auto hostloc : args_hostloc) + ret.erase(std::find(ret.begin(), ret.end(), hostloc)); + + return ret; + }(); + + for (size_t i = 0; i < args_count; i++) { + if (args[i]) { + UseScratch(*args[i], args_hostloc[i]); + } + } + + for (size_t i = 0; i < args_count; i++) { + if (!args[i]) { + // TODO: Force spill + ScratchGpr({args_hostloc[i]}); + } + } + + for (HostLoc caller_saved : other_caller_save) { + ScratchImpl({caller_saved}); + } + + if (result_def) { + DefineValueImpl(result_def, ABI_RETURN); + } +} + +void RegAlloc::EndOfAllocScope() { + for (auto& iter : hostloc_info) { + iter.ReleaseAll(); + } +} + +void RegAlloc::AssertNoMoreUses() { + ASSERT(std::all_of(hostloc_info.begin(), hostloc_info.end(), [](const auto& i) { return i.IsEmpty(); })); +} + +HostLoc RegAlloc::SelectARegister(HostLocList desired_locations) const { + std::vector candidates = desired_locations; + + // Find all locations that have not been allocated.. + const auto allocated_locs = std::partition(candidates.begin(), candidates.end(), [this](auto loc){ + return !this->LocInfo(loc).IsLocked(); + }); + candidates.erase(allocated_locs, candidates.end()); + ASSERT_MSG(!candidates.empty(), "All candidate registers have already been allocated"); + + // Selects the best location out of the available locations. + // TODO: Actually do LRU or something. Currently we just try to pick something without a value if possible. + + std::partition(candidates.begin(), candidates.end(), [this](auto loc){ + return this->LocInfo(loc).IsEmpty(); + }); + + return candidates.front(); +} + +std::optional RegAlloc::ValueLocation(const IR::Inst* value) const { + for (size_t i = 0; i < hostloc_info.size(); i++) + if (hostloc_info[i].ContainsValue(value)) + return static_cast(i); + + return std::nullopt; +} + +void RegAlloc::DefineValueImpl(IR::Inst* def_inst, HostLoc host_loc) { + ASSERT_MSG(!ValueLocation(def_inst), "def_inst has already been defined"); + LocInfo(host_loc).AddValue(def_inst); +} + +void RegAlloc::DefineValueImpl(IR::Inst* def_inst, const IR::Value& use_inst) { + ASSERT_MSG(!ValueLocation(def_inst), "def_inst has already been defined"); + + if (use_inst.IsImmediate()) { + HostLoc location = ScratchImpl(any_gpr); + DefineValueImpl(def_inst, location); + LoadImmediate(use_inst, location); + return; + } + + ASSERT_MSG(ValueLocation(use_inst.GetInst()), "use_inst must already be defined"); + HostLoc location = *ValueLocation(use_inst.GetInst()); + DefineValueImpl(def_inst, location); +} + +HostLoc RegAlloc::LoadImmediate(IR::Value imm, HostLoc host_loc) { + ASSERT_MSG(imm.IsImmediate(), "imm is not an immediate"); + + if (HostLocIsGPR(host_loc)) { + Arm64Gen::ARM64Reg reg = HostLocToReg64(host_loc); + u64 imm_value = ImmediateToU64(imm); + code.MOVI2R(reg, imm_value); + return host_loc; + } + + if (HostLocIsFPR(host_loc)) { + Arm64Gen::ARM64Reg reg = Arm64Gen::EncodeRegToDouble(HostLocToFpr(host_loc)); + u64 imm_value = ImmediateToU64(imm); + if (imm_value == 0) + code.fp_emitter.FMOV(reg, 0); + else { + code.EmitPatchLDR(reg, imm_value); + } + return host_loc; + } + + UNREACHABLE(); +} + +void RegAlloc::Move(HostLoc to, HostLoc from) { + const size_t bit_width = LocInfo(from).GetMaxBitWidth(); + + ASSERT(LocInfo(to).IsEmpty() && !LocInfo(from).IsLocked()); + ASSERT(bit_width <= HostLocBitWidth(to)); + + if (LocInfo(from).IsEmpty()) { + return; + } + + EmitMove(bit_width, to, from); + + LocInfo(to) = std::exchange(LocInfo(from), {}); +} + +void RegAlloc::CopyToScratch(size_t bit_width, HostLoc to, HostLoc from) { + ASSERT(LocInfo(to).IsEmpty() && !LocInfo(from).IsEmpty()); + + EmitMove(bit_width, to, from); +} + +void RegAlloc::Exchange(HostLoc a, HostLoc b) { + ASSERT(!LocInfo(a).IsLocked() && !LocInfo(b).IsLocked()); + ASSERT(LocInfo(a).GetMaxBitWidth() <= HostLocBitWidth(b)); + ASSERT(LocInfo(b).GetMaxBitWidth() <= HostLocBitWidth(a)); + + if (LocInfo(a).IsEmpty()) { + Move(a, b); + return; + } + + if (LocInfo(b).IsEmpty()) { + Move(b, a); + return; + } + + EmitExchange(a, b); + + std::swap(LocInfo(a), LocInfo(b)); +} + +void RegAlloc::MoveOutOfTheWay(HostLoc reg) { + ASSERT(!LocInfo(reg).IsLocked()); + if (!LocInfo(reg).IsEmpty()) { + SpillRegister(reg); + } +} + +void RegAlloc::SpillRegister(HostLoc loc) { + ASSERT_MSG(HostLocIsRegister(loc), "Only registers can be spilled"); + ASSERT_MSG(!LocInfo(loc).IsEmpty(), "There is no need to spill unoccupied registers"); + ASSERT_MSG(!LocInfo(loc).IsLocked(), "Registers that have been allocated must not be spilt"); + + HostLoc new_loc = FindFreeSpill(); + Move(new_loc, loc); +} + +HostLoc RegAlloc::FindFreeSpill() const { + for (size_t i = static_cast(HostLoc::FirstSpill); i < hostloc_info.size(); i++) { + HostLoc loc = static_cast(i); + if (LocInfo(loc).IsEmpty()) + return loc; + } + + ASSERT_FALSE("All spill locations are full"); +} + +HostLocInfo& RegAlloc::LocInfo(HostLoc loc) { + ASSERT(loc != HostLoc::SP && loc != HostLoc::X28 && loc != HostLoc::X29 && loc != HostLoc::X30); + return hostloc_info[static_cast(loc)]; +} + +const HostLocInfo& RegAlloc::LocInfo(HostLoc loc) const { + ASSERT(loc != HostLoc::SP && loc != HostLoc::X28 && loc != HostLoc::X29 && loc != HostLoc::X30); + return hostloc_info[static_cast(loc)]; +} + +void RegAlloc::EmitMove(size_t bit_width, HostLoc to, HostLoc from) { + if (HostLocIsFPR(to) && HostLocIsFPR(from)) { + // bit_width == 128 + //mov(HostLocToFpr(to), HostLocToFpr(from)); + + ASSERT_FALSE("Unimplemented"); + } else if (HostLocIsGPR(to) && HostLocIsGPR(from)) { + ASSERT(bit_width != 128); + if (bit_width == 64) { + code.MOV(HostLocToReg64(to), HostLocToReg64(from)); + } else { + code.MOV(DecodeReg(HostLocToReg64(to)), DecodeReg(HostLocToReg64(from))); + } + } else if (HostLocIsFPR(to) && HostLocIsGPR(from)) { + ASSERT(bit_width != 128); + if (bit_width == 64) { + code.fp_emitter.FMOV(EncodeRegToDouble(HostLocToFpr(to)), HostLocToReg64(from)); + } else { + code.fp_emitter.FMOV(EncodeRegToSingle(HostLocToFpr(to)), DecodeReg(HostLocToReg64(from))); + } + } else if (HostLocIsGPR(to) && HostLocIsFPR(from)) { + ASSERT(bit_width != 128); + if (bit_width == 64) { + code.fp_emitter.FMOV(HostLocToReg64(to), EncodeRegToDouble(HostLocToFpr(from))); + } else { + code.fp_emitter.FMOV(DecodeReg(HostLocToReg64(to)), EncodeRegToSingle(HostLocToFpr(from))); + } + } else if (HostLocIsFPR(to) && HostLocIsSpill(from)) { + s32 spill_addr = spill_to_addr(from); + // ASSERT(spill_addr.getBit() >= bit_width); + code.fp_emitter.LDR(bit_width, Arm64Gen::INDEX_UNSIGNED, HostLocToFpr(to), Arm64Gen::X28, spill_addr); + } else if (HostLocIsSpill(to) && HostLocIsFPR(from)) { + s32 spill_addr = spill_to_addr(to); + // ASSERT(spill_addr.getBit() >= bit_width); + code.fp_emitter.STR(bit_width, Arm64Gen::INDEX_UNSIGNED, HostLocToFpr(from), Arm64Gen::X28, spill_addr); + } else if (HostLocIsGPR(to) && HostLocIsSpill(from)) { + ASSERT(bit_width != 128); + if (bit_width == 64) { + code.LDR(Arm64Gen::INDEX_UNSIGNED, HostLocToReg64(to), Arm64Gen::X28, spill_to_addr(from)); + } else { + code.LDR(Arm64Gen::INDEX_UNSIGNED, DecodeReg(HostLocToReg64(to)), Arm64Gen::X28, spill_to_addr(from)); + } + } else if (HostLocIsSpill(to) && HostLocIsGPR(from)) { + ASSERT(bit_width != 128); + if (bit_width == 64) { + code.STR(Arm64Gen::INDEX_UNSIGNED, HostLocToReg64(from), Arm64Gen::X28, spill_to_addr(to)); + } else { + code.STR(Arm64Gen::INDEX_UNSIGNED, DecodeReg(HostLocToReg64(from)), Arm64Gen::X28, spill_to_addr(to)); + } + } else { + ASSERT_FALSE("Invalid RegAlloc::EmitMove"); + } +} + +void RegAlloc::EmitExchange(HostLoc a, HostLoc b) { + if (HostLocIsGPR(a) && HostLocIsGPR(b)) { + // Is this the best way to do it? + code.EOR(HostLocToReg64(a), HostLocToReg64(a), HostLocToReg64(b)); + code.EOR(HostLocToReg64(b), HostLocToReg64(a), HostLocToReg64(b)); + code.EOR(HostLocToReg64(a), HostLocToReg64(a), HostLocToReg64(b)); + } else if (HostLocIsFPR(a) && HostLocIsFPR(b)) { + ASSERT_FALSE("Check your code: Exchanging XMM registers is unnecessary"); + } else { + ASSERT_FALSE("Invalid RegAlloc::EmitExchange"); + } +} + +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/backend/A64/reg_alloc.h b/src/dynarmic/backend/A64/reg_alloc.h new file mode 100644 index 00000000..3eec7fa6 --- /dev/null +++ b/src/dynarmic/backend/A64/reg_alloc.h @@ -0,0 +1,167 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2016 MerryMage + * This software may be used and distributed according to the terms of the GNU + * General Public License version 2 or any later version. + */ + +#pragma once + +#include +#include +#include +#include + +#include + +#include "backend/A64/block_of_code.h" +#include "backend/A64/hostloc.h" +//#include "backend/A64/oparg.h" +#include "common/common_types.h" +#include "frontend/ir/cond.h" +#include "frontend/ir/microinstruction.h" +#include "frontend/ir/value.h" + +namespace Dynarmic::BackendA64 { + +class RegAlloc; + +struct HostLocInfo { +public: + bool IsLocked() const; + bool IsEmpty() const; + bool IsLastUse() const; + + void ReadLock(); + void WriteLock(); + void AddArgReference(); + void ReleaseOne(); + void ReleaseAll(); + + bool ContainsValue(const IR::Inst* inst) const; + size_t GetMaxBitWidth() const; + + void AddValue(IR::Inst* inst); + +private: + // Current instruction state + size_t is_being_used_count = 0; + bool is_scratch = false; + + // Block state + size_t current_references = 0; + size_t accumulated_uses = 0; + size_t total_uses = 0; + + // Value state + std::vector values; + size_t max_bit_width = 0; +}; + +struct Argument { +public: + using copyable_reference = std::reference_wrapper; + + IR::Type GetType() const; + bool IsImmediate() const; + bool IsVoid() const; + + bool FitsInImmediateU32() const; + bool FitsInImmediateS32() const; + + bool GetImmediateU1() const; + u8 GetImmediateU8() const; + u16 GetImmediateU16() const; + u32 GetImmediateU32() const; + u64 GetImmediateS32() const; + u64 GetImmediateU64() const; + IR::Cond GetImmediateCond() const; + + /// Is this value currently in a GPR? + bool IsInGpr() const; + /// Is this value currently in a FPR? + bool IsInFpr() const; + /// Is this value currently in memory? + bool IsInMemory() const; + +private: + friend class RegAlloc; + explicit Argument(RegAlloc& reg_alloc) : reg_alloc(reg_alloc) {} + + bool allocated = false; + RegAlloc& reg_alloc; + IR::Value value; +}; + +class RegAlloc final { +public: + using ArgumentInfo = std::array; + + explicit RegAlloc(BlockOfCode& code, size_t num_spills, std::function spill_to_addr) + : hostloc_info(NonSpillHostLocCount + num_spills), code(code), spill_to_addr(std::move(spill_to_addr)) {} + + ArgumentInfo GetArgumentInfo(IR::Inst* inst); + + Arm64Gen::ARM64Reg UseGpr(Argument& arg); + Arm64Gen::ARM64Reg UseFpr(Argument& arg); + //OpArg UseOpArg(Argument& arg); + void Use(Argument& arg, HostLoc host_loc); + + Arm64Gen::ARM64Reg UseScratchGpr(Argument& arg); + Arm64Gen::ARM64Reg UseScratchFpr(Argument& arg); + void UseScratch(Argument& arg, HostLoc host_loc); + + void DefineValue(IR::Inst* inst, const Arm64Gen::ARM64Reg& reg); + void DefineValue(IR::Inst* inst, Argument& arg); + + void Release(const Arm64Gen::ARM64Reg& reg); + + Arm64Gen::ARM64Reg ScratchGpr(HostLocList desired_locations = any_gpr); + Arm64Gen::ARM64Reg ScratchFpr(HostLocList desired_locations = any_fpr); + + void HostCall(IR::Inst* result_def = nullptr, std::optional arg0 = {}, + std::optional arg1 = {}, + std::optional arg2 = {}, + std::optional arg3 = {}, + std::optional arg4 = {}, + std::optional arg5 = {}, + std::optional arg6 = {}, + std::optional arg7 = {}); + + // TODO: Values in host flags + + void EndOfAllocScope(); + + void AssertNoMoreUses(); + +private: + friend struct Argument; + + HostLoc SelectARegister(HostLocList desired_locations) const; + std::optional ValueLocation(const IR::Inst* value) const; + + HostLoc UseImpl(IR::Value use_value, HostLocList desired_locations); + HostLoc UseScratchImpl(IR::Value use_value, HostLocList desired_locations); + HostLoc ScratchImpl(HostLocList desired_locations); + void DefineValueImpl(IR::Inst* def_inst, HostLoc host_loc); + void DefineValueImpl(IR::Inst* def_inst, const IR::Value& use_inst); + + HostLoc LoadImmediate(IR::Value imm, HostLoc reg); + void Move(HostLoc to, HostLoc from); + void CopyToScratch(size_t bit_width, HostLoc to, HostLoc from); + void Exchange(HostLoc a, HostLoc b); + void MoveOutOfTheWay(HostLoc reg); + + void SpillRegister(HostLoc loc); + HostLoc FindFreeSpill() const; + + std::vector hostloc_info; + HostLocInfo& LocInfo(HostLoc loc); + const HostLocInfo& LocInfo(HostLoc loc) const; + + BlockOfCode& code; + std::function spill_to_addr; + void EmitMove(size_t bit_width, HostLoc to, HostLoc from); + void EmitExchange(HostLoc a, HostLoc b); +}; + +} // namespace Dynarmic::BackendA64 diff --git a/src/dynarmic/common/math_util.h b/src/dynarmic/common/math_util.h index 5c1f784c..3b278031 100644 --- a/src/dynarmic/common/math_util.h +++ b/src/dynarmic/common/math_util.h @@ -44,4 +44,9 @@ u8 RecipEstimate(u64 a); */ u8 RecipSqrtEstimate(u64 a); +template +constexpr bool IsPow2(T imm){ + return imm > 0 && (imm & (imm - 1)) == 0; +} + } // namespace Dynarmic::Common