diff --git a/CMakeLists.txt b/CMakeLists.txt
index adfea8cf..48339efc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,9 +8,25 @@ if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
   set(MASTER_PROJECT ON)
 endif()
 
+# Add the module directory to the list of paths
+list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/CMakeModules")
+
+# Arch detection
+include(DetectArchitecture)
+if (NOT DEFINED ARCHITECTURE)
+    message(FATAL_ERROR "Unsupported architecture encountered. Ending CMake generation.")
+endif()
+message(STATUS "Target architecture: ${ARCHITECTURE}")
+
+set(REQUIRES_NO_EXECUTE_SUPPORT OFF)
+# Apple Silicon chips require W^X
+if(APPLE AND ARCHITECTURE STREQUAL "arm64")
+    set(REQUIRES_NO_EXECUTE_SUPPORT ON)
+endif()
+
 # Dynarmic project options
 option(DYNARMIC_ENABLE_CPU_FEATURE_DETECTION "Turning this off causes dynarmic to assume the host CPU doesn't support anything later than SSE3" ON)
-option(DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT "Enables support for systems that require W^X" OFF)
+option(DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT "Enables support for systems that require W^X" ${REQUIRES_NO_EXECUTE_SUPPORT})
 option(DYNARMIC_FATAL_ERRORS "Errors are fatal" OFF)
 option(DYNARMIC_IGNORE_ASSERTS "Ignore asserts" OFF)
 option(DYNARMIC_TESTS "Build tests" ${MASTER_PROJECT})
@@ -39,9 +55,6 @@ if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}")
     message(SEND_ERROR "In-source builds are not allowed.")
 endif()
 
-# Add the module directory to the list of paths
-list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/CMakeModules")
-
 # Compiler flags
 if (MSVC)
     set(DYNARMIC_CXX_FLAGS
@@ -105,13 +118,6 @@ else()
     endif()
 endif()
 
-# Arch detection
-include(DetectArchitecture)
-if (NOT DEFINED ARCHITECTURE)
-    message(FATAL_ERROR "Unsupported architecture encountered. Ending CMake generation.")
-endif()
-message(STATUS "Target architecture: ${ARCHITECTURE}")
-
 # Include Boost
 if (NOT TARGET boost)
     if (NOT Boost_INCLUDE_DIRS)
diff --git a/src/dynarmic/CMakeLists.txt b/src/dynarmic/CMakeLists.txt
index 8ae02cf3..6dd194cf 100644
--- a/src/dynarmic/CMakeLists.txt
+++ b/src/dynarmic/CMakeLists.txt
@@ -365,55 +365,66 @@ if (ARCHITECTURE STREQUAL "x86_64")
     else()
         target_sources(dynarmic PRIVATE backend/x64/exception_handler_generic.cpp)
     endif()
+
 elseif(ARCHITECTURE STREQUAL "arm64")
-    target_link_libraries(dynarmic PRIVATE $<BUILD_INTERFACE:merry::oaknut>)
-
     target_sources(dynarmic PRIVATE
-        backend/arm64/a32_jitstate.cpp
-        backend/arm64/a32_jitstate.h
-        backend/arm64/abi.cpp
-        backend/arm64/abi.h
-        backend/arm64/devirtualize.h
-        backend/arm64/emit_arm64.cpp
-        backend/arm64/emit_arm64.h
-        backend/arm64/emit_arm64_a32.cpp
-        backend/arm64/emit_arm64_a32_coprocessor.cpp
-        backend/arm64/emit_arm64_a32_memory.cpp
-        backend/arm64/emit_arm64_a64.cpp
-        backend/arm64/emit_arm64_a64_memory.cpp
-        backend/arm64/emit_arm64_cryptography.cpp
-        backend/arm64/emit_arm64_data_processing.cpp
-        backend/arm64/emit_arm64_floating_point.cpp
-        backend/arm64/emit_arm64_packed.cpp
-        backend/arm64/emit_arm64_saturation.cpp
-        backend/arm64/emit_arm64_vector.cpp
-        backend/arm64/emit_arm64_vector_floating_point.cpp
-        backend/arm64/emit_arm64_vector_saturation.cpp
-        backend/arm64/emit_context.h
-        backend/arm64/exclusive_monitor.cpp
-        backend/arm64/fpsr_manager.cpp
-        backend/arm64/fpsr_manager.h
-        backend/arm64/reg_alloc.cpp
-        backend/arm64/reg_alloc.h
-        backend/arm64/stack_layout.h
-        common/spin_lock_arm64.cpp
-        common/spin_lock_arm64.h
+         backend/A64/emitter/a64_emitter.cpp
+         backend/A64/emitter/a64_emitter.h
+         backend/A64/emitter/arm_common.h
+         backend/A64/emitter/code_block.h
+         # backend/A64/a64_emit_a64.cpp
+         # backend/A64/a64_emit_a64.h
+         # backend/A64/a64_exclusive_monitor.cpp
+         # backend/A64/a64_interface.cpp
+         # backend/A64/a64_jitstate.cpp
+         # backend/A64/a64_jitstate.h
+         backend/A64/abi.cpp
+         backend/A64/abi.h
+         backend/A64/block_of_code.cpp
+         backend/A64/block_of_code.h
+         backend/A64/block_range_information.cpp
+         backend/A64/block_range_information.h
+         backend/A64/callback.cpp
+         backend/A64/callback.h
+         backend/A64/constant_pool.cpp
+         backend/A64/constant_pool.h
+         backend/A64/devirtualize.h
+         backend/A64/emit_a64.cpp
+         backend/A64/emit_a64.h
+         # backend/A64/emit_a64_aes.cpp
+         # backend/A64/emit_a64_crc32.cpp
+         backend/A64/emit_a64_data_processing.cpp
+         backend/A64/emit_a64_floating_point.cpp
+         backend/A64/emit_a64_packed.cpp
+         backend/A64/emit_a64_saturation.cpp
+         # backend/A64/emit_a64_sm4.cpp
+         # backend/A64/emit_a64_vector.cpp
+         # backend/A64/emit_a64_vector_floating_point.cpp         
+         backend/A64/exception_handler.h
+         backend/A64/hostloc.cpp
+         backend/A64/hostloc.h
+         backend/A64/jitstate_info.h
+         backend/A64/opcodes.inc
+         backend/A64/perf_map.cpp
+         backend/A64/perf_map.h
+         backend/A64/reg_alloc.cpp
+         backend/A64/reg_alloc.h
     )
-
+    
     if ("A32" IN_LIST DYNARMIC_FRONTENDS)
         target_sources(dynarmic PRIVATE
-            backend/arm64/a32_address_space.cpp
-            backend/arm64/a32_address_space.h
-            backend/arm64/a32_core.h
-            backend/arm64/a32_interface.cpp
-
-            # Move this to the list below when implemented
-            backend/arm64/a64_interface.cpp
+            backend/A64/a32_emit_a64.cpp
+            backend/A64/a32_emit_a64.h
+            backend/A64/a32_interface.cpp
+            backend/A64/a32_jitstate.cpp
+            backend/A64/a32_jitstate.h
         )
     endif()
-
-    if ("A64" IN_LIST DYNARMIC_FRONTENDS)
-        message(FATAL_ERROR "TODO: Unimplemented frontend for this host architecture")
+    
+    if (UNIX)
+        target_sources(dynarmic PRIVATE backend/A64/exception_handler_posix.cpp)
+    else()
+        target_sources(dynarmic PRIVATE backend/A64/exception_handler_generic.cpp)
     endif()
 else()
     message(FATAL_ERROR "Unsupported architecture")
diff --git a/src/dynarmic/backend/A64/a32_emit_a64.cpp b/src/dynarmic/backend/A64/a32_emit_a64.cpp
new file mode 100644
index 00000000..192f0838
--- /dev/null
+++ b/src/dynarmic/backend/A64/a32_emit_a64.cpp
@@ -0,0 +1,1594 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#include <iterator>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include <dynarmic/A32/coprocessor.h>
+
+#include "backend/A64/a32_emit_a64.h"
+#include "backend/A64/a32_jitstate.h"
+#include "backend/A64/abi.h"
+#include "backend/A64/block_of_code.h"
+#include "backend/A64/devirtualize.h"
+#include "backend/A64/emit_a64.h"
+#include "backend/A64/emitter/a64_emitter.h"
+#include "backend/A64/perf_map.h"
+#include "common/assert.h"
+#include "common/bit_util.h"
+#include "common/common_types.h"
+#include "common/scope_exit.h"
+#include "common/variant_util.h"
+#include "frontend/A32/location_descriptor.h"
+#include "frontend/A32/types.h"
+#include "frontend/ir/basic_block.h"
+#include "frontend/ir/microinstruction.h"
+#include "frontend/ir/opcodes.h"
+
+// TODO: Have ARM flags in host flags and not have them use up GPR registers unless necessary.
+// TODO: Actually implement that proper instruction selector you've always wanted to sweetheart.
+
+namespace Dynarmic::BackendA64 {
+
+// Note that unlike the x64 backend these only returns ONLY the offset to register and not the address!
+static size_t MJitStateReg(A32::Reg reg) {
+    return offsetof(A32JitState, Reg) + sizeof(u32) * static_cast<size_t>(reg);    
+}
+
+static size_t MJitStateExtReg(A32::ExtReg reg) {
+    if (A32::IsSingleExtReg(reg)) {
+        size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::S0);
+        return offsetof(A32JitState, ExtReg) + sizeof(u32) * index;
+    }
+    if (A32::IsDoubleExtReg(reg)) {
+        size_t index = static_cast<size_t>(reg) - static_cast<size_t>(A32::ExtReg::D0);
+        return offsetof(A32JitState, ExtReg) + sizeof(u64) * index;
+    }
+    ASSERT_FALSE("Should never happen.");
+}
+
+A32EmitContext::A32EmitContext(RegAlloc& reg_alloc, IR::Block& block) : EmitContext(reg_alloc, block) {}
+
+A32::LocationDescriptor A32EmitContext::Location() const {
+    return A32::LocationDescriptor{block.Location()};
+}
+
+bool A32EmitContext::IsSingleStep() const {
+    return A32::LocationDescriptor{block.Location()}.SingleStepping();
+}
+
+FP::RoundingMode A32EmitContext::FPSCR_RMode() const {
+    return Location().FPSCR().RMode();
+}
+
+u32 A32EmitContext::FPCR() const {
+    return Location().FPSCR().Value();
+}
+
+bool A32EmitContext::FPSCR_FTZ() const {
+    return Location().FPSCR().FTZ();
+}
+
+bool A32EmitContext::FPSCR_DN() const {
+    return Location().FPSCR().DN();
+}
+
+std::ptrdiff_t A32EmitContext::GetInstOffset(IR::Inst* inst) const {
+    return std::distance(block.begin(), IR::Block::iterator(inst));
+}
+
+A32EmitA64::A32EmitA64(BlockOfCode& code, A32::UserConfig config, A32::Jit* jit_interface)
+    : EmitA64(code), config(std::move(config)), jit_interface(jit_interface) {
+    exception_handler.Register(code, [this](CodePtr PC){FastmemCallback(PC);});
+    GenMemoryAccessors();
+    GenTerminalHandlers();
+    code.PreludeComplete();
+    ClearFastDispatchTable();
+    fastmem_patch_info.clear();
+}
+
+A32EmitA64::~A32EmitA64() = default;
+
+A32EmitA64::BlockDescriptor A32EmitA64::Emit(IR::Block& block) {
+    code.EnableWriting();
+    SCOPE_EXIT {
+        code.DisableWriting();
+    };
+
+    RegAlloc reg_alloc{code, A32JitState::SpillCount, SpillToOpArg<A32JitState>};
+    A32EmitContext ctx{reg_alloc, block};    
+
+    const u8* entrypoint = code.AlignCode16();
+
+    // Start emitting.
+    EmitCondPrelude(ctx);
+
+    for (auto iter = block.begin(); iter != block.end(); ++iter) {
+        IR::Inst* inst = &*iter;
+
+        // Call the relevant Emit* member function.
+        switch (inst->GetOpcode()) {
+
+#define OPCODE(name, type, ...)                                                  \
+    case IR::Opcode::name:                                                       \
+         A32EmitA64::Emit##name(ctx, inst);                                      \
+         break;
+#define A32OPC(name, type, ...)                                                  \
+    case IR::Opcode::A32##name:                                                  \
+         A32EmitA64::EmitA32##name(ctx, inst);                                   \
+         break;
+#define A64OPC(...)
+#include "backend/A64/opcodes.inc"
+#undef OPCODE
+#undef A32OPC
+#undef A64OPC
+
+        default:
+            ASSERT_FALSE("Invalid opcode: {}", inst->GetOpcode());
+            break;
+        }
+
+        reg_alloc.EndOfAllocScope();
+    }
+
+    reg_alloc.AssertNoMoreUses();
+
+    EmitAddCycles(block.CycleCount());
+    EmitA64::EmitTerminal(block.GetTerminal(), ctx.Location().SetSingleStepping(false), ctx.IsSingleStep());
+    code.BRK(0);
+    code.PatchConstPool();
+    code.FlushIcacheSection(entrypoint, code.GetCodePtr());
+
+    const size_t size = static_cast<size_t>(code.GetCodePtr() - entrypoint);
+
+    const A32::LocationDescriptor descriptor{block.Location()};
+    const A32::LocationDescriptor end_location{block.EndLocation()};
+
+    const auto range = boost::icl::discrete_interval<u32>::closed(descriptor.PC(), end_location.PC() - 1);
+    block_ranges.AddRange(range, descriptor);
+
+    return RegisterBlock(descriptor, entrypoint, size);
+}
+
+void A32EmitA64::ClearCache() {
+    EmitA64::ClearCache();
+    block_ranges.ClearCache();
+    ClearFastDispatchTable();
+    fastmem_patch_info.clear();
+}
+
+void A32EmitA64::InvalidateCacheRanges(const boost::icl::interval_set<u32>& ranges) {
+    InvalidateBasicBlocks(block_ranges.InvalidateRanges(ranges));
+}
+
+void A32EmitA64::EmitCondPrelude(const A32EmitContext& ctx) {
+    if (ctx.block.GetCondition() == IR::Cond::AL) {
+        ASSERT(!ctx.block.HasConditionFailedLocation());
+        return;
+    }
+
+    ASSERT(ctx.block.HasConditionFailedLocation());
+
+    FixupBranch pass = EmitCond(ctx.block.GetCondition());
+    EmitAddCycles(ctx.block.ConditionFailedCycleCount());
+    EmitTerminal(IR::Term::LinkBlock{ctx.block.ConditionFailedLocation()}, ctx.block.Location(),  ctx.IsSingleStep());
+    code.SetJumpTarget(pass);
+}
+
+void A32EmitA64::ClearFastDispatchTable() {
+    if (config.enable_fast_dispatch) {
+        fast_dispatch_table.fill({});
+    }
+}
+
+void A32EmitA64::GenMemoryAccessors() {
+    code.AlignCode16();
+    read_memory_8 = code.GetCodePtr();
+    // Push lr and fp onto the stack
+    code.ABI_PushRegisters(0x60000000);
+    code.ADD(X29, SP, 0);
+    ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN);
+    Devirtualize<&A32::UserCallbacks::MemoryRead8>(config.callbacks).EmitCall(code);
+    ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN);
+    code.ABI_PopRegisters(0x60000000);
+    code.RET();
+    PerfMapRegister(read_memory_8, code.GetCodePtr(), "a32_read_memory_8");
+
+    code.AlignCode16();
+    read_memory_16 = code.GetCodePtr();
+    // Push lr and fp onto the stack
+    code.ABI_PushRegisters(0x60000000);
+    code.ADD(X29, SP, 0);
+    ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN);
+    Devirtualize<&A32::UserCallbacks::MemoryRead16>(config.callbacks).EmitCall(code);
+    ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN);
+    code.ABI_PopRegisters(0x60000000);
+    code.RET();
+    PerfMapRegister(read_memory_16, code.GetCodePtr(), "a32_read_memory_16");
+
+    code.AlignCode16();
+    read_memory_32 = code.GetCodePtr();
+    // Push lr and fp onto the stack
+    code.ABI_PushRegisters(0x60000000);
+    code.ADD(X29, SP, 0);
+    ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN);
+    Devirtualize<&A32::UserCallbacks::MemoryRead32>(config.callbacks).EmitCall(code);
+    ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN);
+    code.ABI_PopRegisters(0x60000000);
+    code.RET();
+    PerfMapRegister(read_memory_32, code.GetCodePtr(), "a32_read_memory_32");
+
+    code.AlignCode16();
+    read_memory_64 = code.GetCodePtr();
+    // Push lr and fp onto the stack
+    code.ABI_PushRegisters(0x60000000);
+    code.ADD(X29, SP, 0);
+    ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN);
+    Devirtualize<&A32::UserCallbacks::MemoryRead64>(config.callbacks).EmitCall(code);
+    ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN);
+    code.ABI_PopRegisters(0x60000000);
+    code.RET();
+    PerfMapRegister(read_memory_64, code.GetCodePtr(), "a32_read_memory_64");
+
+    code.AlignCode16();
+    write_memory_8 = code.GetCodePtr();
+    // Push lr and fp onto the stack
+    code.ABI_PushRegisters(0x60000000);
+    code.ADD(X29, SP, 0);
+    ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN);
+    Devirtualize<&A32::UserCallbacks::MemoryWrite8>(config.callbacks).EmitCall(code);
+    ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN);
+    code.ABI_PopRegisters(0x60000000);
+    code.RET();
+    PerfMapRegister(write_memory_8, code.GetCodePtr(), "a32_write_memory_8");
+
+    code.AlignCode16();
+    write_memory_16 = code.GetCodePtr();
+    // Push lr and fp onto the stack
+    code.ABI_PushRegisters(0x60000000);
+    code.ADD(X29, SP, 0);
+    ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN);
+    Devirtualize<&A32::UserCallbacks::MemoryWrite16>(config.callbacks).EmitCall(code);
+    ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN);
+    code.ABI_PopRegisters(0x60000000);
+    code.RET();
+    PerfMapRegister(write_memory_16, code.GetCodePtr(), "a32_write_memory_16");
+
+    code.AlignCode16();
+    write_memory_32 = code.GetCodePtr();
+    // Push lr and fp onto the stack
+    code.ABI_PushRegisters(0x60000000);
+    code.ADD(X29, SP, 0);
+    ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN);
+    Devirtualize<&A32::UserCallbacks::MemoryWrite32>(config.callbacks).EmitCall(code);
+    ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN);
+    code.ABI_PopRegisters(0x60000000);
+    code.RET();
+    PerfMapRegister(write_memory_32, code.GetCodePtr(), "a32_write_memory_32");
+
+    code.AlignCode16();
+    write_memory_64 = code.GetCodePtr();
+    // Push lr and fp onto the stack
+    code.ABI_PushRegisters(0x60000000);
+    code.ADD(X29, SP, 0);
+    ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN);
+    Devirtualize<&A32::UserCallbacks::MemoryWrite64>(config.callbacks).EmitCall(code);
+    ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, ABI_RETURN);
+    code.ABI_PopRegisters(0x60000000);
+    code.RET();
+    PerfMapRegister(write_memory_64, code.GetCodePtr(), "a32_write_memory_64");
+}
+
+void A32EmitA64::GenTerminalHandlers() {
+    const ARM64Reg fast_dispatch_entry_reg = X19;
+    const ARM64Reg location_descriptor_reg = X20;
+
+    // PC ends up in fast_dispatch_entry_reg, location_descriptor ends up in location_descriptor_reg.
+    const auto calculate_location_descriptor = [this](ARM64Reg fast_dispatch_entry_reg_, ARM64Reg location_descriptor_reg_) {
+        // This calculation has to match up with IREmitter::PushRSB
+        code.LDR(INDEX_UNSIGNED, DecodeReg(location_descriptor_reg_), X28, offsetof(A32JitState, upper_location_descriptor));
+        code.LDR(INDEX_UNSIGNED, DecodeReg(fast_dispatch_entry_reg_), X28, MJitStateReg(A32::Reg::PC));
+        code.ORR(location_descriptor_reg_, fast_dispatch_entry_reg_, location_descriptor_reg_, ArithOption{location_descriptor_reg_, ST_LSL, 32});
+    };
+
+    FixupBranch fast_dispatch_cache_miss, rsb_cache_miss;
+
+    code.AlignCode16();
+    terminal_handler_pop_rsb_hint = code.GetCodePtr();
+    calculate_location_descriptor(fast_dispatch_entry_reg, location_descriptor_reg);
+    code.LDR(INDEX_UNSIGNED, DecodeReg(code.ABI_SCRATCH1), X28, offsetof(A32JitState, rsb_ptr));
+    code.SUBI2R(code.ABI_SCRATCH1, DecodeReg(code.ABI_SCRATCH1), 1);
+    code.ANDI2R(code.ABI_SCRATCH1, DecodeReg(code.ABI_SCRATCH1), u32(A32JitState::RSBPtrMask));
+    code.STR(INDEX_UNSIGNED, DecodeReg(code.ABI_SCRATCH1), X28, offsetof(A32JitState, rsb_ptr));
+
+    // cmp(location_descriptor_reg, qword[r15 + offsetof(A32JitState, rsb_location_descriptors) + rsb_ptr * sizeof(u64)]);
+    code.ADD(code.ABI_SCRATCH1, X28, code.ABI_SCRATCH1, ArithOption{code.ABI_SCRATCH1, ST_LSL, 3});
+    code.LDR(INDEX_UNSIGNED, X8, code.ABI_SCRATCH1, offsetof(A32JitState, rsb_location_descriptors));
+    code.CMP(location_descriptor_reg, X8);
+    if (config.enable_fast_dispatch) {
+        rsb_cache_miss = code.B(CC_NEQ);
+    } else {
+        code.B(CC_NEQ, code.GetReturnFromRunCodeAddress());
+    }
+    code.LDR(INDEX_UNSIGNED, code.ABI_SCRATCH1, code.ABI_SCRATCH1, offsetof(A32JitState, rsb_codeptrs));
+    code.BR(code.ABI_SCRATCH1);
+    PerfMapRegister(terminal_handler_pop_rsb_hint, code.GetCodePtr(), "a32_terminal_handler_pop_rsb_hint");
+
+    if (config.enable_fast_dispatch) {
+        terminal_handler_fast_dispatch_hint = code.AlignCode16();
+        calculate_location_descriptor(fast_dispatch_entry_reg, location_descriptor_reg);
+        code.SetJumpTarget(rsb_cache_miss);
+        code.MOVI2R(code.ABI_SCRATCH1, reinterpret_cast<u64>(fast_dispatch_table.data()));
+        code.CRC32CW(DecodeReg(fast_dispatch_entry_reg), DecodeReg(fast_dispatch_entry_reg), DecodeReg(code.ABI_SCRATCH1));
+        code.ANDI2R(fast_dispatch_entry_reg, fast_dispatch_entry_reg, fast_dispatch_table_mask);
+        code.ADD(fast_dispatch_entry_reg, fast_dispatch_entry_reg, code.ABI_SCRATCH1);
+
+        code.LDR(INDEX_UNSIGNED, code.ABI_SCRATCH1, fast_dispatch_entry_reg, offsetof(FastDispatchEntry, location_descriptor));
+        code.CMP(location_descriptor_reg, code.ABI_SCRATCH1);
+        fast_dispatch_cache_miss = code.B(CC_NEQ);
+        code.LDR(INDEX_UNSIGNED, code.ABI_SCRATCH1, fast_dispatch_entry_reg, offsetof(FastDispatchEntry, code_ptr));
+        code.BR(code.ABI_SCRATCH1);
+
+        code.SetJumpTarget(fast_dispatch_cache_miss);
+        code.STR(INDEX_UNSIGNED, location_descriptor_reg, fast_dispatch_entry_reg, offsetof(FastDispatchEntry, location_descriptor) );
+        code.LookupBlock();
+        code.STR(INDEX_UNSIGNED, code.ABI_RETURN, fast_dispatch_entry_reg, offsetof(FastDispatchEntry, code_ptr));
+        code.BR(code.ABI_RETURN);
+        PerfMapRegister(terminal_handler_fast_dispatch_hint, code.GetCodePtr(), "a32_terminal_handler_fast_dispatch_hint");
+
+        code.AlignCode16();
+        fast_dispatch_table_lookup = reinterpret_cast<FastDispatchEntry& (*)(u64)>(code.GetWritableCodePtr());
+        code.MOVI2R(code.ABI_PARAM2, reinterpret_cast<u64>(fast_dispatch_table.data()));
+        code.CRC32CW(DecodeReg(code.ABI_PARAM1), DecodeReg(code.ABI_PARAM1), DecodeReg(code.ABI_PARAM2));
+        code.ANDI2R(DecodeReg(code.ABI_PARAM1), DecodeReg(code.ABI_PARAM1), fast_dispatch_table_mask);
+        code.ADD(code.ABI_RETURN, code.ABI_PARAM1, code.ABI_PARAM2);
+        code.RET();
+    }
+}
+
+
+void A32EmitA64::EmitA32GetRegister(A32EmitContext& ctx, IR::Inst* inst) {
+    A32::Reg reg = inst->GetArg(0).GetA32RegRef();
+
+    Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.ScratchGpr());
+    code.LDR(INDEX_UNSIGNED, result, X28, MJitStateReg(reg));
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A32EmitA64::EmitA32GetExtendedRegister32(A32EmitContext& ctx, IR::Inst* inst) {
+    A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
+    ASSERT(A32::IsSingleExtReg(reg));
+
+    ARM64Reg result = ctx.reg_alloc.ScratchFpr();
+    code.fp_emitter.LDR(32, INDEX_UNSIGNED, result, X28, MJitStateExtReg(reg));
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A32EmitA64::EmitA32GetExtendedRegister64(A32EmitContext& ctx, IR::Inst* inst) {
+    A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
+    ASSERT(A32::IsDoubleExtReg(reg));
+
+    ARM64Reg result = ctx.reg_alloc.ScratchFpr();
+    code.fp_emitter.LDR(64, INDEX_UNSIGNED, result, X28, MJitStateExtReg(reg));
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A32EmitA64::EmitA32SetRegister(A32EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    A32::Reg reg = inst->GetArg(0).GetA32RegRef();
+    if (args[1].IsInFpr()) {
+        Arm64Gen::ARM64Reg to_store = ctx.reg_alloc.UseFpr(args[1]);
+        code.fp_emitter.STR(32, INDEX_UNSIGNED, to_store, X28, MJitStateReg(reg));
+    } else {
+        Arm64Gen::ARM64Reg to_store = DecodeReg(ctx.reg_alloc.UseGpr(args[1]));
+        code.STR(INDEX_UNSIGNED, to_store, X28, MJitStateReg(reg));
+    }
+}
+
+void A32EmitA64::EmitA32SetExtendedRegister32(A32EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
+    ASSERT(A32::IsSingleExtReg(reg));
+    if (args[1].IsInFpr()) {
+        ARM64Reg to_store = ctx.reg_alloc.UseFpr(args[1]);
+        code.fp_emitter.STR(32, INDEX_UNSIGNED, to_store, X28, MJitStateExtReg(reg));
+    } else {
+        ARM64Reg to_store = DecodeReg(ctx.reg_alloc.UseGpr(args[1]));
+        code.STR(INDEX_UNSIGNED, to_store, X28, MJitStateExtReg(reg));
+    }
+}
+
+void A32EmitA64::EmitA32SetExtendedRegister64(A32EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    A32::ExtReg reg = inst->GetArg(0).GetA32ExtRegRef();
+    ASSERT(A32::IsDoubleExtReg(reg));
+    if (args[1].IsInFpr()) {
+        ARM64Reg to_store = ctx.reg_alloc.UseFpr(args[1]);
+        code.fp_emitter.STR(64, INDEX_UNSIGNED, to_store, X28, MJitStateExtReg(reg));
+    }
+    else {
+        ARM64Reg to_store = ctx.reg_alloc.UseGpr(args[1]);
+        code.STR(INDEX_UNSIGNED, to_store, X28, MJitStateExtReg(reg));
+    }
+}
+
+static u32 GetCpsrImpl(A32JitState* jit_state) {
+    return jit_state->Cpsr();
+}
+
+void A32EmitA64::EmitA32GetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
+    // TODO:Inline
+    ctx.reg_alloc.HostCall(inst);
+    code.MOV(code.ABI_PARAM1, X28);
+    code.QuickCallFunction(&GetCpsrImpl);
+}
+
+static void SetCpsrImpl(u32 value, A32JitState* jit_state) {
+    jit_state->SetCpsr(value);
+}
+
+void A32EmitA64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    
+    // TODO:Inline
+    ctx.reg_alloc.HostCall(nullptr, args[0]);
+
+    // Use an unused HostCall register
+    ARM64Reg host_fpsr = X9;
+    
+    if (config.always_little_endian) {
+        code.ANDI2R(code.ABI_PARAM1, code.ABI_PARAM1, 0xFFFFFDFF, ctx.reg_alloc.ScratchGpr());
+    }
+
+    // Since this is one of the only places where the ~sticky~ 
+    // guest's Q flag can be cleared it is also a great place to clear the host's Q flag
+    code.MRS(host_fpsr, FIELD_FPSR);
+    code.ANDI2R(host_fpsr, host_fpsr, ~(1 << 27));
+    code._MSR(FIELD_FPSR, host_fpsr);
+
+    code.MOV(code.ABI_PARAM2, X28);
+    code.QuickCallFunction(&SetCpsrImpl);
+}
+
+void A32EmitA64::EmitA32SetCpsrNZCVRaw(A32EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    ARM64Reg a = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
+
+    code.ANDI2R(a, a, 0xF0000000);
+    code.STR(INDEX_UNSIGNED, a, X28, offsetof(A32JitState, cpsr_nzcv));
+}
+
+void A32EmitA64::EmitA32SetCpsrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
+    EmitA32SetCpsrNZCVRaw(ctx, inst);
+}
+
+void A32EmitA64::EmitA32SetCpsrNZCVQ(A32EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    ARM64Reg host_fpsr = ctx.reg_alloc.ScratchGpr();
+    if (args[0].IsImmediate()) {
+        u32 imm = args[0].GetImmediateU32();
+        ARM64Reg a = DecodeReg(ctx.reg_alloc.ScratchGpr());
+
+        code.MOVI2R(a, u32(imm & 0xF0000000));
+        code.STR(INDEX_UNSIGNED, a, X28, offsetof(A32JitState, cpsr_nzcv));
+        code.MOVI2R(a, u8((imm & 0x08000000) != 0 ? 1 : 0));
+        code.STR(INDEX_UNSIGNED, a, X28, offsetof(A32JitState, cpsr_q));
+    } else {
+        ARM64Reg a = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
+        ARM64Reg q = DecodeReg(ctx.reg_alloc.ScratchGpr());
+
+        code.UBFX(q, a, 27, 1);
+        code.STR(INDEX_UNSIGNED, q, X28, offsetof(A32JitState, cpsr_q));
+        code.ANDI2R(a, a, 0xF0000000);
+        code.STR(INDEX_UNSIGNED, a, X28, offsetof(A32JitState, cpsr_nzcv));
+    }
+
+    // Since this is one of the only places where the ~sticky~ 
+    // guest's Q flag can be cleared it is also a great place to clear the host's Q flag.
+    // TODO : possibly a better job at explaining.
+    code.MRS(host_fpsr, FIELD_FPSR);
+    code.ANDI2R(host_fpsr, host_fpsr, ~(1 << 27));
+    code._MSR(FIELD_FPSR, host_fpsr);
+}
+
+void A32EmitA64::EmitA32GetNFlag(A32EmitContext& ctx, IR::Inst* inst) {
+    Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.ScratchGpr());
+    code.LDR(INDEX_UNSIGNED, result, X28, offsetof(A32JitState, cpsr_nzcv));
+    code.UBFX(result, result, 31, 1);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A32EmitA64::EmitA32SetNFlag(A32EmitContext& ctx, IR::Inst* inst) {
+    constexpr size_t flag_bit = 31;
+    constexpr u32 flag_mask = 1u << flag_bit;
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    Arm64Gen::ARM64Reg nzcv = DecodeReg(ctx.reg_alloc.ScratchGpr());
+
+    code.LDR(INDEX_UNSIGNED, nzcv, X28, offsetof(A32JitState, cpsr_nzcv));
+    if (args[0].IsImmediate()) {
+        if (args[0].GetImmediateU1()) {
+            code.ORRI2R(nzcv, nzcv, flag_mask);
+        } else {
+            code.ANDI2R(nzcv, nzcv, ~flag_mask);
+        }
+    } else {
+        Arm64Gen::ARM64Reg to_store = DecodeReg(ctx.reg_alloc.UseGpr(args[0]));
+
+        code.BFI(nzcv, to_store, flag_bit, 1);
+    }
+    code.STR(INDEX_UNSIGNED, nzcv, X28, offsetof(A32JitState, cpsr_nzcv));
+}
+
+void A32EmitA64::EmitA32GetZFlag(A32EmitContext& ctx, IR::Inst* inst) {
+    Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.ScratchGpr());
+    code.LDR(INDEX_UNSIGNED, result, X28, offsetof(A32JitState, cpsr_nzcv));
+    code.UBFX(result, result, 30, 1);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A32EmitA64::EmitA32SetZFlag(A32EmitContext& ctx, IR::Inst* inst) {
+    constexpr size_t flag_bit = 30;
+    constexpr u32 flag_mask = 1u << flag_bit;
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    Arm64Gen::ARM64Reg nzcv = DecodeReg(ctx.reg_alloc.ScratchGpr());
+
+    code.LDR(INDEX_UNSIGNED, nzcv, X28, offsetof(A32JitState, cpsr_nzcv));
+    if (args[0].IsImmediate()) {
+        if (args[0].GetImmediateU1()) {
+            code.ORRI2R(nzcv, nzcv, flag_mask);
+        } else {
+            code.ANDI2R(nzcv, nzcv, ~flag_mask);
+        }
+    } else {
+        Arm64Gen::ARM64Reg to_store = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
+
+        code.BFI(nzcv, to_store, flag_bit, 1);
+    }
+    code.STR(INDEX_UNSIGNED, nzcv, X28, offsetof(A32JitState, cpsr_nzcv));
+}
+
+void A32EmitA64::EmitA32SetCheckBit(A32EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const ARM64Reg to_store = DecodeReg(ctx.reg_alloc.UseGpr(args[0]));
+    code.STRB(INDEX_UNSIGNED, to_store, X28, offsetof(A32JitState, check_bit));
+}
+
+void A32EmitA64::EmitA32GetCFlag(A32EmitContext& ctx, IR::Inst* inst) {
+    Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.ScratchGpr());
+    code.LDR(INDEX_UNSIGNED, result, X28, offsetof(A32JitState, cpsr_nzcv));
+    code.UBFX(result, result, 29, 1);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A32EmitA64::EmitA32SetCFlag(A32EmitContext& ctx, IR::Inst* inst) {
+    constexpr size_t flag_bit = 29;
+    constexpr u32 flag_mask = 1u << flag_bit;
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    Arm64Gen::ARM64Reg nzcv = DecodeReg(ctx.reg_alloc.ScratchGpr());
+
+    code.LDR(INDEX_UNSIGNED, nzcv, X28, offsetof(A32JitState, cpsr_nzcv));
+    if (args[0].IsImmediate()) {
+        if (args[0].GetImmediateU1()) {
+            code.ORRI2R(nzcv, nzcv, flag_mask);
+        } else {
+            code.ANDI2R(nzcv, nzcv, ~flag_mask);
+        }
+    } else {
+        Arm64Gen::ARM64Reg to_store = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
+        code.BFI(nzcv, to_store, flag_bit, 1);
+    }
+    code.STR(INDEX_UNSIGNED, nzcv, X28, offsetof(A32JitState, cpsr_nzcv));
+}
+
+void A32EmitA64::EmitA32GetVFlag(A32EmitContext& ctx, IR::Inst* inst) {
+    Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.ScratchGpr());
+    code.LDR(INDEX_UNSIGNED, result, X28, offsetof(A32JitState, cpsr_nzcv));
+    code.UBFX(result, result, 28, 1);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A32EmitA64::EmitA32SetVFlag(A32EmitContext& ctx, IR::Inst* inst) {
+    constexpr size_t flag_bit = 28;
+    constexpr u32 flag_mask = 1u << flag_bit;
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    Arm64Gen::ARM64Reg nzcv = DecodeReg(ctx.reg_alloc.ScratchGpr());
+
+    code.LDR(INDEX_UNSIGNED, nzcv, X28, offsetof(A32JitState, cpsr_nzcv));
+    if (args[0].IsImmediate()) {
+        if (args[0].GetImmediateU1()) {
+            code.ORRI2R(nzcv, nzcv, flag_mask);
+        } else {
+            code.ANDI2R(nzcv, nzcv, ~flag_mask);
+        }
+    } else {
+        Arm64Gen::ARM64Reg to_store = DecodeReg(ctx.reg_alloc.UseGpr(args[0]));
+
+        code.BFI(nzcv, to_store, flag_bit, 1);
+    }
+    code.STR(INDEX_UNSIGNED, nzcv, X28, offsetof(A32JitState, cpsr_nzcv));
+}
+
+void A32EmitA64::EmitA32OrQFlag(A32EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    if (args[0].IsImmediate()) {
+        if (args[0].GetImmediateU1()) {
+            ARM64Reg to_store = DecodeReg(ctx.reg_alloc.UseGpr(args[0]));
+            code.STR(INDEX_UNSIGNED, to_store, X28, offsetof(A32JitState, cpsr_q));
+        }
+    } else {
+        ARM64Reg to_store = ctx.reg_alloc.UseGpr(args[0]);
+        ARM64Reg scratch = DecodeReg(ctx.reg_alloc.ScratchGpr());
+
+        code.LDR(INDEX_UNSIGNED, scratch, X28, offsetof(A32JitState, cpsr_q));
+        code.ORR(scratch, scratch, to_store);
+        code.STR(INDEX_UNSIGNED, scratch, X28, offsetof(A32JitState, cpsr_q));
+    }
+}
+
+void A32EmitA64::EmitA32GetGEFlags(A32EmitContext& ctx, IR::Inst* inst) {
+    ARM64Reg result = EncodeRegToSingle(ctx.reg_alloc.ScratchFpr());
+    code.LDR(INDEX_UNSIGNED, result, X28, offsetof(A32JitState, cpsr_ge));
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A32EmitA64::EmitA32SetGEFlags(A32EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    ASSERT(!args[0].IsImmediate());
+    ARM64Reg to_store = INVALID_REG;
+    if (args[0].IsInFpr()) {
+        to_store = EncodeRegToSingle(ctx.reg_alloc.UseFpr(args[0]));
+    } else {
+        to_store = DecodeReg(ctx.reg_alloc.UseGpr(args[0]));
+    }
+    code.STR(INDEX_UNSIGNED, to_store, X28, offsetof(A32JitState, cpsr_ge));
+}
+
+void A32EmitA64::EmitA32SetGEFlagsCompressed(A32EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    if (args[0].IsImmediate()) {
+        ARM64Reg to_store = DecodeReg(ctx.reg_alloc.ScratchGpr());
+        u32 imm = args[0].GetImmediateU32();
+        u32 ge = 0;
+        ge |= Common::Bit<19>(imm) ? 0xFF000000 : 0;
+        ge |= Common::Bit<18>(imm) ? 0x00FF0000 : 0;
+        ge |= Common::Bit<17>(imm) ? 0x0000FF00 : 0;
+        ge |= Common::Bit<16>(imm) ? 0x000000FF : 0;
+
+        code.MOVI2R(to_store, ge);
+        code.STR(INDEX_UNSIGNED, to_store, X28, offsetof(A32JitState, cpsr_ge));
+    } else {
+        ARM64Reg a = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
+        ARM64Reg scratch = DecodeReg(ctx.reg_alloc.ScratchGpr());
+
+        code.UBFX(a, a, 16, 4);
+        code.MOVI2R(scratch, 0x00204081);
+        code.MUL(a, a, scratch);
+        code.ANDI2R(a, a, 0x01010101);
+        code.ORR(a, a, a, ArithOption{a, ST_LSL, 1});
+        code.STR(INDEX_UNSIGNED, a, X28, offsetof(A32JitState, cpsr_ge));
+    }
+}
+
+void A32EmitA64::EmitA32BXWritePC(A32EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    auto& arg = args[0];
+
+    const u32 upper_without_t = (ctx.Location().SetSingleStepping(false).UniqueHash() >> 32) & 0xFFFFFFFE;
+
+    // Pseudocode:
+    // if (new_pc & 1) {
+    //    new_pc &= 0xFFFFFFFE;
+    //    cpsr.T = true;
+    // } else {
+    //    new_pc &= 0xFFFFFFFC;
+    //    cpsr.T = false;
+    // }
+    // We rely on the fact we disallow EFlag from changing within a block.
+    
+    if (arg.IsImmediate()) {
+        const ARM64Reg scratch = DecodeReg(ctx.reg_alloc.ScratchGpr());
+        u32 new_pc = arg.GetImmediateU32();
+        const u32 mask = Common::Bit<0>(new_pc) ? 0xFFFFFFFE : 0xFFFFFFFC;
+        const u32 new_upper = upper_without_t | (Common::Bit<0>(new_pc) ? 1 : 0);
+
+        code.MOVI2R(scratch, new_pc & mask);
+        code.STR(INDEX_UNSIGNED, scratch, X28, MJitStateReg(A32::Reg::PC));
+        code.MOVI2R(scratch, new_upper);
+        code.STR(INDEX_UNSIGNED, scratch, X28, offsetof(A32JitState, upper_location_descriptor));
+    } else {
+        const ARM64Reg new_pc = DecodeReg(ctx.reg_alloc.UseScratchGpr(arg));
+        const ARM64Reg mask = DecodeReg(ctx.reg_alloc.ScratchGpr());
+        const ARM64Reg new_upper = DecodeReg(ctx.reg_alloc.ScratchGpr());
+
+        code.ANDI2R(mask, new_pc, 1);
+        code.MOVI2R(new_upper, upper_without_t);
+        code.ADD(new_upper, new_upper, mask);
+        code.STR(INDEX_UNSIGNED, new_upper, X28, offsetof(A32JitState, upper_location_descriptor));
+        code.LSL(mask, mask, 1);
+        code.SUBI2R(mask, mask, 4); // mask = pc & 1 ? 0xFFFFFFFE : 0xFFFFFFFC
+        code.AND(new_pc, new_pc, mask);
+        code.STR(INDEX_UNSIGNED, new_pc, X28, MJitStateReg(A32::Reg::PC));
+    }
+}
+
+void A32EmitA64::EmitA32CallSupervisor(A32EmitContext& ctx, IR::Inst* inst) {
+    ctx.reg_alloc.HostCall(nullptr);
+
+    code.SwitchFpscrOnExit();
+    code.LDR(INDEX_UNSIGNED, code.ABI_PARAM2, X28, offsetof(A32JitState, cycles_to_run));
+    code.SUB(code.ABI_PARAM2, code.ABI_PARAM2, X26);
+
+    Devirtualize<&A32::UserCallbacks::AddTicks>(config.callbacks).EmitCall(code);
+    ctx.reg_alloc.EndOfAllocScope();
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    ctx.reg_alloc.HostCall(nullptr, {}, args[0]);
+    Devirtualize<&A32::UserCallbacks::CallSVC>(config.callbacks).EmitCall(code);
+    Devirtualize<&A32::UserCallbacks::GetTicksRemaining>(config.callbacks).EmitCall(code);
+    code.STR(INDEX_UNSIGNED, code.ABI_RETURN, X28, offsetof(A32JitState, cycles_to_run));
+    code.MOV(X26, code.ABI_RETURN);
+    code.SwitchFpscrOnEntry();
+}
+
+void A32EmitA64::EmitA32ExceptionRaised(A32EmitContext& ctx, IR::Inst* inst) {
+    ctx.reg_alloc.HostCall(nullptr);
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    ASSERT(args[0].IsImmediate() && args[1].IsImmediate());
+    u32 pc = args[0].GetImmediateU32();
+    u64 exception = args[1].GetImmediateU64();
+    Devirtualize<&A32::UserCallbacks::ExceptionRaised>(config.callbacks).EmitCall(code, [&](RegList param) {
+        code.MOVI2R(param[0], pc);
+        code.MOVI2R(param[1], exception);
+    });
+}
+
+static u32 GetFpscrImpl(A32JitState* jit_state) {
+    return jit_state->Fpscr();
+}
+
+void A32EmitA64::EmitA32GetFpscr(A32EmitContext& ctx, IR::Inst* inst) {
+    ctx.reg_alloc.HostCall(inst);
+    // Use an unused HostCall register
+    const ARM64Reg fpsr = X9;
+    const ARM64Reg fpcr = X10;
+    code.MOV(code.ABI_PARAM1, X28);
+
+    code.MRS(fpsr, FIELD_FPSR);
+    code.MRS(fpcr, FIELD_FPCR);
+    code.STR(INDEX_UNSIGNED, fpsr, X28, offsetof(A32JitState, guest_fpsr));
+    code.STR(INDEX_UNSIGNED, fpcr, X28, offsetof(A32JitState, guest_fpcr));
+    code.QuickCallFunction(&GetFpscrImpl);
+}
+
+static void SetFpscrImpl(u32 value, A32JitState* jit_state) {
+    jit_state->SetFpscr(value);
+}
+
+void A32EmitA64::EmitA32SetFpscr(A32EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    ctx.reg_alloc.HostCall(nullptr, args[0]);
+    // Use an unused HostCall register
+    const ARM64Reg fpsr = X9;
+    const ARM64Reg fpcr = X10;
+
+    code.MOV(code.ABI_PARAM2, X28);
+
+    code.QuickCallFunction(&SetFpscrImpl);
+
+    code.LDR(INDEX_UNSIGNED, fpsr, X28, offsetof(A32JitState, guest_fpsr));
+    code.LDR(INDEX_UNSIGNED, fpcr, X28, offsetof(A32JitState, guest_fpcr));
+    code._MSR(FIELD_FPSR, fpsr);
+    code._MSR(FIELD_FPCR, fpcr);
+}
+
+void A32EmitA64::EmitA32GetFpscrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
+    ARM64Reg result = DecodeReg(ctx.reg_alloc.ScratchGpr());
+    code.LDR(INDEX_UNSIGNED, result, X28, offsetof(A32JitState, fpsr_nzcv));
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void A32EmitA64::EmitA32SetFpscrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    ARM64Reg value = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
+
+    code.ANDI2R(value, value, 0xF0000000);
+
+    code.STR(INDEX_UNSIGNED, value, X28, offsetof(A32JitState, fpsr_nzcv));
+}
+
+void A32EmitA64::EmitA32ClearExclusive(A32EmitContext&, IR::Inst*) {
+    code.STR(INDEX_UNSIGNED, WZR, X28, offsetof(A32JitState, exclusive_state));
+}
+
+void A32EmitA64::EmitA32SetExclusive(A32EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    ASSERT(args[1].IsImmediate());
+    Arm64Gen::ARM64Reg address = DecodeReg(ctx.reg_alloc.UseGpr(args[0]));
+    Arm64Gen::ARM64Reg state = DecodeReg(ctx.reg_alloc.ScratchGpr());
+
+    code.MOVI2R(state, u8(1));
+    code.STR(INDEX_UNSIGNED, state, X28, offsetof(A32JitState, exclusive_state));
+    code.STR(INDEX_UNSIGNED, address, X28, offsetof(A32JitState, exclusive_address));
+}
+
+A32EmitA64::DoNotFastmemMarker A32EmitA64::GenerateDoNotFastmemMarker(A32EmitContext& ctx, IR::Inst* inst) {
+    return std::make_tuple(ctx.Location(), ctx.GetInstOffset(inst));
+}
+
+bool A32EmitA64::ShouldFastmem(const DoNotFastmemMarker& marker) const {
+    return config.fastmem_pointer && exception_handler.SupportsFastmem() && do_not_fastmem.count(marker) == 0;
+}
+
+void A32EmitA64::DoNotFastmem(const DoNotFastmemMarker& marker) {
+    do_not_fastmem.emplace(marker);
+    InvalidateBasicBlocks({std::get<0>(marker)});
+}
+
+template <typename T>
+void A32EmitA64::ReadMemory(A32EmitContext& ctx, IR::Inst* inst, const CodePtr callback_fn) {
+    constexpr size_t bit_size = Common::BitSize<T>();
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    ctx.reg_alloc.UseScratch(args[0], ABI_PARAM2);
+    ctx.reg_alloc.ScratchGpr({ABI_RETURN});
+
+    ARM64Reg result = ctx.reg_alloc.ScratchGpr();
+    ARM64Reg vaddr = DecodeReg(code.ABI_PARAM2);
+    ARM64Reg tmp = code.ABI_RETURN;
+
+    const auto do_not_fastmem_marker = GenerateDoNotFastmemMarker(ctx, inst);
+
+    const auto page_table_lookup = [this, result, vaddr, tmp, callback_fn](FixupBranch& end) {
+        constexpr size_t bit_size = Common::BitSize<T>();
+
+        code.MOVP2R(result, config.page_table);
+        code.MOV(tmp, vaddr, ArithOption{vaddr, ST_LSR, 12});
+        code.LDR(result, result, ArithOption{tmp, true});
+        FixupBranch abort = code.CBZ(result);
+        code.ANDI2R(vaddr, vaddr, 4095);
+        switch (bit_size) {
+            case 8:
+                code.LDRB(DecodeReg(result), result, vaddr);
+                break;
+            case 16:
+                code.LDRH(DecodeReg(result), result, vaddr);
+                break;
+            case 32:
+                code.LDR(DecodeReg(result), result, vaddr);
+                break;
+            case 64:
+                code.LDR(result, result, vaddr);
+                break;
+            default:
+                ASSERT_FALSE("Invalid bit_size");
+                break;
+        }
+        end = code.B();
+        code.SetJumpTarget(abort);
+        code.BL(callback_fn);
+        code.MOV(result, code.ABI_RETURN);
+    };
+
+
+    if (ShouldFastmem(do_not_fastmem_marker)) {
+        const CodePtr patch_location = code.GetCodePtr();
+        switch (bit_size) {
+            case 8:
+                code.LDRB(DecodeReg(result), X27, vaddr);
+                break;
+            case 16:
+                code.LDRH(DecodeReg(result), X27, vaddr);
+                break;
+            case 32:
+                code.LDR(DecodeReg(result), X27, vaddr);
+                break;
+            case 64:
+                code.LDR(result, X27, vaddr);
+                break;
+            default:
+                ASSERT_FALSE("Invalid bit_size");
+                break;
+        }
+
+        fastmem_patch_info.emplace(
+                patch_location,
+                FastmemPatchInfo{
+                        [this, patch_location, page_table_lookup, callback_fn, result, do_not_fastmem_marker]{
+                            CodePtr save_code_ptr = code.GetCodePtr();
+                            code.SetCodePtr(patch_location);
+                            FixupBranch thunk = code.B();
+                            u8* end_ptr = code.GetWritableCodePtr();
+                            code.FlushIcacheSection(reinterpret_cast<const u8*>(patch_location), end_ptr);
+                            code.SetCodePtr(save_code_ptr);
+                            code.SwitchToFarCode();
+                            code.SetJumpTarget(thunk);
+                            if (config.page_table) {
+                                FixupBranch end{};
+                                page_table_lookup(end);
+                                code.SetJumpTarget(end, end_ptr);
+                            } else {
+                                code.BL(callback_fn);
+                                code.MOV(result, code.ABI_RETURN);
+                            }
+                            code.B(end_ptr);
+                            code.FlushIcache();
+                            code.SwitchToNearCode();
+
+                            DoNotFastmem(do_not_fastmem_marker);
+                        }
+                });
+
+        ctx.reg_alloc.DefineValue(inst, result);
+        return;
+    }
+
+    if (!config.page_table) {
+        code.BL(callback_fn);
+        code.MOV(result, code.ABI_RETURN);
+        ctx.reg_alloc.DefineValue(inst, result);
+        return;
+    }
+
+    FixupBranch end{};
+    page_table_lookup(end);
+    code.SetJumpTarget(end);
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+template<typename T>
+void A32EmitA64::WriteMemory(A32EmitContext& ctx, IR::Inst* inst, const CodePtr callback_fn) {
+    constexpr size_t bit_size = Common::BitSize<T>();
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    ctx.reg_alloc.ScratchGpr({ABI_RETURN});
+    ctx.reg_alloc.UseScratch(args[0], ABI_PARAM2);
+    ctx.reg_alloc.UseScratch(args[1], ABI_PARAM3);
+
+    ARM64Reg vaddr = DecodeReg(code.ABI_PARAM2);
+    ARM64Reg value = code.ABI_PARAM3;
+    ARM64Reg page_index = ctx.reg_alloc.ScratchGpr();
+    ARM64Reg addr = ctx.reg_alloc.ScratchGpr();
+
+    const auto do_not_fastmem_marker = GenerateDoNotFastmemMarker(ctx, inst);
+
+    const auto page_table_lookup = [this, vaddr, value, page_index, addr, callback_fn](FixupBranch& end) {
+        constexpr size_t bit_size = Common::BitSize<T>();
+
+        code.MOVP2R(addr, config.page_table);
+        code.MOV(DecodeReg(page_index), vaddr, ArithOption{vaddr, ST_LSR, 12});
+        code.LDR(addr, addr, ArithOption{page_index, true});
+        FixupBranch abort = code.CBZ(addr);
+        code.ANDI2R(vaddr, vaddr, 4095);
+        switch (bit_size) {
+            case 8:
+                code.STRB(DecodeReg(value), addr, vaddr);
+                break;
+            case 16:
+                code.STRH(DecodeReg(value), addr, vaddr);
+                break;
+            case 32:
+                code.STR(DecodeReg(value), addr, vaddr);;
+                break;
+            case 64:
+                code.STR(value, addr, vaddr);
+                break;
+            default:
+                ASSERT_FALSE("Invalid bit_size");
+                break;
+        }
+        end = code.B();
+        code.SetJumpTarget(abort);
+        code.BL(callback_fn);
+    };
+
+    if (ShouldFastmem(do_not_fastmem_marker)) {
+        const CodePtr patch_location = code.GetCodePtr();
+        switch (bit_size) {
+            case 8:
+                code.STRB(DecodeReg(value), X27, vaddr);
+                break;
+            case 16:
+                code.STRH(DecodeReg(value), X27, vaddr);
+                break;
+            case 32:
+                code.STR(DecodeReg(value), X27, vaddr);
+                break;
+            case 64:
+                code.STR(value, X27, vaddr);
+                break;
+            default:
+                ASSERT_FALSE("Invalid bit_size");
+                break;
+        }
+
+        fastmem_patch_info.emplace(
+                patch_location,
+                FastmemPatchInfo{
+                        [this, patch_location, page_table_lookup, callback_fn, do_not_fastmem_marker]{
+                            CodePtr save_code_ptr = code.GetCodePtr();
+                            code.SetCodePtr(patch_location);
+                            FixupBranch thunk = code.B();
+                            u8* end_ptr = code.GetWritableCodePtr();
+                            code.FlushIcacheSection(reinterpret_cast<const u8*>(patch_location), end_ptr);
+                            code.SetCodePtr(save_code_ptr);
+                            code.SwitchToFarCode();
+                            code.SetJumpTarget(thunk);
+                            if (config.page_table) {
+                                FixupBranch end{};
+                                page_table_lookup(end);
+                                code.SetJumpTarget(end, end_ptr);
+                            } else {
+                                code.BL(callback_fn);
+                            }
+                            code.B(end_ptr);
+                            code.FlushIcache();
+                            code.SwitchToNearCode();
+
+                            DoNotFastmem(do_not_fastmem_marker);
+                        }
+                });
+        return;
+    }
+
+    if (!config.page_table) {
+        code.BL(callback_fn);
+        return;
+    }
+
+    FixupBranch end{};
+    page_table_lookup(end);
+    code.SetJumpTarget(end);
+}
+
+void A32EmitA64::EmitA32ReadMemory8(A32EmitContext& ctx, IR::Inst* inst) {
+    ReadMemory<u8>(ctx, inst, read_memory_8);
+}
+
+void A32EmitA64::EmitA32ReadMemory16(A32EmitContext& ctx, IR::Inst* inst) {
+    ReadMemory<u16>(ctx, inst, read_memory_16);
+}
+
+void A32EmitA64::EmitA32ReadMemory32(A32EmitContext& ctx, IR::Inst* inst) {
+    ReadMemory<u32>(ctx, inst, read_memory_32);
+}
+
+void A32EmitA64::EmitA32ReadMemory64(A32EmitContext& ctx, IR::Inst* inst) {
+    ReadMemory<u64>(ctx, inst, read_memory_64);
+}
+
+void A32EmitA64::EmitA32WriteMemory8(A32EmitContext& ctx, IR::Inst* inst) {
+    WriteMemory<u8>(ctx, inst, write_memory_8);
+}
+
+void A32EmitA64::EmitA32WriteMemory16(A32EmitContext& ctx, IR::Inst* inst) {
+    WriteMemory<u16>(ctx, inst, write_memory_16);
+}
+
+void A32EmitA64::EmitA32WriteMemory32(A32EmitContext& ctx, IR::Inst* inst) {
+    WriteMemory<u32>(ctx, inst, write_memory_32);
+}
+
+void A32EmitA64::EmitA32WriteMemory64(A32EmitContext& ctx, IR::Inst* inst) {
+    WriteMemory<u64>(ctx, inst, write_memory_64);
+}
+
+template <typename T, void (A32::UserCallbacks::*fn)(A32::VAddr, T)>
+static void ExclusiveWrite(BlockOfCode& code, RegAlloc& reg_alloc, IR::Inst* inst, const A32::UserConfig& config) {
+    auto args = reg_alloc.GetArgumentInfo(inst);
+    reg_alloc.HostCall(nullptr, {}, args[0], args[1]);
+
+    // Use unused HostCall registers
+    ARM64Reg passed = W9;
+    ARM64Reg tmp = W10;
+
+    std::vector<FixupBranch> end;
+
+    code.MOVI2R(passed, u32(1));
+    code.LDR(INDEX_UNSIGNED, tmp, X28, offsetof(A32JitState, exclusive_state));
+    end.push_back(code.CBZ(tmp));
+    code.LDR(INDEX_UNSIGNED, tmp, X28, offsetof(A32JitState, exclusive_address));
+    code.EOR(tmp, code.ABI_PARAM2, tmp);
+    code.TSTI2R(tmp, A32JitState::RESERVATION_GRANULE_MASK, reg_alloc.ScratchGpr());
+    end.push_back(code.B(CC_NEQ));
+    code.STR(INDEX_UNSIGNED, WZR, X28, offsetof(A32JitState, exclusive_state));
+
+    Devirtualize<fn>(config.callbacks).EmitCall(code);
+    code.MOVI2R(passed, 0);
+
+    for (FixupBranch e : end) {
+         code.SetJumpTarget(e);
+    }
+
+    reg_alloc.DefineValue(inst, passed);
+}
+
+void A32EmitA64::EmitA32ExclusiveWriteMemory8(A32EmitContext& ctx, IR::Inst* inst) {
+    ExclusiveWrite<u8, &A32::UserCallbacks::MemoryWrite8>(code, ctx.reg_alloc, inst, config);
+}
+
+void A32EmitA64::EmitA32ExclusiveWriteMemory16(A32EmitContext& ctx, IR::Inst* inst) {
+    ExclusiveWrite<u16, &A32::UserCallbacks::MemoryWrite16>(code, ctx.reg_alloc, inst, config);
+}
+
+void A32EmitA64::EmitA32ExclusiveWriteMemory32(A32EmitContext& ctx, IR::Inst* inst) {
+    ExclusiveWrite<u32, &A32::UserCallbacks::MemoryWrite32>(code, ctx.reg_alloc, inst, config);
+}
+
+void A32EmitA64::EmitA32ExclusiveWriteMemory64(A32EmitContext& ctx, IR::Inst* inst) {
+    ExclusiveWrite<u64, &A32::UserCallbacks::MemoryWrite64>(code, ctx.reg_alloc, inst, config);
+}
+
+static void EmitCoprocessorException() {
+    ASSERT_FALSE("Should raise coproc exception here");
+}
+
+static void CallCoprocCallback(BlockOfCode& code, RegAlloc& reg_alloc, A32::Jit* jit_interface, A32::Coprocessor::Callback callback,
+                               IR::Inst* inst = nullptr, std::optional<Argument::copyable_reference> arg0 = {}, std::optional<Argument::copyable_reference> arg1 = {}) {
+    reg_alloc.HostCall(inst, {}, {}, arg0, arg1);
+
+    code.MOVP2R(code.ABI_PARAM1, jit_interface);
+    if (callback.user_arg) {
+        code.MOVP2R(code.ABI_PARAM2, *callback.user_arg);
+    }
+
+    code.QuickCallFunction(callback.function);
+}
+
+void A32EmitA64::EmitA32CoprocInternalOperation(A32EmitContext& ctx, IR::Inst* inst) {
+    auto coproc_info = inst->GetArg(0).GetCoprocInfo();
+
+    size_t coproc_num = coproc_info[0];
+    bool two = coproc_info[1] != 0;
+    unsigned opc1 = static_cast<unsigned>(coproc_info[2]);
+    A32::CoprocReg CRd = static_cast<A32::CoprocReg>(coproc_info[3]);
+    A32::CoprocReg CRn = static_cast<A32::CoprocReg>(coproc_info[4]);
+    A32::CoprocReg CRm = static_cast<A32::CoprocReg>(coproc_info[5]);
+    unsigned opc2 = static_cast<unsigned>(coproc_info[6]);
+
+    std::shared_ptr<A32::Coprocessor> coproc = config.coprocessors[coproc_num];
+    if (!coproc) {
+        EmitCoprocessorException();
+        return;
+    }
+
+    auto action = coproc->CompileInternalOperation(two, opc1, CRd, CRn, CRm, opc2);
+    if (!action) {
+        EmitCoprocessorException();
+        return;
+    }
+
+    CallCoprocCallback(code, ctx.reg_alloc, jit_interface, *action);
+}
+
+void A32EmitA64::EmitA32CoprocSendOneWord(A32EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    auto coproc_info = inst->GetArg(0).GetCoprocInfo();
+
+    size_t coproc_num = coproc_info[0];
+    bool two = coproc_info[1] != 0;
+    unsigned opc1 = static_cast<unsigned>(coproc_info[2]);
+    A32::CoprocReg CRn = static_cast<A32::CoprocReg>(coproc_info[3]);
+    A32::CoprocReg CRm = static_cast<A32::CoprocReg>(coproc_info[4]);
+    unsigned opc2 = static_cast<unsigned>(coproc_info[5]);
+
+    std::shared_ptr<A32::Coprocessor> coproc = config.coprocessors[coproc_num];
+    if (!coproc) {
+        EmitCoprocessorException();
+        return;
+    }
+
+    auto action = coproc->CompileSendOneWord(two, opc1, CRn, CRm, opc2);
+    switch (action.index()) {
+    case 0:
+        EmitCoprocessorException();
+        return;
+    case 1:
+        CallCoprocCallback(code, ctx.reg_alloc, jit_interface, std::get<A32::Coprocessor::Callback>(action), nullptr, args[1]);
+        return;
+    case 2: {
+        u32* destination_ptr = std::get<u32*>(action);
+
+        ARM64Reg reg_word = DecodeReg(ctx.reg_alloc.UseGpr(args[1]));
+        ARM64Reg reg_destination_addr = ctx.reg_alloc.ScratchGpr();
+
+        code.MOVP2R(reg_destination_addr, destination_ptr);
+        code.STR(INDEX_UNSIGNED, reg_word, reg_destination_addr, 0);
+
+        return;
+    }
+    default:
+        UNREACHABLE();
+    }
+}
+
+void A32EmitA64::EmitA32CoprocSendTwoWords(A32EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    auto coproc_info = inst->GetArg(0).GetCoprocInfo();
+
+    size_t coproc_num = coproc_info[0];
+    bool two = coproc_info[1] != 0;
+    unsigned opc = static_cast<unsigned>(coproc_info[2]);
+    A32::CoprocReg CRm = static_cast<A32::CoprocReg>(coproc_info[3]);
+
+    std::shared_ptr<A32::Coprocessor> coproc = config.coprocessors[coproc_num];
+    if (!coproc) {
+        EmitCoprocessorException();
+        return;
+    }
+
+    auto action = coproc->CompileSendTwoWords(two, opc, CRm);
+    switch (action.index()) {
+    case 0:
+        EmitCoprocessorException();
+        return;
+    case 1:
+        CallCoprocCallback(code, ctx.reg_alloc, jit_interface, std::get<A32::Coprocessor::Callback>(action), nullptr, args[1], args[2]);
+        return;
+    case 2: {
+        auto destination_ptrs = std::get<std::array<u32*, 2>>(action);
+
+        ARM64Reg reg_word1 = DecodeReg(ctx.reg_alloc.UseGpr(args[1]));
+        ARM64Reg reg_word2 = DecodeReg(ctx.reg_alloc.UseGpr(args[2]));
+        ARM64Reg reg_destination_addr = ctx.reg_alloc.ScratchGpr();
+
+        code.MOVP2R(reg_destination_addr, destination_ptrs[0]);
+        code.STR(INDEX_UNSIGNED, reg_word1, reg_destination_addr, 0);
+        code.MOVP2R(reg_destination_addr, destination_ptrs[1]);
+        code.STR(INDEX_UNSIGNED, reg_word2, reg_destination_addr, 0);
+
+        return;
+    }
+    default:
+        UNREACHABLE();
+    }
+}
+
+void A32EmitA64::EmitA32CoprocGetOneWord(A32EmitContext& ctx, IR::Inst* inst) {
+    auto coproc_info = inst->GetArg(0).GetCoprocInfo();
+
+    size_t coproc_num = coproc_info[0];
+    bool two = coproc_info[1] != 0;
+    unsigned opc1 = static_cast<unsigned>(coproc_info[2]);
+    A32::CoprocReg CRn = static_cast<A32::CoprocReg>(coproc_info[3]);
+    A32::CoprocReg CRm = static_cast<A32::CoprocReg>(coproc_info[4]);
+    unsigned opc2 = static_cast<unsigned>(coproc_info[5]);
+
+    std::shared_ptr<A32::Coprocessor> coproc = config.coprocessors[coproc_num];
+    if (!coproc) {
+        EmitCoprocessorException();
+        return;
+    }
+
+    auto action = coproc->CompileGetOneWord(two, opc1, CRn, CRm, opc2);
+    switch (action.index()) {
+    case 0:
+        EmitCoprocessorException();
+        return;
+    case 1:
+        CallCoprocCallback(code, ctx.reg_alloc, jit_interface, std::get<A32::Coprocessor::Callback>(action), inst);
+        return;
+    case 2: {
+        u32* source_ptr = std::get<u32*>(action);
+
+        ARM64Reg result = ctx.reg_alloc.ScratchGpr();
+
+        code.MOVP2R(result, source_ptr);
+        code.LDR(INDEX_UNSIGNED, DecodeReg(result), result, 0);
+
+        ctx.reg_alloc.DefineValue(inst, result);
+
+        return;
+    }
+    default:
+        UNREACHABLE();
+    }
+}
+
+void A32EmitA64::EmitA32CoprocGetTwoWords(A32EmitContext& ctx, IR::Inst* inst) {
+    auto coproc_info = inst->GetArg(0).GetCoprocInfo();
+
+    size_t coproc_num = coproc_info[0];
+    bool two = coproc_info[1] != 0;
+    unsigned opc = coproc_info[2];
+    A32::CoprocReg CRm = static_cast<A32::CoprocReg>(coproc_info[3]);
+
+    std::shared_ptr<A32::Coprocessor> coproc = config.coprocessors[coproc_num];
+    if (!coproc) {
+        EmitCoprocessorException();
+        return;
+    }
+
+    auto action = coproc->CompileGetTwoWords(two, opc, CRm);
+    switch (action.index()) {
+    case 0:
+        EmitCoprocessorException();
+        return;
+    case 1:
+        CallCoprocCallback(code, ctx.reg_alloc, jit_interface, std::get<A32::Coprocessor::Callback>(action), inst);
+        return;
+    case 2: {
+        auto source_ptrs = std::get<std::array<u32*, 2>>(action);
+
+        ARM64Reg reg_result = ctx.reg_alloc.ScratchGpr();
+        ARM64Reg reg_tmp = ctx.reg_alloc.ScratchGpr();
+
+        code.MOVP2R(reg_tmp, source_ptrs[1]);
+        code.LDR(INDEX_UNSIGNED, DecodeReg(reg_result), reg_tmp, 0);
+        code.MOVP2R(reg_tmp, source_ptrs[0]);
+        code.LDR(INDEX_UNSIGNED, DecodeReg(reg_tmp), reg_tmp, 0);
+        code.ORR(reg_result, reg_tmp, reg_result, ArithOption{ reg_result , ST_LSL, 32});
+
+        ctx.reg_alloc.DefineValue(inst, reg_result);
+
+        return;
+    }
+    default:
+        UNREACHABLE();
+    }
+}
+
+void A32EmitA64::EmitA32CoprocLoadWords(A32EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    auto coproc_info = inst->GetArg(0).GetCoprocInfo();
+
+    size_t coproc_num = coproc_info[0];
+    bool two = coproc_info[1] != 0;
+    bool long_transfer = coproc_info[2] != 0;
+    A32::CoprocReg CRd = static_cast<A32::CoprocReg>(coproc_info[3]);
+    bool has_option = coproc_info[4] != 0;
+    std::optional<u8> option = std::nullopt;
+    if (has_option) {
+        option = coproc_info[5];
+    }
+
+
+    std::shared_ptr<A32::Coprocessor> coproc = config.coprocessors[coproc_num];
+    if (!coproc) {
+        EmitCoprocessorException();
+        return;
+    }
+
+    auto action = coproc->CompileLoadWords(two, long_transfer, CRd, option);
+    if (!action) {
+        EmitCoprocessorException();
+        return;
+    }
+
+    CallCoprocCallback(code, ctx.reg_alloc, jit_interface, *action, nullptr, args[1]);
+}
+
+void A32EmitA64::EmitA32CoprocStoreWords(A32EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    auto coproc_info = inst->GetArg(0).GetCoprocInfo();
+
+    size_t coproc_num = coproc_info[0];
+    bool two = coproc_info[1] != 0;
+    bool long_transfer = coproc_info[2] != 0;
+    A32::CoprocReg CRd = static_cast<A32::CoprocReg>(coproc_info[3]);
+    bool has_option = coproc_info[4] != 0;
+    std::optional<u8> option = std::nullopt;
+    if (has_option) {
+        option = coproc_info[5];
+    }
+
+    std::shared_ptr<A32::Coprocessor> coproc = config.coprocessors[coproc_num];
+    if (!coproc) {
+        EmitCoprocessorException();
+        return;
+    }
+
+    auto action = coproc->CompileStoreWords(two, long_transfer, CRd, option);
+    if (!action) {
+        EmitCoprocessorException();
+        return;
+    }
+
+    CallCoprocCallback(code, ctx.reg_alloc, jit_interface, *action, nullptr, args[1]);
+}
+
+
+std::string A32EmitA64::LocationDescriptorToFriendlyName(const IR::LocationDescriptor& ir_descriptor) const {
+    const A32::LocationDescriptor descriptor{ir_descriptor};
+    return fmt::format("a32_{}{:08X}_{}_fpcr{:08X}", descriptor.TFlag() ? "t" : "a", descriptor.PC(), descriptor.EFlag() ? "be" : "le",
+                       descriptor.FPSCR().Value());
+}
+
+void A32EmitA64::FastmemCallback(CodePtr PC) {
+    const auto iter = fastmem_patch_info.find(PC);
+    ASSERT(iter != fastmem_patch_info.end());
+    iter->second.callback();
+    fastmem_patch_info.erase(iter);
+}
+
+void A32EmitA64::EmitTerminalImpl(IR::Term::Interpret terminal, IR::LocationDescriptor initial_location, bool) {
+    ASSERT_MSG(A32::LocationDescriptor{terminal.next}.TFlag() == A32::LocationDescriptor{initial_location}.TFlag(), "Unimplemented");
+    ASSERT_MSG(A32::LocationDescriptor{terminal.next}.EFlag() == A32::LocationDescriptor{initial_location}.EFlag(), "Unimplemented");
+
+    code.MOVI2R(DecodeReg(code.ABI_PARAM2), A32::LocationDescriptor{terminal.next}.PC());
+    code.MOVI2R(DecodeReg(code.ABI_PARAM3), terminal.num_instructions);
+    code.STR(INDEX_UNSIGNED, DecodeReg(code.ABI_PARAM2), X28, MJitStateReg(A32::Reg::PC));
+    code.SwitchFpscrOnExit();
+    Devirtualize<&A32::UserCallbacks::InterpreterFallback>(config.callbacks).EmitCall(code);
+    code.ReturnFromRunCode(true); // TODO: Check cycles
+}
+
+void A32EmitA64::EmitTerminalImpl(IR::Term::ReturnToDispatch, IR::LocationDescriptor, bool) {
+    code.ReturnFromRunCode();
+}
+
+void A32EmitA64::EmitSetUpperLocationDescriptor(IR::LocationDescriptor new_location, IR::LocationDescriptor old_location) {
+    auto get_upper = [](const IR::LocationDescriptor &desc) -> u32 {
+        return static_cast<u32>(A32::LocationDescriptor{desc}.SetSingleStepping(false).UniqueHash() >> 32);
+    };
+
+    const u32 old_upper = get_upper(old_location);
+    const u32 new_upper = [&] {
+        const u32 mask = ~u32(config.always_little_endian ? 0x2 : 0);
+        return get_upper(new_location) & mask;
+    }();
+
+    if (old_upper != new_upper) {
+        code.MOVI2R(DecodeReg(code.ABI_SCRATCH1), new_upper);
+        code.STR(INDEX_UNSIGNED, DecodeReg(code.ABI_SCRATCH1), X28, offsetof(A32JitState, upper_location_descriptor));
+    }
+}
+
+void A32EmitA64::EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+    EmitSetUpperLocationDescriptor(terminal.next, initial_location);
+
+    if (!config.enable_optimizations || is_single_step) {
+        code.MOVI2R(DecodeReg(code.ABI_SCRATCH1), A32::LocationDescriptor{terminal.next}.PC());
+        code.STR(INDEX_UNSIGNED, DecodeReg(code.ABI_SCRATCH1), X28, MJitStateReg(A32::Reg::PC));
+        code.ReturnFromRunCode();
+        return;
+    }
+
+    code.CMP(X26, ZR);
+
+    patch_information[terminal.next].jg.emplace_back(code.GetCodePtr());
+    if (auto next_bb = GetBasicBlock(terminal.next)) {
+        EmitPatchJg(terminal.next, next_bb->entrypoint);
+    } else {
+        EmitPatchJg(terminal.next);
+    }
+    FixupBranch dest = code.B();
+
+    code.SwitchToFarCode();    
+    code.AlignCode16();
+    code.SetJumpTarget(dest);
+    code.MOVI2R(DecodeReg(code.ABI_SCRATCH1), A32::LocationDescriptor{terminal.next}.PC());
+    code.STR(INDEX_UNSIGNED, DecodeReg(code.ABI_SCRATCH1), X28, MJitStateReg(A32::Reg::PC));
+    PushRSBHelper(X1, X2, terminal.next);
+    code.ForceReturnFromRunCode();
+
+    //Todo: find a better/generic place to FlushIcache when switching between
+    //      far code and near code
+    code.FlushIcache();
+    code.SwitchToNearCode();
+}
+
+void A32EmitA64::EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+    EmitSetUpperLocationDescriptor(terminal.next, initial_location);
+    
+    if (!config.enable_optimizations || is_single_step) {
+        code.MOVI2R(DecodeReg(code.ABI_SCRATCH1), A32::LocationDescriptor{terminal.next}.PC());
+        code.STR(INDEX_UNSIGNED, DecodeReg(code.ABI_SCRATCH1), X28, MJitStateReg(A32::Reg::PC));
+        code.ReturnFromRunCode();
+        return;
+    }
+
+    patch_information[terminal.next].jmp.emplace_back(code.GetCodePtr());
+    if (auto next_bb = GetBasicBlock(terminal.next)) {
+        EmitPatchJmp(terminal.next, next_bb->entrypoint);
+    } else {
+        EmitPatchJmp(terminal.next);
+    }
+}
+
+void A32EmitA64::EmitTerminalImpl(IR::Term::PopRSBHint, IR::LocationDescriptor, bool is_single_step) {
+    if (!config.enable_optimizations || is_single_step) {
+        code.ReturnFromRunCode();
+        return;
+    }
+    code.B(terminal_handler_pop_rsb_hint);
+}
+
+void A32EmitA64::EmitTerminalImpl(IR::Term::FastDispatchHint, IR::LocationDescriptor, bool is_single_step) {
+    if (config.enable_fast_dispatch && !is_single_step) {
+        code.B(terminal_handler_fast_dispatch_hint);
+    } else {
+        code.ReturnFromRunCode();
+    }
+}
+
+void A32EmitA64::EmitTerminalImpl(IR::Term::If terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+    FixupBranch pass = EmitCond(terminal.if_);
+    EmitTerminal(terminal.else_, initial_location, is_single_step);
+    code.SetJumpTarget(pass);
+    EmitTerminal(terminal.then_, initial_location, is_single_step);
+}
+
+void A32EmitA64::EmitTerminalImpl(IR::Term::CheckBit terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+    FixupBranch fail;
+    code.LDRB(INDEX_UNSIGNED, DecodeReg(code.ABI_SCRATCH1), X28, offsetof(A32JitState, check_bit));
+    fail = code.CBZ(DecodeReg(code.ABI_SCRATCH1));
+    EmitTerminal(terminal.then_, initial_location, is_single_step);
+    code.SetJumpTarget(fail);
+    EmitTerminal(terminal.else_, initial_location, is_single_step);
+}
+
+void A32EmitA64::EmitTerminalImpl(IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+    code.LDRB(INDEX_UNSIGNED, DecodeReg(code.ABI_SCRATCH1), X28, offsetof(A32JitState, halt_requested));
+    // Conditional branch only gives +/- 1MB of branch distance
+    FixupBranch zero = code.CBZ(DecodeReg(code.ABI_SCRATCH1));
+    code.B(code.GetForceReturnFromRunCodeAddress());
+    code.SetJumpTarget(zero);
+    EmitTerminal(terminal.else_, initial_location, is_single_step);
+}
+
+void A32EmitA64::EmitPatchJg(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr) {
+    const CodePtr patch_location = code.GetCodePtr();
+
+    auto long_branch_gt = [this](CodePtr ptr){
+        const s64 distance = reinterpret_cast<s64>(ptr) - reinterpret_cast<s64>(code.GetCodePtr());
+
+        if((distance >> 2) >= -0x40000 && (distance >> 2) <= 0x3FFFF) {
+            code.B(CC_GT, ptr);
+            return;
+        }
+
+        FixupBranch cc_le = code.B(CC_LE);
+        code.B(ptr);
+        code.SetJumpTarget(cc_le);
+    };
+
+    if (target_code_ptr) {
+        long_branch_gt(target_code_ptr);
+    } else {
+        code.MOVI2R(DecodeReg(code.ABI_SCRATCH1), A32::LocationDescriptor{target_desc}.PC());
+        code.STR(INDEX_UNSIGNED, DecodeReg(code.ABI_SCRATCH1), X28, MJitStateReg(A32::Reg::PC));
+        long_branch_gt(code.GetReturnFromRunCodeAddress());
+    }
+    code.EnsurePatchLocationSize(patch_location, 24);
+}
+
+void A32EmitA64::EmitPatchJmp(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr) {
+    const CodePtr patch_location = code.GetCodePtr();
+    if (target_code_ptr) {
+        code.B(target_code_ptr);
+    } else {
+        code.MOVI2R(DecodeReg(code.ABI_SCRATCH1), A32::LocationDescriptor{target_desc}.PC());
+        code.STR(INDEX_UNSIGNED, DecodeReg(code.ABI_SCRATCH1), X28, MJitStateReg(A32::Reg::PC));
+        code.B(code.GetReturnFromRunCodeAddress());
+    }
+    code.EnsurePatchLocationSize(patch_location, 20);
+}
+
+void A32EmitA64::EmitPatchMovX0(CodePtr target_code_ptr) {
+    if (!target_code_ptr) {
+        target_code_ptr = code.GetReturnFromRunCodeAddress();
+    }
+    const CodePtr patch_location = code.GetCodePtr();
+    code.MOVP2R(X0, target_code_ptr);
+    code.EnsurePatchLocationSize(patch_location, 16);
+}
+
+void A32EmitA64::Unpatch(const IR::LocationDescriptor& location) {
+    EmitA64::Unpatch(location);
+    if (config.enable_fast_dispatch) {
+        code.DisableWriting();
+        SCOPE_EXIT { code.EnableWriting(); };
+
+        (*fast_dispatch_table_lookup)(location.Value()) = {};
+    }
+}
+
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/a32_emit_a64.h b/src/dynarmic/backend/A64/a32_emit_a64.h
new file mode 100644
index 00000000..4989cccb
--- /dev/null
+++ b/src/dynarmic/backend/A64/a32_emit_a64.h
@@ -0,0 +1,138 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#pragma once
+
+#include <array>
+#include <functional>
+#include <optional>
+#include <set>
+#include <tuple>
+#include <unordered_map>
+
+#include "backend/A64/a32_jitstate.h"
+#include "backend/A64/block_range_information.h"
+#include "backend/A64/emit_a64.h"
+#include "backend/A64/exception_handler.h"
+#include "dynarmic/A32/a32.h"
+#include "dynarmic/A32/config.h"
+#include "frontend/A32/location_descriptor.h"
+#include "frontend/ir/terminal.h"
+
+namespace Dynarmic::BackendA64 {
+
+struct A64State;
+class RegAlloc;
+
+struct A32EmitContext final : public EmitContext {
+    A32EmitContext(RegAlloc& reg_alloc, IR::Block& block);
+    A32::LocationDescriptor Location() const;
+    bool IsSingleStep() const;
+    FP::RoundingMode FPSCR_RMode() const override;
+    u32 FPCR() const override;
+    bool FPSCR_FTZ() const override;
+    bool FPSCR_DN() const override;
+    std::ptrdiff_t GetInstOffset(IR::Inst* inst) const;
+};
+
+class A32EmitA64 final : public EmitA64 {
+public:
+    A32EmitA64(BlockOfCode& code, A32::UserConfig config, A32::Jit* jit_interface);
+    ~A32EmitA64() override;
+
+    /**
+     * Emit host machine code for a basic block with intermediate representation `ir`.
+     * @note ir is modified.
+     */
+    BlockDescriptor Emit(IR::Block& ir);
+
+    void ClearCache() override;
+
+    void InvalidateCacheRanges(const boost::icl::interval_set<u32>& ranges);
+
+    void FastmemCallback(CodePtr PC);
+
+protected:
+    const A32::UserConfig config;
+    A32::Jit* jit_interface;
+    BlockRangeInformation<u32> block_ranges;
+    ExceptionHandler exception_handler;
+
+    void EmitCondPrelude(const A32EmitContext& ctx);
+
+    struct FastDispatchEntry {
+        u64 location_descriptor = 0xFFFF'FFFF'FFFF'FFFFull;
+        const void* code_ptr = nullptr;
+    };
+    static_assert(sizeof(FastDispatchEntry) == 0x10);
+    static constexpr u64 fast_dispatch_table_mask = 0xFFFF0;
+    static constexpr size_t fast_dispatch_table_size = 0x10000;
+    std::array<FastDispatchEntry, fast_dispatch_table_size> fast_dispatch_table;
+    void ClearFastDispatchTable();
+
+    using DoNotFastmemMarker = std::tuple<IR::LocationDescriptor, std::ptrdiff_t>;
+    std::set<DoNotFastmemMarker> do_not_fastmem;
+    DoNotFastmemMarker GenerateDoNotFastmemMarker(A32EmitContext& ctx, IR::Inst* inst);
+    void DoNotFastmem(const DoNotFastmemMarker& marker);
+    bool ShouldFastmem(const DoNotFastmemMarker& marker) const;
+
+    const void* read_memory_8;
+    const void* read_memory_16;
+    const void* read_memory_32;
+    const void* read_memory_64;
+    const void* write_memory_8;
+    const void* write_memory_16;
+    const void* write_memory_32;
+    const void* write_memory_64;
+    void GenMemoryAccessors();
+    template<typename T>
+    void ReadMemory(A32EmitContext& ctx, IR::Inst* inst, const CodePtr callback_fn);
+    template<typename T>
+    void WriteMemory(A32EmitContext& ctx, IR::Inst* inst, const CodePtr callback_fn);
+
+    const void* terminal_handler_pop_rsb_hint;
+    const void* terminal_handler_fast_dispatch_hint = nullptr;
+    FastDispatchEntry& (*fast_dispatch_table_lookup)(u64) = nullptr;
+    void GenTerminalHandlers();
+
+    // Microinstruction emitters
+#define OPCODE(...)
+#define A32OPC(name, type, ...) void EmitA32##name(A32EmitContext& ctx, IR::Inst* inst);
+#define A64OPC(...)
+#include "frontend/ir/opcodes.inc"
+#undef OPCODE
+#undef A32OPC
+#undef A64OPC
+
+    // Helpers
+    std::string LocationDescriptorToFriendlyName(const IR::LocationDescriptor&) const override;
+
+    // Fastmem
+    struct FastmemPatchInfo {
+        std::function<void()> callback;
+    };
+    std::unordered_map<CodePtr, FastmemPatchInfo> fastmem_patch_info;
+
+    // Terminal instruction emitters
+    void EmitSetUpperLocationDescriptor(IR::LocationDescriptor new_location, IR::LocationDescriptor old_location);
+    void EmitTerminalImpl(IR::Term::Interpret terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+    void EmitTerminalImpl(IR::Term::ReturnToDispatch terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+    void EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+    void EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+    void EmitTerminalImpl(IR::Term::PopRSBHint terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+    void EmitTerminalImpl(IR::Term::FastDispatchHint terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+    void EmitTerminalImpl(IR::Term::If terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+    void EmitTerminalImpl(IR::Term::CheckBit terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+    void EmitTerminalImpl(IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location, bool is_single_step) override;
+
+    // Patching
+    void Unpatch(const IR::LocationDescriptor& target_desc) override;
+    void EmitPatchJg(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) override;
+    void EmitPatchJmp(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) override;
+    void EmitPatchMovX0(CodePtr target_code_ptr = nullptr) override;
+};
+
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/a32_interface.cpp b/src/dynarmic/backend/A64/a32_interface.cpp
new file mode 100644
index 00000000..ce2b0225
--- /dev/null
+++ b/src/dynarmic/backend/A64/a32_interface.cpp
@@ -0,0 +1,323 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#include <memory>
+
+#include <boost/icl/interval_set.hpp>
+#include <fmt/format.h>
+
+#include <dynarmic/A32/a32.h>
+#include <dynarmic/A32/context.h>
+
+#include "backend/A64/a32_emit_a64.h"
+#include "backend/A64/a32_jitstate.h"
+#include "backend/A64/block_of_code.h"
+#include "backend/A64/callback.h"
+#include "backend/A64/devirtualize.h"
+#include "backend/A64/jitstate_info.h"
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "common/llvm_disassemble.h"
+#include "common/scope_exit.h"
+#include "frontend/A32/translate/translate.h"
+#include "frontend/ir/basic_block.h"
+#include "frontend/ir/location_descriptor.h"
+#include "ir_opt/passes.h"
+
+namespace Dynarmic::A32 {
+
+using namespace BackendA64;
+
+static RunCodeCallbacks GenRunCodeCallbacks(const A32::UserConfig& config, CodePtr (*LookupBlock)(void* lookup_block_arg), void* arg) {
+    return RunCodeCallbacks{
+        std::make_unique<ArgCallback>(LookupBlock, reinterpret_cast<u64>(arg)),
+        std::make_unique<ArgCallback>(Devirtualize<&A32::UserCallbacks::AddTicks>(config.callbacks)),
+        std::make_unique<ArgCallback>(Devirtualize<&A32::UserCallbacks::GetTicksRemaining>(config.callbacks)),
+        reinterpret_cast<u64>(config.fastmem_pointer),
+    };
+}
+
+struct Jit::Impl {
+    Impl(Jit* jit, A32::UserConfig config)
+            : block_of_code(GenRunCodeCallbacks(config, &GetCurrentBlockThunk, this), JitStateInfo{jit_state})
+            , emitter(block_of_code, config, jit)
+            , config(std::move(config))
+            , jit_interface(jit)
+    {}
+
+    A32JitState jit_state;
+    BlockOfCode block_of_code;
+    A32EmitA64 emitter;
+
+    const A32::UserConfig config;
+
+    // Requests made during execution to invalidate the cache are queued up here.
+    size_t invalid_cache_generation = 0;
+    boost::icl::interval_set<u32> invalid_cache_ranges;
+    bool invalidate_entire_cache = false;
+
+    void Execute() {
+        const CodePtr current_codeptr = [this]{
+            // RSB optimization
+            const u32 new_rsb_ptr = (jit_state.rsb_ptr - 1) & A32JitState::RSBPtrMask;
+            if (jit_state.GetUniqueHash() == jit_state.rsb_location_descriptors[new_rsb_ptr]) {
+                jit_state.rsb_ptr = new_rsb_ptr;
+                return reinterpret_cast<CodePtr>(jit_state.rsb_codeptrs[new_rsb_ptr]);
+            }
+
+            return GetCurrentBlock();
+        }();
+
+        block_of_code.RunCode(&jit_state, current_codeptr);
+    }
+
+    void Step() {
+        block_of_code.StepCode(&jit_state, GetCurrentSingleStep());
+    }
+
+    std::string Disassemble(const IR::LocationDescriptor& descriptor) {
+        auto block = GetBasicBlock(descriptor);
+        std::string result = fmt::format("address: {}\nsize: {} bytes\n", block.entrypoint, block.size);
+#ifdef DYNARMIC_USE_LLVM
+        for (const u32* pos = reinterpret_cast<const u32*>(block.entrypoint);
+             reinterpret_cast<const u8*>(pos) < reinterpret_cast<const u8*>(block.entrypoint) + block.size; pos += 1) {
+            fmt::print("0x{:02x} 0x{:02x} ", reinterpret_cast<u64>(pos), *pos);
+            fmt::print("{}", Common::DisassembleAArch64(*pos, reinterpret_cast<u64>(pos)));
+            result += Common::DisassembleAArch64(*pos, reinterpret_cast<u64>(pos));        
+        }
+#endif
+        return result;
+    }
+
+    void PerformCacheInvalidation() {
+        if (invalidate_entire_cache) {
+            jit_state.ResetRSB();
+            block_of_code.ClearCache();
+            emitter.ClearCache();
+
+            invalid_cache_ranges.clear();
+            invalidate_entire_cache = false;
+            invalid_cache_generation++;
+            return;
+        }
+
+        if (invalid_cache_ranges.empty()) {
+            return;
+        }
+
+        jit_state.ResetRSB();
+        emitter.InvalidateCacheRanges(invalid_cache_ranges);
+        invalid_cache_ranges.clear();
+        invalid_cache_generation++;
+    }
+
+    void RequestCacheInvalidation() {
+        if (jit_interface->is_executing) {
+            jit_state.halt_requested = true;
+            return;
+        }
+
+        PerformCacheInvalidation();
+    }
+
+private:
+    Jit* jit_interface;
+
+    static CodePtr GetCurrentBlockThunk(void* this_voidptr) {
+        Jit::Impl& this_ = *static_cast<Jit::Impl*>(this_voidptr);
+        return this_.GetCurrentBlock();
+    }
+
+    IR::LocationDescriptor GetCurrentLocation() const {
+        return IR::LocationDescriptor{jit_state.GetUniqueHash()};
+    }
+
+    CodePtr GetCurrentBlock() {
+        return GetBasicBlock(GetCurrentLocation()).entrypoint;
+    }
+
+    CodePtr GetCurrentSingleStep() {
+        return GetBasicBlock(A32::LocationDescriptor{GetCurrentLocation()}.SetSingleStepping(true)).entrypoint;
+    }
+
+    A32EmitA64::BlockDescriptor GetBasicBlock(IR::LocationDescriptor descriptor) {
+        auto block = emitter.GetBasicBlock(descriptor);
+        if (block)
+            return *block;
+
+        constexpr size_t MINIMUM_REMAINING_CODESIZE = 1 * 1024 * 1024;
+        if (block_of_code.SpaceRemaining() < MINIMUM_REMAINING_CODESIZE) {
+            invalidate_entire_cache = true;
+            PerformCacheInvalidation();
+        }
+
+        IR::Block ir_block = A32::Translate(A32::LocationDescriptor{descriptor}, [this](u32 vaddr) { return config.callbacks->MemoryReadCode(vaddr); }, {config.define_unpredictable_behaviour, config.hook_hint_instructions});
+        if (config.enable_optimizations) {
+            Optimization::A32GetSetElimination(ir_block);
+            Optimization::DeadCodeElimination(ir_block);
+            Optimization::A32ConstantMemoryReads(ir_block, config.callbacks);
+            Optimization::ConstantPropagation(ir_block);
+            Optimization::DeadCodeElimination(ir_block);
+            Optimization::A32MergeInterpretBlocksPass(ir_block, config.callbacks);
+        }
+        Optimization::VerificationPass(ir_block);
+        return emitter.Emit(ir_block);
+    }
+};
+
+Jit::Jit(UserConfig config) : impl(std::make_unique<Impl>(this, std::move(config))) {}
+
+Jit::~Jit() = default;
+
+void Jit::Run() {
+    ASSERT(!is_executing);
+    is_executing = true;
+    SCOPE_EXIT { this->is_executing = false; };
+
+    impl->jit_state.halt_requested = false;
+
+    impl->Execute();
+
+    impl->PerformCacheInvalidation();
+}
+
+void Jit::Step() {
+    ASSERT(!is_executing);
+    is_executing = true;
+    SCOPE_EXIT { this->is_executing = false; };
+
+    impl->jit_state.halt_requested = true;
+
+    impl->Step();
+
+    impl->PerformCacheInvalidation();
+}
+
+void Jit::ClearCache() {
+    impl->invalidate_entire_cache = true;
+    impl->RequestCacheInvalidation();
+}
+
+void Jit::InvalidateCacheRange(std::uint32_t start_address, std::size_t length) {
+    impl->invalid_cache_ranges.add(boost::icl::discrete_interval<u32>::closed(start_address, static_cast<u32>(start_address + length - 1)));
+    impl->RequestCacheInvalidation();
+}
+
+void Jit::Reset() {
+    ASSERT(!is_executing);
+    impl->jit_state = {};
+}
+
+void Jit::HaltExecution() {
+    impl->jit_state.halt_requested = true;
+}
+
+std::array<u32, 16>& Jit::Regs() {
+    return impl->jit_state.Reg;
+}
+const std::array<u32, 16>& Jit::Regs() const {
+    return impl->jit_state.Reg;
+}
+
+std::array<u32, 64>& Jit::ExtRegs() {
+    return impl->jit_state.ExtReg;
+}
+
+const std::array<u32, 64>& Jit::ExtRegs() const {
+    return impl->jit_state.ExtReg;
+}
+
+u32 Jit::Cpsr() const {
+    return impl->jit_state.Cpsr();
+}
+
+void Jit::SetCpsr(u32 value) {
+    return impl->jit_state.SetCpsr(value);
+}
+
+u32 Jit::Fpscr() const {
+    return impl->jit_state.Fpscr();
+}
+
+void Jit::SetFpscr(u32 value) {
+    return impl->jit_state.SetFpscr(value);
+}
+
+Context Jit::SaveContext() const {
+    Context ctx;
+    SaveContext(ctx);
+    return ctx;
+}
+
+struct Context::Impl {
+    A32JitState jit_state;
+    size_t invalid_cache_generation;
+};
+
+Context::Context() : impl(std::make_unique<Context::Impl>()) { impl->jit_state.ResetRSB(); }
+Context::~Context() = default;
+Context::Context(const Context& ctx) : impl(std::make_unique<Context::Impl>(*ctx.impl)) {}
+Context::Context(Context&& ctx) noexcept : impl(std::move(ctx.impl)) {}
+Context& Context::operator=(const Context& ctx) {
+    *impl = *ctx.impl;
+    return *this;
+}
+Context& Context::operator=(Context&& ctx) noexcept {
+    impl = std::move(ctx.impl);
+    return *this;
+}
+
+std::array<std::uint32_t, 16>& Context::Regs() {
+    return impl->jit_state.Reg;
+}
+const std::array<std::uint32_t, 16>& Context::Regs() const {
+    return impl->jit_state.Reg;
+}
+std::array<std::uint32_t, 64>& Context::ExtRegs() {
+    return impl->jit_state.ExtReg;
+}
+const std::array<std::uint32_t, 64>& Context::ExtRegs() const {
+    return impl->jit_state.ExtReg;
+}
+
+std::uint32_t Context::Cpsr() const {
+    return impl->jit_state.Cpsr();
+}
+void Context::SetCpsr(std::uint32_t value) {
+    impl->jit_state.SetCpsr(value);
+}
+
+std::uint32_t Context::Fpscr() const {
+    return impl->jit_state.Fpscr();
+}
+void Context::SetFpscr(std::uint32_t value) {
+    return impl->jit_state.SetFpscr(value);
+}
+
+void Jit::SaveContext(Context& ctx) const {
+    ctx.impl->jit_state.TransferJitState(impl->jit_state, false);
+    ctx.impl->invalid_cache_generation = impl->invalid_cache_generation;
+}
+
+void Jit::LoadContext(const Context& ctx) {
+    bool reset_rsb = ctx.impl->invalid_cache_generation != impl->invalid_cache_generation;
+    impl->jit_state.TransferJitState(ctx.impl->jit_state, reset_rsb);
+}
+
+std::string Jit::Disassemble() const {
+    std::string result;
+#ifdef DYNARMIC_USE_LLVM
+    for (const u32* pos = reinterpret_cast<const u32*>(impl->block_of_code.GetCodeBegin());
+         reinterpret_cast<const u8*>(pos) < reinterpret_cast<const u8*>(impl->block_of_code.GetCodePtr()); pos += 1) {
+        fmt::print("0x{:02x} 0x{:02x} ", reinterpret_cast<u64>(pos), *pos);
+        fmt::print("{}", Common::DisassembleAArch64(*pos, reinterpret_cast<u64>(pos)));
+        result += Common::DisassembleAArch64(*pos, reinterpret_cast<u64>(pos));
+    }
+#endif
+    return result;
+}
+
+} // namespace Dynarmic::A32
diff --git a/src/dynarmic/backend/A64/a32_jitstate.cpp b/src/dynarmic/backend/A64/a32_jitstate.cpp
new file mode 100644
index 00000000..2ae33118
--- /dev/null
+++ b/src/dynarmic/backend/A64/a32_jitstate.cpp
@@ -0,0 +1,172 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#include "backend/A64/a32_jitstate.h"
+#include "backend/A64/block_of_code.h"
+#include "common/assert.h"
+#include "common/bit_util.h"
+#include "common/common_types.h"
+#include "frontend/A32/location_descriptor.h"
+
+namespace Dynarmic::BackendA64 {
+
+/**
+ * CPSR Bits
+ * =========
+ *
+ * ARM CPSR flags
+ * --------------
+ * N        bit 31       Negative flag
+ * Z        bit 30       Zero flag
+ * C        bit 29       Carry flag
+ * V        bit 28       oVerflow flag
+ * Q        bit 27       Saturation flag
+ * IT[1:0]  bits 25-26   If-Then execution state (lower 2 bits)
+ * J        bit 24       Jazelle instruction set flag
+ * GE       bits 16-19   Greater than or Equal flags
+ * IT[7:2]  bits 10-15   If-Then execution state (upper 6 bits)
+ * E        bit 9        Data Endianness flag
+ * A        bit 8        Disable imprecise Aborts
+ * I        bit 7        Disable IRQ interrupts
+ * F        bit 6        Disable FIQ interrupts
+ * T        bit 5        Thumb instruction set flag
+ * M        bits 0-4     Processor Mode bits
+ *
+ * A64 flags
+ * -------------------
+ * N    bit 31       Negative flag
+ * Z    bit 30       Zero flag
+ * C    bit 29       Carry flag
+ * V    bit 28       oVerflow flag
+ */
+
+u32 A32JitState::Cpsr() const {
+    DEBUG_ASSERT((cpsr_nzcv & ~0xF0000000) == 0);
+    DEBUG_ASSERT((cpsr_q & ~1) == 0);
+    DEBUG_ASSERT((cpsr_jaifm & ~0x010001DF) == 0);
+
+    u32 cpsr = 0;
+
+    // NZCV flags
+    cpsr |= cpsr_nzcv;
+    // Q flag
+    cpsr |= cpsr_q ? 1 << 27 : 0;
+    // GE flags
+    cpsr |= Common::Bit<31>(cpsr_ge) ? 1 << 19 : 0;
+    cpsr |= Common::Bit<23>(cpsr_ge) ? 1 << 18 : 0;
+    cpsr |= Common::Bit<15>(cpsr_ge) ? 1 << 17 : 0;
+    cpsr |= Common::Bit<7>(cpsr_ge) ? 1 << 16 : 0;
+    // E flag, T flag
+    cpsr |= Common::Bit<1>(upper_location_descriptor) ? 1 << 9 : 0;
+    cpsr |= Common::Bit<0>(upper_location_descriptor) ? 1 << 5 : 0;
+    // IT state
+    cpsr |= static_cast<u32>(upper_location_descriptor & 0b11111100'00000000);
+    cpsr |= static_cast<u32>(upper_location_descriptor & 0b00000011'00000000) << 17;
+    // Other flags
+    cpsr |= cpsr_jaifm;
+
+    return cpsr;
+}
+
+void A32JitState::SetCpsr(u32 cpsr) {
+    // NZCV flags
+    cpsr_nzcv = cpsr & 0xF0000000;
+    // Q flag
+    cpsr_q = Common::Bit<27>(cpsr) ? 1 : 0;
+    // GE flags
+    cpsr_ge = 0;
+    cpsr_ge |= Common::Bit<19>(cpsr) ? 0xFF000000 : 0;
+    cpsr_ge |= Common::Bit<18>(cpsr) ? 0x00FF0000 : 0;
+    cpsr_ge |= Common::Bit<17>(cpsr) ? 0x0000FF00 : 0;
+    cpsr_ge |= Common::Bit<16>(cpsr) ? 0x000000FF : 0;
+
+    upper_location_descriptor &= 0xFFFF0000;
+    // E flag, T flag
+    upper_location_descriptor |= Common::Bit<9>(cpsr) ? 2 : 0;
+    upper_location_descriptor |= Common::Bit<5>(cpsr) ? 1 : 0;
+    // IT state
+    upper_location_descriptor |= (cpsr >>  0) & 0b11111100'00000000;
+    upper_location_descriptor |= (cpsr >> 17) & 0b00000011'00000000;
+
+    // Other flags
+    cpsr_jaifm = cpsr & 0x010001DF;
+}
+
+void A32JitState::ResetRSB() {
+    rsb_location_descriptors.fill(0xFFFFFFFFFFFFFFFFull);
+    rsb_codeptrs.fill(0);
+}
+
+/**
+ * FPSCR
+ * =========================
+ *
+ * VFP FPSCR cumulative exception bits
+ * -----------------------------------
+ * IDC  bit 7   Input Denormal cumulative exception bit       // Only ever set when FPSCR.FTZ = 1
+ * IXC  bit 4   Inexact cumulative exception bit
+ * UFC  bit 3   Underflow cumulative exception bit
+ * OFC  bit 2   Overflow cumulative exception bit
+ * DZC  bit 1   Division by Zero cumulative exception bit
+ * IOC  bit 0   Invalid Operation cumulative exception bit
+ *
+ * VFP FPSCR exception trap enables
+ * --------------------------------
+ * IDE  bit 15  Input Denormal exception trap enable
+ * IXE  bit 12  Inexact exception trap enable
+ * UFE  bit 11  Underflow exception trap enable
+ * OFE  bit 10  Overflow exception trap enable
+ * DZE  bit 9   Division by Zero exception trap enable
+ * IOE  bit 8   Invalid Operation exception trap enable
+ *
+ * VFP FPSCR mode bits
+ * -------------------
+ * AHP      bit 26      Alternate half-precision
+ * DN       bit 25      Default NaN
+ * FZ       bit 24      Flush to Zero
+ * RMode    bits 22-23  Round to {0 = Nearest, 1 = Positive, 2 = Negative, 3 = Zero}
+ * Stride   bits 20-21  Vector stride
+ * Len      bits 16-18  Vector length
+ */
+
+// NZCV; QC (ASIMD only), AHP; DN, FZ, RMode, Stride; SBZP; Len; trap enables; cumulative bits
+constexpr u32 FPSCR_MODE_MASK = A32::LocationDescriptor::FPSCR_MODE_MASK;
+constexpr u32 FPSCR_NZCV_MASK = 0xF0000000;
+
+u32 A32JitState::Fpscr() const {
+    DEBUG_ASSERT((fpsr_nzcv & ~FPSCR_NZCV_MASK) == 0);
+
+    const u32 fpcr_mode = static_cast<u32>(upper_location_descriptor) & FPSCR_MODE_MASK;
+
+    u32 FPSCR = fpcr_mode | fpsr_nzcv;
+    FPSCR |= (guest_fpsr & 0x1F);
+    FPSCR |= fpsr_exc;
+
+    return FPSCR;
+}
+
+void A32JitState::SetFpscr(u32 FPSCR) {
+    // Ensure that only upper half of upper_location_descriptor is used for FPSCR bits.
+    static_assert((FPSCR_MODE_MASK & 0xFFFF0000) == FPSCR_MODE_MASK);
+
+    upper_location_descriptor &= 0x0000FFFF;
+    upper_location_descriptor |= FPSCR & FPSCR_MODE_MASK;
+
+    fpsr_nzcv = FPSCR & FPSCR_NZCV_MASK;
+    guest_fpcr = 0;
+    guest_fpsr = 0;
+
+    // Cumulative flags IDC, IOC, IXC, UFC, OFC, DZC
+    fpsr_exc = FPSCR & 0x9F;
+
+    // Mode Bits
+    guest_fpcr |= FPSCR & 0x07C09F00;
+
+    // Exceptions
+    guest_fpsr |= FPSCR & 0x9F;
+}
+
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/a32_jitstate.h b/src/dynarmic/backend/A64/a32_jitstate.h
new file mode 100644
index 00000000..0783fb2b
--- /dev/null
+++ b/src/dynarmic/backend/A64/a32_jitstate.h
@@ -0,0 +1,111 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#pragma once
+
+#include <array>
+
+#include "common/common_types.h"
+
+namespace Dynarmic::BackendA64 {
+
+class BlockOfCode;
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable:4324) // Structure was padded due to alignment specifier
+#endif
+
+struct A32JitState {
+    using ProgramCounterType = u32;
+
+    A32JitState() { ResetRSB(); }
+
+    std::array<u32, 16> Reg{}; // Current register file.
+    // TODO: Mode-specific register sets unimplemented.
+
+    u32 upper_location_descriptor = 0;
+
+    u32 cpsr_ge = 0;
+    u32 cpsr_q = 0;
+    u32 cpsr_nzcv = 0;
+    u32 cpsr_jaifm = 0;
+    u32 Cpsr() const;
+    void SetCpsr(u32 cpsr);
+
+    alignas(u64) std::array<u32, 64> ExtReg{}; // Extension registers.
+
+    static constexpr size_t SpillCount = 64;
+    std::array<u64, SpillCount> Spill{}; // Spill.
+    static size_t GetSpillLocationOffsetFromIndex(size_t i) {
+        return static_cast<u64>(offsetof(A32JitState, Spill) + i * sizeof(u64));
+    }
+
+    // For internal use (See: BlockOfCode::RunCode)
+    u64 guest_fpcr = 0;
+    u64 guest_fpsr = 0;
+    u64 save_host_FPCR = 0;
+    s64 cycles_to_run = 0;
+    s64 cycles_remaining = 0;
+    bool halt_requested = false;
+    bool check_bit = false;
+
+    // Exclusive state
+    static constexpr u32 RESERVATION_GRANULE_MASK = 0xFFFFFFF8;
+    u32 exclusive_state = 0;
+    u32 exclusive_address = 0;
+
+    static constexpr size_t RSBSize = 8; // MUST be a power of 2.
+    static constexpr size_t RSBPtrMask = RSBSize - 1;
+    u32 rsb_ptr = 0;
+    std::array<u64, RSBSize> rsb_location_descriptors;
+    std::array<u64, RSBSize> rsb_codeptrs;
+    void ResetRSB();
+
+    u32 fpsr_exc = 0;
+    u32 fpsr_qc = 0; // Dummy value
+    u32 fpsr_nzcv = 0;
+    u32 Fpscr() const;
+    void SetFpscr(u32 FPSCR);
+
+    u64 GetUniqueHash() const noexcept {
+        return (static_cast<u64>(upper_location_descriptor) << 32) | (static_cast<u64>(Reg[15]));
+    }
+
+    void TransferJitState(const A32JitState& src, bool reset_rsb) {
+        Reg = src.Reg;
+        upper_location_descriptor = src.upper_location_descriptor;
+        cpsr_ge = src.cpsr_ge;
+        cpsr_q = src.cpsr_q;
+        cpsr_nzcv = src.cpsr_nzcv;
+        cpsr_jaifm = src.cpsr_jaifm;
+        ExtReg = src.ExtReg;
+        guest_fpcr = src.guest_fpcr;
+        guest_fpsr = src.guest_fpsr;
+        fpsr_exc = src.fpsr_exc;
+        fpsr_qc = src.fpsr_qc;
+        fpsr_nzcv = src.fpsr_nzcv;
+
+        exclusive_state = 0;
+        exclusive_address = 0;
+
+        if (reset_rsb) {
+            ResetRSB();
+        } else {
+            rsb_ptr = src.rsb_ptr;
+            rsb_location_descriptors = src.rsb_location_descriptors;
+            rsb_codeptrs = src.rsb_codeptrs;
+        }
+    }
+};
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+using CodePtr = const void*;
+
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/abi.cpp b/src/dynarmic/backend/A64/abi.cpp
new file mode 100644
index 00000000..24fc1cb8
--- /dev/null
+++ b/src/dynarmic/backend/A64/abi.cpp
@@ -0,0 +1,87 @@
+// Copyright (C) 2003 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// 20th Sep 2018: This code was modified for Dynarmic.
+
+#include <algorithm>
+#include <vector>
+
+#include "backend/A64/abi.h"
+#include "common/common_types.h"
+#include "common/math_util.h"
+#include "common/iterator_util.h"
+
+namespace Dynarmic::BackendA64 {
+
+template<typename RegisterArrayT>
+void ABI_PushRegistersAndAdjustStack(BlockOfCode& code, const RegisterArrayT& regs) {
+    u32 gprs = 0 , fprs = 0;
+
+    for (HostLoc reg : regs) {
+        if (HostLocIsGPR(reg)) {
+            gprs |= 0x1 << static_cast<u32>(DecodeReg(HostLocToReg64(reg)));
+        } else if (HostLocIsFPR(reg)) {
+            fprs |= 0x1 << static_cast<u32>(DecodeReg(HostLocToFpr(reg)));
+        }
+    }
+
+    code.fp_emitter.ABI_PushRegisters(fprs);
+    code.ABI_PushRegisters(gprs);
+}
+
+template<typename RegisterArrayT>
+void ABI_PopRegistersAndAdjustStack(BlockOfCode& code, const RegisterArrayT& regs) {
+    u32 gprs = 0, fprs = 0;
+
+    for (HostLoc reg : regs) {
+        if (HostLocIsGPR(reg)) {
+            gprs |= 0x1 << static_cast<u32>(DecodeReg(HostLocToReg64(reg)));
+        } else if (HostLocIsFPR(reg)) {
+            fprs |= 0x1 << static_cast<u32>(DecodeReg(HostLocToFpr(reg)));
+        }
+    }
+
+    code.ABI_PopRegisters(gprs);
+    code.fp_emitter.ABI_PopRegisters(fprs);
+}
+
+void ABI_PushCalleeSaveRegistersAndAdjustStack(BlockOfCode& code) {
+    ABI_PushRegistersAndAdjustStack(code, ABI_ALL_CALLEE_SAVE);
+}
+
+void ABI_PopCalleeSaveRegistersAndAdjustStack(BlockOfCode& code) {
+    ABI_PopRegistersAndAdjustStack(code, ABI_ALL_CALLEE_SAVE);
+}
+
+void ABI_PushCallerSaveRegistersAndAdjustStack(BlockOfCode& code) {
+    ABI_PushRegistersAndAdjustStack(code, ABI_ALL_CALLER_SAVE);
+}
+
+void ABI_PopCallerSaveRegistersAndAdjustStack(BlockOfCode& code) {
+    ABI_PopRegistersAndAdjustStack(code, ABI_ALL_CALLER_SAVE);
+}
+
+void ABI_PushCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, HostLoc exception) {
+    std::vector<HostLoc> regs;
+    std::remove_copy(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end(), std::back_inserter(regs), exception);
+    ABI_PushRegistersAndAdjustStack(code, regs);
+}
+
+void ABI_PopCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, HostLoc exception) {
+    std::vector<HostLoc> regs;
+    std::remove_copy(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end(), std::back_inserter(regs), exception);
+    ABI_PopRegistersAndAdjustStack(code, regs);
+}
+
+} // namespace Dynarmic::BackendX64
diff --git a/src/dynarmic/backend/A64/abi.h b/src/dynarmic/backend/A64/abi.h
new file mode 100644
index 00000000..1bbff25a
--- /dev/null
+++ b/src/dynarmic/backend/A64/abi.h
@@ -0,0 +1,110 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+#pragma once
+
+#include <array>
+
+#include "backend/A64/block_of_code.h"
+#include "backend/A64/hostloc.h"
+
+namespace Dynarmic::BackendA64 {
+
+constexpr HostLoc ABI_RETURN = HostLoc::X0;
+
+constexpr HostLoc ABI_PARAM1 = HostLoc::X0;
+constexpr HostLoc ABI_PARAM2 = HostLoc::X1;
+constexpr HostLoc ABI_PARAM3 = HostLoc::X2;
+constexpr HostLoc ABI_PARAM4 = HostLoc::X3;
+constexpr HostLoc ABI_PARAM5 = HostLoc::X4;
+constexpr HostLoc ABI_PARAM6 = HostLoc::X5;
+constexpr HostLoc ABI_PARAM7 = HostLoc::X6;
+constexpr HostLoc ABI_PARAM8 = HostLoc::X7;
+
+constexpr std::array<HostLoc, 43> ABI_ALL_CALLER_SAVE = {  
+    HostLoc::X0,
+    HostLoc::X1,
+    HostLoc::X2,
+    HostLoc::X3,
+    HostLoc::X4,
+    HostLoc::X5,
+    HostLoc::X6,
+    HostLoc::X7,
+    HostLoc::X8,
+    HostLoc::X9,
+    HostLoc::X10,
+    HostLoc::X11,
+    HostLoc::X12,
+    HostLoc::X13,
+    HostLoc::X14,
+    HostLoc::X15,
+    HostLoc::X16,
+    HostLoc::X17,
+    HostLoc::X18,
+
+    HostLoc::Q0,
+    HostLoc::Q1,
+    HostLoc::Q2,
+    HostLoc::Q3,
+    HostLoc::Q4,
+    HostLoc::Q5,
+    HostLoc::Q6,
+    HostLoc::Q7,
+   
+    HostLoc::Q16,
+    HostLoc::Q17,
+    HostLoc::Q18,
+    HostLoc::Q19,
+    HostLoc::Q20,
+    HostLoc::Q21,
+    HostLoc::Q22,
+    HostLoc::Q23,
+    HostLoc::Q24,
+    HostLoc::Q25,
+    HostLoc::Q26,
+    HostLoc::Q27,
+    HostLoc::Q28,
+    HostLoc::Q29,
+    HostLoc::Q30,
+    HostLoc::Q31,
+};
+
+constexpr std::array<HostLoc, 20> ABI_ALL_CALLEE_SAVE = {
+    HostLoc::X19,
+    HostLoc::X20,
+    HostLoc::X21,
+    HostLoc::X22,
+    HostLoc::X23,
+    HostLoc::X24,
+    HostLoc::X25,
+    HostLoc::X26,
+    HostLoc::X27,
+    HostLoc::X28,
+    HostLoc::X29,
+    HostLoc::X30,
+
+    HostLoc::Q8,
+    HostLoc::Q9,
+    HostLoc::Q10,
+    HostLoc::Q11,
+    HostLoc::Q12,
+    HostLoc::Q13,
+    HostLoc::Q14,
+    HostLoc::Q15,
+};
+
+constexpr size_t ABI_SHADOW_SPACE = 0; // bytes
+
+static_assert(ABI_ALL_CALLER_SAVE.size() + ABI_ALL_CALLEE_SAVE.size() == 63, "Invalid total number of registers");
+
+void ABI_PushCalleeSaveRegistersAndAdjustStack(BlockOfCode& code);
+void ABI_PopCalleeSaveRegistersAndAdjustStack(BlockOfCode& code);
+void ABI_PushCallerSaveRegistersAndAdjustStack(BlockOfCode& code);
+void ABI_PopCallerSaveRegistersAndAdjustStack(BlockOfCode& code);
+
+void ABI_PushCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, HostLoc exception);
+void ABI_PopCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, HostLoc exception);
+
+} // namespace Dynarmic::BackendX64
diff --git a/src/dynarmic/backend/A64/block_of_code.cpp b/src/dynarmic/backend/A64/block_of_code.cpp
new file mode 100644
index 00000000..2c534fc1
--- /dev/null
+++ b/src/dynarmic/backend/A64/block_of_code.cpp
@@ -0,0 +1,336 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#include <array>
+#include <cstring>
+#include <limits>
+
+#include "backend/A64/a32_jitstate.h"
+#include "backend/A64/abi.h"
+#include "backend/A64/block_of_code.h"
+#include "backend/A64/perf_map.h"
+#include "common/assert.h"
+
+#ifdef _WIN32
+    #include <windows.h>
+#else
+    #include <sys/mman.h>
+#endif
+
+#ifdef __APPLE__
+#include <pthread.h>
+#endif
+
+namespace Dynarmic::BackendA64 {
+
+const Arm64Gen::ARM64Reg BlockOfCode::ABI_RETURN  = Arm64Gen::ARM64Reg::X0;
+const Arm64Gen::ARM64Reg BlockOfCode::ABI_RETURN2 = Arm64Gen::ARM64Reg::X1;
+
+const Arm64Gen::ARM64Reg BlockOfCode::ABI_PARAM1 = Arm64Gen::ARM64Reg::X0;
+const Arm64Gen::ARM64Reg BlockOfCode::ABI_PARAM2 = Arm64Gen::ARM64Reg::X1;
+const Arm64Gen::ARM64Reg BlockOfCode::ABI_PARAM3 = Arm64Gen::ARM64Reg::X2;
+const Arm64Gen::ARM64Reg BlockOfCode::ABI_PARAM4 = Arm64Gen::ARM64Reg::X3;
+const Arm64Gen::ARM64Reg BlockOfCode::ABI_PARAM5 = Arm64Gen::ARM64Reg::X4;
+const Arm64Gen::ARM64Reg BlockOfCode::ABI_PARAM6 = Arm64Gen::ARM64Reg::X5;
+const Arm64Gen::ARM64Reg BlockOfCode::ABI_PARAM7 = Arm64Gen::ARM64Reg::X6;
+const Arm64Gen::ARM64Reg BlockOfCode::ABI_PARAM8 = Arm64Gen::ARM64Reg::X7;
+
+const Arm64Gen::ARM64Reg BlockOfCode::ABI_SCRATCH1 = Arm64Gen::ARM64Reg::X30;
+
+const std::array<Arm64Gen::ARM64Reg, 8> BlockOfCode::ABI_PARAMS = {BlockOfCode::ABI_PARAM1, BlockOfCode::ABI_PARAM2,
+                                                         BlockOfCode::ABI_PARAM3, BlockOfCode::ABI_PARAM4,
+                                                         BlockOfCode::ABI_PARAM5, BlockOfCode::ABI_PARAM6,
+                                                         BlockOfCode::ABI_PARAM7, BlockOfCode::ABI_PARAM8};
+
+namespace {
+
+constexpr size_t TOTAL_CODE_SIZE = 128 * 1024 * 1024;
+constexpr size_t FAR_CODE_OFFSET = 100 * 1024 * 1024;
+
+#ifdef DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT
+void ProtectMemory([[maybe_unused]] const void* base, [[maybe_unused]] size_t size, bool is_executable) {
+#if defined(_WIN32)
+    DWORD oldProtect = 0;
+    VirtualProtect(const_cast<void*>(base), size, is_executable ? PAGE_EXECUTE_READ : PAGE_READWRITE, &oldProtect);
+#elif defined(__APPLE__)
+    pthread_jit_write_protect_np(is_executable);
+#else
+    static const size_t pageSize = sysconf(_SC_PAGESIZE);
+    const size_t iaddr = reinterpret_cast<size_t>(base);
+    const size_t roundAddr = iaddr & ~(pageSize - static_cast<size_t>(1));
+    const int mode = is_executable ? (PROT_READ | PROT_EXEC) : (PROT_READ | PROT_WRITE);
+    mprotect(reinterpret_cast<void*>(roundAddr), size + (iaddr - roundAddr), mode);
+#endif
+}
+#endif
+
+} // anonymous namespace
+
+BlockOfCode::BlockOfCode(RunCodeCallbacks cb, JitStateInfo jsi)
+        : fp_emitter(this)
+        , cb(std::move(cb))
+        , jsi(jsi)
+        , constant_pool(*this) {
+    AllocCodeSpace(TOTAL_CODE_SIZE);
+    EnableWriting();
+    GenRunCode();
+}
+
+void BlockOfCode::PreludeComplete() {
+    prelude_complete = true;
+    near_code_begin = GetCodePtr();
+    far_code_begin = GetCodePtr() + FAR_CODE_OFFSET;
+    FlushIcache();
+    ClearCache();
+    DisableWriting();
+}
+
+void BlockOfCode::EnableWriting() {
+#ifdef DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT
+    ProtectMemory(GetCodePtr(), TOTAL_CODE_SIZE, false);
+#endif
+}
+
+void BlockOfCode::DisableWriting() {
+#ifdef DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT
+    ProtectMemory(GetCodePtr(), TOTAL_CODE_SIZE, true);
+#endif
+}
+
+void BlockOfCode::ClearCache() {
+    ASSERT(prelude_complete);
+    in_far_code = false;
+    near_code_ptr = near_code_begin;
+    far_code_ptr = far_code_begin;
+    SetCodePtr(near_code_begin);
+    constant_pool.Clear();
+}
+
+size_t BlockOfCode::SpaceRemaining() const {
+    ASSERT(prelude_complete);
+    // This function provides an underestimate of near-code-size but that's okay.
+    // (Why? The maximum size of near code should be measured from near_code_begin, not top_.)
+    // These are offsets from Xbyak::CodeArray::top_.
+    std::size_t far_code_offset, near_code_offset;
+    if (in_far_code) {
+        near_code_offset = static_cast<const u8*>(near_code_ptr) - static_cast<const u8*>(region);
+        far_code_offset = GetCodePtr() - static_cast<const u8*>(region);
+    } else {
+        near_code_offset = GetCodePtr() - static_cast<const u8*>(region);
+        far_code_offset = static_cast<const u8*>(far_code_ptr) - static_cast<const u8*>(region);
+    }
+    if (far_code_offset > TOTAL_CODE_SIZE)
+        return 0;
+    if (near_code_offset > FAR_CODE_OFFSET)
+        return 0;
+    return std::min(TOTAL_CODE_SIZE - far_code_offset, FAR_CODE_OFFSET - near_code_offset);
+}
+
+void BlockOfCode::RunCode(void* jit_state, CodePtr code_ptr) const {
+    run_code(jit_state, code_ptr);
+}
+
+void BlockOfCode::StepCode(void* jit_state, CodePtr code_ptr) const {
+    step_code(jit_state, code_ptr);
+}
+
+void BlockOfCode::ReturnFromRunCode(bool fpscr_already_exited) {
+    size_t index = 0;
+    if (fpscr_already_exited)
+        index |= FPSCR_ALREADY_EXITED;
+    B(return_from_run_code[index]);
+}
+
+void BlockOfCode::ForceReturnFromRunCode(bool fpscr_already_exited) {
+    size_t index = FORCE_RETURN;
+    if (fpscr_already_exited)
+        index |= FPSCR_ALREADY_EXITED;
+    B(return_from_run_code[index]);
+}
+
+void BlockOfCode::GenRunCode() {
+    const u8* loop, *enter_fpscr_then_loop;
+
+    AlignCode16();
+    run_code = reinterpret_cast<RunCodeFuncType>(GetWritableCodePtr());
+
+    // This serves two purposes:
+    // 1. It saves all the registers we as a callee need to save.
+    // 2. It aligns the stack so that the code the JIT emits can assume
+    //    that the stack is appropriately aligned for CALLs.
+    ABI_PushCalleeSaveRegistersAndAdjustStack(*this);
+
+    MOV(Arm64Gen::X28, ABI_PARAM1);
+    MOVI2R(Arm64Gen::X27, cb.value_in_X27);
+    MOV(Arm64Gen::X25, ABI_PARAM2); // save temporarily in non-volatile register
+
+    cb.GetTicksRemaining->EmitCall(*this);
+    STR(Arm64Gen::INDEX_UNSIGNED, ABI_RETURN, Arm64Gen::X28, jsi.offsetof_cycles_to_run);
+    MOV(Arm64Gen::X26, ABI_RETURN);
+
+    SwitchFpscrOnEntry();
+    BR(Arm64Gen::X25);
+
+    AlignCode16();
+    step_code = reinterpret_cast<RunCodeFuncType>(GetWritableCodePtr());
+    ABI_PushCalleeSaveRegistersAndAdjustStack(*this);
+
+    MOV(Arm64Gen::X28, ABI_PARAM1);
+    
+    MOVI2R(Arm64Gen::X26, 1);
+    STR(Arm64Gen::INDEX_UNSIGNED, Arm64Gen::X26, Arm64Gen::X28, jsi.offsetof_cycles_to_run);    
+
+    SwitchFpscrOnEntry();
+    BR(ABI_PARAM2);
+
+    enter_fpscr_then_loop = GetCodePtr();
+    SwitchFpscrOnEntry();
+    loop = GetCodePtr();
+    cb.LookupBlock->EmitCall(*this);
+    BR(ABI_RETURN);    
+
+    // Return from run code variants
+    const auto emit_return_from_run_code = [this, &loop, &enter_fpscr_then_loop](bool fpscr_already_exited, bool force_return){
+        if (!force_return) {
+            CMP(Arm64Gen::X26, Arm64Gen::ZR);
+            B(CC_GT, fpscr_already_exited ? enter_fpscr_then_loop : loop);
+        }
+
+        if (!fpscr_already_exited) {
+            SwitchFpscrOnExit();
+        }
+
+        cb.AddTicks->EmitCall(*this, [this](RegList param) {
+            LDR(Arm64Gen::INDEX_UNSIGNED, param[0], Arm64Gen::X28, jsi.offsetof_cycles_to_run);
+            SUB(param[0], param[0], Arm64Gen::X26);
+        });
+
+        ABI_PopCalleeSaveRegistersAndAdjustStack(*this);
+        RET();
+    };
+
+    return_from_run_code[0] = AlignCode16();
+    emit_return_from_run_code(false, false);
+
+    return_from_run_code[FPSCR_ALREADY_EXITED] = AlignCode16();
+    emit_return_from_run_code(true, false);
+
+    return_from_run_code[FORCE_RETURN] = AlignCode16();
+    emit_return_from_run_code(false, true);
+
+    return_from_run_code[FPSCR_ALREADY_EXITED | FORCE_RETURN] = AlignCode16();
+    emit_return_from_run_code(true, true);
+
+    PerfMapRegister(run_code, GetCodePtr(), "dynarmic_dispatcher");
+}
+
+void BlockOfCode::SwitchFpscrOnEntry() {
+    MRS(ABI_SCRATCH1, Arm64Gen::FIELD_FPCR);
+    STR(Arm64Gen::INDEX_UNSIGNED, ABI_SCRATCH1, Arm64Gen::X28, jsi.offsetof_save_host_FPCR);
+    
+    LDR(Arm64Gen::INDEX_UNSIGNED, ABI_SCRATCH1, Arm64Gen::X28, jsi.offsetof_guest_fpcr);
+    _MSR(Arm64Gen::FIELD_FPCR, ABI_SCRATCH1);
+    LDR(Arm64Gen::INDEX_UNSIGNED, ABI_SCRATCH1, Arm64Gen::X28, jsi.offsetof_guest_fpsr);
+    _MSR(Arm64Gen::FIELD_FPSR, ABI_SCRATCH1);    
+}
+
+void BlockOfCode::SwitchFpscrOnExit() {
+    MRS(ABI_SCRATCH1, Arm64Gen::FIELD_FPCR);
+    STR(Arm64Gen::INDEX_UNSIGNED, ABI_SCRATCH1, Arm64Gen::X28, jsi.offsetof_guest_fpcr);
+    MRS(ABI_SCRATCH1, Arm64Gen::FIELD_FPSR);
+    STR(Arm64Gen::INDEX_UNSIGNED, ABI_SCRATCH1, Arm64Gen::X28, jsi.offsetof_guest_fpsr);
+
+    LDR(Arm64Gen::INDEX_UNSIGNED, ABI_SCRATCH1, Arm64Gen::X28, jsi.offsetof_save_host_FPCR);
+    _MSR(Arm64Gen::FIELD_FPCR, ABI_SCRATCH1);
+}
+
+void BlockOfCode::UpdateTicks() {
+    cb.AddTicks->EmitCall(*this, [this](RegList param) {
+        LDR(Arm64Gen::INDEX_UNSIGNED, param[0], Arm64Gen::X28, jsi.offsetof_cycles_to_run);
+        SUB(param[0], param[0], Arm64Gen::X26);
+    });
+
+    cb.GetTicksRemaining->EmitCall(*this);
+    STR(Arm64Gen::INDEX_UNSIGNED, ABI_RETURN, Arm64Gen::X28, jsi.offsetof_cycles_to_run);
+    MOV(Arm64Gen::X26, ABI_RETURN);
+}
+
+void BlockOfCode::LookupBlock() {
+    cb.LookupBlock->EmitCall(*this);
+}
+
+void BlockOfCode::EmitPatchLDR(Arm64Gen::ARM64Reg Rt, u64 lower, u64 upper) {
+    ASSERT_MSG(!in_far_code, "Can't patch when in far code, yet!");
+    constant_pool.EmitPatchLDR(Rt, lower, upper);
+}
+
+void BlockOfCode::PatchConstPool() {
+    constant_pool.PatchPool();
+}
+
+void BlockOfCode::SwitchToFarCode() {
+    ASSERT(prelude_complete);
+    ASSERT(!in_far_code);
+    in_far_code = true;
+    near_code_ptr = GetCodePtr();
+    SetCodePtr(far_code_ptr);
+
+    ASSERT_MSG(near_code_ptr < far_code_begin, "Near code has overwritten far code!");
+}
+
+void BlockOfCode::SwitchToNearCode() {
+    ASSERT(prelude_complete);
+    ASSERT(in_far_code);
+    in_far_code = false;
+    far_code_ptr = GetCodePtr();
+    SetCodePtr(near_code_ptr);
+}
+
+CodePtr BlockOfCode::GetCodeBegin() const {
+    return near_code_begin;
+}
+
+u8* BlockOfCode::GetRegion() const {
+    return region;
+}
+
+std::size_t BlockOfCode::GetRegionSize() const {
+    return total_region_size;
+}
+
+void* BlockOfCode::AllocateFromCodeSpace(size_t alloc_size) {    
+    ASSERT_MSG(GetSpaceLeft() >= alloc_size, "ERR_CODE_IS_TOO_BIG");
+
+    void* ret = GetWritableCodePtr();
+    region_size += alloc_size;
+    SetCodePtr(GetCodePtr() + alloc_size);
+    memset(ret, 0, alloc_size);    
+    return ret;
+}
+
+void BlockOfCode::SetCodePtr(CodePtr code_ptr) {
+    u8* ptr = const_cast<u8*>(reinterpret_cast<const u8*>(code_ptr));
+    ARM64XEmitter::SetCodePtr(ptr);
+}
+
+void BlockOfCode::EnsurePatchLocationSize(CodePtr begin, size_t size) {
+    size_t current_size = GetCodePtr() - reinterpret_cast<const u8*>(begin);
+    ASSERT(current_size <= size);
+    for (u32 i = 0; i < (size - current_size) / 4; i++) {
+        HINT(Arm64Gen::HINT_NOP);
+    }
+}
+
+//bool BlockOfCode::DoesCpuSupport(Xbyak::util::Cpu::Type type) const {
+//#ifdef DYNARMIC_ENABLE_CPU_FEATURE_DETECTION
+//    return cpu_info.has(type);
+//#else
+//    (void)type;
+//    return false;
+//#endif
+//}
+
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/block_of_code.h b/src/dynarmic/backend/A64/block_of_code.h
new file mode 100644
index 00000000..44f5c9a0
--- /dev/null
+++ b/src/dynarmic/backend/A64/block_of_code.h
@@ -0,0 +1,147 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#pragma once
+
+#include <array>
+#include <memory>
+#include <type_traits>
+
+#include "backend/A64/callback.h"
+#include "backend/A64/constant_pool.h"
+#include "backend/A64/jitstate_info.h"
+#include "backend/A64/emitter/a64_emitter.h"
+#include "common/common_types.h"
+
+namespace Dynarmic::BackendA64 {
+
+using CodePtr = const void*;
+
+struct RunCodeCallbacks {
+    std::unique_ptr<Callback> LookupBlock;
+    std::unique_ptr<Callback> AddTicks;
+    std::unique_ptr<Callback> GetTicksRemaining;
+    u64 value_in_X27;
+};
+
+class BlockOfCode final : public Arm64Gen::ARM64CodeBlock {
+public:
+    BlockOfCode(RunCodeCallbacks cb, JitStateInfo jsi);
+    BlockOfCode(const BlockOfCode&) = delete;
+
+
+    /// Call when external emitters have finished emitting their preludes.
+    void PreludeComplete();
+
+    /// Change permissions to RW. This is required to support systems with W^X enforced.
+    void EnableWriting();
+    /// Change permissions to RX. This is required to support systems with W^X enforced.
+    void DisableWriting();
+
+    /// Clears this block of code and resets code pointer to beginning.
+    void ClearCache();
+    /// Calculates how much space is remaining to use. This is the minimum of near code and far code.
+    size_t SpaceRemaining() const;
+
+    /// Runs emulated code from code_ptr.
+    void RunCode(void* jit_state, CodePtr code_ptr) const;
+    /// Runs emulated code from code_ptr for a single cycle.
+    void StepCode(void* jit_state, CodePtr code_ptr) const;
+    /// Code emitter: Returns to dispatcher
+    void ReturnFromRunCode(bool fpscr_already_exited = false);
+    /// Code emitter: Returns to dispatcher, forces return to host
+    void ForceReturnFromRunCode(bool fpscr_already_exited = false);
+    /// Code emitter: Makes guest FPSR and FPCR the current FPSR and FPCR
+    void SwitchFpscrOnEntry();
+    /// Code emitter: Makes saved host FPCR the current FPCR
+    void SwitchFpscrOnExit();
+    /// Code emitter: Updates cycles remaining my calling cb.AddTicks and cb.GetTicksRemaining
+    /// @note this clobbers ABI caller-save registers
+    void UpdateTicks();
+    /// Code emitter: Performs a block lookup based on current state
+    /// @note this clobbers ABI caller-save registers
+    void LookupBlock();
+
+    u64 MConst(u64 lower, u64 upper = 0);
+
+    void EmitPatchLDR(Arm64Gen::ARM64Reg Rt, u64 lower, u64 upper = 0);
+
+    void PatchConstPool();
+
+    /// Far code sits far away from the near code. Execution remains primarily in near code.
+    /// "Cold" / Rarely executed instructions sit in far code, so the CPU doesn't fetch them unless necessary.
+    void SwitchToFarCode();
+    void SwitchToNearCode();
+
+    CodePtr GetCodeBegin() const;
+    u8* GetRegion() const;
+    std::size_t GetRegionSize() const;
+
+    const void* GetReturnFromRunCodeAddress() const {
+        return return_from_run_code[0];
+    }
+
+    const void* GetForceReturnFromRunCodeAddress() const {
+        return return_from_run_code[FORCE_RETURN];
+    }
+
+    /// Allocate memory of `size` bytes from the same block of memory the code is in.
+    /// This is useful for objects that need to be placed close to or within code.
+    /// The lifetime of this memory is the same as the code around it.
+    void* AllocateFromCodeSpace(size_t size);
+
+    void SetCodePtr(CodePtr code_ptr);
+    void EnsurePatchLocationSize(CodePtr begin, size_t size);
+
+    Arm64Gen::ARM64FloatEmitter fp_emitter;
+
+    // ABI registers
+
+    static const Arm64Gen::ARM64Reg ABI_RETURN;
+    static const Arm64Gen::ARM64Reg ABI_RETURN2;
+    static const Arm64Gen::ARM64Reg ABI_PARAM1;
+    static const Arm64Gen::ARM64Reg ABI_PARAM2;
+    static const Arm64Gen::ARM64Reg ABI_PARAM3;
+    static const Arm64Gen::ARM64Reg ABI_PARAM4;
+    static const Arm64Gen::ARM64Reg ABI_PARAM5;
+    static const Arm64Gen::ARM64Reg ABI_PARAM6;
+    static const Arm64Gen::ARM64Reg ABI_PARAM7;
+    static const Arm64Gen::ARM64Reg ABI_PARAM8;
+
+    static const Arm64Gen::ARM64Reg ABI_SCRATCH1;
+
+    static const std::array<Arm64Gen::ARM64Reg, 8> ABI_PARAMS;
+
+    // bool DoesCpuSupport(Xbyak::util::Cpu::Type type) const;
+
+    JitStateInfo GetJitStateInfo() const { return jsi; }
+
+private:
+    RunCodeCallbacks cb;
+    JitStateInfo jsi;
+
+    bool prelude_complete = false;
+    CodePtr near_code_begin;
+    CodePtr far_code_begin;
+
+    ConstantPool constant_pool;
+
+    bool in_far_code = false;
+    CodePtr near_code_ptr;
+    CodePtr far_code_ptr;
+
+    using RunCodeFuncType = void(*)(void*, CodePtr);
+    RunCodeFuncType run_code = nullptr;
+    RunCodeFuncType step_code = nullptr;
+    static constexpr size_t FPSCR_ALREADY_EXITED = 1 << 0;
+    static constexpr size_t FORCE_RETURN = 1 << 1;
+    std::array<const void*, 4> return_from_run_code;
+    void GenRunCode();
+
+    //Xbyak::util::Cpu cpu_info;
+};
+
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/block_range_information.cpp b/src/dynarmic/backend/A64/block_range_information.cpp
new file mode 100644
index 00000000..1f85c861
--- /dev/null
+++ b/src/dynarmic/backend/A64/block_range_information.cpp
@@ -0,0 +1,45 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#include <unordered_set>
+
+#include <boost/icl/interval_map.hpp>
+#include <boost/icl/interval_set.hpp>
+
+#include "backend/A64/block_range_information.h"
+#include "common/common_types.h"
+
+namespace Dynarmic::BackendA64 {
+
+template <typename ProgramCounterType>
+void BlockRangeInformation<ProgramCounterType>::AddRange(boost::icl::discrete_interval<ProgramCounterType> range, IR::LocationDescriptor location) {
+    block_ranges.add(std::make_pair(range, std::set<IR::LocationDescriptor>{location}));
+}
+
+template <typename ProgramCounterType>
+void BlockRangeInformation<ProgramCounterType>::ClearCache() {
+    block_ranges.clear();
+}
+
+template <typename ProgramCounterType>
+std::unordered_set<IR::LocationDescriptor> BlockRangeInformation<ProgramCounterType>::InvalidateRanges(const boost::icl::interval_set<ProgramCounterType>& ranges) {
+    std::unordered_set<IR::LocationDescriptor> erase_locations;
+    for (auto invalidate_interval : ranges) {
+        auto pair = block_ranges.equal_range(invalidate_interval);
+        for (auto it = pair.first; it != pair.second; ++it) {
+            for (const auto &descriptor : it->second) {
+                erase_locations.insert(descriptor);
+            }
+        }
+    }
+    // TODO: EFFICIENCY: Remove ranges that are to be erased.
+    return erase_locations;
+}
+
+template class BlockRangeInformation<u32>;
+template class BlockRangeInformation<u64>;
+
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/block_range_information.h b/src/dynarmic/backend/A64/block_range_information.h
new file mode 100644
index 00000000..f9d94315
--- /dev/null
+++ b/src/dynarmic/backend/A64/block_range_information.h
@@ -0,0 +1,29 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#pragma once
+
+#include <unordered_set>
+
+#include <boost/icl/interval_map.hpp>
+#include <boost/icl/interval_set.hpp>
+
+#include "frontend/ir/location_descriptor.h"
+
+namespace Dynarmic::BackendA64 {
+
+template <typename ProgramCounterType>
+class BlockRangeInformation {
+public:
+    void AddRange(boost::icl::discrete_interval<ProgramCounterType> range, IR::LocationDescriptor location);
+    void ClearCache();
+    std::unordered_set<IR::LocationDescriptor> InvalidateRanges(const boost::icl::interval_set<ProgramCounterType>& ranges);
+
+private:
+    boost::icl::interval_map<ProgramCounterType, std::set<IR::LocationDescriptor>> block_ranges;
+};
+
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/callback.cpp b/src/dynarmic/backend/A64/callback.cpp
new file mode 100644
index 00000000..c7e19f64
--- /dev/null
+++ b/src/dynarmic/backend/A64/callback.cpp
@@ -0,0 +1,41 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#include "backend/A64/callback.h"
+#include "backend/A64/block_of_code.h"
+
+namespace Dynarmic::BackendA64 {
+
+Callback::~Callback() = default;
+
+void SimpleCallback::EmitCall(BlockOfCode& code, std::function<void(RegList)> l) const {
+    l({code.ABI_PARAM1, code.ABI_PARAM2, code.ABI_PARAM3, code.ABI_PARAM4});
+    code.QuickCallFunction(fn);
+}
+
+void SimpleCallback::EmitCallWithReturnPointer(BlockOfCode& code, std::function<void(Arm64Gen::ARM64Reg, RegList)> l) const {
+    l(code.ABI_PARAM1, {code.ABI_PARAM2, code.ABI_PARAM3, code.ABI_PARAM4});
+    code.QuickCallFunction(fn);
+}
+
+void ArgCallback::EmitCall(BlockOfCode& code, std::function<void(RegList)> l) const {
+    l({code.ABI_PARAM2, code.ABI_PARAM3, code.ABI_PARAM4});
+    code.MOVI2R(code.ABI_PARAM1, arg);
+    code.QuickCallFunction(fn);
+}
+
+void ArgCallback::EmitCallWithReturnPointer(BlockOfCode& code, std::function<void(Arm64Gen::ARM64Reg, RegList)> l) const {
+#if defined(WIN32) && !defined(__MINGW64__)
+    l(code.ABI_PARAM2, {code.ABI_PARAM3, code.ABI_PARAM4});
+    code.MOVI2R(code.ABI_PARAM1, arg);
+#else
+    l(code.ABI_PARAM1, {code.ABI_PARAM3, code.ABI_PARAM4});
+    code.MOVI2R(code.ABI_PARAM2, arg);
+#endif
+    code.QuickCallFunction(fn);
+}
+
+} // namespace Dynarmic::BackendX64
diff --git a/src/dynarmic/backend/A64/callback.h b/src/dynarmic/backend/A64/callback.h
new file mode 100644
index 00000000..c9d88db4
--- /dev/null
+++ b/src/dynarmic/backend/A64/callback.h
@@ -0,0 +1,54 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#pragma once
+
+#include <functional>
+#include <vector>
+
+#include "backend/A64/emitter/a64_emitter.h"
+#include "common/common_types.h"
+
+namespace Dynarmic::BackendA64 {
+
+using RegList = std::vector<Arm64Gen::ARM64Reg>;
+
+class BlockOfCode;
+
+class Callback {
+public:
+    virtual ~Callback();
+
+    virtual void EmitCall(BlockOfCode& code, std::function<void(RegList)> fn = [](RegList) {}) const = 0;
+    virtual void EmitCallWithReturnPointer(BlockOfCode& code, std::function<void(Arm64Gen::ARM64Reg, RegList)> fn) const = 0;
+};
+
+class SimpleCallback final : public Callback {
+public:
+    template <typename Function>
+    SimpleCallback(Function fn) : fn(reinterpret_cast<void (*)()>(fn)) {}
+
+    void EmitCall(BlockOfCode& code, std::function<void(RegList)> fn = [](RegList) {}) const override;
+    void EmitCallWithReturnPointer(BlockOfCode& code, std::function<void(Arm64Gen::ARM64Reg, RegList)> fn) const override;
+
+private:
+    void (*fn)();
+};
+
+class ArgCallback final : public Callback {
+public:
+    template <typename Function>
+    ArgCallback(Function fn, u64 arg) : fn(reinterpret_cast<void (*)()>(fn)), arg(arg) {}
+
+    void EmitCall(BlockOfCode& code, std::function<void(RegList)> fn = [](RegList) {}) const override;
+    void EmitCallWithReturnPointer(BlockOfCode& code, std::function<void(Arm64Gen::ARM64Reg, RegList)> fn) const override;
+
+private:
+    void (*fn)();
+    u64 arg;
+};
+
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/constant_pool.cpp b/src/dynarmic/backend/A64/constant_pool.cpp
new file mode 100644
index 00000000..0b3a0f20
--- /dev/null
+++ b/src/dynarmic/backend/A64/constant_pool.cpp
@@ -0,0 +1,65 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#include <cstring>
+
+#include "backend/A64/block_of_code.h"
+#include "backend/A64/constant_pool.h"
+#include "common/assert.h"
+
+namespace Dynarmic::BackendA64 {
+
+ConstantPool::ConstantPool(BlockOfCode& code) : code(code) {}
+
+void ConstantPool::EmitPatchLDR(Arm64Gen::ARM64Reg Rt, u64 lower, u64 upper) {
+    const auto constant = std::make_tuple(lower, upper);
+    auto iter = constant_info.find(constant);
+    if (iter == constant_info.end()) {
+        struct PatchInfo p = { code.GetCodePtr(), Rt, constant };
+        patch_info.emplace_back(p);
+        code.BRK(0);
+        return;
+    }
+
+    const s32 offset = reinterpret_cast<size_t>(iter->second) - reinterpret_cast<size_t>(code.GetCodePtr());
+
+    if (!(offset >= -0x40000 && offset <= 0x3FFFF)) {
+        constant_info.erase(constant);
+        struct PatchInfo p = { code.GetCodePtr(), Rt, constant };
+        patch_info.emplace_back(p);
+        code.BRK(0x42);
+        return;
+    }
+    DEBUG_ASSERT((offset & 3) == 0);
+    code.LDR(Rt, offset / 4);
+}
+
+void ConstantPool::PatchPool() {
+    u8* pool_ptr = code.GetWritableCodePtr();
+    for (PatchInfo patch : patch_info) {
+        auto iter = constant_info.find(patch.constant);
+        if (iter == constant_info.end()) {
+            std::memcpy(pool_ptr, &std::get<0>(patch.constant), sizeof(u64));
+            std::memcpy(pool_ptr + sizeof(u64), &std::get<1>(patch.constant), sizeof(u64));
+            iter = constant_info.emplace(patch.constant, pool_ptr).first;
+            pool_ptr += align_size;
+        }
+        code.SetCodePtr(patch.ptr);
+
+        const s32 offset = reinterpret_cast<size_t>(iter->second) - reinterpret_cast<size_t>(code.GetCodePtr());
+        DEBUG_ASSERT((offset & 3) == 0);
+        code.LDR(patch.Rt, offset / 4);
+    }
+    patch_info.clear();
+    code.SetCodePtr(pool_ptr);
+}
+
+void  ConstantPool::Clear() {
+    constant_info.clear();
+    patch_info.clear();
+}
+
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/constant_pool.h b/src/dynarmic/backend/A64/constant_pool.h
new file mode 100644
index 00000000..ede34e5b
--- /dev/null
+++ b/src/dynarmic/backend/A64/constant_pool.h
@@ -0,0 +1,47 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#pragma once
+
+#include <map>
+
+#include "common/common_types.h"
+
+namespace Dynarmic::BackendA64 {
+
+class BlockOfCode;
+
+/// ConstantPool allocates a block of memory from BlockOfCode.
+/// It places constants into this block of memory, returning the address
+/// of the memory location where the constant is placed. If the constant
+/// already exists, its memory location is reused.
+class ConstantPool final {
+public:
+    ConstantPool(BlockOfCode& code);
+
+    void EmitPatchLDR(Arm64Gen::ARM64Reg Rt, u64 lower, u64 upper = 0);
+
+    void PatchPool();
+
+    void Clear();
+
+private:
+    static constexpr size_t align_size = 16; // bytes
+
+    std::map<std::tuple<u64, u64>, void*> constant_info;
+
+    BlockOfCode& code;
+
+    struct PatchInfo {
+        const void* ptr;
+        Arm64Gen::ARM64Reg Rt;
+        std::tuple<u64, u64> constant;
+    };
+
+    std::vector<PatchInfo> patch_info;
+};
+
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/devirtualize.h b/src/dynarmic/backend/A64/devirtualize.h
new file mode 100644
index 00000000..caefc9b0
--- /dev/null
+++ b/src/dynarmic/backend/A64/devirtualize.h
@@ -0,0 +1,77 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#pragma once
+
+#include <cstring>
+#include <memory>
+
+#include <mp/traits/function_info.h>
+
+#include "backend/A64/callback.h"
+#include "common/assert.h"
+#include "common/cast_util.h"
+#include "common/common_types.h"
+
+namespace Dynarmic::BackendA64 {
+
+namespace impl {
+
+template <typename FunctionType, FunctionType mfp>
+struct ThunkBuilder;
+
+template <typename C, typename R, typename... Args, R(C::*mfp)(Args...)>
+struct ThunkBuilder<R(C::*)(Args...), mfp> {
+    static R Thunk(C* this_, Args... args) {
+        return (this_->*mfp)(std::forward<Args>(args)...);
+    }
+};
+
+} // namespace impl
+
+template<auto mfp>
+ArgCallback DevirtualizeGeneric(mp::class_type<decltype(mfp)>* this_) {
+    return ArgCallback{&impl::ThunkBuilder<decltype(mfp), mfp>::Thunk, reinterpret_cast<u64>(this_)};
+}
+
+template<auto mfp>
+ArgCallback DevirtualizeWindows(mp::class_type<decltype(mfp)>* this_) {
+    static_assert(sizeof(mfp) == 8);
+    return ArgCallback{Common::BitCast<u64>(mfp), reinterpret_cast<u64>(this_)};
+}
+
+template<auto mfp>
+ArgCallback DevirtualizeAarch64(mp::class_type<decltype(mfp)>* this_) {
+    struct MemberFunctionPointer {
+        /// For a non-virtual function, this is a simple function pointer.
+        /// For a virtual function, it is virtual table offset in bytes.
+        u64 ptr;
+        /// Twice the required adjustment to `this`, plus 1 if the member function is virtual.
+        u64 adj;
+    } mfp_struct = Common::BitCast<MemberFunctionPointer>(mfp);
+
+    static_assert(sizeof(MemberFunctionPointer) == 16);
+    static_assert(sizeof(MemberFunctionPointer) == sizeof(mfp));
+
+    u64 fn_ptr = mfp_struct.ptr;
+    u64 this_ptr = reinterpret_cast<u64>(this_) + mfp_struct.adj / 2;
+    if (mfp_struct.adj & 1) {
+        u64 vtable = Common::BitCastPointee<u64>(this_ptr);
+        fn_ptr = Common::BitCastPointee<u64>(vtable + fn_ptr);
+    }
+    return ArgCallback{fn_ptr, this_ptr};
+}
+
+template<auto mfp>
+ArgCallback Devirtualize(mp::class_type<decltype(mfp)>* this_) {
+#if defined(linux) || defined(__linux) || defined(__linux__)
+    return DevirtualizeAarch64<mfp>(this_);
+#else
+    return DevirtualizeGeneric<mfp>(this_);
+#endif
+}
+
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/emit_a64.cpp b/src/dynarmic/backend/A64/emit_a64.cpp
new file mode 100644
index 00000000..c1a3070f
--- /dev/null
+++ b/src/dynarmic/backend/A64/emit_a64.cpp
@@ -0,0 +1,286 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#include <unordered_map>
+#include <unordered_set>
+
+#include "backend/A64/block_of_code.h"
+#include "backend/A64/emit_a64.h"
+#include "backend/A64/hostloc.h"
+#include "backend/A64/perf_map.h"
+#include "common/assert.h"
+#include "common/bit_util.h"
+#include "common/common_types.h"
+#include "common/scope_exit.h"
+#include "common/variant_util.h"
+#include "frontend/ir/basic_block.h"
+#include "frontend/ir/microinstruction.h"
+#include "frontend/ir/opcodes.h"
+
+// TODO: Have ARM flags in host flags and not have them use up GPR registers unless necessary.
+// TODO: Actually implement that proper instruction selector you've always wanted to sweetheart.
+
+namespace Dynarmic::BackendA64 {
+
+EmitContext::EmitContext(RegAlloc& reg_alloc, IR::Block& block)
+    : reg_alloc(reg_alloc), block(block) {}
+
+void EmitContext::EraseInstruction(IR::Inst* inst) {
+    block.Instructions().erase(inst);
+    inst->ClearArgs();
+}
+
+EmitA64::EmitA64(BlockOfCode& code)
+    : code(code) {}
+
+EmitA64::~EmitA64() = default;
+
+std::optional<typename EmitA64::BlockDescriptor> EmitA64::GetBasicBlock(IR::LocationDescriptor descriptor) const {
+    auto iter = block_descriptors.find(descriptor);
+    if (iter == block_descriptors.end())
+        return std::nullopt;
+    return iter->second;
+}
+
+void EmitA64::EmitVoid(EmitContext&, IR::Inst*) {
+}
+
+void EmitA64::EmitBreakpoint(EmitContext&, IR::Inst*) {
+    code.BRK(0);
+}
+
+void EmitA64::EmitIdentity(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    if (!args[0].IsImmediate()) {
+        ctx.reg_alloc.DefineValue(inst, args[0]);
+    }
+}
+
+void EmitA64::PushRSBHelper(ARM64Reg loc_desc_reg, ARM64Reg index_reg, IR::LocationDescriptor target) {
+    auto iter = block_descriptors.find(target);
+    CodePtr target_code_ptr = iter != block_descriptors.end()
+                            ? iter->second.entrypoint
+                            : code.GetReturnFromRunCodeAddress();
+
+    code.LDR(INDEX_UNSIGNED, DecodeReg(index_reg), X28, code.GetJitStateInfo().offsetof_rsb_ptr);
+
+    code.MOVI2R(loc_desc_reg, target.Value());
+
+    patch_information[target].mov_x0.emplace_back(code.GetCodePtr());
+    EmitPatchMovX0(target_code_ptr);
+
+    code.ADD(code.ABI_SCRATCH1, X28, DecodeReg(index_reg), ArithOption{index_reg, ST_LSL, 3});
+    code.STR(INDEX_UNSIGNED, loc_desc_reg, code.ABI_SCRATCH1, code.GetJitStateInfo().offsetof_rsb_location_descriptors);
+    code.STR(INDEX_UNSIGNED, X0, code.ABI_SCRATCH1, code.GetJitStateInfo().offsetof_rsb_codeptrs);
+
+    code.ADDI2R(DecodeReg(index_reg), DecodeReg(index_reg), 1);
+    code.ANDI2R(DecodeReg(index_reg), DecodeReg(index_reg), code.GetJitStateInfo().rsb_ptr_mask, code.ABI_SCRATCH1);
+    code.STR(INDEX_UNSIGNED, DecodeReg(index_reg), X28, code.GetJitStateInfo().offsetof_rsb_ptr);        
+}
+
+void EmitA64::EmitPushRSB(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    ASSERT(args[0].IsImmediate());
+    u64 unique_hash_of_target = args[0].GetImmediateU64();
+
+    ctx.reg_alloc.ScratchGpr({HostLoc::X0});
+    Arm64Gen::ARM64Reg loc_desc_reg = ctx.reg_alloc.ScratchGpr();
+    Arm64Gen::ARM64Reg index_reg = ctx.reg_alloc.ScratchGpr();
+
+    PushRSBHelper(loc_desc_reg, index_reg, IR::LocationDescriptor{unique_hash_of_target});
+}
+
+void EmitA64::EmitGetCarryFromOp(EmitContext&, IR::Inst*) {
+    ASSERT_FALSE("should never happen");
+}
+
+void EmitA64::EmitGetOverflowFromOp(EmitContext&, IR::Inst*) {
+    ASSERT_FALSE("should never happen");
+}
+
+void EmitA64::EmitGetGEFromOp(EmitContext&, IR::Inst*) {
+    ASSERT_FALSE("should never happen");
+}
+
+void EmitA64::EmitGetUpperFromOp(EmitContext&, IR::Inst*) {
+    ASSERT_FALSE("should never happen");
+}
+
+void EmitA64::EmitGetLowerFromOp(EmitContext&, IR::Inst*) {
+    ASSERT_FALSE("should never happen");
+}
+
+void EmitA64::EmitGetNZCVFromOp(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    Arm64Gen::ARM64Reg nzcv = ctx.reg_alloc.ScratchGpr();
+    Arm64Gen::ARM64Reg value = ctx.reg_alloc.UseGpr(args[0]);
+    code.CMP(value, ZR);
+    code.MRS(nzcv, FIELD_NZCV);
+    ctx.reg_alloc.DefineValue(inst, nzcv);
+}
+
+void EmitA64::EmitNZCVFromPackedFlags(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    if (args[0].IsImmediate()) {
+        Arm64Gen::ARM64Reg nzcv = DecodeReg(ctx.reg_alloc.ScratchGpr());
+        u32 value = 0;
+        value |= Common::Bit<31>(args[0].GetImmediateU32()) ? (1 << 15) : 0;
+        value |= Common::Bit<30>(args[0].GetImmediateU32()) ? (1 << 14) : 0;
+        value |= Common::Bit<29>(args[0].GetImmediateU32()) ? (1 << 8) : 0;
+        value |= Common::Bit<28>(args[0].GetImmediateU32()) ? (1 << 0) : 0;
+        code.MOVI2R(nzcv, value);
+        ctx.reg_alloc.DefineValue(inst, nzcv);
+    } else {
+        Arm64Gen::ARM64Reg nzcv = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
+        Arm64Gen::ARM64Reg scratch = DecodeReg(ctx.reg_alloc.ScratchGpr());
+        // TODO: Optimize
+        code.LSR(nzcv, nzcv, 28);
+        code.MOVI2R(scratch, 0b00010000'10000001);
+        code.MUL(nzcv, nzcv, scratch);
+        code.ANDI2R(nzcv, nzcv, 1, scratch);
+        ctx.reg_alloc.DefineValue(inst, nzcv);
+    }
+}
+
+void EmitA64::EmitAddCycles(size_t cycles) {
+    ASSERT(cycles < std::numeric_limits<u32>::max());
+    code.SUBI2R(X26, X26, static_cast<u32>(cycles));
+}
+
+FixupBranch EmitA64::EmitCond(IR::Cond cond) {
+    FixupBranch label;
+
+    const Arm64Gen::ARM64Reg cpsr = code.ABI_SCRATCH1;
+    code.LDR(INDEX_UNSIGNED, DecodeReg(cpsr), X28, code.GetJitStateInfo().offsetof_cpsr_nzcv);
+    code._MSR(FIELD_NZCV, cpsr);
+
+    switch (cond) {
+    case IR::Cond::EQ: //z
+        label = code.B(CC_EQ);
+        break;
+    case IR::Cond::NE: //!z
+        label = code.B(CC_NEQ);
+        break;
+    case IR::Cond::CS: //c
+        label = code.B(CC_CS);
+        break;
+    case IR::Cond::CC: //!c
+        label = code.B(CC_CC);
+        break;
+    case IR::Cond::MI: //n
+        label = code.B(CC_MI);
+        break;
+    case IR::Cond::PL: //!n
+        label = code.B(CC_PL);
+        break;
+    case IR::Cond::VS: //v
+        label = code.B(CC_VS);
+        break;
+    case IR::Cond::VC: //!v
+        label = code.B(CC_VC);
+        break;
+    case IR::Cond::HI:  //c & !z
+        label = code.B(CC_HI);
+        break;
+    case IR::Cond::LS:  //!c | z
+        label = code.B(CC_LS);
+        break;
+    case IR::Cond::GE:  // n == v
+        label = code.B(CC_GE);
+        break;
+    case IR::Cond::LT:  // n != v
+        label = code.B(CC_LT);
+        break;
+    case IR::Cond::GT:  // !z & (n == v)
+        label = code.B(CC_GT);
+        break;
+    case IR::Cond::LE:  // z | (n != v)
+        label = code.B(CC_LE);
+        break;    
+    default:
+        ASSERT_MSG(false, "Unknown cond {}", static_cast<size_t>(cond));
+        break;
+    }
+
+    return label;
+}
+
+EmitA64::BlockDescriptor EmitA64::RegisterBlock(const IR::LocationDescriptor& descriptor, CodePtr entrypoint, size_t size) {
+    PerfMapRegister(entrypoint, code.GetCodePtr(), LocationDescriptorToFriendlyName(descriptor));
+    Patch(descriptor, entrypoint);
+    BlockDescriptor block_desc{entrypoint, size};
+
+    block_descriptors.emplace(descriptor.Value(), block_desc);
+    return block_desc;
+}
+
+void EmitA64::EmitTerminal(IR::Terminal terminal, IR::LocationDescriptor initial_location, bool is_single_step) {
+    Common::VisitVariant<void>(terminal, [this, initial_location, is_single_step](auto x) {
+        using T = std::decay_t<decltype(x)>;
+        if constexpr (!std::is_same_v<T, IR::Term::Invalid>) {
+            this->EmitTerminalImpl(x, initial_location, is_single_step);
+        } else {
+            ASSERT_MSG(false, "Invalid terminal");
+        }
+    });
+}
+
+void EmitA64::Patch(const IR::LocationDescriptor& desc, CodePtr bb) {
+    const CodePtr save_code_ptr = code.GetCodePtr();
+    const PatchInformation& patch_info = patch_information[desc];
+
+    for (CodePtr location : patch_info.jg) {
+        code.SetCodePtr(location);
+        EmitPatchJg(desc, bb);
+        code.FlushIcache();
+    }
+
+    for (CodePtr location : patch_info.jmp) {
+        code.SetCodePtr(location);
+        EmitPatchJmp(desc, bb);
+        code.FlushIcache();
+    }
+
+    for (CodePtr location : patch_info.mov_x0) {
+        code.SetCodePtr(location);
+        EmitPatchMovX0(bb);
+        code.FlushIcache();
+    }
+
+    code.SetCodePtr(save_code_ptr);
+}
+
+void EmitA64::Unpatch(const IR::LocationDescriptor& desc) {
+    Patch(desc, nullptr);
+}
+
+void EmitA64::ClearCache() {
+    block_descriptors.clear();
+    patch_information.clear();
+
+    PerfMapClear();
+}
+
+void EmitA64::InvalidateBasicBlocks(const std::unordered_set<IR::LocationDescriptor>& locations) {
+    code.EnableWriting();
+    SCOPE_EXIT { code.DisableWriting(); };
+
+    for (const auto &descriptor : locations) {
+        auto it = block_descriptors.find(descriptor);
+        if (it == block_descriptors.end()) {
+            continue;
+        }
+
+        if (patch_information.count(descriptor)) {
+            Unpatch(descriptor);
+        }
+        block_descriptors.erase(it);
+    }
+}
+
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/emit_a64.h b/src/dynarmic/backend/A64/emit_a64.h
new file mode 100644
index 00000000..1716af6d
--- /dev/null
+++ b/src/dynarmic/backend/A64/emit_a64.h
@@ -0,0 +1,124 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#pragma once
+
+#include <array>
+#include <optional>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "backend/A64/reg_alloc.h"
+#include "backend/A64/emitter/a64_emitter.h"
+#include "common/bit_util.h"
+#include "common/fp/rounding_mode.h"
+#include "frontend/ir/location_descriptor.h"
+#include "frontend/ir/terminal.h"
+
+namespace Dynarmic::IR {
+class Block;
+class Inst;
+} // namespace Dynarmic::IR
+
+namespace Dynarmic::BackendA64 {
+
+class BlockOfCode;
+
+using namespace Arm64Gen;
+
+using A64FullVectorWidth = std::integral_constant<size_t, 128>;
+
+// Array alias that always sizes itself according to the given type T
+// relative to the size of a vector register. e.g. T = u32 would result
+// in a std::array<u32, 4>.
+template <typename T>
+using VectorArray = std::array<T, A64FullVectorWidth::value / Common::BitSize<T>()>;
+
+struct EmitContext {
+    EmitContext(RegAlloc& reg_alloc, IR::Block& block);
+
+    void EraseInstruction(IR::Inst* inst);
+
+    virtual FP::RoundingMode FPSCR_RMode() const = 0;
+    virtual u32 FPCR() const = 0;
+    virtual bool FPSCR_FTZ() const = 0;
+    virtual bool FPSCR_DN() const = 0;
+    virtual bool AccurateNaN() const { return true; }
+
+    RegAlloc& reg_alloc;
+    IR::Block& block;
+};
+
+class EmitA64 {
+public:
+    struct BlockDescriptor {
+        CodePtr entrypoint;  // Entrypoint of emitted code
+        size_t size;         // Length in bytes of emitted code
+    };
+
+    EmitA64(BlockOfCode& code);
+    virtual ~EmitA64();
+
+    /// Looks up an emitted host block in the cache.
+    std::optional<BlockDescriptor> GetBasicBlock(IR::LocationDescriptor descriptor) const;
+
+    /// Empties the entire cache.
+    virtual void ClearCache();
+
+    /// Invalidates a selection of basic blocks.
+    void InvalidateBasicBlocks(const std::unordered_set<IR::LocationDescriptor>& locations);
+
+protected:
+    // Microinstruction emitters
+#define OPCODE(name, type, ...) void Emit##name(EmitContext& ctx, IR::Inst* inst);
+#define A32OPC(...)
+#define A64OPC(...)
+#include "backend/A64/opcodes.inc"
+#undef OPCODE
+#undef A32OPC
+#undef A64OPC
+
+    // Helpers
+    virtual std::string LocationDescriptorToFriendlyName(const IR::LocationDescriptor&) const = 0;
+    void EmitAddCycles(size_t cycles);
+    FixupBranch EmitCond(IR::Cond cond);
+    BlockDescriptor RegisterBlock(const IR::LocationDescriptor& location_descriptor, CodePtr entrypoint, size_t size);
+    void PushRSBHelper(Arm64Gen::ARM64Reg loc_desc_reg, Arm64Gen::ARM64Reg index_reg, IR::LocationDescriptor target);
+
+    // Terminal instruction emitters
+    void EmitTerminal(IR::Terminal terminal, IR::LocationDescriptor initial_location, bool is_single_step);
+    virtual void EmitTerminalImpl(IR::Term::Interpret terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0;
+    virtual void EmitTerminalImpl(IR::Term::ReturnToDispatch terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0;
+    virtual void EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0;
+    virtual void EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0;
+    virtual void EmitTerminalImpl(IR::Term::PopRSBHint terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0;
+    virtual void EmitTerminalImpl(IR::Term::FastDispatchHint terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0;
+    virtual void EmitTerminalImpl(IR::Term::If terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0;
+    virtual void EmitTerminalImpl(IR::Term::CheckBit terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0;
+    virtual void EmitTerminalImpl(IR::Term::CheckHalt terminal, IR::LocationDescriptor initial_location, bool is_single_step) = 0;
+
+    // Patching
+    struct PatchInformation {
+        std::vector<CodePtr> jg;
+        std::vector<CodePtr> jmp;
+        std::vector<CodePtr> mov_x0;
+    };
+    void Patch(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr);
+    virtual void Unpatch(const IR::LocationDescriptor& target_desc);
+    virtual void EmitPatchJg(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) = 0;
+    virtual void EmitPatchJmp(const IR::LocationDescriptor& target_desc, CodePtr target_code_ptr = nullptr) = 0;
+    virtual void EmitPatchMovX0(CodePtr target_code_ptr = nullptr) = 0;
+
+    // State
+    BlockOfCode& code;
+    std::unordered_map<IR::LocationDescriptor, BlockDescriptor> block_descriptors;
+    std::unordered_map<IR::LocationDescriptor, PatchInformation> patch_information;
+};
+
+} // namespace Dynarmic::BackendX64
diff --git a/src/dynarmic/backend/A64/emit_a64_data_processing.cpp b/src/dynarmic/backend/A64/emit_a64_data_processing.cpp
new file mode 100644
index 00000000..07b49276
--- /dev/null
+++ b/src/dynarmic/backend/A64/emit_a64_data_processing.cpp
@@ -0,0 +1,1128 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#include "backend/A64/block_of_code.h"
+#include "backend/A64/emit_a64.h"
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "frontend/ir/basic_block.h"
+#include "frontend/ir/microinstruction.h"
+#include "frontend/ir/opcodes.h"
+
+namespace Dynarmic::BackendA64 {
+
+void EmitA64::EmitPack2x32To1x64(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    ARM64Reg lo = ctx.reg_alloc.UseScratchGpr(args[0]);
+    ARM64Reg hi = ctx.reg_alloc.UseScratchGpr(args[1]);
+
+    code.ORR(lo, lo, hi, ArithOption{hi, ST_LSL, 32});
+
+    ctx.reg_alloc.DefineValue(inst, lo);
+}
+
+//void EmitA64::EmitPack2x64To1x128(EmitContext& ctx, IR::Inst* inst) {
+//    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+//    Xbyak::Reg64 lo = ctx.reg_alloc.UseGpr(args[0]);
+//    Xbyak::Reg64 hi = ctx.reg_alloc.UseGpr(args[1]);
+//    Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+//
+//    if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE41)) {
+//        code.movq(result, lo);
+//        code.pinsrq(result, hi, 1);
+//    } else {
+//        Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+//        code.movq(result, lo);
+//        code.movq(tmp, hi);
+//        code.punpcklqdq(result, tmp);
+//    }
+//
+//    ctx.reg_alloc.DefineValue(inst, result);
+//}
+
+void EmitA64::EmitLeastSignificantWord(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    ctx.reg_alloc.DefineValue(inst, args[0]);
+}
+
+void EmitA64::EmitMostSignificantWord(EmitContext& ctx, IR::Inst* inst) {
+    auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
+
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]);
+
+    if (carry_inst) {
+        ARM64Reg carry = ctx.reg_alloc.ScratchGpr();
+        code.UBFX(carry, result, 31, 1);
+        ctx.reg_alloc.DefineValue(carry_inst, carry);
+        ctx.EraseInstruction(carry_inst);
+    } 
+
+    code.LSR(result, result, 32);
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitLeastSignificantHalf(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    ctx.reg_alloc.DefineValue(inst, args[0]);
+}
+
+void EmitA64::EmitLeastSignificantByte(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    ctx.reg_alloc.DefineValue(inst, args[0]);
+}
+
+void EmitA64::EmitMostSignificantBit(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
+    // TODO: Flag optimization
+    code.LSR(result,result, 31);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitIsZero32(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
+    // TODO: Flag optimization
+    code.CMP(result, WZR);
+    code.CSET(result, CC_EQ);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitIsZero64(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]);
+    // TODO: Flag optimization
+    code.CMP(result, ZR);
+    code.CSET(result, CC_EQ);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitTestBit(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]);
+    ASSERT(args[1].IsImmediate());
+    // TODO: Flag optimization
+    code.UBFX(result, result, args[1].GetImmediateU8(), 1);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+static void EmitConditionalSelect(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bitsize) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    Arm64Gen::ARM64Reg nzcv = ctx.reg_alloc.ScratchGpr();
+    Arm64Gen::ARM64Reg then_ = ctx.reg_alloc.UseGpr(args[1]);
+    Arm64Gen::ARM64Reg else_ = ctx.reg_alloc.UseScratchGpr(args[2]);
+
+    then_ = bitsize == 64 ? then_ : DecodeReg(then_);
+    else_ = bitsize == 64 ? else_ : DecodeReg(else_);
+
+    code.LDR(INDEX_UNSIGNED, DecodeReg(nzcv), X28, code.GetJitStateInfo().offsetof_cpsr_nzcv);
+    // TODO: Flag optimization
+    code._MSR(FIELD_NZCV, nzcv);
+
+    switch (args[0].GetImmediateCond()) {
+    case IR::Cond::EQ: //z
+        code.CSEL(else_, else_, then_ , CC_EQ);
+        break;
+    case IR::Cond::NE: //!z
+        code.CSEL(else_, else_, then_, CC_NEQ);
+        break;
+    case IR::Cond::CS: //c
+        code.CSEL(else_, else_, then_, CC_CS);
+        break;
+    case IR::Cond::CC: //!c
+        code.CSEL(else_, else_, then_ , CC_CC);
+        break;
+    case IR::Cond::MI: //n
+        code.CSEL(else_, else_, then_, CC_MI);
+        break;
+    case IR::Cond::PL: //!n
+        code.CSEL(else_, else_, then_, CC_PL);
+        break;
+    case IR::Cond::VS: //v
+        code.CSEL(else_, else_, then_, CC_VS);
+        break;
+    case IR::Cond::VC: //!v
+        code.CSEL(else_, else_, then_, CC_VC);
+        break;
+    case IR::Cond::HI: //c & !z
+        code.CSEL(else_, else_, then_, CC_HI);
+        break;
+    case IR::Cond::LS: //!c | z
+        code.CSEL(else_, else_, then_, CC_LS);
+        break;
+    case IR::Cond::GE: // n == v
+        code.CSEL(else_, else_, then_, CC_GE);
+        break;
+    case IR::Cond::LT: // n != v
+        code.CSEL(else_, else_, then_, CC_LT);
+        break;
+    case IR::Cond::GT: // !z & (n == v)
+        code.CSEL(else_, else_, then_, CC_GT);
+        break;
+    case IR::Cond::LE: // z | (n != v)
+        code.CSEL(else_, else_, then_, CC_LE);
+        break;
+    case IR::Cond::AL:
+    case IR::Cond::NV:
+        code.MOV(else_, then_);
+        break;
+    default:
+        ASSERT_MSG(false, "Invalid cond {}", static_cast<size_t>(args[0].GetImmediateCond()));
+    }
+
+    ctx.reg_alloc.DefineValue(inst, else_);
+}
+
+void EmitA64::EmitConditionalSelect32(EmitContext& ctx, IR::Inst* inst) {
+    EmitConditionalSelect(code, ctx, inst, 32);
+}
+
+void EmitA64::EmitConditionalSelect64(EmitContext& ctx, IR::Inst* inst) {
+    EmitConditionalSelect(code, ctx, inst, 64);
+}
+
+void EmitA64::EmitConditionalSelectNZCV(EmitContext& ctx, IR::Inst* inst) {
+    EmitConditionalSelect(code, ctx, inst, 32);
+}
+
+void EmitA64::EmitLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) {
+    auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
+
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    auto& operand_arg = args[0];
+    auto& shift_arg = args[1];
+    auto& carry_arg = args[2];
+
+    if (!carry_inst) {
+        if (shift_arg.IsImmediate()) {
+            Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
+            u8 shift = shift_arg.GetImmediateU8();
+
+            if (shift <= 31) {
+                code.LSL(result, result, shift);
+            } else {
+                code.MOV(result, WZR);
+            }
+
+            ctx.reg_alloc.DefineValue(inst, result);
+        } else {
+            //ctx.reg_alloc.Use(shift_arg, HostLoc::X0);
+            Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg));
+            Arm64Gen::ARM64Reg result = ctx.reg_alloc.UseScratchGpr(operand_arg);
+
+            code.ANDI2R(shift, shift, 0xFF);
+            code.LSLV(result, result, shift);            
+            code.CMPI2R(shift, 32);
+            code.CSEL(result, WZR, DecodeReg(result), CC_GE);
+            ctx.reg_alloc.DefineValue(inst, DecodeReg(result));
+        }
+    } else {
+        if (shift_arg.IsImmediate()) {
+            u8 shift = shift_arg.GetImmediateU8();
+            Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
+            Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg));
+
+            if (shift == 0) {
+                // There is nothing more to do.
+            } else if (shift < 32) {
+                code.UBFX(carry, result, 32 - shift, 1);
+                code.LSL(result, result, shift);
+            } else if (shift > 32) {
+                code.MOV(result, WZR);
+                code.MOV(carry, WZR);
+            } else {
+                code.ANDI2R(carry, result, 1);
+                code.MOV(result, WZR);                
+            }
+
+            ctx.reg_alloc.DefineValue(carry_inst, carry);
+            ctx.EraseInstruction(carry_inst);
+            ctx.reg_alloc.DefineValue(inst, result);
+        } else {
+            Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg));
+            Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
+            Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg));
+
+            FixupBranch end;
+
+            code.ANDSI2R(shift, shift, 0xFF);
+            // if (Rs & 0xFF == 0) goto end;
+            end = code.B(CC_EQ);
+
+            code.CMPI2R(shift, 32);
+            code.SUBI2R(shift, shift, 1); // Subtract 1 to get the bit that is shiftedout, into the MSB.
+            code.LSLV(result, result, shift);
+            code.UBFX(carry, result, 31, 1);
+            code.LSL(result, result, 1);
+
+            code.CSEL(result, result, WZR, CC_LT);
+            code.CSEL(carry, carry, WZR, CC_LE);
+
+            code.SetJumpTarget(end);
+
+            ctx.reg_alloc.DefineValue(carry_inst, carry);
+            ctx.EraseInstruction(carry_inst);
+            ctx.reg_alloc.DefineValue(inst, result);
+        }
+    }
+}
+
+void EmitA64::EmitLogicalShiftLeft64(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    auto& operand_arg = args[0];
+    auto& shift_arg = args[1];
+
+    if (shift_arg.IsImmediate()) {
+        ARM64Reg result = ctx.reg_alloc.UseScratchGpr(operand_arg);
+        u8 shift = shift_arg.GetImmediateU8();
+
+        if (shift < 64) {
+            code.LSL(result, result, shift);
+        } else {
+            code.MOV(result, ZR);
+        }
+
+        ctx.reg_alloc.DefineValue(inst, result);
+    } else {
+        ARM64Reg result = ctx.reg_alloc.UseScratchGpr(operand_arg);
+        ARM64Reg shift = ctx.reg_alloc.UseGpr(shift_arg);
+
+        code.LSLV(result, result, shift);
+
+        ctx.reg_alloc.DefineValue(inst, result);
+    }
+}
+
+void EmitA64::EmitLogicalShiftRight32(EmitContext& ctx, IR::Inst* inst) {
+    auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
+
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    auto& operand_arg = args[0];
+    auto& shift_arg = args[1];
+    auto& carry_arg = args[2];
+
+    if (!carry_inst) {
+        if (shift_arg.IsImmediate()) {
+            Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
+            u8 shift = shift_arg.GetImmediateU8();
+
+            if (shift <= 31) {
+                code.LSR(result, result, shift);
+            } else {
+                code.MOVI2R(result, 0);
+            }
+            ctx.reg_alloc.DefineValue(inst, result);
+        } else {
+            Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg));
+            Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
+
+            // The 32-bit A64 LSR instruction masks the shift count by 0x1F before performing the shift.
+            // ARM differs from the behaviour: It does not mask the count, so shifts above 31 result in zeros.
+
+            code.ANDI2R(shift, shift, 0xFF);
+            code.LSRV(result, result, shift);
+            code.CMPI2R(shift, 31);
+            code.CSEL(result, WZR, result, CC_GT);
+
+            ctx.reg_alloc.DefineValue(inst, result);
+        }
+    } else {
+        if (shift_arg.IsImmediate()) {
+            u8 shift = shift_arg.GetImmediateU8();
+            Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
+            Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg));
+
+            if (shift == 0) {
+                // There is nothing more to do.
+            } else if (shift < 32) {
+                code.LSR(carry, result, shift - 1);
+                code.ANDI2R(carry, carry, 1);
+                code.LSR(result,result, shift);
+            } else if (shift == 32) {
+                code.UBFX(carry, result, 31, 1);
+                code.MOV(result, WZR);
+            } else {
+                code.MOV(result, WZR);
+                code.MOV(carry, WZR);
+            }
+
+            ctx.reg_alloc.DefineValue(carry_inst, carry);
+            ctx.EraseInstruction(carry_inst);
+            ctx.reg_alloc.DefineValue(inst, result);
+        } else {
+            Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg));
+            Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
+            Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg));
+
+            // TODO: Optimize this.
+            FixupBranch end;
+
+            code.ANDSI2R(shift, shift, 0xFF);
+            // if (Rs & 0xFF == 0) goto end;
+            end = code.B(CC_EQ);
+
+            code.CMPI2R(shift, 32);
+            code.SUBI2R(shift, shift, 1); // Subtract 1 to get the bit that is shifted out to the carry.
+            code.LSRV(result, result, shift);
+            code.ANDI2R(carry, result, 1);
+            code.LSR(result, result, 1);
+
+            code.CSEL(result, result, WZR, CC_LT);
+            code.CSEL(carry, carry, WZR, CC_LE);
+            
+            code.SetJumpTarget(end);            
+
+            ctx.reg_alloc.DefineValue(carry_inst, carry);
+            ctx.EraseInstruction(carry_inst);
+            ctx.reg_alloc.DefineValue(inst, result);
+        }
+    }
+}
+
+void EmitA64::EmitLogicalShiftRight64(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    auto& operand_arg = args[0];
+    auto& shift_arg = args[1];
+
+    if (shift_arg.IsImmediate()) {
+        ARM64Reg result = ctx.reg_alloc.UseScratchGpr(operand_arg);
+        u8 shift = shift_arg.GetImmediateU8();
+
+        if (shift < 64) {
+            code.LSR(result, result, shift);
+        } else {
+            code.MOV(result, ZR);
+        }
+
+        ctx.reg_alloc.DefineValue(inst, result);
+    } else {
+        ARM64Reg shift = ctx.reg_alloc.UseScratchGpr(shift_arg);
+        ARM64Reg result = ctx.reg_alloc.UseScratchGpr(operand_arg);
+
+        code.ANDI2R(shift, shift, 0xFF);
+        code.LSRV(result, result, shift);
+        code.CMP(shift, 63);
+        code.CSEL(result, WZR, result, CC_GT);
+
+        ctx.reg_alloc.DefineValue(inst, result);
+    }
+}
+
+void EmitA64::EmitArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst) {
+    auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
+
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    auto& operand_arg = args[0];
+    auto& shift_arg = args[1];
+    auto& carry_arg = args[2];
+
+    if (!carry_inst) {
+        if (shift_arg.IsImmediate()) {
+            u8 shift = shift_arg.GetImmediateU8();
+            Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
+
+            code.ASR(result, result, u8(shift < 31 ? shift : 31));
+
+            ctx.reg_alloc.DefineValue(inst, result);
+        } else {
+            //ctx.reg_alloc.UseScratch(shift_arg, HostLoc::X0);
+            Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg));
+            Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
+            Arm64Gen::ARM64Reg const31 = DecodeReg(ctx.reg_alloc.ScratchGpr());
+
+            // The 32-bit arm64 ASR instruction masks the shift count by 0x1F before performing the shift.
+            // ARM differs from the behaviour: It does not mask the count.
+
+            // We note that all shift values above 31 have the same behaviour as 31 does, so we saturate `shift` to 31.
+            code.ANDI2R(shift, shift, 0xFF);
+            code.MOVI2R(const31, 31);
+            code.CMPI2R(shift, u32(31));
+            code.CSEL(shift, shift, const31, CC_LE);
+            code.ASRV(result, result, shift);
+
+            ctx.reg_alloc.DefineValue(inst, result);
+        }
+    } else {
+        if (shift_arg.IsImmediate()) {
+            u8 shift = shift_arg.GetImmediateU8();
+            Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
+            Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg));
+
+            if (shift == 0) {
+                // There is nothing more to do.
+            } else if (shift <= 31) {
+                code.ASR(result, result, shift - 1);
+                code.ANDI2R(carry, result, 1);
+                code.ASR(result, result, 1);
+            } else {
+                code.ASR(result, result, 31);
+                code.ANDI2R(carry, result, 1);
+            }
+
+            ctx.reg_alloc.DefineValue(carry_inst, carry);
+            ctx.EraseInstruction(carry_inst);
+            ctx.reg_alloc.DefineValue(inst, result);
+        } else {
+            Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg));
+            Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
+            Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg));
+
+            // TODO: Optimize this.
+
+            FixupBranch end;
+
+            code.ANDSI2R(shift, shift, 0xFF);
+            // if (Rs & 0xFF == 0) goto end;
+            end = code.B(CC_EQ);
+            // else {
+            code.MOVI2R(carry, 32);
+            code.CMPI2R(shift, u32(31));
+            code.CSEL(shift, shift, carry, CC_LE);
+            code.SUBI2R(shift, shift, 1);
+            code.ASRV(result, result, shift);
+            code.ANDI2R(carry, result, 1);
+            code.ASR(result, result, 1);
+            // }
+            
+            code.SetJumpTarget(end);            
+
+            ctx.reg_alloc.DefineValue(carry_inst, carry);
+            ctx.EraseInstruction(carry_inst);
+            ctx.reg_alloc.DefineValue(inst, result);
+        }
+    }
+}
+
+//void EmitA64::EmitArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst) {
+//    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+//    auto& operand_arg = args[0];
+//    auto& shift_arg = args[1];
+//
+//    if (shift_arg.IsImmediate()) {
+//        u8 shift = shift_arg.GetImmediateU8();
+//        Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg);
+//
+//        code.sar(result, u8(shift < 63 ? shift : 63));
+//
+//        ctx.reg_alloc.DefineValue(inst, result);
+//    } else {
+//        ctx.reg_alloc.UseScratch(shift_arg, HostLoc::RCX);
+//        Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(operand_arg);
+//        Xbyak::Reg64 const63 = ctx.reg_alloc.ScratchGpr();
+//
+//        // The 64-bit x64 SAR instruction masks the shift count by 0x3F before performing the shift.
+//        // ARM differs from the behaviour: It does not mask the count.
+//
+//        // We note that all shift values above 63 have the same behaviour as 63 does, so we saturate `shift` to 63.
+//        code.mov(const63, 63);
+//        code.movzx(code.ecx, code.cl);
+//        code.cmp(code.ecx, u32(63));
+//        code.cmovg(code.ecx, const63);
+//        code.sar(result, code.cl);
+//
+//        ctx.reg_alloc.DefineValue(inst, result);
+//    }
+//}
+
+void EmitA64::EmitRotateRight32(EmitContext& ctx, IR::Inst* inst) {
+    auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
+
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    auto& operand_arg = args[0];
+    auto& shift_arg = args[1];
+    auto& carry_arg = args[2];
+
+    if (!carry_inst) {
+        if (shift_arg.IsImmediate()) {
+            u8 shift = shift_arg.GetImmediateU8();
+            Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
+
+            code.ROR(result, result, u8(shift & 0x1F));
+
+            ctx.reg_alloc.DefineValue(inst, result);
+        } else {
+            Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseGpr(shift_arg));
+            Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
+
+            // aarch64 ROR instruction does (shift & 0x1F) for us.
+            code.RORV(result, result, shift);
+
+            ctx.reg_alloc.DefineValue(inst, result);
+        }
+    } else {
+        if (shift_arg.IsImmediate()) {
+            u8 shift = shift_arg.GetImmediateU8();
+            Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
+            Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg));
+
+            if (shift == 0) {
+                // There is nothing more to do.
+            } else if ((shift & 0x1F) == 0) {
+                code.MOV(carry, result, ArithOption{result, ST_LSR, 31});
+            } else {
+                code.ROR(result, result, (shift & 0x1F) - 1);
+                code.ANDI2R(carry, result, 1);
+                code.ROR(result, result, 1);
+            }
+
+            ctx.reg_alloc.DefineValue(carry_inst, carry);
+            ctx.EraseInstruction(carry_inst);
+            ctx.reg_alloc.DefineValue(inst, result);
+        } else {
+            Arm64Gen::ARM64Reg shift = DecodeReg(ctx.reg_alloc.UseScratchGpr(shift_arg));
+            Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(operand_arg));
+            Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(carry_arg));
+
+            // TODO: Optimize
+
+            std::vector<FixupBranch> end; 
+            FixupBranch zero_1F;
+
+            code.ANDSI2R(shift, shift, u32(0xFF));
+            // if (Rs & 0xFF == 0) goto end;
+            end.push_back(code.B(CC_EQ));
+            code.ANDSI2R(shift, shift, u32(0x1F));
+            zero_1F = code.B(CC_EQ);
+            // if (Rs & 0x1F != 0) {
+            code.SUBI2R(shift, shift, 1);
+            code.RORV(result, result, shift);
+            code.ANDI2R(carry, result, 1);
+            code.ROR(result, result, 1);
+            end.push_back(code.B());
+            // } else {
+            code.SetJumpTarget(zero_1F);
+            code.MOV(carry, result, ArithOption{result, ST_LSR, 31});
+            // }
+
+            for (FixupBranch e : end) {
+                code.SetJumpTarget(e);
+            }
+
+            ctx.reg_alloc.DefineValue(carry_inst, carry);
+            ctx.EraseInstruction(carry_inst);
+            ctx.reg_alloc.DefineValue(inst, result);
+        }
+    }
+}
+
+void EmitA64::EmitRotateRight64(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    auto& operand_arg = args[0];
+    auto& shift_arg = args[1];
+
+    if (shift_arg.IsImmediate()) {
+        u8 shift = shift_arg.GetImmediateU8();
+        ARM64Reg result = ctx.reg_alloc.UseScratchGpr(operand_arg);
+
+        code.ROR(result, result, u8(shift & 0x3F));
+
+        ctx.reg_alloc.DefineValue(inst, result);
+    } else {
+        ARM64Reg result = ctx.reg_alloc.UseScratchGpr(operand_arg);
+        ARM64Reg shift = ctx.reg_alloc.UseGpr(shift_arg);
+
+        code.RORV(result, result, shift);
+
+        ctx.reg_alloc.DefineValue(inst, result);
+    }
+}
+
+void EmitA64::EmitRotateRightExtended(EmitContext& ctx, IR::Inst* inst) {
+    auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
+
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
+    Arm64Gen::ARM64Reg carry = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[1]));
+    Arm64Gen::ARM64Reg temp = DecodeReg(ctx.reg_alloc.ScratchGpr());
+
+    if (carry_inst) {
+        code.MOV(temp, result);
+    }
+
+    // Set carry to the LSB and perform ROR.
+    code.BFI(result, carry, 0, 1);
+    code.ROR(result, result, 1);    
+   
+    if (carry_inst) {
+        code.ANDI2R(carry, temp, 1);
+
+        ctx.reg_alloc.DefineValue(carry_inst, carry);
+        ctx.EraseInstruction(carry_inst);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+static Arm64Gen::ARM64Reg DoCarry(RegAlloc& reg_alloc, Argument& carry_in, IR::Inst* carry_out) {
+    if (carry_in.IsImmediate()) {
+        return carry_out ? reg_alloc.ScratchGpr() : INVALID_REG;
+    } else {
+        return carry_out ? reg_alloc.UseScratchGpr(carry_in) : reg_alloc.UseGpr(carry_in);
+    }
+}
+
+static void EmitAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bitsize) {
+    auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
+    auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp);
+    auto nzcv_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetNZCVFromOp);
+
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    auto& carry_in = args[2];
+
+    Arm64Gen::ARM64Reg nzcv = nzcv_inst ? ctx.reg_alloc.ScratchGpr() : INVALID_REG;
+    Arm64Gen::ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]);
+    Arm64Gen::ARM64Reg carry = DecodeReg(DoCarry(ctx.reg_alloc, carry_in, carry_inst));
+    Arm64Gen::ARM64Reg overflow = overflow_inst ? ctx.reg_alloc.ScratchGpr() : INVALID_REG;
+
+    result = bitsize == 64 ? result : DecodeReg(result);
+
+    if (args[1].IsImmediate() && args[1].GetType() == IR::Type::U32) {        
+        if (carry_in.IsImmediate()) {
+            if (carry_in.GetImmediateU1()) {
+                Arm64Gen::ARM64Reg op_arg = ctx.reg_alloc.UseGpr(args[1]);
+                code.CMP(op_arg, op_arg);
+                code.ADCS(result, result, op_arg);
+            } else {
+                u32 op_arg = args[1].GetImmediateU32();
+                code.ADDSI2R(result, result, op_arg, ctx.reg_alloc.ScratchGpr());
+            }
+        } else {
+            Arm64Gen::ARM64Reg op_arg = ctx.reg_alloc.UseGpr(args[1]);
+            code.CMPI2R(carry, 1);
+            code.ADCS(result, result, op_arg);
+        }
+    } else {
+        Arm64Gen::ARM64Reg op_arg = ctx.reg_alloc.UseGpr(args[1]);
+        if (carry_in.IsImmediate()) {
+            if (carry_in.GetImmediateU1()) {
+                code.CMP(DecodeReg(op_arg), DecodeReg(op_arg));
+                code.ADCS(result, result, op_arg);
+            } else {
+                code.ADDS(result,result, op_arg);
+            }
+        } else {
+            code.CMPI2R(DecodeReg(carry), 1);
+            code.ADCS(result, result, op_arg);
+        }
+    }
+
+    if (nzcv_inst) {
+        code.MRS(nzcv, FIELD_NZCV);
+        ctx.reg_alloc.DefineValue(nzcv_inst, nzcv);
+        ctx.EraseInstruction(nzcv_inst);
+    }
+    if (carry_inst) {
+        code.CSET(carry, CC_CS);
+        ctx.reg_alloc.DefineValue(carry_inst, carry);
+        ctx.EraseInstruction(carry_inst);
+    }
+    if (overflow_inst) {
+        code.CSET(overflow, CC_VS);
+        ctx.reg_alloc.DefineValue(overflow_inst, overflow);
+        ctx.EraseInstruction(overflow_inst);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitAdd32(EmitContext& ctx, IR::Inst* inst) {
+    EmitAdd(code, ctx, inst, 32);
+}
+
+void EmitA64::EmitAdd64(EmitContext& ctx, IR::Inst* inst) {
+    EmitAdd(code, ctx, inst, 64);
+}
+
+static void EmitSub(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bitsize) {
+    auto carry_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetCarryFromOp);
+    auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp);
+    auto nzcv_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetNZCVFromOp);
+
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    auto& carry_in = args[2];
+
+    Arm64Gen::ARM64Reg nzcv = nzcv_inst ? ctx.reg_alloc.ScratchGpr() : INVALID_REG;
+    Arm64Gen::ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]);
+    Arm64Gen::ARM64Reg carry = DoCarry(ctx.reg_alloc, carry_in, carry_inst);
+    Arm64Gen::ARM64Reg overflow = overflow_inst ? ctx.reg_alloc.ScratchGpr() : INVALID_REG;
+
+    // TODO: Consider using LEA.
+    // TODO: Optimize CMP case.
+
+    result = bitsize == 64 ? result : DecodeReg(result);
+
+    if (args[1].IsImmediate() && args[1].GetType() == IR::Type::U32) {        
+        if (carry_in.IsImmediate()) {
+            if (carry_in.GetImmediateU1()) {
+                u32 op_arg = args[1].GetImmediateU32();
+                code.SUBSI2R(result, result, op_arg, ctx.reg_alloc.ScratchGpr());
+            } else {
+                Arm64Gen::ARM64Reg op_arg = ctx.reg_alloc.UseGpr(args[1]);
+
+                code.ADDSI2R(op_arg, op_arg, 0); // Clear carry
+                code.SBCS(result, result, op_arg);
+            }
+        } else {
+            Arm64Gen::ARM64Reg op_arg = ctx.reg_alloc.UseGpr(args[1]);
+            code.CMPI2R(carry, 0x1);
+            code.SBCS(result, result, op_arg);
+        }
+    } else {
+        Arm64Gen::ARM64Reg op_arg = ctx.reg_alloc.UseGpr(args[1]);
+        if (carry_in.IsImmediate()) {
+            if (carry_in.GetImmediateU1()) {
+                code.SUBS(result, result, op_arg);
+            } else {
+                code.ADDSI2R(DecodeReg(op_arg), DecodeReg(op_arg), 0); // Clear carry
+                code.SBCS(result,result, op_arg);
+            }
+        } else {
+            code.CMPI2R(DecodeReg(carry), 0x1);
+            code.SBCS(result,result, op_arg);
+        }
+    }
+
+    if (nzcv_inst) {
+        code.MRS(nzcv, FIELD_NZCV);
+        ctx.reg_alloc.DefineValue(nzcv_inst, nzcv);
+        ctx.EraseInstruction(nzcv_inst);
+    }
+    if (carry_inst) {
+        code.CSET(carry, CC_CS);
+        ctx.reg_alloc.DefineValue(carry_inst, carry);
+        ctx.EraseInstruction(carry_inst);
+    }
+    if (overflow_inst) {
+        code.CSET(overflow, CC_VS);
+        ctx.reg_alloc.DefineValue(overflow_inst, overflow);
+        ctx.EraseInstruction(overflow_inst);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitSub32(EmitContext& ctx, IR::Inst* inst) {
+    EmitSub(code, ctx, inst, 32);
+}
+
+void EmitA64::EmitSub64(EmitContext& ctx, IR::Inst* inst) {
+    EmitSub(code, ctx, inst, 64);
+}
+
+void EmitA64::EmitMul32(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
+    ARM64Reg op_arg = DecodeReg(ctx.reg_alloc.UseGpr(args[1]));
+    
+    code.MUL(result, result, op_arg);
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitMul64(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]);
+    ARM64Reg op_arg = ctx.reg_alloc.UseGpr(args[1]);
+
+    code.MUL(result, result, op_arg);
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+
+void EmitA64::EmitUnsignedDiv32(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
+    const ARM64Reg divisor = DecodeReg(ctx.reg_alloc.UseGpr(args[1]));
+
+    code.UDIV(result, result, divisor);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitUnsignedDiv64(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]);
+    const ARM64Reg divisor = ctx.reg_alloc.UseGpr(args[1]);
+
+    code.UDIV(result, result, divisor);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitSignedDiv32(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
+    const ARM64Reg divisor = DecodeReg(ctx.reg_alloc.UseGpr(args[1]));
+
+    code.SDIV(result, result, divisor);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitSignedDiv64(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]);
+    const ARM64Reg divisor = ctx.reg_alloc.UseGpr(args[1]);
+
+    code.SDIV(result, result, divisor);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+
+void EmitA64::EmitAnd32(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
+
+    if (args[1].IsImmediate()) {
+        u32 op_arg = args[1].GetImmediateU32();
+        code.ANDI2R(result, result, op_arg, ctx.reg_alloc.ScratchGpr());
+    } else {
+        Arm64Gen::ARM64Reg op_arg = DecodeReg(ctx.reg_alloc.UseGpr(args[1]));    
+        code.AND(result, result, op_arg);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitAnd64(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    Arm64Gen::ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]);
+
+    if (args[1].IsImmediate()) {
+        u32 op_arg = args[1].GetImmediateU32();
+        code.ANDI2R(result, result, op_arg, ctx.reg_alloc.ScratchGpr());
+    }
+    else {
+        Arm64Gen::ARM64Reg op_arg = ctx.reg_alloc.UseGpr(args[1]);
+        code.AND(result, result, op_arg);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitEor32(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
+
+    if (args[1].IsImmediate()) {
+        u32 op_arg = args[1].GetImmediateU32();
+        code.EORI2R(result, result, op_arg, ctx.reg_alloc.ScratchGpr());
+    } else {
+        Arm64Gen::ARM64Reg op_arg = DecodeReg(ctx.reg_alloc.UseGpr(args[1]));
+        code.EOR(result, result, op_arg);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitEor64(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    Arm64Gen::ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]);
+
+    if (args[1].IsImmediate()) {
+        u32 op_arg = args[1].GetImmediateU32();
+        code.EORI2R(result, result, op_arg, ctx.reg_alloc.ScratchGpr());
+    }
+    else {
+        Arm64Gen::ARM64Reg op_arg = ctx.reg_alloc.UseGpr(args[1]);
+        code.EOR(result, result, op_arg);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitOr32(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
+
+    if (args[1].IsImmediate()) {
+        u32 op_arg = args[1].GetImmediateU32();
+        code.ORRI2R(result, result, op_arg, ctx.reg_alloc.ScratchGpr());
+    } else {
+        Arm64Gen::ARM64Reg op_arg = DecodeReg(ctx.reg_alloc.UseGpr(args[1]));
+        code.ORR(result, result , op_arg);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitOr64(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    Arm64Gen::ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]);
+
+    if (args[1].IsImmediate()) {
+        u32 op_arg = args[1].GetImmediateU32();
+        code.ORRI2R(result, result, op_arg, ctx.reg_alloc.ScratchGpr());
+    }
+    else {
+        Arm64Gen::ARM64Reg op_arg = ctx.reg_alloc.UseGpr(args[1]);
+        code.ORR(result, result, op_arg);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitNot32(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    Arm64Gen::ARM64Reg result;
+    if (args[0].IsImmediate()) {
+        result = DecodeReg(ctx.reg_alloc.ScratchGpr());
+        code.MOVI2R(result, u32(~args[0].GetImmediateU32()));
+    } else {
+        result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
+        code.MVN(result, result);
+    }
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitNot64(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    Arm64Gen::ARM64Reg result;
+    if (args[0].IsImmediate()) {
+        result = ctx.reg_alloc.ScratchGpr();
+        code.MOVI2R(result, u32(~args[0].GetImmediateU32()));
+    }
+    else {
+        result = ctx.reg_alloc.UseScratchGpr(args[0]);
+        code.MVN(result, result);
+    }
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitSignExtendByteToWord(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
+    code.SXTB(result, result);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitSignExtendHalfToWord(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
+    code.SXTH(result, result);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitSignExtendByteToLong(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]);
+    code.SXTB(result, result);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitSignExtendHalfToLong(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]);
+    code.SXTH(result, result);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitSignExtendWordToLong(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]);
+    code.SXTW(result, result);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitZeroExtendByteToWord(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
+    code.UXTB(result, result);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitZeroExtendHalfToWord(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
+    code.UXTH(result, result);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitZeroExtendByteToLong(EmitContext& ctx, IR::Inst* inst) {
+    // a64 zeros upper 32 bits on a 32-bit move
+    EmitZeroExtendByteToWord(ctx, inst);
+}
+
+void EmitA64::EmitZeroExtendHalfToLong(EmitContext& ctx, IR::Inst* inst) {
+    // a64 zeros upper 32 bits on a 32-bit move
+    EmitZeroExtendHalfToWord(ctx, inst);
+}
+
+void EmitA64::EmitZeroExtendWordToLong(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    ARM64Reg result = ctx.reg_alloc.UseScratchGpr(args[0]);
+    code.MOV(result, DecodeReg(result));
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+//void EmitA64::EmitZeroExtendLongToQuad(EmitContext& ctx, IR::Inst* inst) {
+//    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+//    if (args[0].IsInGpr()) {
+//        Xbyak::Reg64 source = ctx.reg_alloc.UseGpr(args[0]);
+//        Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+//        code.movq(result, source);
+//        ctx.reg_alloc.DefineValue(inst, result);
+//    } else {
+//        Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+//        code.movq(result, result);
+//        ctx.reg_alloc.DefineValue(inst, result);
+//    }
+//}
+
+void EmitA64::EmitByteReverseWord(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
+    code.REV32(result, result);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitByteReverseHalf(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    Arm64Gen::ARM64Reg result = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
+    code.REV16(result, result);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+//void EmitA64::EmitByteReverseDual(EmitContext& ctx, IR::Inst* inst) {
+//    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+//    Xbyak::Reg64 result = ctx.reg_alloc.UseScratchGpr(args[0]);
+//    code.bswap(result);
+//    ctx.reg_alloc.DefineValue(inst, result);
+//}
+
+void EmitA64::EmitCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    ARM64Reg source = DecodeReg(ctx.reg_alloc.UseGpr(args[0]));
+    ARM64Reg result = DecodeReg(ctx.reg_alloc.ScratchGpr());
+
+    code.CLZ(result, source);
+    ctx.reg_alloc.DefineValue(inst, result);    
+}
+
+void EmitA64::EmitCountLeadingZeros64(EmitContext& ctx, IR::Inst* inst) {
+   auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+   ARM64Reg source = ctx.reg_alloc.UseGpr(args[0]);
+   ARM64Reg result = ctx.reg_alloc.ScratchGpr();
+
+   code.CLZ(result, source);
+   ctx.reg_alloc.DefineValue(inst, result);
+}
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/emit_a64_floating_point.cpp b/src/dynarmic/backend/A64/emit_a64_floating_point.cpp
new file mode 100644
index 00000000..be0b97a6
--- /dev/null
+++ b/src/dynarmic/backend/A64/emit_a64_floating_point.cpp
@@ -0,0 +1,471 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#include <optional>
+#include <type_traits>
+#include <utility>
+
+#include "backend/A64/abi.h"
+#include "backend/A64/block_of_code.h"
+#include "backend/A64/emit_a64.h"
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "common/fp/fpcr.h"
+#include "common/fp/fpsr.h"
+#include "common/fp/info.h"
+#include "common/fp/op.h"
+#include "common/fp/rounding_mode.h"
+#include "common/fp/util.h"
+#include "frontend/ir/basic_block.h"
+#include "frontend/ir/microinstruction.h"
+#include "frontend/ir/opcodes.h"
+
+namespace Dynarmic::BackendA64 {
+
+namespace {
+
+Arm64Gen::RoundingMode ConvertRoundingModeToA64RoundingMode(FP::RoundingMode rounding_mode) {
+    switch (rounding_mode) {
+    case FP::RoundingMode::ToNearest_TieEven:
+        return RoundingMode::ROUND_N;
+    case FP::RoundingMode::TowardsPlusInfinity:
+        return RoundingMode::ROUND_P;
+    case FP::RoundingMode::TowardsMinusInfinity:
+        return RoundingMode::ROUND_M;
+    case FP::RoundingMode::TowardsZero:
+        return RoundingMode::ROUND_Z;
+    case FP::RoundingMode::ToNearest_TieAwayFromZero:
+        return RoundingMode::ROUND_A;
+    default:
+        UNREACHABLE();
+    }
+}
+
+template <size_t fsize, typename Function>
+void FPTwoOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    ARM64Reg result = ctx.reg_alloc.UseScratchFpr(args[0]);
+    result = fsize == 32 ? EncodeRegToSingle(result) : EncodeRegToDouble(result);
+    if constexpr (std::is_member_function_pointer_v<Function>) {
+        (code.fp_emitter.*fn)(result, result);
+    } else {
+        fn(result);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+template <size_t fsize, typename Function>
+void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    ARM64Reg result = ctx.reg_alloc.UseScratchFpr(args[0]);
+    ARM64Reg operand = ctx.reg_alloc.UseScratchFpr(args[1]);
+    result = fsize == 32 ? EncodeRegToSingle(result) : EncodeRegToDouble(result);
+    operand = fsize == 32 ? EncodeRegToSingle(operand) : EncodeRegToDouble(operand);
+
+    if constexpr (std::is_member_function_pointer_v<Function>) {
+        (code.fp_emitter.*fn)(result, result, operand);
+    }
+    else {
+        fn(result, result, operand);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+} // anonymous namespace
+
+//void EmitA64::EmitFPAbs16(EmitContext& ctx, IR::Inst* inst) {
+//    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+//    const ARM64Reg result = ctx.reg_alloc.UseScratchXmm(args[0]);
+//
+//    code.pand(result, code.MConst(xword, f16_non_sign_mask));
+//
+//    ctx.reg_alloc.DefineValue(inst, result);
+//}
+
+void EmitA64::EmitFPAbs32(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const ARM64Reg result = EncodeRegToSingle(ctx.reg_alloc.UseScratchFpr(args[0]));
+
+    code.fp_emitter.FABS(result, result);
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitFPAbs64(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const ARM64Reg result = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+
+    code.fp_emitter.FABS(result, result);
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+//void EmitA64::EmitFPNeg16(EmitContext& ctx, IR::Inst* inst) {
+//    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+//    const ARM64Reg result = ctx.reg_alloc.UseScratchXmm(args[0]);
+//
+//    code.pxor(result, code.MConst(xword, f16_negative_zero));
+//
+//    ctx.reg_alloc.DefineValue(inst, result);
+//}
+
+void EmitA64::EmitFPNeg32(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const ARM64Reg result = EncodeRegToSingle(ctx.reg_alloc.UseScratchFpr(args[0]));
+
+    code.fp_emitter.FNEG(result, result);
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitFPNeg64(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const ARM64Reg result = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+
+    code.fp_emitter.FNEG(result, result);
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitFPAdd32(EmitContext& ctx, IR::Inst* inst) {
+    FPThreeOp<32, void(Arm64Gen::ARM64FloatEmitter::*)(ARM64Reg, ARM64Reg, ARM64Reg)>(code, ctx, inst, &Arm64Gen::ARM64FloatEmitter::FADD);
+}
+
+void EmitA64::EmitFPAdd64(EmitContext& ctx, IR::Inst* inst) {
+    FPThreeOp<64, void(Arm64Gen::ARM64FloatEmitter::*)(ARM64Reg, ARM64Reg, ARM64Reg)>(code, ctx, inst, &Arm64Gen::ARM64FloatEmitter::FADD);
+}
+
+void EmitA64::EmitFPDiv32(EmitContext& ctx, IR::Inst* inst) {
+    FPThreeOp<32, void(Arm64Gen::ARM64FloatEmitter::*)(ARM64Reg, ARM64Reg, ARM64Reg)>(code, ctx, inst, &Arm64Gen::ARM64FloatEmitter::FDIV);
+}
+
+void EmitA64::EmitFPDiv64(EmitContext& ctx, IR::Inst* inst) {
+    FPThreeOp<64, void(Arm64Gen::ARM64FloatEmitter::*)(ARM64Reg, ARM64Reg, ARM64Reg)>(code, ctx, inst, &Arm64Gen::ARM64FloatEmitter::FDIV);
+}
+
+void EmitA64::EmitFPMul32(EmitContext& ctx, IR::Inst* inst) {
+    FPThreeOp<32, void(Arm64Gen::ARM64FloatEmitter::*)(ARM64Reg, ARM64Reg, ARM64Reg)>(code, ctx, inst, &Arm64Gen::ARM64FloatEmitter::FMUL);
+}
+
+void EmitA64::EmitFPMul64(EmitContext& ctx, IR::Inst* inst) {
+    FPThreeOp<64, void(Arm64Gen::ARM64FloatEmitter::*)(ARM64Reg, ARM64Reg, ARM64Reg)>(code, ctx, inst, &Arm64Gen::ARM64FloatEmitter::FMUL);
+}
+void EmitA64::EmitFPSqrt32(EmitContext& ctx, IR::Inst* inst) {
+    FPTwoOp<32>(code, ctx, inst, &Arm64Gen::ARM64FloatEmitter::FSQRT);
+}
+
+void EmitA64::EmitFPSqrt64(EmitContext& ctx, IR::Inst* inst) {
+    FPTwoOp<64>(code, ctx, inst, &Arm64Gen::ARM64FloatEmitter::FSQRT);
+}
+
+void EmitA64::EmitFPSub32(EmitContext& ctx, IR::Inst* inst) {
+    FPThreeOp<32, void(Arm64Gen::ARM64FloatEmitter::*)(ARM64Reg, ARM64Reg, ARM64Reg)>(code, ctx, inst, &Arm64Gen::ARM64FloatEmitter::FSUB);
+}
+
+void EmitA64::EmitFPSub64(EmitContext& ctx, IR::Inst* inst) {
+    FPThreeOp<64, void(Arm64Gen::ARM64FloatEmitter::*)(ARM64Reg, ARM64Reg, ARM64Reg)>(code, ctx, inst, &Arm64Gen::ARM64FloatEmitter::FSUB);
+}
+
+static ARM64Reg SetFpscrNzcvFromFlags(BlockOfCode& code, EmitContext& ctx) {
+    ARM64Reg nzcv = ctx.reg_alloc.ScratchGpr();
+    // Fpsr's nzcv is copied across integer nzcv 
+    code.MRS(nzcv, FIELD_NZCV);
+    return nzcv;
+}
+
+void EmitA64::EmitFPCompare32(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    ARM64Reg reg_a = EncodeRegToSingle(ctx.reg_alloc.UseFpr(args[0]));
+    ARM64Reg reg_b = EncodeRegToSingle(ctx.reg_alloc.UseFpr(args[1]));
+    bool exc_on_qnan = args[2].GetImmediateU1();
+
+    if (exc_on_qnan) {
+        code.fp_emitter.FCMPE(reg_a, reg_b);
+    } else {
+        code.fp_emitter.FCMP(reg_a, reg_b);
+    }
+
+    ARM64Reg nzcv = SetFpscrNzcvFromFlags(code, ctx);
+    ctx.reg_alloc.DefineValue(inst, nzcv);
+}
+
+void EmitA64::EmitFPCompare64(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const ARM64Reg reg_a = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[0]));
+    const ARM64Reg reg_b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+    bool exc_on_qnan = args[2].GetImmediateU1();
+
+    if (exc_on_qnan) {
+        code.fp_emitter.FCMPE(reg_a, reg_b);
+    } else {
+        code.fp_emitter.FCMP(reg_a, reg_b);
+    }
+
+    ARM64Reg nzcv = SetFpscrNzcvFromFlags(code, ctx);
+    ctx.reg_alloc.DefineValue(inst, nzcv);
+}
+
+void EmitA64::EmitFPHalfToDouble(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const ARM64Reg result = EncodeRegToSingle(ctx.reg_alloc.UseScratchFpr(args[0]));
+
+    code.fp_emitter.FCVT(64, 16, result, result);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitFPHalfToSingle(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const ARM64Reg result = EncodeRegToSingle(ctx.reg_alloc.UseScratchFpr(args[0]));
+    code.fp_emitter.FCVT(32, 16, result, result);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitFPSingleToDouble(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const ARM64Reg result = EncodeRegToSingle(ctx.reg_alloc.UseScratchFpr(args[0]));
+
+    code.fp_emitter.FCVT(64, 32, result, result);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitFPSingleToHalf(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const ARM64Reg result = EncodeRegToSingle(ctx.reg_alloc.UseScratchFpr(args[0]));
+    code.fp_emitter.FCVT(16, 32, result, result);
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitFPDoubleToHalf(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const ARM64Reg result = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    code.fp_emitter.FCVT(16, 64, result, result);
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitFPDoubleToSingle(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const ARM64Reg result = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    code.fp_emitter.FCVT(32, 64, result, result);
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+template<size_t fsize, bool unsigned_, size_t isize>
+static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const size_t fbits = args[1].GetImmediateU8();
+    const auto rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
+    const auto round_imm = ConvertRoundingModeToA64RoundingMode(rounding_mode);
+
+    ASSERT_MSG(fbits == 0, "fixed point conversions are not supported yet");
+
+    ARM64Reg src = ctx.reg_alloc.UseScratchFpr(args[0]);
+    ARM64Reg result = ctx.reg_alloc.ScratchGpr();
+    src = fsize == 64 ? EncodeRegToDouble(src) : EncodeRegToSingle(src);
+    result = isize == 64 ? result : DecodeReg(result);
+
+    if constexpr (unsigned_) {
+        code.fp_emitter.FCVTU(result, src, round_imm);
+    }
+    else {
+        code.fp_emitter.FCVTS(result, src, round_imm);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+
+}
+
+void EmitA64::EmitFPDoubleToFixedS32(EmitContext& ctx, IR::Inst* inst) {
+    EmitFPToFixed<64, false, 32>(code, ctx, inst);
+}
+
+void EmitA64::EmitFPDoubleToFixedS64(EmitContext& ctx, IR::Inst* inst) {
+    EmitFPToFixed<64, false, 64>(code, ctx, inst);
+}
+
+void EmitA64::EmitFPDoubleToFixedU32(EmitContext& ctx, IR::Inst* inst) {
+    EmitFPToFixed<64, true, 32>(code, ctx, inst);
+}
+
+void EmitA64::EmitFPDoubleToFixedU64(EmitContext& ctx, IR::Inst* inst) {
+    EmitFPToFixed<64, true, 64>(code, ctx, inst);
+}
+
+void EmitA64::EmitFPSingleToFixedS32(EmitContext& ctx, IR::Inst* inst) {
+    EmitFPToFixed<32, false, 32>(code, ctx, inst);
+}
+
+void EmitA64::EmitFPSingleToFixedS64(EmitContext& ctx, IR::Inst* inst) {
+    EmitFPToFixed<32, false, 64>(code, ctx, inst);
+}
+
+void EmitA64::EmitFPSingleToFixedU32(EmitContext& ctx, IR::Inst* inst) {
+    EmitFPToFixed<32, true, 32>(code, ctx, inst);
+}
+
+void EmitA64::EmitFPSingleToFixedU64(EmitContext& ctx, IR::Inst* inst) {
+    EmitFPToFixed<32, true, 64>(code, ctx, inst);
+}
+
+void EmitA64::EmitFPFixedS32ToSingle(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg from = DecodeReg(ctx.reg_alloc.UseGpr(args[0]));
+    const ARM64Reg result = EncodeRegToSingle(ctx.reg_alloc.ScratchFpr());
+    const size_t fbits = args[1].GetImmediateU8();
+    const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
+    ASSERT(rounding_mode == ctx.FPSCR_RMode());
+
+    if (fbits != 0) {
+        code.fp_emitter.SCVTF(result, from, fbits);
+    }
+    else {
+        code.fp_emitter.SCVTF(result, from);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitFPFixedU32ToSingle(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    
+    const ARM64Reg from = DecodeReg(ctx.reg_alloc.UseGpr(args[0]));
+    const ARM64Reg result = EncodeRegToSingle(ctx.reg_alloc.ScratchFpr());
+    const size_t fbits = args[1].GetImmediateU8();
+    const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
+    ASSERT(rounding_mode == ctx.FPSCR_RMode());
+
+    if (fbits != 0) {
+        code.fp_emitter.UCVTF(result, from, fbits);
+    }
+    else {
+        code.fp_emitter.UCVTF(result, from);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitFPFixedS32ToDouble(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg from = DecodeReg(ctx.reg_alloc.UseGpr(args[0]));
+    const ARM64Reg result = EncodeRegToDouble(ctx.reg_alloc.ScratchFpr());
+    const size_t fbits = args[1].GetImmediateU8();
+    const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
+    ASSERT(rounding_mode == ctx.FPSCR_RMode());
+
+    if (fbits != 0) {
+        code.fp_emitter.SCVTF(result, from, fbits);
+    }
+    else {
+        code.fp_emitter.SCVTF(result, from);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitFPFixedS64ToDouble(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg from = ctx.reg_alloc.UseGpr(args[0]);
+    const ARM64Reg result = EncodeRegToDouble(ctx.reg_alloc.ScratchFpr());
+    const size_t fbits = args[1].GetImmediateU8();
+    const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
+    ASSERT(rounding_mode == ctx.FPSCR_RMode());
+
+    if (fbits != 0) {
+        code.fp_emitter.SCVTF(result, from, fbits);
+    }
+    else {
+        code.fp_emitter.SCVTF(result, from);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitFPFixedS64ToSingle(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg from = ctx.reg_alloc.UseGpr(args[0]);
+    const ARM64Reg result = EncodeRegToSingle(ctx.reg_alloc.ScratchFpr());
+    const size_t fbits = args[1].GetImmediateU8();
+    const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
+    ASSERT(rounding_mode == ctx.FPSCR_RMode());
+
+    if (fbits != 0) {
+        code.fp_emitter.SCVTF(result, from, fbits);
+    }
+    else {
+        code.fp_emitter.SCVTF(result, from);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitFPFixedU32ToDouble(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg from = DecodeReg(ctx.reg_alloc.UseGpr(args[0]));
+    const ARM64Reg result = EncodeRegToDouble(ctx.reg_alloc.ScratchFpr());
+    const size_t fbits = args[1].GetImmediateU8();
+    const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
+    ASSERT(rounding_mode == ctx.FPSCR_RMode());
+
+    if (fbits != 0) {
+        code.fp_emitter.UCVTF(result, from, fbits);
+    }
+    else {
+        code.fp_emitter.UCVTF(result, from);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitFPFixedU64ToDouble(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+
+    const ARM64Reg from = ctx.reg_alloc.UseGpr(args[0]);
+    const ARM64Reg result = EncodeRegToDouble(ctx.reg_alloc.ScratchFpr());
+    const size_t fbits = args[1].GetImmediateU8();
+    const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
+    ASSERT(rounding_mode == ctx.FPSCR_RMode());
+
+    if (fbits != 0) {
+        code.fp_emitter.UCVTF(result, from, fbits);
+    }
+    else {
+        code.fp_emitter.UCVTF(result, from);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitFPFixedU64ToSingle(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+
+    const ARM64Reg from = ctx.reg_alloc.UseGpr(args[0]);
+    const ARM64Reg result = EncodeRegToSingle(ctx.reg_alloc.ScratchFpr());
+    const size_t fbits = args[1].GetImmediateU8();
+    const FP::RoundingMode rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
+    ASSERT(rounding_mode == ctx.FPSCR_RMode());
+
+    if (fbits != 0) {
+        code.fp_emitter.UCVTF(result, from, fbits);
+    }
+    else {
+        code.fp_emitter.UCVTF(result, from);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/emit_a64_packed.cpp b/src/dynarmic/backend/A64/emit_a64_packed.cpp
new file mode 100644
index 00000000..fb54361d
--- /dev/null
+++ b/src/dynarmic/backend/A64/emit_a64_packed.cpp
@@ -0,0 +1,469 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#include "backend/A64/block_of_code.h"
+#include "backend/A64/emit_a64.h"
+#include "frontend/ir/microinstruction.h"
+#include "frontend/ir/opcodes.h"
+
+namespace Dynarmic::BackendA64 {
+
+void EmitA64::EmitPackedAddU8(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+    const ARM64Reg sum = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    code.fp_emitter.ADD(B, sum, sum, b);
+
+    if (ge_inst) {
+        const ARM64Reg ge = EncodeRegToDouble(ctx.reg_alloc.ScratchFpr());
+
+        code.fp_emitter.CMHI(B, ge, b, sum);
+
+        ctx.reg_alloc.DefineValue(ge_inst, ge);
+        ctx.EraseInstruction(ge_inst);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, sum);
+}
+
+void EmitA64::EmitPackedAddS8(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+    const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    if (ge_inst) {
+        const ARM64Reg ge = EncodeRegToDouble(ctx.reg_alloc.ScratchFpr());
+
+        code.fp_emitter.SQADD(B, ge, a, b);
+        code.fp_emitter.CMGE_zero(B, ge, ge);
+
+        ctx.reg_alloc.DefineValue(ge_inst, ge);
+        ctx.EraseInstruction(ge_inst);
+    }
+
+    code.fp_emitter.ADD(B, a, a, b);
+
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitA64::EmitPackedAddU16(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+    const ARM64Reg sum = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    code.fp_emitter.ADD(H, sum, sum, b);
+
+    if (ge_inst) {
+        const ARM64Reg ge = EncodeRegToDouble(ctx.reg_alloc.ScratchFpr());
+
+        code.fp_emitter.CMHI(H, ge, b, sum);
+
+        ctx.reg_alloc.DefineValue(ge_inst, ge);
+        ctx.EraseInstruction(ge_inst);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, sum);
+}
+
+void EmitA64::EmitPackedAddS16(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+    const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    if (ge_inst) {
+        const ARM64Reg ge = EncodeRegToDouble(ctx.reg_alloc.ScratchFpr());
+
+        code.fp_emitter.SQADD(H, ge, a, b);
+        code.fp_emitter.CMGE_zero(H, ge, ge);
+
+        ctx.reg_alloc.DefineValue(ge_inst, ge);
+        ctx.EraseInstruction(ge_inst);
+    }
+
+    code.fp_emitter.ADD(H, a, a, b);
+
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitA64::EmitPackedSubU8(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+    const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    if (ge_inst) {
+        const ARM64Reg ge = EncodeRegToDouble(ctx.reg_alloc.ScratchFpr());
+
+        code.fp_emitter.CMHS(B, ge, a, b);
+
+        ctx.reg_alloc.DefineValue(ge_inst, ge);
+        ctx.EraseInstruction(ge_inst);
+    }
+
+    code.fp_emitter.SUB(B, a, a, b);
+
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitA64::EmitPackedSubS8(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+    const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    if (ge_inst) {
+        const ARM64Reg ge = EncodeRegToDouble(ctx.reg_alloc.ScratchFpr());
+
+        code.fp_emitter.SQSUB(B, ge, a, b);
+        code.fp_emitter.CMGE_zero(B, ge, ge);
+
+        ctx.reg_alloc.DefineValue(ge_inst, ge);
+        ctx.EraseInstruction(ge_inst);
+    }
+
+    code.fp_emitter.SUB(B, a, a, b);
+
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitA64::EmitPackedSubU16(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+    const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    if (ge_inst) {
+        const ARM64Reg ge = EncodeRegToDouble(ctx.reg_alloc.ScratchFpr());
+
+        code.fp_emitter.CMHS(H, ge, a, b);
+
+        ctx.reg_alloc.DefineValue(ge_inst, ge);
+        ctx.EraseInstruction(ge_inst);
+    }
+
+    code.fp_emitter.SUB(H, a, a, b);
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitA64::EmitPackedSubS16(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+    const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    if (ge_inst) {
+        const ARM64Reg ge = EncodeRegToDouble(ctx.reg_alloc.ScratchFpr());
+
+        code.fp_emitter.SQSUB(H, ge, a, b);
+        code.fp_emitter.CMGE_zero(H, ge, ge);
+
+        ctx.reg_alloc.DefineValue(ge_inst, ge);
+        ctx.EraseInstruction(ge_inst);
+    }
+
+    code.fp_emitter.SUB(H, a, a, b);
+
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitA64::EmitPackedHalvingAddU8(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    code.fp_emitter.UHADD(B, a, a, b);
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitA64::EmitPackedHalvingAddU16(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    code.fp_emitter.UHADD(H, a, a, b);
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitA64::EmitPackedHalvingAddS8(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    code.fp_emitter.SHADD(B, a, a, b);
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitA64::EmitPackedHalvingAddS16(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    code.fp_emitter.SHADD(H, a, a, b);
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitA64::EmitPackedHalvingSubU8(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    code.fp_emitter.UHSUB(B, a, a, b);
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitA64::EmitPackedHalvingSubS8(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    code.fp_emitter.SHSUB(B, a, a, b);
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitA64::EmitPackedHalvingSubU16(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    code.fp_emitter.UHSUB(H, a, a, b);
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitA64::EmitPackedHalvingSubS16(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    code.fp_emitter.SHSUB(H, a, a, b);
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitPackedSubAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, bool hi_is_sum, bool is_signed, bool is_halving) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp);
+
+    const ARM64Reg reg_a_hi = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[0]));
+    const ARM64Reg reg_b_hi = DecodeReg(ctx.reg_alloc.UseScratchGpr(args[1]));
+    const ARM64Reg reg_a_lo = DecodeReg(ctx.reg_alloc.ScratchGpr());
+    const ARM64Reg reg_b_lo = DecodeReg(ctx.reg_alloc.ScratchGpr());
+    ARM64Reg reg_sum, reg_diff;
+
+    if (is_signed) {
+        code.SXTH(reg_a_lo, reg_a_hi);
+        code.SXTH(reg_b_lo, reg_b_hi);
+        code.ASR(reg_a_hi, reg_a_hi, 16);
+        code.ASR(reg_b_hi, reg_b_hi, 16);
+    } else {
+        code.UXTH(reg_a_lo, reg_a_hi);
+        code.UXTH(reg_b_lo, reg_b_hi);
+        code.LSR(reg_a_hi, reg_a_hi, 16);
+        code.LSR(reg_b_hi, reg_b_hi, 16);
+    }
+
+    if (hi_is_sum) {
+        code.SUB(reg_a_lo, reg_a_lo, reg_b_hi);
+        code.ADD(reg_a_hi, reg_a_hi, reg_b_lo);
+        reg_diff = reg_a_lo;
+        reg_sum = reg_a_hi;
+    } else {
+        code.ADD(reg_a_lo, reg_a_lo, reg_b_hi);
+        code.SUB(reg_a_hi, reg_a_hi, reg_b_lo);
+        reg_diff = reg_a_hi;
+        reg_sum = reg_a_lo;
+    }
+
+    if (ge_inst) {
+        // The reg_b registers are no longer required.
+        const ARM64Reg ge_sum = reg_b_hi;
+        const ARM64Reg ge_diff = reg_b_lo;
+
+        if (!is_signed) {
+            code.LSL(ge_sum, reg_sum, 15);
+            code.ASR(ge_sum, ge_sum, 31);
+        } else {
+            code.MVN(ge_sum, reg_sum);
+            code.ASR(ge_sum, ge_sum, 31);
+        }
+        code.MVN(ge_diff, reg_diff);
+        code.ASR(ge_diff, ge_diff, 31);
+        code.ANDI2R(ge_sum, ge_sum, hi_is_sum ? 0xFFFF0000 : 0x0000FFFF);
+        code.ANDI2R(ge_diff, ge_diff, hi_is_sum ? 0x0000FFFF : 0xFFFF0000);
+        code.ORR(ge_sum, ge_sum, ge_diff);
+
+        ctx.reg_alloc.DefineValue(ge_inst, ge_sum);
+        ctx.EraseInstruction(ge_inst);
+    }
+
+    if (is_halving) {
+        code.LSR(reg_a_hi, reg_a_hi, 1);
+        code.LSR(reg_a_lo, reg_a_lo, 1);
+    }
+
+    // reg_a_lo now contains the low word and reg_a_hi now contains the high word.
+    // Merge them.
+    code.BFM(reg_a_lo, reg_a_hi, 16, 15);
+
+    ctx.reg_alloc.DefineValue(inst, reg_a_lo);
+}
+
+void EmitA64::EmitPackedAddSubU16(EmitContext& ctx, IR::Inst* inst) {
+    EmitPackedSubAdd(code, ctx, inst, true, false, false);
+}
+
+void EmitA64::EmitPackedAddSubS16(EmitContext& ctx, IR::Inst* inst) {
+    EmitPackedSubAdd(code, ctx, inst, true, true, false);
+}
+
+void EmitA64::EmitPackedSubAddU16(EmitContext& ctx, IR::Inst* inst) {
+    EmitPackedSubAdd(code, ctx, inst, false, false, false);
+}
+
+void EmitA64::EmitPackedSubAddS16(EmitContext& ctx, IR::Inst* inst) {
+    EmitPackedSubAdd(code, ctx, inst, false, true, false);
+}
+
+void EmitA64::EmitPackedHalvingAddSubU16(EmitContext& ctx, IR::Inst* inst) {
+    EmitPackedSubAdd(code, ctx, inst, true, false, true);
+}
+
+void EmitA64::EmitPackedHalvingAddSubS16(EmitContext& ctx, IR::Inst* inst) {
+    EmitPackedSubAdd(code, ctx, inst, true, true, true);
+}
+
+void EmitA64::EmitPackedHalvingSubAddU16(EmitContext& ctx, IR::Inst* inst) {
+    EmitPackedSubAdd(code, ctx, inst, false, false, true);
+}
+
+void EmitA64::EmitPackedHalvingSubAddS16(EmitContext& ctx, IR::Inst* inst) {
+    EmitPackedSubAdd(code, ctx, inst, false, true, true);
+}
+
+void EmitA64::EmitPackedSaturatedAddU8(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    code.fp_emitter.UQADD(B, a, a, b);
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitA64::EmitPackedSaturatedAddS8(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    code.fp_emitter.SQADD(B, a, a, b);
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitA64::EmitPackedSaturatedSubU8(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    code.fp_emitter.UQSUB(B, a, a, b);
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitA64::EmitPackedSaturatedSubS8(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    code.fp_emitter.SQSUB(B, a, a, b);
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitA64::EmitPackedSaturatedAddU16(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    code.fp_emitter.UQADD(H, a, a, b);
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitA64::EmitPackedSaturatedAddS16(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    code.fp_emitter.SQADD(H, a, a, b);
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitA64::EmitPackedSaturatedSubU16(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    code.fp_emitter.UQSUB(H, a, a, b);
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitA64::EmitPackedSaturatedSubS16(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    code.fp_emitter.SQSUB(H, a, a, b);
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitA64::EmitPackedAbsDiffSumS8(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    code.fp_emitter.UABD(B, a, a, b);
+    code.fp_emitter.UADDLV(B, a, a);
+
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitA64::EmitPackedSelect(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const ARM64Reg ge = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    const ARM64Reg a = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+    const ARM64Reg b = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[2]));
+
+    code.fp_emitter.BSL(ge, b, a);
+
+    ctx.reg_alloc.DefineValue(inst, ge);
+}
+
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/emit_a64_saturation.cpp b/src/dynarmic/backend/A64/emit_a64_saturation.cpp
new file mode 100644
index 00000000..5462fba4
--- /dev/null
+++ b/src/dynarmic/backend/A64/emit_a64_saturation.cpp
@@ -0,0 +1,167 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#include <limits>
+
+#include "backend/A64/block_of_code.h"
+#include "backend/A64/emit_a64.h"
+#include "common/assert.h"
+#include "common/bit_util.h"
+#include "common/common_types.h"
+#include "frontend/ir/basic_block.h"
+#include "frontend/ir/microinstruction.h"
+#include "frontend/ir/opcodes.h"
+
+namespace Dynarmic::BackendA64 {
+
+namespace {
+
+enum class Op {
+    Add,
+    Sub,
+};
+
+template<Op op, size_t size>
+void EmitSignedSaturatedOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+    const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp);
+
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    ARM64Reg result = EncodeRegToDouble(ctx.reg_alloc.UseScratchFpr(args[0]));
+    ARM64Reg addend = EncodeRegToDouble(ctx.reg_alloc.UseFpr(args[1]));
+
+    if constexpr (op == Op::Add) {
+        code.fp_emitter.SQADD(size, result, result, addend);
+    }
+    else {
+        code.fp_emitter.SQSUB(size, result, result, addend);
+    }
+
+    if (overflow_inst) {
+        ARM64Reg overflow = ctx.reg_alloc.ScratchGpr();
+
+        code.MRS(overflow, FIELD_FPSR);
+        code.UBFX(overflow, overflow, 27, 1);
+
+        ctx.reg_alloc.DefineValue(overflow_inst, overflow);
+        ctx.EraseInstruction(overflow_inst);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+} // anonymous namespace
+
+void EmitA64::EmitSignedSaturatedAdd8(EmitContext& ctx, IR::Inst* inst) {
+    EmitSignedSaturatedOp<Op::Add, 8>(code, ctx, inst);
+}
+
+void EmitA64::EmitSignedSaturatedAdd16(EmitContext& ctx, IR::Inst* inst) {
+    EmitSignedSaturatedOp<Op::Add, 16>(code, ctx, inst);
+}
+
+void EmitA64::EmitSignedSaturatedAdd32(EmitContext& ctx, IR::Inst* inst) {
+    EmitSignedSaturatedOp<Op::Add, 32>(code, ctx, inst);
+}
+
+void EmitA64::EmitSignedSaturatedAdd64(EmitContext& ctx, IR::Inst* inst) {
+    EmitSignedSaturatedOp<Op::Add, 64>(code, ctx, inst);
+}
+
+void EmitA64::EmitSignedSaturatedSub8(EmitContext& ctx, IR::Inst* inst) {
+    EmitSignedSaturatedOp<Op::Sub, 8>(code, ctx, inst);
+}
+
+void EmitA64::EmitSignedSaturatedSub16(EmitContext& ctx, IR::Inst* inst) {
+    EmitSignedSaturatedOp<Op::Sub, 16>(code, ctx, inst);
+}
+
+void EmitA64::EmitSignedSaturatedSub32(EmitContext& ctx, IR::Inst* inst) {
+    EmitSignedSaturatedOp<Op::Sub, 32>(code, ctx, inst);
+}
+
+void EmitA64::EmitSignedSaturatedSub64(EmitContext& ctx, IR::Inst* inst) {
+    EmitSignedSaturatedOp<Op::Sub, 64>(code, ctx, inst);
+}
+
+void EmitA64::EmitSignedSaturation(EmitContext& ctx, IR::Inst* inst) {
+    const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp);
+
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const size_t N = args[1].GetImmediateU8();
+    ASSERT(N >= 1 && N <= 32);
+
+    if (N == 32) {
+        if (overflow_inst) {
+            const auto no_overflow = IR::Value(false);
+            overflow_inst->ReplaceUsesWith(no_overflow);
+        }
+        ctx.reg_alloc.DefineValue(inst, args[0]);
+        return;
+    }
+
+    const u32 mask = (1u << N) - 1;
+    const u32 positive_saturated_value = (1u << (N - 1)) - 1;
+    const u32 negative_saturated_value = 1u << (N - 1);
+    const u32 sext_negative_satured_value = Common::SignExtend(N, negative_saturated_value);
+
+    const ARM64Reg result = DecodeReg(ctx.reg_alloc.ScratchGpr());
+    const ARM64Reg reg_a = DecodeReg(ctx.reg_alloc.UseGpr(args[0]));
+    const ARM64Reg overflow = DecodeReg(ctx.reg_alloc.ScratchGpr());
+    const ARM64Reg tmp = DecodeReg(ctx.reg_alloc.ScratchGpr());
+
+    // overflow now contains a value between 0 and mask if it was originally between {negative,positive}_saturated_value.
+    code.ADDI2R(overflow, reg_a, negative_saturated_value, overflow);
+
+    // Put the appropriate saturated value in result
+    code.MOVI2R(tmp, positive_saturated_value);
+    code.CMP(reg_a, tmp);
+    code.MOVI2R(result, sext_negative_satured_value);
+    code.CSEL(result, tmp, result, CC_GT);
+
+    // Do the saturation
+    code.CMPI2R(overflow, mask, tmp);
+    code.CSEL(result, reg_a, result, CC_LS);
+
+    if (overflow_inst) {
+        code.CSET(overflow, CC_HI);
+
+        ctx.reg_alloc.DefineValue(overflow_inst, overflow);
+        ctx.EraseInstruction(overflow_inst);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitA64::EmitUnsignedSaturation(EmitContext& ctx, IR::Inst* inst) {
+    const auto overflow_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetOverflowFromOp);
+
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const size_t N = args[1].GetImmediateU8();
+    ASSERT(N <= 31);
+
+    const u32 saturated_value = (1u << N) - 1;
+
+    const ARM64Reg result = DecodeReg(ctx.reg_alloc.ScratchGpr());
+    const ARM64Reg reg_a = DecodeReg(ctx.reg_alloc.UseGpr(args[0]));
+    const ARM64Reg overflow = DecodeReg(ctx.reg_alloc.ScratchGpr());
+
+    // Pseudocode: result = clamp(reg_a, 0, saturated_value);
+    code.MOVI2R(result, saturated_value);
+    code.CMP(reg_a, result);
+    code.CSEL(result, WZR, result, CC_LE);
+    code.CSEL(result, reg_a, result, CC_LS);
+
+    if (overflow_inst) {
+        code.CSET(overflow, CC_HI);
+
+        ctx.reg_alloc.DefineValue(overflow_inst, overflow);
+        ctx.EraseInstruction(overflow_inst);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/emitter/a64_emitter.cpp b/src/dynarmic/backend/A64/emitter/a64_emitter.cpp
new file mode 100644
index 00000000..efbb4767
--- /dev/null
+++ b/src/dynarmic/backend/A64/emitter/a64_emitter.cpp
@@ -0,0 +1,3897 @@
+// Copyright 2015 Dolphin Emulator Project / 2018 dynarmic project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <cinttypes>
+#include <cstring>
+#include <vector>
+
+#if defined(__APPLE__)
+#include <libkern/OSCacheControl.h>
+#endif
+
+#include "a64_emitter.h"
+#include "common/assert.h"
+#include "common/bit_util.h"
+#include "common/cast_util.h"
+#include "common/common_types.h"
+#include "common/math_util.h"
+
+namespace Dynarmic::BackendA64::Arm64Gen {
+
+namespace {
+const int kWRegSizeInBits = 32;
+const int kXRegSizeInBits = 64;
+
+// The below few functions are taken from V8.
+int CountLeadingZeros(u64 value, int width) {
+#ifdef _MSC_VER
+    if (width == 64) {
+        return _CountLeadingZeros64(value);
+    }
+#else
+    if (width == 64) {
+        return __builtin_clzll(value);
+    }
+#endif
+    // TODO(jbramley): Optimize this for ARM64 hosts.
+    int count = 0;
+    uint64_t bit_test = 1ULL << (width - 1);
+    while ((count < width) && ((bit_test & value) == 0)) {
+        count++;
+        bit_test >>= 1;
+    }
+    return count;
+}
+
+uint64_t LargestPowerOf2Divisor(uint64_t value) {
+    return value & -(int64_t)value;
+}
+
+// For ADD/SUB
+bool IsImmArithmetic(uint64_t input, u32* val, bool* shift) {
+    if (input < 4096) {
+        *val = static_cast<u32>(input);
+        *shift = false;
+        return true;
+    } else if ((input & 0xFFF000) == input) {
+        *val = static_cast<u32>(input >> 12);
+        *shift = true;
+        return true;
+    }
+    return false;
+}
+
+// For AND/TST/ORR/EOR etc
+bool IsImmLogical(uint64_t value, unsigned int width, unsigned int* n, unsigned int* imm_s,
+                  unsigned int* imm_r) {
+    bool negate = false;
+
+    // Logical immediates are encoded using parameters n, imm_s and imm_r using
+    // the following table:
+    //
+    //    N   imms    immr    size        S             R
+    //    1  ssssss  rrrrrr    64    UInt(ssssss)  UInt(rrrrrr)
+    //    0  0sssss  xrrrrr    32    UInt(sssss)   UInt(rrrrr)
+    //    0  10ssss  xxrrrr    16    UInt(ssss)    UInt(rrrr)
+    //    0  110sss  xxxrrr     8    UInt(sss)     UInt(rrr)
+    //    0  1110ss  xxxxrr     4    UInt(ss)      UInt(rr)
+    //    0  11110s  xxxxxr     2    UInt(s)       UInt(r)
+    // (s bits must not be all set)
+    //
+    // A pattern is constructed of size bits, where the least significant S+1 bits
+    // are set. The pattern is rotated right by R, and repeated across a 32 or
+    // 64-bit value, depending on destination register width.
+    //
+    // Put another way: the basic format of a logical immediate is a single
+    // contiguous stretch of 1 bits, repeated across the whole word at intervals
+    // given by a power of 2. To identify them quickly, we first locate the
+    // lowest stretch of 1 bits, then the next 1 bit above that; that combination
+    // is different for every logical immediate, so it gives us all the
+    // information we need to identify the only logical immediate that our input
+    // could be, and then we simply check if that's the value we actually have.
+    //
+    // (The rotation parameter does give the possibility of the stretch of 1 bits
+    // going 'round the end' of the word. To deal with that, we observe that in
+    // any situation where that happens the bitwise NOT of the value is also a
+    // valid logical immediate. So we simply invert the input whenever its low bit
+    // is set, and then we know that the rotated case can't arise.)
+
+    if (value & 1) {
+        // If the low bit is 1, negate the value, and set a flag to remember that we
+        // did (so that we can adjust the return values appropriately).
+        negate = true;
+        value = ~value;
+    }
+
+    if (width == kWRegSizeInBits) {
+        // To handle 32-bit logical immediates, the very easiest thing is to repeat
+        // the input value twice to make a 64-bit word. The correct encoding of that
+        // as a logical immediate will also be the correct encoding of the 32-bit
+        // value.
+
+        // The most-significant 32 bits may not be zero (ie. negate is true) so
+        // shift the value left before duplicating it.
+        value <<= kWRegSizeInBits;
+        value |= value >> kWRegSizeInBits;
+    }
+
+    // The basic analysis idea: imagine our input word looks like this.
+    //
+    //    0011111000111110001111100011111000111110001111100011111000111110
+    //                                                          c  b    a
+    //                                                          |<--d-->|
+    //
+    // We find the lowest set bit (as an actual power-of-2 value, not its index)
+    // and call it a. Then we add a to our original number, which wipes out the
+    // bottommost stretch of set bits and replaces it with a 1 carried into the
+    // next zero bit. Then we look for the new lowest set bit, which is in
+    // position b, and subtract it, so now our number is just like the original
+    // but with the lowest stretch of set bits completely gone. Now we find the
+    // lowest set bit again, which is position c in the diagram above. Then we'll
+    // measure the distance d between bit positions a and c (using CLZ), and that
+    // tells us that the only valid logical immediate that could possibly be equal
+    // to this number is the one in which a stretch of bits running from a to just
+    // below b is replicated every d bits.
+    uint64_t a = LargestPowerOf2Divisor(value);
+    uint64_t value_plus_a = value + a;
+    uint64_t b = LargestPowerOf2Divisor(value_plus_a);
+    uint64_t value_plus_a_minus_b = value_plus_a - b;
+    uint64_t c = LargestPowerOf2Divisor(value_plus_a_minus_b);
+
+    int d, clz_a, out_n;
+    uint64_t mask;
+
+    if (c != 0) {
+        // The general case, in which there is more than one stretch of set bits.
+        // Compute the repeat distance d, and set up a bitmask covering the basic
+        // unit of repetition (i.e. a word with the bottom d bits set). Also, in all
+        // of these cases the N bit of the output will be zero.
+        clz_a = CountLeadingZeros(a, kXRegSizeInBits);
+        int clz_c = CountLeadingZeros(c, kXRegSizeInBits);
+        d = clz_a - clz_c;
+        mask = ((UINT64_C(1) << d) - 1);
+        out_n = 0;
+    } else {
+        // Handle degenerate cases.
+        //
+        // If any of those 'find lowest set bit' operations didn't find a set bit at
+        // all, then the word will have been zero thereafter, so in particular the
+        // last lowest_set_bit operation will have returned zero. So we can test for
+        // all the special case conditions in one go by seeing if c is zero.
+        if (a == 0) {
+            // The input was zero (or all 1 bits, which will come to here too after we
+            // inverted it at the start of the function), for which we just return
+            // false.
+            return false;
+        } else {
+            // Otherwise, if c was zero but a was not, then there's just one stretch
+            // of set bits in our word, meaning that we have the trivial case of
+            // d == 64 and only one 'repetition'. Set up all the same variables as in
+            // the general case above, and set the N bit in the output.
+            clz_a = CountLeadingZeros(a, kXRegSizeInBits);
+            d = 64;
+            mask = ~UINT64_C(0);
+            out_n = 1;
+        }
+    }
+
+    // If the repeat period d is not a power of two, it can't be encoded.
+    if (!Dynarmic::Common::IsPow2<u64>(d))
+        return false;
+
+    // If the bit stretch (b - a) does not fit within the mask derived from the
+    // repeat period, then fail.
+    if (((b - a) & ~mask) != 0)
+        return false;
+
+    // The only possible option is b - a repeated every d bits. Now we're going to
+    // actually construct the valid logical immediate derived from that
+    // specification, and see if it equals our original input.
+    //
+    // To repeat a value every d bits, we multiply it by a number of the form
+    // (1 + 2^d + 2^(2d) + ...), i.e. 0x0001000100010001 or similar. These can
+    // be derived using a table lookup on CLZ(d).
+    static const std::array<uint64_t, 6> multipliers = {{
+        0x0000000000000001UL,
+        0x0000000100000001UL,
+        0x0001000100010001UL,
+        0x0101010101010101UL,
+        0x1111111111111111UL,
+        0x5555555555555555UL,
+    }};
+
+    int multiplier_idx = CountLeadingZeros(d, kXRegSizeInBits) - 57;
+
+    // Ensure that the index to the multipliers array is within bounds.
+    DEBUG_ASSERT((multiplier_idx >= 0) &&
+                 (static_cast<size_t>(multiplier_idx) < multipliers.size()));
+
+    uint64_t multiplier = multipliers[multiplier_idx];
+    uint64_t candidate = (b - a) * multiplier;
+
+    // The candidate pattern doesn't match our input value, so fail.
+    if (value != candidate)
+        return false;
+
+    // We have a match! This is a valid logical immediate, so now we have to
+    // construct the bits and pieces of the instruction encoding that generates
+    // it.
+
+    // Count the set bits in our basic stretch. The special case of clz(0) == -1
+    // makes the answer come out right for stretches that reach the very top of
+    // the word (e.g. numbers like 0xffffc00000000000).
+    int clz_b = (b == 0) ? -1 : CountLeadingZeros(b, kXRegSizeInBits);
+    int s = clz_a - clz_b;
+
+    // Decide how many bits to rotate right by, to put the low bit of that basic
+    // stretch in position a.
+    int r;
+    if (negate) {
+        // If we inverted the input right at the start of this function, here's
+        // where we compensate: the number of set bits becomes the number of clear
+        // bits, and the rotation count is based on position b rather than position
+        // a (since b is the location of the 'lowest' 1 bit after inversion).
+        s = d - s;
+        r = (clz_b + 1) & (d - 1);
+    } else {
+        r = (clz_a + 1) & (d - 1);
+    }
+
+    // Now we're done, except for having to encode the S output in such a way that
+    // it gives both the number of set bits and the length of the repeated
+    // segment. The s field is encoded like this:
+    //
+    //     imms    size        S
+    //    ssssss    64    UInt(ssssss)
+    //    0sssss    32    UInt(sssss)
+    //    10ssss    16    UInt(ssss)
+    //    110sss     8    UInt(sss)
+    //    1110ss     4    UInt(ss)
+    //    11110s     2    UInt(s)
+    //
+    // So we 'or' (-d << 1) with our computed s to form imms.
+    *n = out_n;
+    *imm_s = ((-d << 1) | (s - 1)) & 0x3f;
+    *imm_r = r;
+
+    return true;
+}
+
+float FPImm8ToFloat(u8 bits) {
+    const u32 sign = bits >> 7;
+    const u32 bit6 = (bits >> 6) & 1;
+    const u32 exp = ((!bit6) << 7) | (0x7C * bit6) | ((bits >> 4) & 3);
+    const u32 mantissa = (bits & 0xF) << 19;
+    const u32 f = (sign << 31) | (exp << 23) | mantissa;
+
+    return Dynarmic::Common::BitCast<float>(f);
+}
+
+bool FPImm8FromFloat(float value, u8* imm_out) {
+    const u32 f = Dynarmic::Common::BitCast<u32>(value);
+    const u32 mantissa4 = (f & 0x7FFFFF) >> 19;
+    const u32 exponent = (f >> 23) & 0xFF;
+    const u32 sign = f >> 31;
+
+    if ((exponent >> 7) == ((exponent >> 6) & 1))
+        return false;
+
+    const u8 imm8 = static_cast<u8>((sign << 7) | ((!(exponent >> 7)) << 6) |
+                                    ((exponent & 3) << 4) | mantissa4);
+    const float new_float = FPImm8ToFloat(imm8);
+    if (new_float == value)
+        *imm_out = imm8;
+    else
+        return false;
+
+    return true;
+}
+
+static constexpr bool IsInRangeImm19(s64 distance) {
+    return (distance >= -0x40000 && distance <= 0x3FFFF);
+}
+
+static constexpr bool IsInRangeImm14(s64 distance) {
+    return (distance >= -0x2000 && distance <= 0x1FFF);
+}
+
+static constexpr bool IsInRangeImm26(s64 distance) {
+    return (distance >= -0x2000000 && distance <= 0x1FFFFFF);
+}
+
+static constexpr u32 MaskImm19(s64 distance) {
+    return distance & 0x7FFFF;
+}
+
+static constexpr u32 MaskImm14(s64 distance) {
+    return distance & 0x3FFF;
+}
+
+static constexpr u32 MaskImm26(s64 distance) {
+    return distance & 0x3FFFFFF;
+}
+
+} // Anonymous namespace
+
+void ARM64XEmitter::SetCodePtrUnsafe(u8* ptr) {
+    m_code = ptr;
+}
+
+void ARM64XEmitter::SetCodePtr(u8* ptr) {
+    SetCodePtrUnsafe(ptr);
+    m_lastCacheFlushEnd = ptr;
+}
+
+const u8* ARM64XEmitter::GetCodePtr() const {
+    return m_code;
+}
+
+u8* ARM64XEmitter::GetWritableCodePtr() {
+    return m_code;
+}
+
+void ARM64XEmitter::ReserveCodeSpace(u32 bytes) {
+    for (u32 i = 0; i < bytes / 4; i++)
+        BRK(0);
+}
+
+const u8* ARM64XEmitter::AlignCode16() {
+    int c = int((u64)m_code & 15);
+    if (c)
+        ReserveCodeSpace(16 - c);
+    return m_code;
+}
+
+const u8* ARM64XEmitter::AlignCodePage() {
+    int c = int((u64)m_code & 4095);
+    if (c)
+        ReserveCodeSpace(4096 - c);
+    return m_code;
+}
+
+void ARM64XEmitter::Write32(u32 value) {
+    std::memcpy(m_code, &value, sizeof(u32));
+    m_code += sizeof(u32);
+}
+
+void ARM64XEmitter::FlushIcache() {
+    FlushIcacheSection(m_lastCacheFlushEnd, m_code);
+    m_lastCacheFlushEnd = m_code;
+}
+
+void ARM64XEmitter::FlushIcacheSection(const u8* start, const u8* end) {
+    if (start == end)
+        return;
+
+#if defined(__APPLE__)
+    // Header file says this is equivalent to: sys_icache_invalidate(start, end -
+    // start);
+    sys_cache_control(kCacheFunctionPrepareForExecution, const_cast<u8*>(start), end - start);
+#else
+    // Don't rely on GCC's __clear_cache implementation, as it caches
+    // icache/dcache cache line sizes, that can vary between cores on
+    // big.LITTLE architectures.
+    u64 addr, ctr_el0;
+    static size_t icache_line_size = 0xffff, dcache_line_size = 0xffff;
+    size_t isize, dsize;
+
+    __asm__ volatile("mrs %0, ctr_el0" : "=r"(ctr_el0));
+    isize = 4 << ((ctr_el0 >> 0) & 0xf);
+    dsize = 4 << ((ctr_el0 >> 16) & 0xf);
+
+    // use the global minimum cache line size
+    icache_line_size = isize = icache_line_size < isize ? icache_line_size : isize;
+    dcache_line_size = dsize = dcache_line_size < dsize ? dcache_line_size : dsize;
+
+    addr = reinterpret_cast<u64>(start) & ~static_cast<u64>(dsize - 1);
+    for (; addr < reinterpret_cast<u64>(end); addr += dsize)
+        // use "civac" instead of "cvau", as this is the suggested workaround for
+        // Cortex-A53 errata 819472, 826319, 827319 and 824069.
+        __asm__ volatile("dc civac, %0" : : "r"(addr) : "memory");
+    __asm__ volatile("dsb ish" : : : "memory");
+
+    addr = reinterpret_cast<u64>(start) & ~static_cast<u64>(isize - 1);
+    for (; addr < reinterpret_cast<u64>(end); addr += isize)
+        __asm__ volatile("ic ivau, %0" : : "r"(addr) : "memory");
+
+    __asm__ volatile("dsb ish" : : : "memory");
+    __asm__ volatile("isb" : : : "memory");
+#endif
+}
+
+// Exception generation
+static const u32 ExcEnc[][3] = {
+    {0, 0, 1}, // SVC
+    {0, 0, 2}, // HVC
+    {0, 0, 3}, // SMC
+    {1, 0, 0}, // BRK
+    {2, 0, 0}, // HLT
+    {5, 0, 1}, // DCPS1
+    {5, 0, 2}, // DCPS2
+    {5, 0, 3}, // DCPS3
+};
+
+// Arithmetic generation
+static const u32 ArithEnc[] = {
+    0x058, // ADD
+    0x258, // SUB
+};
+
+// Conditional Select
+static const u32 CondSelectEnc[][2] = {
+    {0, 0}, // CSEL
+    {0, 1}, // CSINC
+    {1, 0}, // CSINV
+    {1, 1}, // CSNEG
+};
+
+// Data-Processing (1 source)
+static const u32 Data1SrcEnc[][2] = {
+    {0, 0}, // RBIT
+    {0, 1}, // REV16
+    {0, 2}, // REV32
+    {0, 3}, // REV64
+    {0, 4}, // CLZ
+    {0, 5}, // CLS
+};
+
+// Data-Processing (2 source)
+static const u32 Data2SrcEnc[] = {
+    0x02, // UDIV
+    0x03, // SDIV
+    0x08, // LSLV
+    0x09, // LSRV
+    0x0A, // ASRV
+    0x0B, // RORV
+    0x10, // CRC32B
+    0x11, // CRC32H
+    0x12, // CRC32W
+    0x14, // CRC32CB
+    0x15, // CRC32CH
+    0x16, // CRC32CW
+    0x13, // CRC32X (64bit Only)
+    0x17, // XRC32CX (64bit Only)
+};
+
+// Data-Processing (3 source)
+static const u32 Data3SrcEnc[][2] = {
+    {0, 0}, // MADD
+    {0, 1}, // MSUB
+    {1, 0}, // SMADDL (64Bit Only)
+    {1, 1}, // SMSUBL (64Bit Only)
+    {2, 0}, // SMULH (64Bit Only)
+    {5, 0}, // UMADDL (64Bit Only)
+    {5, 1}, // UMSUBL (64Bit Only)
+    {6, 0}, // UMULH (64Bit Only)
+};
+
+// Logical (shifted register)
+static const u32 LogicalEnc[][2] = {
+    {0, 0}, // AND
+    {0, 1}, // BIC
+    {1, 0}, // OOR
+    {1, 1}, // ORN
+    {2, 0}, // EOR
+    {2, 1}, // EON
+    {3, 0}, // ANDS
+    {3, 1}, // BICS
+};
+
+// Load/Store Exclusive
+static const u32 LoadStoreExcEnc[][5] = {
+    {0, 0, 0, 0, 0}, // STXRB
+    {0, 0, 0, 0, 1}, // STLXRB
+    {0, 0, 1, 0, 0}, // LDXRB
+    {0, 0, 1, 0, 1}, // LDAXRB
+    {0, 1, 0, 0, 1}, // STLRB
+    {0, 1, 1, 0, 1}, // LDARB
+    {1, 0, 0, 0, 0}, // STXRH
+    {1, 0, 0, 0, 1}, // STLXRH
+    {1, 0, 1, 0, 0}, // LDXRH
+    {1, 0, 1, 0, 1}, // LDAXRH
+    {1, 1, 0, 0, 1}, // STLRH
+    {1, 1, 1, 0, 1}, // LDARH
+    {2, 0, 0, 0, 0}, // STXR
+    {3, 0, 0, 0, 0}, // (64bit) STXR
+    {2, 0, 0, 0, 1}, // STLXR
+    {3, 0, 0, 0, 1}, // (64bit) STLXR
+    {2, 0, 0, 1, 0}, // STXP
+    {3, 0, 0, 1, 0}, // (64bit) STXP
+    {2, 0, 0, 1, 1}, // STLXP
+    {3, 0, 0, 1, 1}, // (64bit) STLXP
+    {2, 0, 1, 0, 0}, // LDXR
+    {3, 0, 1, 0, 0}, // (64bit) LDXR
+    {2, 0, 1, 0, 1}, // LDAXR
+    {3, 0, 1, 0, 1}, // (64bit) LDAXR
+    {2, 0, 1, 1, 0}, // LDXP
+    {3, 0, 1, 1, 0}, // (64bit) LDXP
+    {2, 0, 1, 1, 1}, // LDAXP
+    {3, 0, 1, 1, 1}, // (64bit) LDAXP
+    {2, 1, 0, 0, 1}, // STLR
+    {3, 1, 0, 0, 1}, // (64bit) STLR
+    {2, 1, 1, 0, 1}, // LDAR
+    {3, 1, 1, 0, 1}, // (64bit) LDAR
+};
+
+void ARM64XEmitter::EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr) {
+    bool b64Bit = Is64Bit(Rt);
+    s64 distance = reinterpret_cast<s64>(ptr) - reinterpret_cast<s64>(m_code);
+
+    ASSERT_MSG(!(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64, __func__,
+               distance);
+
+    distance >>= 2;
+
+    ASSERT_MSG(distance >= -0x40000 && distance <= 0x3FFFF,
+               "%s: Received too large distance: %" PRIx64, __func__, distance);
+
+    Rt = DecodeReg(Rt);
+    Write32((b64Bit << 31) | (0x34 << 24) | (op << 24) |
+            ((static_cast<u32>(distance) << 5) & 0xFFFFE0) | Rt);
+}
+
+void ARM64XEmitter::EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const void* ptr) {
+    bool b64Bit = Is64Bit(Rt);
+    s64 distance = reinterpret_cast<s64>(ptr) - reinterpret_cast<s64>(m_code);
+
+    ASSERT_MSG(!(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64, __func__,
+               distance);
+
+    distance >>= 2;
+
+    ASSERT_MSG(distance >= -0x3FFF && distance < 0x3FFF,
+               "%s: Received too large distance: %" PRIx64, __func__, distance);
+
+    Rt = DecodeReg(Rt);
+    Write32((b64Bit << 31) | (0x36 << 24) | (op << 24) | (bits << 19) |
+            ((static_cast<u32>(distance) << 5) & 0x7FFE0) | Rt);
+}
+
+void ARM64XEmitter::EncodeUnconditionalBranchInst(u32 op, const void* ptr) {
+    s64 distance = reinterpret_cast<s64>(ptr) - reinterpret_cast<s64>(m_code);
+
+    ASSERT_MSG(!(distance & 0x3), "%s: distance must be a multiple of 4: %" PRIx64, __func__,
+               distance);
+
+    distance >>= 2;
+
+    ASSERT_MSG(distance >= -0x2000000LL && distance <= 0x1FFFFFFLL,
+               "%s: Received too large distance: %" PRIx64, __func__, distance);
+
+    Write32((op << 31) | (0x5 << 26) | (distance & 0x3FFFFFF));
+}
+
+void ARM64XEmitter::EncodeUnconditionalBranchInst(u32 opc, u32 op2, u32 op3, u32 op4, ARM64Reg Rn) {
+    Rn = DecodeReg(Rn);
+    Write32((0x6B << 25) | (opc << 21) | (op2 << 16) | (op3 << 10) | (Rn << 5) | op4);
+}
+
+void ARM64XEmitter::EncodeExceptionInst(u32 instenc, u32 imm) {
+    ASSERT_MSG(!(imm & ~0xFFFF), "%s: Exception instruction too large immediate: %d", __func__,
+               imm);
+
+    Write32((0xD4 << 24) | (ExcEnc[instenc][0] << 21) | (imm << 5) | (ExcEnc[instenc][1] << 2) |
+            ExcEnc[instenc][2]);
+}
+
+void ARM64XEmitter::EncodeSystemInst(u32 op0, u32 op1, u32 CRn, u32 CRm, u32 op2, ARM64Reg Rt) {
+    Write32((0x354 << 22) | (op0 << 19) | (op1 << 16) | (CRn << 12) | (CRm << 8) | (op2 << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeArithmeticInst(u32 instenc, bool flags, ARM64Reg Rd, ARM64Reg Rn,
+                                         ARM64Reg Rm, ArithOption Option) {
+    bool b64Bit = Is64Bit(Rd);
+
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    Rm = DecodeReg(Rm);
+    Write32((b64Bit << 31) | (flags << 29) | (ArithEnc[instenc] << 21) |
+            (Option.GetType() == ArithOption::TYPE_EXTENDEDREG ? (1 << 21) : 0) | (Rm << 16) |
+            Option.GetData() | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeArithmeticCarryInst(u32 op, bool flags, ARM64Reg Rd, ARM64Reg Rn,
+                                              ARM64Reg Rm) {
+    bool b64Bit = Is64Bit(Rd);
+
+    Rd = DecodeReg(Rd);
+    Rm = DecodeReg(Rm);
+    Rn = DecodeReg(Rn);
+    Write32((b64Bit << 31) | (op << 30) | (flags << 29) | (0xD0 << 21) | (Rm << 16) | (Rn << 5) |
+            Rd);
+}
+
+void ARM64XEmitter::EncodeCondCompareImmInst(u32 op, ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond) {
+    bool b64Bit = Is64Bit(Rn);
+
+    ASSERT_MSG(!(imm & ~0x1F), "%s: too large immediate: %d", __func__, imm);
+    ASSERT_MSG(!(nzcv & ~0xF), "%s: Flags out of range: %d", __func__, nzcv);
+
+    Rn = DecodeReg(Rn);
+    Write32((b64Bit << 31) | (op << 30) | (1 << 29) | (0xD2 << 21) | (imm << 16) | (cond << 12) |
+            (1 << 11) | (Rn << 5) | nzcv);
+}
+
+void ARM64XEmitter::EncodeCondCompareRegInst(u32 op, ARM64Reg Rn, ARM64Reg Rm, u32 nzcv,
+                                             CCFlags cond) {
+    bool b64Bit = Is64Bit(Rm);
+
+    ASSERT_MSG(!(nzcv & ~0xF), "%s: Flags out of range: %d", __func__, nzcv);
+
+    Rm = DecodeReg(Rm);
+    Rn = DecodeReg(Rn);
+    Write32((b64Bit << 31) | (op << 30) | (1 << 29) | (0xD2 << 21) | (Rm << 16) | (cond << 12) |
+            (Rn << 5) | nzcv);
+}
+
+void ARM64XEmitter::EncodeCondSelectInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+                                         CCFlags cond) {
+    bool b64Bit = Is64Bit(Rd);
+
+    Rd = DecodeReg(Rd);
+    Rm = DecodeReg(Rm);
+    Rn = DecodeReg(Rn);
+    Write32((b64Bit << 31) | (CondSelectEnc[instenc][0] << 30) | (0xD4 << 21) | (Rm << 16) |
+            (cond << 12) | (CondSelectEnc[instenc][1] << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn) {
+    bool b64Bit = Is64Bit(Rd);
+
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    Write32((b64Bit << 31) | (0x2D6 << 21) | (Data1SrcEnc[instenc][0] << 16) |
+            (Data1SrcEnc[instenc][1] << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    bool b64Bit = Is64Bit(Rd);
+
+    Rd = DecodeReg(Rd);
+    Rm = DecodeReg(Rm);
+    Rn = DecodeReg(Rn);
+    Write32((b64Bit << 31) | (0x0D6 << 21) | (Rm << 16) | (Data2SrcEnc[instenc] << 10) | (Rn << 5) |
+            Rd);
+}
+
+void ARM64XEmitter::EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+                                       ARM64Reg Ra) {
+    bool b64Bit = Is64Bit(Rd);
+
+    Rd = DecodeReg(Rd);
+    Rm = DecodeReg(Rm);
+    Rn = DecodeReg(Rn);
+    Ra = DecodeReg(Ra);
+    Write32((b64Bit << 31) | (0xD8 << 21) | (Data3SrcEnc[instenc][0] << 21) | (Rm << 16) |
+            (Data3SrcEnc[instenc][1] << 15) | (Ra << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+                                      ArithOption Shift) {
+    bool b64Bit = Is64Bit(Rd);
+
+    Rd = DecodeReg(Rd);
+    Rm = DecodeReg(Rm);
+    Rn = DecodeReg(Rn);
+    Write32((b64Bit << 31) | (LogicalEnc[instenc][0] << 29) | (0x5 << 25) |
+            (LogicalEnc[instenc][1] << 21) | Shift.GetData() | (Rm << 16) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLoadRegisterInst(u32 bitop, ARM64Reg Rt, s32 imm) {
+    bool b64Bit = Is64Bit(Rt);
+    bool bVec = IsVector(Rt);
+
+    ASSERT_MSG(IsInRangeImm19(imm), "{}: offset too large {}", __func__, imm);
+
+    Rt = DecodeReg(Rt);
+    if (b64Bit && bitop != 0x2) // LDRSW(0x2) uses 64bit reg, doesn't have 64bit bit set
+        bitop |= 0x1;
+    Write32((bitop << 30) | (bVec << 26) | (0x18 << 24) | (MaskImm19(imm) << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeLoadStoreExcInst(u32 instenc, ARM64Reg Rs, ARM64Reg Rt2, ARM64Reg Rn,
+                                           ARM64Reg Rt) {
+    Rs = DecodeReg(Rs);
+    Rt2 = DecodeReg(Rt2);
+    Rn = DecodeReg(Rn);
+    Rt = DecodeReg(Rt);
+    Write32((LoadStoreExcEnc[instenc][0] << 30) | (0x8 << 24) |
+            (LoadStoreExcEnc[instenc][1] << 23) | (LoadStoreExcEnc[instenc][2] << 22) |
+            (LoadStoreExcEnc[instenc][3] << 21) | (Rs << 16) | (LoadStoreExcEnc[instenc][4] << 15) |
+            (Rt2 << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeLoadStorePairedInst(u32 op, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn,
+                                              u32 imm) {
+    bool b64Bit = Is64Bit(Rt);
+    bool b128Bit = IsQuad(Rt);
+    bool bVec = IsVector(Rt);
+
+    if (b128Bit)
+        imm >>= 4;
+    else if (b64Bit)
+        imm >>= 3;
+    else
+        imm >>= 2;
+
+    ASSERT_MSG(!(imm & ~0xF), "%s: offset too large %d", __func__, imm);
+
+    u32 opc = 0;
+    if (b128Bit)
+        opc = 2;
+    else if (b64Bit && bVec)
+        opc = 1;
+    else if (b64Bit && !bVec)
+        opc = 2;
+
+    Rt = DecodeReg(Rt);
+    Rt2 = DecodeReg(Rt2);
+    Rn = DecodeReg(Rn);
+    Write32((opc << 30) | (bVec << 26) | (op << 22) | (imm << 15) | (Rt2 << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeLoadStoreIndexedInst(u32 op, u32 op2, ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    bool b64Bit = Is64Bit(Rt);
+    bool bVec = IsVector(Rt);
+
+    u32 offset = imm & 0x1FF;
+
+    ASSERT_MSG(!(imm < -256 || imm > 255), "%s: offset too large %d", __func__, imm);
+
+    Rt = DecodeReg(Rt);
+    Rn = DecodeReg(Rn);
+    Write32((b64Bit << 30) | (op << 22) | (bVec << 26) | (offset << 12) | (op2 << 10) | (Rn << 5) |
+            Rt);
+}
+
+void ARM64XEmitter::EncodeLoadStoreIndexedInst(u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm, u8 size) {
+    bool b64Bit = Is64Bit(Rt);
+    bool bVec = IsVector(Rt);
+
+    if (size == 64)
+        imm >>= 3;
+    else if (size == 32)
+        imm >>= 2;
+    else if (size == 16)
+        imm >>= 1;
+
+    ASSERT_MSG(imm >= 0, "%s(INDEX_UNSIGNED): offset must be positive %d", __func__, imm);
+    ASSERT_MSG(!(imm & ~0xFFF), "%s(INDEX_UNSIGNED): offset too large %d", __func__, imm);
+
+    Rt = DecodeReg(Rt);
+    Rn = DecodeReg(Rn);
+    Write32((b64Bit << 30) | (op << 22) | (bVec << 26) | (imm << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeMOVWideInst(u32 op, ARM64Reg Rd, u32 imm, ShiftAmount pos) {
+    bool b64Bit = Is64Bit(Rd);
+
+    ASSERT_MSG(!(imm & ~0xFFFF), "%s: immediate out of range: %d", __func__, imm);
+
+    Rd = DecodeReg(Rd);
+    Write32((b64Bit << 31) | (op << 29) | (0x25 << 23) | (pos << 21) | (imm << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeBitfieldMOVInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms) {
+    bool b64Bit = Is64Bit(Rd);
+
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    Write32((b64Bit << 31) | (op << 29) | (0x26 << 23) | (b64Bit << 22) | (immr << 16) |
+            (imms << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn,
+                                                  ArithOption Rm) {
+    ASSERT_MSG(Rm.GetType() == ArithOption::TYPE_EXTENDEDREG, "Shifted registers are not supported used Indexed registers");
+
+    Rt = DecodeReg(Rt);
+    Rn = DecodeReg(Rn);
+    ARM64Reg decoded_Rm = DecodeReg(Rm.GetReg());
+
+    Write32((size << 30) | (opc << 22) | (0x1C1 << 21) | (decoded_Rm << 16) | Rm.GetData() |
+            (1 << 11) | (Rn << 5) | Rt);
+}
+
+void ARM64XEmitter::EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn,
+                                        ARM64Reg Rd) {
+    bool b64Bit = Is64Bit(Rd);
+
+    ASSERT_MSG(!(imm & ~0xFFF), "%s: immediate too large: %x", __func__, imm);
+
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    Write32((b64Bit << 31) | (op << 30) | (flags << 29) | (0x11 << 24) | (shift << 22) |
+            (imm << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms,
+                                         int n) {
+    // Sometimes Rd is fixed to SP, but can still be 32bit or 64bit.
+    // Use Rn to determine bitness here.
+    bool b64Bit = Is64Bit(Rn);
+
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+
+    Write32((b64Bit << 31) | (op << 29) | (0x24 << 23) | (n << 22) | (immr << 16) | (imms << 10) |
+            (Rn << 5) | Rd);
+}
+
+void ARM64XEmitter::EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2,
+                                        ARM64Reg Rn, s32 imm) {
+    bool b64Bit = Is64Bit(Rt);
+    u32 type_encode = 0;
+
+    switch (type) {
+    case INDEX_SIGNED:
+        type_encode = 0b010;
+        break;
+    case INDEX_POST:
+        type_encode = 0b001;
+        break;
+    case INDEX_PRE:
+        type_encode = 0b011;
+        break;
+    case INDEX_UNSIGNED:
+        ASSERT_MSG(false, "%s doesn't support INDEX_UNSIGNED!", __func__);
+        break;
+    }
+
+    if (b64Bit) {
+        op |= 0b10;
+        imm >>= 3;
+    } else {
+        imm >>= 2;
+    }
+
+    Rt = DecodeReg(Rt);
+    Rt2 = DecodeReg(Rt2);
+    Rn = DecodeReg(Rn);
+
+    Write32((op << 30) | (0b101 << 27) | (type_encode << 23) | (load << 22) | ((imm & 0x7F) << 15) |
+            (Rt2 << 10) | (Rn << 5) | Rt);
+}
+void ARM64XEmitter::EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm) {
+    Rd = DecodeReg(Rd);
+
+    Write32((op << 31) | ((imm & 0x3) << 29) | (0x10 << 24) | ((imm & 0x1FFFFC) << 3) | Rd);
+}
+
+void ARM64XEmitter::EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    ASSERT_MSG(!(imm < -256 || imm > 255), "%s received too large offset: %d", __func__, imm);
+    Rt = DecodeReg(Rt);
+    Rn = DecodeReg(Rn);
+
+    Write32((size << 30) | (0b111 << 27) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt);
+}
+
+// FixupBranch branching
+void ARM64XEmitter::SetJumpTarget(FixupBranch const& branch, u8* target) {
+    if(!target)
+        target = m_code;
+    bool Not = false;
+    u32 inst = 0;
+    s64 distance = static_cast<s64>(target - branch.ptr);
+    distance >>= 2;
+
+    switch (branch.type) {
+    case 1: // CBNZ
+        Not = true;
+        [[fallthrough]];
+    case 0: // CBZ
+    {
+        ASSERT_MSG(IsInRangeImm19(distance), "%s(%d): Received too large distance: %" PRIx64,
+                   __func__, branch.type, distance);
+        bool b64Bit = Is64Bit(branch.reg);
+        ARM64Reg reg = DecodeReg(branch.reg);
+        inst = (b64Bit << 31) | (0x1A << 25) | (Not << 24) | (MaskImm19(distance) << 5) | reg;
+    } break;
+    case 2: // B (conditional)
+        ASSERT_MSG(IsInRangeImm19(distance), "%s(%d): Received too large distance: %" PRIx64,
+                   __func__, branch.type, distance);
+        inst = (0x2A << 25) | (MaskImm19(distance) << 5) | branch.cond;
+        break;
+    case 4: // TBNZ
+        Not = true;
+        [[fallthrough]];
+    case 3: // TBZ
+    {
+        ASSERT_MSG(IsInRangeImm14(distance), "%s(%d): Received too large distance: %" PRIx64,
+                   __func__, branch.type, distance);
+        ARM64Reg reg = DecodeReg(branch.reg);
+        inst = ((branch.bit & 0x20) << 26) | (0x1B << 25) | (Not << 24) |
+               ((branch.bit & 0x1F) << 19) | (MaskImm14(distance) << 5) | reg;
+    } break;
+    case 5: // B (unconditional)
+        ASSERT_MSG(IsInRangeImm26(distance), "%s(%d): Received too large distance: %" PRIx64,
+                   __func__, branch.type, distance);
+        inst = (0x5 << 26) | MaskImm26(distance);
+        break;
+    case 6: // BL (unconditional)
+        ASSERT_MSG(IsInRangeImm26(distance), "%s(%d): Received too large distance: %" PRIx64,
+                   __func__, branch.type, distance);
+        inst = (0x25 << 26) | MaskImm26(distance);
+        break;
+    }
+    std::memcpy(branch.ptr, &inst, sizeof(inst));
+}
+
+FixupBranch ARM64XEmitter::CBZ(ARM64Reg Rt) {
+    FixupBranch branch;
+    branch.ptr = m_code;
+    branch.type = 0;
+    branch.reg = Rt;
+    HINT(HINT_NOP);
+    return branch;
+}
+FixupBranch ARM64XEmitter::CBNZ(ARM64Reg Rt) {
+    FixupBranch branch;
+    branch.ptr = m_code;
+    branch.type = 1;
+    branch.reg = Rt;
+    HINT(HINT_NOP);
+    return branch;
+}
+FixupBranch ARM64XEmitter::B(CCFlags cond) {
+    FixupBranch branch;
+    branch.ptr = m_code;
+    branch.type = 2;
+    branch.cond = cond;
+    HINT(HINT_NOP);
+    return branch;
+}
+FixupBranch ARM64XEmitter::TBZ(ARM64Reg Rt, u8 bit) {
+    FixupBranch branch;
+    branch.ptr = m_code;
+    branch.type = 3;
+    branch.reg = Rt;
+    branch.bit = bit;
+    HINT(HINT_NOP);
+    return branch;
+}
+FixupBranch ARM64XEmitter::TBNZ(ARM64Reg Rt, u8 bit) {
+    FixupBranch branch;
+    branch.ptr = m_code;
+    branch.type = 4;
+    branch.reg = Rt;
+    branch.bit = bit;
+    HINT(HINT_NOP);
+    return branch;
+}
+FixupBranch ARM64XEmitter::B() {
+    FixupBranch branch;
+    branch.ptr = m_code;
+    branch.type = 5;
+    HINT(HINT_NOP);
+    return branch;
+}
+FixupBranch ARM64XEmitter::BL() {
+    FixupBranch branch;
+    branch.ptr = m_code;
+    branch.type = 6;
+    HINT(HINT_NOP);
+    return branch;
+}
+
+// Compare and Branch
+void ARM64XEmitter::CBZ(ARM64Reg Rt, const void* ptr) {
+    EncodeCompareBranchInst(0, Rt, ptr);
+}
+void ARM64XEmitter::CBNZ(ARM64Reg Rt, const void* ptr) {
+    EncodeCompareBranchInst(1, Rt, ptr);
+}
+
+// Conditional Branch
+void ARM64XEmitter::B(CCFlags cond, const void* ptr) {
+    s64 distance = reinterpret_cast<s64>(ptr) - reinterpret_cast<s64>(m_code);
+
+    distance >>= 2;
+
+    ASSERT_MSG(IsInRangeImm19(distance),
+               "%s: Received too large distance: %p->%p %" PRIi64 " %" PRIx64, __func__, m_code,
+               ptr, distance, distance);
+    Write32((0x54 << 24) | (MaskImm19(distance) << 5) | cond);
+}
+
+// Test and Branch
+void ARM64XEmitter::TBZ(ARM64Reg Rt, u8 bits, const void* ptr) {
+    EncodeTestBranchInst(0, Rt, bits, ptr);
+}
+void ARM64XEmitter::TBNZ(ARM64Reg Rt, u8 bits, const void* ptr) {
+    EncodeTestBranchInst(1, Rt, bits, ptr);
+}
+
+// Unconditional Branch
+void ARM64XEmitter::B(const void* ptr) {
+    EncodeUnconditionalBranchInst(0, ptr);
+}
+void ARM64XEmitter::BL(const void* ptr) {
+    EncodeUnconditionalBranchInst(1, ptr);
+}
+
+void ARM64XEmitter::QuickCallFunction(const void* func, ARM64Reg scratchreg) {
+    s64 distance = reinterpret_cast<s64>(func) - reinterpret_cast<s64>(m_code);
+    distance >>= 2; // Can only branch to opcode-aligned (4) addresses
+    if (!IsInRangeImm26(distance)) {
+        // WARN_LOG( "Distance too far in function call (%p to %p)! Using scratch.",
+        // m_code, func);
+        MOVI2R(scratchreg, reinterpret_cast<uintptr_t>(func));
+        BLR(scratchreg);
+    } else {
+        BL(func);
+    }
+}
+
+// Unconditional Branch (register)
+void ARM64XEmitter::BR(ARM64Reg Rn) {
+    EncodeUnconditionalBranchInst(0, 0x1F, 0, 0, Rn);
+}
+void ARM64XEmitter::BLR(ARM64Reg Rn) {
+    EncodeUnconditionalBranchInst(1, 0x1F, 0, 0, Rn);
+}
+void ARM64XEmitter::RET(ARM64Reg Rn) {
+    EncodeUnconditionalBranchInst(2, 0x1F, 0, 0, Rn);
+}
+void ARM64XEmitter::ERET() {
+    EncodeUnconditionalBranchInst(4, 0x1F, 0, 0, SP);
+}
+void ARM64XEmitter::DRPS() {
+    EncodeUnconditionalBranchInst(5, 0x1F, 0, 0, SP);
+}
+
+// Exception generation
+void ARM64XEmitter::SVC(u32 imm) {
+    EncodeExceptionInst(0, imm);
+}
+
+void ARM64XEmitter::HVC(u32 imm) {
+    EncodeExceptionInst(1, imm);
+}
+
+void ARM64XEmitter::SMC(u32 imm) {
+    EncodeExceptionInst(2, imm);
+}
+
+void ARM64XEmitter::BRK(u32 imm) {
+    EncodeExceptionInst(3, imm);
+}
+
+void ARM64XEmitter::HLT(u32 imm) {
+    EncodeExceptionInst(4, imm);
+}
+
+void ARM64XEmitter::DCPS1(u32 imm) {
+    EncodeExceptionInst(5, imm);
+}
+
+void ARM64XEmitter::DCPS2(u32 imm) {
+    EncodeExceptionInst(6, imm);
+}
+
+void ARM64XEmitter::DCPS3(u32 imm) {
+    EncodeExceptionInst(7, imm);
+}
+
+// System
+void ARM64XEmitter::_MSR(PStateField field, u8 imm) {
+    u32 op1 = 0, op2 = 0;
+    switch (field) {
+    case FIELD_SPSel:
+        op1 = 0;
+        op2 = 5;
+        break;
+    case FIELD_DAIFSet:
+        op1 = 3;
+        op2 = 6;
+        break;
+    case FIELD_DAIFClr:
+        op1 = 3;
+        op2 = 7;
+        break;
+    default:
+        ASSERT_MSG(false, "Invalid PStateField to do a imm move to");
+        break;
+    }
+    EncodeSystemInst(0, op1, 4, imm, op2, WSP);
+}
+
+static void GetSystemReg(PStateField field, int& o0, int& op1, int& CRn, int& CRm, int& op2) {
+    switch (field) {
+    case FIELD_NZCV:
+        o0 = 3;
+        op1 = 3;
+        CRn = 4;
+        CRm = 2;
+        op2 = 0;
+        break;
+    case FIELD_FPCR:
+        o0 = 3;
+        op1 = 3;
+        CRn = 4;
+        CRm = 4;
+        op2 = 0;
+        break;
+    case FIELD_FPSR:
+        o0 = 3;
+        op1 = 3;
+        CRn = 4;
+        CRm = 4;
+        op2 = 1;
+        break;
+    case FIELD_PMCR_EL0:
+        o0 = 3;
+        op1 = 3;
+        CRn = 9;
+        CRm = 6;
+        op2 = 0;
+        break;
+    case FIELD_PMCCNTR_EL0:
+        o0 = 3;
+        op1 = 3;
+        CRn = 9;
+        CRm = 7;
+        op2 = 0;
+        break;
+    default:
+        ASSERT_MSG(false, "Invalid PStateField to do a register move from/to");
+        break;
+    }
+}
+
+void ARM64XEmitter::_MSR(PStateField field, ARM64Reg Rt) {
+    int o0 = 0, op1 = 0, CRn = 0, CRm = 0, op2 = 0;
+    ASSERT_MSG(Is64Bit(Rt), "MSR: Rt must be 64-bit");
+    GetSystemReg(field, o0, op1, CRn, CRm, op2);
+    EncodeSystemInst(o0, op1, CRn, CRm, op2, DecodeReg(Rt));
+}
+
+void ARM64XEmitter::MRS(ARM64Reg Rt, PStateField field) {
+    int o0 = 0, op1 = 0, CRn = 0, CRm = 0, op2 = 0;
+    ASSERT_MSG(Is64Bit(Rt), "MRS: Rt must be 64-bit");
+    GetSystemReg(field, o0, op1, CRn, CRm, op2);
+    EncodeSystemInst(o0 | 4, op1, CRn, CRm, op2, DecodeReg(Rt));
+}
+
+void ARM64XEmitter::CNTVCT(ARM64Reg Rt) {
+    ASSERT_MSG(Is64Bit(Rt), "CNTVCT: Rt must be 64-bit");
+
+    // MRS <Xt>, CNTVCT_EL0 ; Read CNTVCT_EL0 into Xt
+    EncodeSystemInst(3 | 4, 3, 0xe, 0, 2, DecodeReg(Rt));
+}
+
+void ARM64XEmitter::HINT(SystemHint op) {
+    EncodeSystemInst(0, 3, 2, 0, op, WSP);
+}
+void ARM64XEmitter::CLREX() {
+    EncodeSystemInst(0, 3, 3, 0, 2, WSP);
+}
+void ARM64XEmitter::DSB(BarrierType type) {
+    EncodeSystemInst(0, 3, 3, type, 4, WSP);
+}
+void ARM64XEmitter::DMB(BarrierType type) {
+    EncodeSystemInst(0, 3, 3, type, 5, WSP);
+}
+void ARM64XEmitter::ISB(BarrierType type) {
+    EncodeSystemInst(0, 3, 3, type, 6, WSP);
+}
+
+// Add/Subtract (extended register)
+void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    ADD(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+}
+
+void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) {
+    EncodeArithmeticInst(0, false, Rd, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EncodeArithmeticInst(0, true, Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+}
+
+void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) {
+    EncodeArithmeticInst(0, true, Rd, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    SUB(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+}
+
+void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) {
+    EncodeArithmeticInst(1, false, Rd, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EncodeArithmeticInst(1, true, Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+}
+
+void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) {
+    EncodeArithmeticInst(1, true, Rd, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::CMN(ARM64Reg Rn, ARM64Reg Rm) {
+    CMN(Rn, Rm, ArithOption(Rn, ST_LSL, 0));
+}
+
+void ARM64XEmitter::CMN(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) {
+    EncodeArithmeticInst(0, true, Is64Bit(Rn) ? ZR : WZR, Rn, Rm, Option);
+}
+
+void ARM64XEmitter::CMP(ARM64Reg Rn, ARM64Reg Rm) {
+    CMP(Rn, Rm, ArithOption(Rn, ST_LSL, 0));
+}
+
+void ARM64XEmitter::CMP(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option) {
+    EncodeArithmeticInst(1, true, Is64Bit(Rn) ? ZR : WZR, Rn, Rm, Option);
+}
+
+// Add/Subtract (with carry)
+void ARM64XEmitter::ADC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EncodeArithmeticCarryInst(0, false, Rd, Rn, Rm);
+}
+void ARM64XEmitter::ADCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EncodeArithmeticCarryInst(0, true, Rd, Rn, Rm);
+}
+void ARM64XEmitter::SBC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EncodeArithmeticCarryInst(1, false, Rd, Rn, Rm);
+}
+void ARM64XEmitter::SBCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EncodeArithmeticCarryInst(1, true, Rd, Rn, Rm);
+}
+
+// Conditional Compare (immediate)
+void ARM64XEmitter::CCMN(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond) {
+    EncodeCondCompareImmInst(0, Rn, imm, nzcv, cond);
+}
+void ARM64XEmitter::CCMP(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond) {
+    EncodeCondCompareImmInst(1, Rn, imm, nzcv, cond);
+}
+
+// Conditiona Compare (register)
+void ARM64XEmitter::CCMN(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond) {
+    EncodeCondCompareRegInst(0, Rn, Rm, nzcv, cond);
+}
+void ARM64XEmitter::CCMP(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond) {
+    EncodeCondCompareRegInst(1, Rn, Rm, nzcv, cond);
+}
+
+// Conditional Select
+void ARM64XEmitter::CSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) {
+    EncodeCondSelectInst(0, Rd, Rn, Rm, cond);
+}
+void ARM64XEmitter::CSINC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) {
+    EncodeCondSelectInst(1, Rd, Rn, Rm, cond);
+}
+void ARM64XEmitter::CSINV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) {
+    EncodeCondSelectInst(2, Rd, Rn, Rm, cond);
+}
+void ARM64XEmitter::CSNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) {
+    EncodeCondSelectInst(3, Rd, Rn, Rm, cond);
+}
+
+// Data-Processing 1 source
+void ARM64XEmitter::RBIT(ARM64Reg Rd, ARM64Reg Rn) {
+    EncodeData1SrcInst(0, Rd, Rn);
+}
+void ARM64XEmitter::REV16(ARM64Reg Rd, ARM64Reg Rn) {
+    EncodeData1SrcInst(1, Rd, Rn);
+}
+void ARM64XEmitter::REV32(ARM64Reg Rd, ARM64Reg Rn) {
+    EncodeData1SrcInst(2, Rd, Rn);
+}
+void ARM64XEmitter::REV64(ARM64Reg Rd, ARM64Reg Rn) {
+    EncodeData1SrcInst(3, Rd, Rn);
+}
+void ARM64XEmitter::CLZ(ARM64Reg Rd, ARM64Reg Rn) {
+    EncodeData1SrcInst(4, Rd, Rn);
+}
+void ARM64XEmitter::CLS(ARM64Reg Rd, ARM64Reg Rn) {
+    EncodeData1SrcInst(5, Rd, Rn);
+}
+
+// Data-Processing 2 source
+void ARM64XEmitter::UDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EncodeData2SrcInst(0, Rd, Rn, Rm);
+}
+void ARM64XEmitter::SDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EncodeData2SrcInst(1, Rd, Rn, Rm);
+}
+void ARM64XEmitter::LSLV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EncodeData2SrcInst(2, Rd, Rn, Rm);
+}
+void ARM64XEmitter::LSRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EncodeData2SrcInst(3, Rd, Rn, Rm);
+}
+void ARM64XEmitter::ASRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EncodeData2SrcInst(4, Rd, Rn, Rm);
+}
+void ARM64XEmitter::RORV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EncodeData2SrcInst(5, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32B(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EncodeData2SrcInst(6, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32H(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EncodeData2SrcInst(7, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32W(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EncodeData2SrcInst(8, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32CB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EncodeData2SrcInst(9, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32CH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EncodeData2SrcInst(10, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32CW(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EncodeData2SrcInst(11, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32X(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EncodeData2SrcInst(12, Rd, Rn, Rm);
+}
+void ARM64XEmitter::CRC32CX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EncodeData2SrcInst(13, Rd, Rn, Rm);
+}
+
+// Data-Processing 3 source
+void ARM64XEmitter::MADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) {
+    EncodeData3SrcInst(0, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::MSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) {
+    EncodeData3SrcInst(1, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::SMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) {
+    EncodeData3SrcInst(2, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::SMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    SMADDL(Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::SMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) {
+    EncodeData3SrcInst(3, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::SMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EncodeData3SrcInst(4, Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::UMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) {
+    EncodeData3SrcInst(5, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::UMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    UMADDL(Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::UMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) {
+    EncodeData3SrcInst(6, Rd, Rn, Rm, Ra);
+}
+void ARM64XEmitter::UMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EncodeData3SrcInst(7, Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::MUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EncodeData3SrcInst(0, Rd, Rn, Rm, SP);
+}
+void ARM64XEmitter::MNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EncodeData3SrcInst(1, Rd, Rn, Rm, SP);
+}
+
+// Logical (shifted register)
+void ARM64XEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) {
+    EncodeLogicalInst(0, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) {
+    EncodeLogicalInst(1, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) {
+    EncodeLogicalInst(2, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) {
+    EncodeLogicalInst(3, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) {
+    EncodeLogicalInst(4, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) {
+    EncodeLogicalInst(5, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) {
+    EncodeLogicalInst(6, Rd, Rn, Rm, Shift);
+}
+void ARM64XEmitter::BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) {
+    EncodeLogicalInst(7, Rd, Rn, Rm, Shift);
+}
+
+void ARM64XEmitter::MOV(ARM64Reg Rd, ARM64Reg Rm, ArithOption Shift) {
+    ORR(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, Shift);
+}
+
+void ARM64XEmitter::MOV(ARM64Reg Rd, ARM64Reg Rm) {
+    if (IsGPR(Rd) && IsGPR(Rm))
+        ORR(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, ArithOption(Rm, ST_LSL, 0));
+    else
+        ASSERT_MSG(false, "Non-GPRs not supported in MOV");
+}
+void ARM64XEmitter::MVN(ARM64Reg Rd, ARM64Reg Rm) {
+    ORN(Rd, Is64Bit(Rd) ? ZR : WZR, Rm, ArithOption(Rm, ST_LSL, 0));
+}
+void ARM64XEmitter::LSL(ARM64Reg Rd, ARM64Reg Rm, int shift) {
+    int bits = Is64Bit(Rd) ? 64 : 32;
+    UBFM(Rd, Rm, (bits - shift) & (bits - 1), bits - shift - 1);
+}
+void ARM64XEmitter::LSR(ARM64Reg Rd, ARM64Reg Rm, int shift) {
+    int bits = Is64Bit(Rd) ? 64 : 32;
+    UBFM(Rd, Rm, shift, bits - 1);
+}
+void ARM64XEmitter::ASR(ARM64Reg Rd, ARM64Reg Rm, int shift) {
+    int bits = Is64Bit(Rd) ? 64 : 32;
+    SBFM(Rd, Rm, shift, bits - 1);
+}
+void ARM64XEmitter::ROR(ARM64Reg Rd, ARM64Reg Rm, int shift) {
+    EXTR(Rd, Rm, Rm, shift);
+}
+
+// Logical (immediate)
+void ARM64XEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert) {
+    EncodeLogicalImmInst(0, Rd, Rn, immr, imms, invert);
+}
+void ARM64XEmitter::ANDS(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert) {
+    EncodeLogicalImmInst(3, Rd, Rn, immr, imms, invert);
+}
+void ARM64XEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert) {
+    EncodeLogicalImmInst(2, Rd, Rn, immr, imms, invert);
+}
+void ARM64XEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert) {
+    EncodeLogicalImmInst(1, Rd, Rn, immr, imms, invert);
+}
+void ARM64XEmitter::TST(ARM64Reg Rn, u32 immr, u32 imms, bool invert) {
+    EncodeLogicalImmInst(3, Is64Bit(Rn) ? ZR : WZR, Rn, immr, imms, invert);
+}
+
+// Add/subtract (immediate)
+void ARM64XEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift) {
+    EncodeAddSubImmInst(0, false, shift, imm, Rn, Rd);
+}
+void ARM64XEmitter::ADDS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift) {
+    EncodeAddSubImmInst(0, true, shift, imm, Rn, Rd);
+}
+void ARM64XEmitter::SUB(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift) {
+    EncodeAddSubImmInst(1, false, shift, imm, Rn, Rd);
+}
+void ARM64XEmitter::SUBS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift) {
+    EncodeAddSubImmInst(1, true, shift, imm, Rn, Rd);
+}
+void ARM64XEmitter::CMP(ARM64Reg Rn, u32 imm, bool shift) {
+    EncodeAddSubImmInst(1, true, shift, imm, Rn, Is64Bit(Rn) ? ZR : WZR);
+}
+
+// Data Processing (Immediate)
+void ARM64XEmitter::MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos) {
+    EncodeMOVWideInst(2, Rd, imm, pos);
+}
+void ARM64XEmitter::MOVN(ARM64Reg Rd, u32 imm, ShiftAmount pos) {
+    EncodeMOVWideInst(0, Rd, imm, pos);
+}
+void ARM64XEmitter::MOVK(ARM64Reg Rd, u32 imm, ShiftAmount pos) {
+    EncodeMOVWideInst(3, Rd, imm, pos);
+}
+
+// Bitfield move
+void ARM64XEmitter::BFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms) {
+    EncodeBitfieldMOVInst(1, Rd, Rn, immr, imms);
+}
+void ARM64XEmitter::SBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms) {
+    EncodeBitfieldMOVInst(0, Rd, Rn, immr, imms);
+}
+void ARM64XEmitter::UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms) {
+    EncodeBitfieldMOVInst(2, Rd, Rn, immr, imms);
+}
+
+void ARM64XEmitter::BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width) {
+    u32 size = Is64Bit(Rn) ? 64 : 32;
+    ASSERT_MSG((lsb + width) <= size,
+               "%s passed lsb %d and width %d which is greater than the register size!", __func__,
+               lsb, width);
+    EncodeBitfieldMOVInst(1, Rd, Rn, (size - lsb) % size, width - 1);
+}
+void ARM64XEmitter::UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width) {
+    u32 size = Is64Bit(Rn) ? 64 : 32;
+    ASSERT_MSG((lsb + width) <= size,
+               "%s passed lsb %d and width %d which is greater than the register size!", __func__,
+               lsb, width);
+    EncodeBitfieldMOVInst(2, Rd, Rn, (size - lsb) % size, width - 1);
+}
+void ARM64XEmitter::EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift) {
+    bool sf = Is64Bit(Rd);
+    bool N = sf;
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    Rm = DecodeReg(Rm);
+    Write32((sf << 31) | (0x27 << 23) | (N << 22) | (Rm << 16) | (shift << 10) | (Rm << 5) | Rd);
+}
+void ARM64XEmitter::SXTB(ARM64Reg Rd, ARM64Reg Rn) {
+    SBFM(Rd, Rn, 0, 7);
+}
+void ARM64XEmitter::SXTH(ARM64Reg Rd, ARM64Reg Rn) {
+    SBFM(Rd, Rn, 0, 15);
+}
+void ARM64XEmitter::SXTW(ARM64Reg Rd, ARM64Reg Rn) {
+    ASSERT_MSG(Is64Bit(Rd), "%s requires 64bit register as destination", __func__);
+    SBFM(Rd, Rn, 0, 31);
+}
+void ARM64XEmitter::UXTB(ARM64Reg Rd, ARM64Reg Rn) {
+    UBFM(Rd, Rn, 0, 7);
+}
+void ARM64XEmitter::UXTH(ARM64Reg Rd, ARM64Reg Rn) {
+    UBFM(Rd, Rn, 0, 15);
+}
+
+// Load Register (Literal)
+void ARM64XEmitter::LDR(ARM64Reg Rt, s32 imm) {
+    EncodeLoadRegisterInst(0, Rt, imm);
+}
+void ARM64XEmitter::LDRSW(ARM64Reg Rt, s32 imm) {
+    EncodeLoadRegisterInst(2, Rt, imm);
+}
+void ARM64XEmitter::PRFM(ARM64Reg Rt, s32 imm) {
+    EncodeLoadRegisterInst(3, Rt, imm);
+}
+
+// Load/Store pair
+void ARM64XEmitter::LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) {
+    EncodeLoadStorePair(0, 1, type, Rt, Rt2, Rn, imm);
+}
+void ARM64XEmitter::LDPSW(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) {
+    EncodeLoadStorePair(1, 1, type, Rt, Rt2, Rn, imm);
+}
+void ARM64XEmitter::STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) {
+    EncodeLoadStorePair(0, 0, type, Rt, Rt2, Rn, imm);
+}
+
+// Load/Store Exclusive
+void ARM64XEmitter::STXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn) {
+    EncodeLoadStoreExcInst(0, Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn) {
+    EncodeLoadStoreExcInst(1, Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDXRB(ARM64Reg Rt, ARM64Reg Rn) {
+    EncodeLoadStoreExcInst(2, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDAXRB(ARM64Reg Rt, ARM64Reg Rn) {
+    EncodeLoadStoreExcInst(3, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLRB(ARM64Reg Rt, ARM64Reg Rn) {
+    EncodeLoadStoreExcInst(4, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDARB(ARM64Reg Rt, ARM64Reg Rn) {
+    EncodeLoadStoreExcInst(5, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::STXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn) {
+    EncodeLoadStoreExcInst(6, Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn) {
+    EncodeLoadStoreExcInst(7, Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDXRH(ARM64Reg Rt, ARM64Reg Rn) {
+    EncodeLoadStoreExcInst(8, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDAXRH(ARM64Reg Rt, ARM64Reg Rn) {
+    EncodeLoadStoreExcInst(9, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLRH(ARM64Reg Rt, ARM64Reg Rn) {
+    EncodeLoadStoreExcInst(10, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDARH(ARM64Reg Rt, ARM64Reg Rn) {
+    EncodeLoadStoreExcInst(11, SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::STXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn) {
+    EncodeLoadStoreExcInst(12 + Is64Bit(Rt), Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::STLXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn) {
+    EncodeLoadStoreExcInst(14 + Is64Bit(Rt), Rs, SP, Rt, Rn);
+}
+void ARM64XEmitter::STXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn) {
+    EncodeLoadStoreExcInst(16 + Is64Bit(Rt), Rs, Rt2, Rt, Rn);
+}
+void ARM64XEmitter::STLXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn) {
+    EncodeLoadStoreExcInst(18 + Is64Bit(Rt), Rs, Rt2, Rt, Rn);
+}
+void ARM64XEmitter::LDXR(ARM64Reg Rt, ARM64Reg Rn) {
+    EncodeLoadStoreExcInst(20 + Is64Bit(Rt), SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDAXR(ARM64Reg Rt, ARM64Reg Rn) {
+    EncodeLoadStoreExcInst(22 + Is64Bit(Rt), SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn) {
+    EncodeLoadStoreExcInst(24 + Is64Bit(Rt), SP, Rt2, Rt, Rn);
+}
+void ARM64XEmitter::LDAXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn) {
+    EncodeLoadStoreExcInst(26 + Is64Bit(Rt), SP, Rt2, Rt, Rn);
+}
+void ARM64XEmitter::STLR(ARM64Reg Rt, ARM64Reg Rn) {
+    EncodeLoadStoreExcInst(28 + Is64Bit(Rt), SP, SP, Rt, Rn);
+}
+void ARM64XEmitter::LDAR(ARM64Reg Rt, ARM64Reg Rn) {
+    EncodeLoadStoreExcInst(30 + Is64Bit(Rt), SP, SP, Rt, Rn);
+}
+
+// Load/Store no-allocate pair (offset)
+void ARM64XEmitter::STNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm) {
+    EncodeLoadStorePairedInst(0xA0, Rt, Rt2, Rn, imm);
+}
+void ARM64XEmitter::LDNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm) {
+    EncodeLoadStorePairedInst(0xA1, Rt, Rt2, Rn, imm);
+}
+
+// Load/Store register (immediate post-indexed)
+// XXX: Most of these support vectors
+void ARM64XEmitter::STRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    if (type == INDEX_UNSIGNED)
+        EncodeLoadStoreIndexedInst(0x0E4, Rt, Rn, imm, 8);
+    else
+        EncodeLoadStoreIndexedInst(0x0E0, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    if (type == INDEX_UNSIGNED)
+        EncodeLoadStoreIndexedInst(0x0E5, Rt, Rn, imm, 8);
+    else
+        EncodeLoadStoreIndexedInst(0x0E1, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDRSB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    if (type == INDEX_UNSIGNED)
+        EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x0E6 : 0x0E7, Rt, Rn, imm, 8);
+    else
+        EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x0E2 : 0x0E3, type == INDEX_POST ? 1 : 3, Rt, Rn,
+                                   imm);
+}
+void ARM64XEmitter::STRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    if (type == INDEX_UNSIGNED)
+        EncodeLoadStoreIndexedInst(0x1E4, Rt, Rn, imm, 16);
+    else
+        EncodeLoadStoreIndexedInst(0x1E0, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    if (type == INDEX_UNSIGNED)
+        EncodeLoadStoreIndexedInst(0x1E5, Rt, Rn, imm, 16);
+    else
+        EncodeLoadStoreIndexedInst(0x1E1, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDRSH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    if (type == INDEX_UNSIGNED)
+        EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x1E6 : 0x1E7, Rt, Rn, imm, 16);
+    else
+        EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x1E2 : 0x1E3, type == INDEX_POST ? 1 : 3, Rt, Rn,
+                                   imm);
+}
+void ARM64XEmitter::STR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    if (type == INDEX_UNSIGNED)
+        EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E4 : 0x2E4, Rt, Rn, imm, Is64Bit(Rt) ? 64 : 32);
+    else
+        EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E0 : 0x2E0, type == INDEX_POST ? 1 : 3, Rt, Rn,
+                                   imm);
+}
+void ARM64XEmitter::LDR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    if (type == INDEX_UNSIGNED)
+        EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E5 : 0x2E5, Rt, Rn, imm, Is64Bit(Rt) ? 64 : 32);
+    else
+        EncodeLoadStoreIndexedInst(Is64Bit(Rt) ? 0x3E1 : 0x2E1, type == INDEX_POST ? 1 : 3, Rt, Rn,
+                                   imm);
+}
+void ARM64XEmitter::LDRSW(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    if (type == INDEX_UNSIGNED)
+        EncodeLoadStoreIndexedInst(0x2E6, Rt, Rn, imm, 32);
+    else
+        EncodeLoadStoreIndexedInst(0x2E2, type == INDEX_POST ? 1 : 3, Rt, Rn, imm);
+}
+
+// Load/Store register (register offset)
+void ARM64XEmitter::STRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) {
+    EncodeLoadStoreRegisterOffset(0, 0, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) {
+    EncodeLoadStoreRegisterOffset(0, 1, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRSB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) {
+    bool b64Bit = Is64Bit(Rt);
+    EncodeLoadStoreRegisterOffset(0, 3 - b64Bit, Rt, Rn, Rm);
+}
+void ARM64XEmitter::STRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) {
+    EncodeLoadStoreRegisterOffset(1, 0, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) {
+    EncodeLoadStoreRegisterOffset(1, 1, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRSH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) {
+    bool b64Bit = Is64Bit(Rt);
+    EncodeLoadStoreRegisterOffset(1, 3 - b64Bit, Rt, Rn, Rm);
+}
+void ARM64XEmitter::STR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) {
+    bool b64Bit = Is64Bit(Rt);
+    EncodeLoadStoreRegisterOffset(2 + b64Bit, 0, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) {
+    bool b64Bit = Is64Bit(Rt);
+    EncodeLoadStoreRegisterOffset(2 + b64Bit, 1, Rt, Rn, Rm);
+}
+void ARM64XEmitter::LDRSW(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) {
+    EncodeLoadStoreRegisterOffset(2, 2, Rt, Rn, Rm);
+}
+void ARM64XEmitter::PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) {
+    EncodeLoadStoreRegisterOffset(3, 2, Rt, Rn, Rm);
+}
+
+// Load/Store register (unscaled offset)
+void ARM64XEmitter::STURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    EncodeLoadStoreUnscaled(0, 0, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    EncodeLoadStoreUnscaled(0, 1, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURSB(ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    EncodeLoadStoreUnscaled(0, Is64Bit(Rt) ? 2 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::STURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    EncodeLoadStoreUnscaled(1, 0, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    EncodeLoadStoreUnscaled(1, 1, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURSH(ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    EncodeLoadStoreUnscaled(1, Is64Bit(Rt) ? 2 : 3, Rt, Rn, imm);
+}
+void ARM64XEmitter::STUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    EncodeLoadStoreUnscaled(Is64Bit(Rt) ? 3 : 2, 0, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    EncodeLoadStoreUnscaled(Is64Bit(Rt) ? 3 : 2, 1, Rt, Rn, imm);
+}
+void ARM64XEmitter::LDURSW(ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    ASSERT_MSG(!Is64Bit(Rt), "%s must have a 64bit destination register!", __func__);
+    EncodeLoadStoreUnscaled(2, 2, Rt, Rn, imm);
+}
+
+// Address of label/page PC-relative
+void ARM64XEmitter::ADR(ARM64Reg Rd, s32 imm) {
+    EncodeAddressInst(0, Rd, imm);
+}
+void ARM64XEmitter::ADRP(ARM64Reg Rd, s32 imm) {
+    EncodeAddressInst(1, Rd, imm >> 12);
+}
+
+// Wrapper around MOVZ+MOVK (and later MOVN)
+void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm, bool optimize) {
+    unsigned int parts = Is64Bit(Rd) ? 4 : 2;
+    std::bitset<32> upload_part(0);
+
+    // Always start with a movz! Kills the dependency on the register.
+    bool use_movz = true;
+
+    if (!imm) {
+        // Zero immediate, just clear the register. EOR is pointless when we have
+        // MOVZ, which looks clearer in disasm too.
+        MOVZ(Rd, 0, SHIFT_0);
+        return;
+    }
+
+    if ((Is64Bit(Rd) && imm == std::numeric_limits<u64>::max()) ||
+        (!Is64Bit(Rd) && imm == std::numeric_limits<u32>::max())) {
+        // Max unsigned value (or if signed, -1)
+        // Set to ~ZR
+        ARM64Reg ZR = Is64Bit(Rd) ? SP : WSP;
+        ORN(Rd, ZR, ZR, ArithOption(ZR, ST_LSL, 0));
+        return;
+    }
+
+    // TODO: Make some more systemic use of MOVN, but this will take care of most
+    // cases. Small negative integer. Use MOVN
+    if (!Is64Bit(Rd) && (imm | 0xFFFF0000) == imm) {
+        MOVN(Rd, static_cast<u32>(~imm), SHIFT_0);
+        return;
+    }
+
+    // XXX: Use MOVN when possible.
+    // XXX: Optimize more
+    // XXX: Support rotating immediates to save instructions
+    if (optimize) {
+        for (unsigned int i = 0; i < parts; ++i) {
+            if ((imm >> (i * 16)) & 0xFFFF)
+                upload_part[i] = 1;
+        }
+    }
+
+    u64 aligned_pc = reinterpret_cast<u64>(GetCodePtr()) & ~0xFFF;
+    s64 aligned_offset = static_cast<s64>(imm) - static_cast<s64>(aligned_pc);
+    // The offset for ADR/ADRP is an s32, so make sure it can be represented in
+    // that
+    if (upload_part.count() > 1 && std::abs(aligned_offset) < 0x7FFFFFFFLL) {
+        // Immediate we are loading is within 4GB of our aligned range
+        // Most likely a address that we can load in one or two instructions
+        if (!(std::abs(aligned_offset) & 0xFFF)) {
+            // Aligned ADR
+            ADRP(Rd, static_cast<s32>(aligned_offset));
+            return;
+        } else {
+            // If the address is within 1MB of PC we can load it in a single
+            // instruction still
+            s64 offset = static_cast<s64>(imm) - reinterpret_cast<s64>(GetCodePtr());
+            if (offset >= -0xFFFFF && offset <= 0xFFFFF) {
+                ADR(Rd, static_cast<s32>(offset));
+                return;
+            } else {
+                ADRP(Rd, static_cast<s32>(aligned_offset & ~0xFFF));
+                ADD(Rd, Rd, imm & 0xFFF);
+                return;
+            }
+        }
+    }
+
+    for (unsigned i = 0; i < parts; ++i) {
+        if (use_movz && upload_part[i]) {
+            MOVZ(Rd, (imm >> (i * 16)) & 0xFFFF, static_cast<ShiftAmount>(i));
+            use_movz = false;
+        } else {
+            if (upload_part[i] || !optimize)
+                MOVK(Rd, (imm >> (i * 16)) & 0xFFFF, static_cast<ShiftAmount>(i));
+        }
+    }
+}
+
+bool ARM64XEmitter::MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2) {
+    // TODO: Also optimize for performance, not just for code size.
+    u8* start_pointer = GetWritableCodePtr();
+
+    MOVI2R(Rd, imm1);
+    u64 size1 = GetCodePtr() - start_pointer;
+
+    SetCodePtrUnsafe(start_pointer);
+
+    MOVI2R(Rd, imm2);
+    u64 size2 = GetCodePtr() - start_pointer;
+
+    SetCodePtrUnsafe(start_pointer);
+
+    bool element = size1 > size2;
+
+    MOVI2R(Rd, element ? imm2 : imm1);
+
+    return element;
+}
+
+void ARM64XEmitter::ABI_PushRegisters(u32 registers) {
+    int num_regs = Common::BitCount(registers);
+    int stack_size = (num_regs + (num_regs & 1)) * 8;
+    int it = 0;
+
+    std::array<ARM64Reg, 32> gpr{};
+
+    if (!num_regs)
+        return;
+
+    for (int i = 0; i < 32; ++i) {
+        if (Common::Bit(i, registers)) {
+            gpr[it++] = static_cast<ARM64Reg>(X0 + i);
+        }
+    }
+
+    // 8 byte per register, but 16 byte alignment, so we may have to padd one register.
+    // Only update the SP on the last write to avoid the dependency between those stores.
+
+    // The first push must adjust the SP, else a context switch may invalidate everything below SP.
+
+    it = 0;
+    if (num_regs & 1) {
+        STR(INDEX_PRE, gpr[0], SP, -stack_size);
+        it++;
+    } else {
+        STP(INDEX_PRE, gpr[0], gpr[1], SP, -stack_size);
+        it += 2;
+    }    
+	
+    // Fast store for all other registers, this is always an even number.
+    for (int i = 0; i < (num_regs - 1) / 2; i++) {
+        STP(INDEX_SIGNED, gpr[it], gpr[it + 1], SP, 16 * (i + 1));
+        it += 2;
+    }
+
+    ASSERT_MSG(it == num_regs, "%s registers don't match.", __func__);
+}
+
+void ARM64XEmitter::ABI_PopRegisters(u32 registers) {
+    u8 num_regs = static_cast<u8>(Common::BitCount(registers));
+    int stack_size = (num_regs + (num_regs & 1)) * 8;
+    int it = 0;
+
+    std::array<ARM64Reg, 32> gpr{};
+
+    if (!num_regs)
+        return;
+
+    for (int i = 0; i < 32; ++i) {
+        if (Common::Bit(i, registers)) {
+            gpr[it++] = static_cast<ARM64Reg>(X0 + i);
+        }
+    }
+    it = 0;
+    // We must adjust the SP in the end, so load the first (two) registers at least.
+    ARM64Reg first = gpr[it++];
+    ARM64Reg second = INVALID_REG;
+    if (!(num_regs & 1))
+        second = gpr[it++];
+
+    // 8 byte per register, but 16 byte alignment, so we may have to padd one register.
+    // Only update the SP on the last load to avoid the dependency between those loads.
+
+    // Fast load for all but the first (two) registers, this is always an even number.
+
+    for (int i = 0; i < (num_regs - 1) / 2; ++i) {
+        LDP(INDEX_SIGNED, gpr[it], gpr[it + 1], SP, 16 * (i + 1));
+        it += 2;
+    }
+
+    // Post loading the first (two) registers.
+    if (num_regs & 1)
+        LDR(INDEX_POST, first, SP, stack_size);
+    else
+        LDP(INDEX_POST, first, second, SP, stack_size);
+
+    ASSERT_MSG(it == num_regs, "%s registers don't match.", __func__);
+}
+
+// Float Emitter
+void ARM64FloatEmitter::EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt,
+                                               ARM64Reg Rn, s32 imm) {
+    Rt = DecodeReg(Rt);
+    Rn = DecodeReg(Rn);
+    u32 encoded_size = 0;
+    u32 encoded_imm = 0;
+
+    if (size == 8)
+        encoded_size = 0;
+    else if (size == 16)
+        encoded_size = 1;
+    else if (size == 32)
+        encoded_size = 2;
+    else if (size == 64)
+        encoded_size = 3;
+    else if (size == 128)
+        encoded_size = 0;
+
+    if (type == INDEX_UNSIGNED) {
+        ASSERT_MSG(!(imm & ((size - 1) >> 3)),
+                   "%s(INDEX_UNSIGNED) immediate offset must be aligned to size! "
+                   "(%d) (%p)",
+                   __func__, imm, m_emit->GetCodePtr());
+        ASSERT_MSG(imm >= 0, "%s(INDEX_UNSIGNED) immediate offset must be positive!", __func__);
+        if (size == 16)
+            imm >>= 1;
+        else if (size == 32)
+            imm >>= 2;
+        else if (size == 64)
+            imm >>= 3;
+        else if (size == 128)
+            imm >>= 4;
+        encoded_imm = (imm & 0xFFF);
+    } else {
+        ASSERT_MSG(!(imm < -256 || imm > 255),
+                   "%s immediate offset must be within range of -256 to 256!", __func__);
+        encoded_imm = (imm & 0x1FF) << 2;
+        if (type == INDEX_POST)
+            encoded_imm |= 1;
+        else
+            encoded_imm |= 3;
+    }
+
+    Write32((encoded_size << 30) | (0xF << 26) | (type == INDEX_UNSIGNED ? (1 << 24) : 0) |
+            (size == 128 ? (1 << 23) : 0) | (opc << 22) | (encoded_imm << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd,
+                                          ARM64Reg Rn, ARM64Reg Rm) {
+    ASSERT_MSG(!IsQuad(Rd), "%s only supports double and single registers!", __func__);
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    Rm = DecodeReg(Rm);
+
+    Write32((M << 31) | (S << 29) | (0b11110001 << 21) | (type << 22) | (Rm << 16) |
+            (opcode << 12) | (1 << 11) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
+                                      ARM64Reg Rm) {
+    ASSERT_MSG(!IsSingle(Rd), "%s doesn't support singles!", __func__);
+    bool quad = IsQuad(Rd);
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    Rm = DecodeReg(Rm);
+
+    Write32((quad << 30) | (U << 29) | (0b1110001 << 21) | (size << 22) | (Rm << 16) |
+            (opcode << 11) | (1 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitScalarThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
+                                      ARM64Reg Rm) {
+    ASSERT_MSG(!IsQuad(Rd), "%s doesn't support quads!", __func__);
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    Rm = DecodeReg(Rm);
+    int esize = 0;
+    switch (size) {
+    case 8:
+        esize = 0;
+        break;
+    case 16:
+        esize = 1;
+        break;
+    case 32:
+        esize = 2;
+        break;
+    case 64:
+        esize = 3;
+        break;
+    default:
+        ASSERT_MSG(false, "Size must be 8, 16, 32, or 64");
+        break;
+    }
+
+
+    Write32((U << 29) | (0b1011110001 << 21) | (esize << 22) | (Rm << 16) |
+            (opcode << 11) | (1 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn) {
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+
+    Write32((Q << 30) | (op << 29) | (0b111 << 25) | (imm5 << 16) | (imm4 << 11) | (1 << 10) |
+            (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd,
+                                     ARM64Reg Rn) {
+    ASSERT_MSG(!IsSingle(Rd), "%s doesn't support singles!", __func__);
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+
+    Write32((Q << 30) | (U << 29) | (0b1110001 << 21) | (size << 22) | (opcode << 12) | (1 << 11) |
+            (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size,
+                                                     ARM64Reg Rt, ARM64Reg Rn) {
+    ASSERT_MSG(!IsSingle(Rt), "%s doesn't support singles!", __func__);
+    bool quad = IsQuad(Rt);
+    Rt = DecodeReg(Rt);
+    Rn = DecodeReg(Rn);
+
+    Write32((quad << 30) | (0b1101 << 24) | (L << 22) | (R << 21) | (opcode << 13) | (S << 12) |
+            (size << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size,
+                                                     ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) {
+    ASSERT_MSG(!IsSingle(Rt), "%s doesn't support singles!", __func__);
+    bool quad = IsQuad(Rt);
+    Rt = DecodeReg(Rt);
+    Rn = DecodeReg(Rn);
+    Rm = DecodeReg(Rm);
+
+    Write32((quad << 30) | (0x1B << 23) | (L << 22) | (R << 21) | (Rm << 16) | (opcode << 13) |
+            (S << 12) | (size << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd,
+                                    ARM64Reg Rn) {
+    ASSERT_MSG(!IsQuad(Rd), "%s doesn't support vector!", __func__);
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+
+    Write32((M << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (opcode << 15) | (1 << 14) |
+            (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 opcode,
+                                       ARM64Reg Rd, ARM64Reg Rn) {
+    ASSERT_MSG(Rn <= SP, "%s only supports GPR as source!", __func__);
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+
+    Write32((sf << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (rmode << 19) | (opcode << 16) |
+            (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round,
+                                               bool sign) {
+    DEBUG_ASSERT_MSG(IsScalar(Rn), "fcvts: Rn must be floating point");
+    if (IsGPR(Rd)) {
+        // Use the encoding that transfers the result to a GPR.
+        bool sf = Is64Bit(Rd);
+        int type = IsDouble(Rn) ? 1 : 0;
+        Rd = DecodeReg(Rd);
+        Rn = DecodeReg(Rn);
+        int opcode = (sign ? 1 : 0);
+        int rmode = 0;
+        switch (round) {
+        case ROUND_A:
+            rmode = 0;
+            opcode |= 4;
+            break;
+        case ROUND_P:
+            rmode = 1;
+            break;
+        case ROUND_M:
+            rmode = 2;
+            break;
+        case ROUND_Z:
+            rmode = 3;
+            break;
+        case ROUND_N:
+            rmode = 0;
+            break;
+        }
+        EmitConversion2(sf, 0, true, type, rmode, opcode, 0, Rd, Rn);
+    } else {
+        // Use the encoding (vector, single) that keeps the result in the fp
+        // register.
+        int sz = IsDouble(Rn);
+        Rd = DecodeReg(Rd);
+        Rn = DecodeReg(Rn);
+        int opcode = 0;
+        switch (round) {
+        case ROUND_A:
+            opcode = 0x1C;
+            break;
+        case ROUND_N:
+            opcode = 0x1A;
+            break;
+        case ROUND_M:
+            opcode = 0x1B;
+            break;
+        case ROUND_P:
+            opcode = 0x1A;
+            sz |= 2;
+            break;
+        case ROUND_Z:
+            opcode = 0x1B;
+            sz |= 2;
+            break;
+        }
+        Write32((0x5E << 24) | (sign << 29) | (sz << 22) | (1 << 21) | (opcode << 12) | (2 << 10) |
+                (Rn << 5) | Rd);
+    }
+}
+
+void ARM64FloatEmitter::FCVTS(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round) {
+    EmitConvertScalarToInt(Rd, Rn, round, false);
+}
+
+void ARM64FloatEmitter::FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round) {
+    EmitConvertScalarToInt(Rd, Rn, round, true);
+}
+
+void ARM64FloatEmitter::EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode,
+                                        u32 opcode, int scale, ARM64Reg Rd, ARM64Reg Rn) {
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+
+    Write32((sf << 31) | (S << 29) | (0xF0 << 21) | (direction << 21) | (type << 22) |
+            (rmode << 19) | (opcode << 16) | (scale << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm) {
+    ASSERT_MSG(!IsQuad(Rn), "%s doesn't support vector!", __func__);
+    bool is_double = IsDouble(Rn);
+
+    Rn = DecodeReg(Rn);
+    Rm = DecodeReg(Rm);
+
+    Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (Rm << 16) | (op << 14) |
+            (1 << 13) | (Rn << 5) | opcode2);
+}
+
+void ARM64FloatEmitter::EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn,
+                                       ARM64Reg Rm) {
+    ASSERT_MSG(!IsQuad(Rd), "%s doesn't support vector!", __func__);
+    bool is_double = IsDouble(Rd);
+
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    Rm = DecodeReg(Rm);
+
+    Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (Rm << 16) | (cond << 12) |
+            (3 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    ASSERT_MSG(!IsSingle(Rd), "%s doesn't support singles!", __func__);
+
+    bool quad = IsQuad(Rd);
+
+    u32 encoded_size = 0;
+    if (size == 16)
+        encoded_size = 1;
+    else if (size == 32)
+        encoded_size = 2;
+    else if (size == 64)
+        encoded_size = 3;
+
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    Rm = DecodeReg(Rm);
+
+    Write32((quad << 30) | (7 << 25) | (encoded_size << 22) | (Rm << 16) | (op << 12) | (1 << 11) |
+            (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8) {
+    ASSERT_MSG(!IsQuad(Rd), "%s doesn't support vector!", __func__);
+
+    bool is_double = !IsSingle(Rd);
+
+    Rd = DecodeReg(Rd);
+
+    Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (type << 22) | (imm8 << 13) |
+            (1 << 12) | (imm5 << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitShiftImm(bool Q, bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd,
+                                     ARM64Reg Rn) {
+    ASSERT_MSG(immh, "%s bad encoding! Can't have zero immh", __func__);
+
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+
+    Write32((Q << 30) | (U << 29) | (0xF << 24) | (immh << 19) | (immb << 16) | (opcode << 11) |
+            (1 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitScalarShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd,
+                                           ARM64Reg Rn) {
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+
+    Write32((2 << 30) | (U << 29) | (0x3E << 23) | (immh << 19) | (immb << 16) | (opcode << 11) |
+            (1 << 10) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt,
+                                                       ARM64Reg Rn) {
+    bool quad = IsQuad(Rt);
+    u32 encoded_size = 0;
+
+    if (size == 16)
+        encoded_size = 1;
+    else if (size == 32)
+        encoded_size = 2;
+    else if (size == 64)
+        encoded_size = 3;
+
+    Rt = DecodeReg(Rt);
+    Rn = DecodeReg(Rn);
+
+    Write32((quad << 30) | (3 << 26) | (L << 22) | (opcode << 12) | (encoded_size << 10) |
+            (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode,
+                                                           ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) {
+    bool quad = IsQuad(Rt);
+    u32 encoded_size = 0;
+
+    if (size == 16)
+        encoded_size = 1;
+    else if (size == 32)
+        encoded_size = 2;
+    else if (size == 64)
+        encoded_size = 3;
+
+    Rt = DecodeReg(Rt);
+    Rn = DecodeReg(Rn);
+    Rm = DecodeReg(Rm);
+
+    Write32((quad << 30) | (0b11001 << 23) | (L << 22) | (Rm << 16) | (opcode << 12) |
+            (encoded_size << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd,
+                                          ARM64Reg Rn) {
+    ASSERT_MSG(!IsQuad(Rd), "%s doesn't support vector!", __func__);
+
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+
+    Write32((M << 31) | (S << 29) | (0xF1 << 21) | (type << 22) | (opcode << 15) | (1 << 14) |
+            (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H,
+                                           ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    bool quad = IsQuad(Rd);
+
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    Rm = DecodeReg(Rm);
+
+    Write32((quad << 30) | (U << 29) | (0xF << 24) | (size << 22) | (L << 21) | (Rm << 16) |
+            (opcode << 12) | (H << 11) | (Rn << 5) | Rd);
+}
+
+void ARM64FloatEmitter::EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    ASSERT_MSG(!(imm < -256 || imm > 255), "%s received too large offset: %d", __func__, imm);
+    Rt = DecodeReg(Rt);
+    Rn = DecodeReg(Rn);
+
+    Write32((size << 30) | (0xF << 26) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt,
+                                            ARM64Reg Rt2, ARM64Reg Rn, s32 imm) {
+    u32 type_encode = 0;
+    u32 opc = 0;
+
+    switch (type) {
+    case INDEX_SIGNED:
+        type_encode = 0b010;
+        break;
+    case INDEX_POST:
+        type_encode = 0b001;
+        break;
+    case INDEX_PRE:
+        type_encode = 0b011;
+        break;
+    case INDEX_UNSIGNED:
+        ASSERT_MSG(false, "%s doesn't support INDEX_UNSIGNED!", __func__);
+        break;
+    }
+
+    if (size == 128) {
+        ASSERT_MSG(!(imm & 0xF), "%s received invalid offset 0x%x!", __func__, imm);
+        opc = 2;
+        imm >>= 4;
+    } else if (size == 64) {
+        ASSERT_MSG(!(imm & 0x7), "%s received invalid offset 0x%x!", __func__, imm);
+        opc = 1;
+        imm >>= 3;
+    } else if (size == 32) {
+        ASSERT_MSG(!(imm & 0x3), "%s received invalid offset 0x%x!", __func__, imm);
+        opc = 0;
+        imm >>= 2;
+    }
+
+    Rt = DecodeReg(Rt);
+    Rt2 = DecodeReg(Rt2);
+    Rn = DecodeReg(Rn);
+
+    Write32((opc << 30) | (0b1011 << 26) | (type_encode << 23) | (load << 22) |
+            ((imm & 0x7F) << 15) | (Rt2 << 10) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn,
+                                                      ArithOption Rm) {
+    ASSERT_MSG(Rm.GetType() == ArithOption::TYPE_EXTENDEDREG,
+               "%s must contain an extended reg as Rm!", __func__);
+
+    u32 encoded_size = 0;
+    u32 encoded_op = 0;
+
+    if (size == 8) {
+        encoded_size = 0;
+        encoded_op = 0;
+    } else if (size == 16) {
+        encoded_size = 1;
+        encoded_op = 0;
+    } else if (size == 32) {
+        encoded_size = 2;
+        encoded_op = 0;
+    } else if (size == 64) {
+        encoded_size = 3;
+        encoded_op = 0;
+    } else if (size == 128) {
+        encoded_size = 0;
+        encoded_op = 2;
+    }
+
+    if (load)
+        encoded_op |= 1;
+
+    Rt = DecodeReg(Rt);
+    Rn = DecodeReg(Rn);
+    ARM64Reg decoded_Rm = DecodeReg(Rm.GetReg());
+
+    Write32((encoded_size << 30) | (encoded_op << 22) | (0b111100001 << 21) | (decoded_Rm << 16) |
+            Rm.GetData() | (1 << 11) | (Rn << 5) | Rt);
+}
+
+void ARM64FloatEmitter::EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh) {
+    union V {
+        u8 hex;
+        struct InV {
+            unsigned defgh : 5;
+            unsigned abc : 3;
+        } in;
+    } v;
+    v.hex = abcdefgh;
+    Rd = DecodeReg(Rd);
+    Write32((Q << 30) | (op << 29) | (0xF << 24) | (v.in.abc << 16) | (cmode << 12) | (o2 << 11) |
+            (1 << 10) | (v.in.defgh << 5) | Rd);
+}
+
+void ARM64FloatEmitter::LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    EmitLoadStoreImmediate(size, 1, type, Rt, Rn, imm);
+}
+void ARM64FloatEmitter::STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    EmitLoadStoreImmediate(size, 0, type, Rt, Rn, imm);
+}
+
+// Loadstore unscaled
+void ARM64FloatEmitter::LDUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    u32 encoded_size = 0;
+    u32 encoded_op = 0;
+
+    if (size == 8) {
+        encoded_size = 0;
+        encoded_op = 1;
+    } else if (size == 16) {
+        encoded_size = 1;
+        encoded_op = 1;
+    } else if (size == 32) {
+        encoded_size = 2;
+        encoded_op = 1;
+    } else if (size == 64) {
+        encoded_size = 3;
+        encoded_op = 1;
+    } else if (size == 128) {
+        encoded_size = 0;
+        encoded_op = 3;
+    }
+
+    EmitLoadStoreUnscaled(encoded_size, encoded_op, Rt, Rn, imm);
+}
+void ARM64FloatEmitter::STUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm) {
+    u32 encoded_size = 0;
+    u32 encoded_op = 0;
+
+    if (size == 8) {
+        encoded_size = 0;
+        encoded_op = 0;
+    } else if (size == 16) {
+        encoded_size = 1;
+        encoded_op = 0;
+    } else if (size == 32) {
+        encoded_size = 2;
+        encoded_op = 0;
+    } else if (size == 64) {
+        encoded_size = 3;
+        encoded_op = 0;
+    } else if (size == 128) {
+        encoded_size = 0;
+        encoded_op = 2;
+    }
+
+    EmitLoadStoreUnscaled(encoded_size, encoded_op, Rt, Rn, imm);
+}
+
+// Loadstore single structure
+void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn) {
+    bool S = 0;
+    u32 opcode = 0;
+    u32 encoded_size = 0;
+    ARM64Reg encoded_reg = INVALID_REG;
+
+    if (size == 8) {
+        S = (index & 4) != 0;
+        opcode = 0;
+        encoded_size = index & 3;
+        if (index & 8)
+            encoded_reg = EncodeRegToQuad(Rt);
+        else
+            encoded_reg = EncodeRegToDouble(Rt);
+    } else if (size == 16) {
+        S = (index & 2) != 0;
+        opcode = 2;
+        encoded_size = (index & 1) << 1;
+        if (index & 4)
+            encoded_reg = EncodeRegToQuad(Rt);
+        else
+            encoded_reg = EncodeRegToDouble(Rt);
+    } else if (size == 32) {
+        S = (index & 1) != 0;
+        opcode = 4;
+        encoded_size = 0;
+        if (index & 2)
+            encoded_reg = EncodeRegToQuad(Rt);
+        else
+            encoded_reg = EncodeRegToDouble(Rt);
+    } else if (size == 64) {
+        S = 0;
+        opcode = 4;
+        encoded_size = 1;
+        if (index == 1)
+            encoded_reg = EncodeRegToQuad(Rt);
+        else
+            encoded_reg = EncodeRegToDouble(Rt);
+    }
+
+    EmitLoadStoreSingleStructure(1, 0, opcode, S, encoded_size, encoded_reg, Rn);
+}
+
+void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm) {
+    bool S = 0;
+    u32 opcode = 0;
+    u32 encoded_size = 0;
+    ARM64Reg encoded_reg = INVALID_REG;
+
+    if (size == 8) {
+        S = (index & 4) != 0;
+        opcode = 0;
+        encoded_size = index & 3;
+        if (index & 8)
+            encoded_reg = EncodeRegToQuad(Rt);
+        else
+            encoded_reg = EncodeRegToDouble(Rt);
+    } else if (size == 16) {
+        S = (index & 2) != 0;
+        opcode = 2;
+        encoded_size = (index & 1) << 1;
+        if (index & 4)
+            encoded_reg = EncodeRegToQuad(Rt);
+        else
+            encoded_reg = EncodeRegToDouble(Rt);
+    } else if (size == 32) {
+        S = (index & 1) != 0;
+        opcode = 4;
+        encoded_size = 0;
+        if (index & 2)
+            encoded_reg = EncodeRegToQuad(Rt);
+        else
+            encoded_reg = EncodeRegToDouble(Rt);
+    } else if (size == 64) {
+        S = 0;
+        opcode = 4;
+        encoded_size = 1;
+        if (index == 1)
+            encoded_reg = EncodeRegToQuad(Rt);
+        else
+            encoded_reg = EncodeRegToDouble(Rt);
+    }
+
+    EmitLoadStoreSingleStructure(1, 0, opcode, S, encoded_size, encoded_reg, Rn, Rm);
+}
+
+void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn) {
+    EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn);
+}
+void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn) {
+    EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn);
+}
+void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn, Rm);
+}
+void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn, Rm);
+}
+
+void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn) {
+    bool S = 0;
+    u32 opcode = 0;
+    u32 encoded_size = 0;
+    ARM64Reg encoded_reg = INVALID_REG;
+
+    if (size == 8) {
+        S = (index & 4) != 0;
+        opcode = 0;
+        encoded_size = index & 3;
+        if (index & 8)
+            encoded_reg = EncodeRegToQuad(Rt);
+        else
+            encoded_reg = EncodeRegToDouble(Rt);
+    } else if (size == 16) {
+        S = (index & 2) != 0;
+        opcode = 2;
+        encoded_size = (index & 1) << 1;
+        if (index & 4)
+            encoded_reg = EncodeRegToQuad(Rt);
+        else
+            encoded_reg = EncodeRegToDouble(Rt);
+    } else if (size == 32) {
+        S = (index & 1) != 0;
+        opcode = 4;
+        encoded_size = 0;
+        if (index & 2)
+            encoded_reg = EncodeRegToQuad(Rt);
+        else
+            encoded_reg = EncodeRegToDouble(Rt);
+    } else if (size == 64) {
+        S = 0;
+        opcode = 4;
+        encoded_size = 1;
+        if (index == 1)
+            encoded_reg = EncodeRegToQuad(Rt);
+        else
+            encoded_reg = EncodeRegToDouble(Rt);
+    }
+
+    EmitLoadStoreSingleStructure(0, 0, opcode, S, encoded_size, encoded_reg, Rn);
+}
+
+void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm) {
+    bool S = 0;
+    u32 opcode = 0;
+    u32 encoded_size = 0;
+    ARM64Reg encoded_reg = INVALID_REG;
+
+    if (size == 8) {
+        S = (index & 4) != 0;
+        opcode = 0;
+        encoded_size = index & 3;
+        if (index & 8)
+            encoded_reg = EncodeRegToQuad(Rt);
+        else
+            encoded_reg = EncodeRegToDouble(Rt);
+    } else if (size == 16) {
+        S = (index & 2) != 0;
+        opcode = 2;
+        encoded_size = (index & 1) << 1;
+        if (index & 4)
+            encoded_reg = EncodeRegToQuad(Rt);
+        else
+            encoded_reg = EncodeRegToDouble(Rt);
+    } else if (size == 32) {
+        S = (index & 1) != 0;
+        opcode = 4;
+        encoded_size = 0;
+        if (index & 2)
+            encoded_reg = EncodeRegToQuad(Rt);
+        else
+            encoded_reg = EncodeRegToDouble(Rt);
+    } else if (size == 64) {
+        S = 0;
+        opcode = 4;
+        encoded_size = 1;
+        if (index == 1)
+            encoded_reg = EncodeRegToQuad(Rt);
+        else
+            encoded_reg = EncodeRegToDouble(Rt);
+    }
+
+    EmitLoadStoreSingleStructure(0, 0, opcode, S, encoded_size, encoded_reg, Rn, Rm);
+}
+
+// Loadstore multiple structure
+void ARM64FloatEmitter::LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn) {
+    ASSERT_MSG(!(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", __func__);
+    u32 opcode = 0;
+    if (count == 1)
+        opcode = 0b111;
+    else if (count == 2)
+        opcode = 0b1010;
+    else if (count == 3)
+        opcode = 0b0110;
+    else if (count == 4)
+        opcode = 0b0010;
+    EmitLoadStoreMultipleStructure(size, 1, opcode, Rt, Rn);
+}
+void ARM64FloatEmitter::LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn,
+                            ARM64Reg Rm) {
+    ASSERT_MSG(!(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", __func__);
+    ASSERT_MSG(type == INDEX_POST, "%s only supports post indexing!", __func__);
+
+    u32 opcode = 0;
+    if (count == 1)
+        opcode = 0b111;
+    else if (count == 2)
+        opcode = 0b1010;
+    else if (count == 3)
+        opcode = 0b0110;
+    else if (count == 4)
+        opcode = 0b0010;
+    EmitLoadStoreMultipleStructurePost(size, 1, opcode, Rt, Rn, Rm);
+}
+void ARM64FloatEmitter::ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn) {
+    ASSERT_MSG(!(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", __func__);
+    u32 opcode = 0;
+    if (count == 1)
+        opcode = 0b111;
+    else if (count == 2)
+        opcode = 0b1010;
+    else if (count == 3)
+        opcode = 0b0110;
+    else if (count == 4)
+        opcode = 0b0010;
+    EmitLoadStoreMultipleStructure(size, 0, opcode, Rt, Rn);
+}
+void ARM64FloatEmitter::ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn,
+                            ARM64Reg Rm) {
+    ASSERT_MSG(!(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", __func__);
+    ASSERT_MSG(type == INDEX_POST, "%s only supports post indexing!", __func__);
+
+    u32 opcode = 0;
+    if (count == 1)
+        opcode = 0b111;
+    else if (count == 2)
+        opcode = 0b1010;
+    else if (count == 3)
+        opcode = 0b0110;
+    else if (count == 4)
+        opcode = 0b0010;
+    EmitLoadStoreMultipleStructurePost(size, 0, opcode, Rt, Rn, Rm);
+}
+
+// Scalar - 1 Source
+void ARM64FloatEmitter::FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top) {
+    if (IsScalar(Rd) && IsScalar(Rn)) {
+        EmitScalar1Source(0, 0, IsDouble(Rd), 0, Rd, Rn);
+    } else {
+        ASSERT_MSG(!IsQuad(Rd) && !IsQuad(Rn), "FMOV can't move to/from quads");
+        int rmode = 0;
+        int opcode = 6;
+        int encoded_size = 0;
+        int sf = 0;
+        if (IsSingle(Rd) && !Is64Bit(Rn) && !top) {
+            // GPR to scalar single
+            opcode |= 1;
+        } else if (!Is64Bit(Rd) && IsSingle(Rn) && !top) {
+            // Scalar single to GPR - defaults are correct
+        } else if (Is64Bit(Rd) && IsDouble(Rn) && !top) {
+            // Scalar double to GPR
+            sf = 1;
+            encoded_size = 1;
+        } else if (IsDouble(Rd) && Is64Bit(Rn) && !top) {
+            // GPR to Scalar double
+            sf = 1;
+            encoded_size = 1;
+            opcode |= 1;
+        } else {
+            // TODO
+            ASSERT_MSG(0, "FMOV: Unhandled case");
+        }
+        Rd = DecodeReg(Rd);
+        Rn = DecodeReg(Rn);
+        Write32((sf << 31) | (encoded_size << 22) | (0x1e2 << 20)  | (rmode << 19) | (opcode << 16) | (Rn << 5) | Rd);
+    }
+}
+
+// Loadstore paired
+void ARM64FloatEmitter::LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn,
+                            s32 imm) {
+    EncodeLoadStorePair(size, true, type, Rt, Rt2, Rn, imm);
+}
+void ARM64FloatEmitter::STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn,
+                            s32 imm) {
+    EncodeLoadStorePair(size, false, type, Rt, Rt2, Rn, imm);
+}
+
+// Loadstore register offset
+void ARM64FloatEmitter::STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) {
+    EncodeLoadStoreRegisterOffset(size, false, Rt, Rn, Rm);
+}
+void ARM64FloatEmitter::LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) {
+    EncodeLoadStoreRegisterOffset(size, true, Rt, Rn, Rm);
+}
+
+void ARM64FloatEmitter::FABS(ARM64Reg Rd, ARM64Reg Rn) {
+    EmitScalar1Source(0, 0, IsDouble(Rd), 1, Rd, Rn);
+}
+void ARM64FloatEmitter::FNEG(ARM64Reg Rd, ARM64Reg Rn) {
+    EmitScalar1Source(0, 0, IsDouble(Rd), 2, Rd, Rn);
+}
+void ARM64FloatEmitter::FSQRT(ARM64Reg Rd, ARM64Reg Rn) {
+    EmitScalar1Source(0, 0, IsDouble(Rd), 3, Rd, Rn);
+}
+
+// Scalar - 2 Source
+void ARM64FloatEmitter::FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitScalar2Source(0, 0, IsDouble(Rd), 2, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitScalar2Source(0, 0, IsDouble(Rd), 0, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitScalar2Source(0, 0, IsDouble(Rd), 3, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitScalar2Source(0, 0, IsDouble(Rd), 1, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitScalar2Source(0, 0, IsDouble(Rd), 4, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitScalar2Source(0, 0, IsDouble(Rd), 5, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMAXNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitScalar2Source(0, 0, IsDouble(Rd), 6, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMINNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitScalar2Source(0, 0, IsDouble(Rd), 7, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FNMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitScalar2Source(0, 0, IsDouble(Rd), 8, Rd, Rn, Rm);
+}
+
+void ARM64FloatEmitter::FMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) {
+    EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 0);
+}
+void ARM64FloatEmitter::FMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) {
+    EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 1);
+}
+void ARM64FloatEmitter::FNMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) {
+    EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 2);
+}
+void ARM64FloatEmitter::FNMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) {
+    EmitScalar3Source(IsDouble(Rd), Rd, Rn, Rm, Ra, 3);
+}
+
+void ARM64FloatEmitter::EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+                                          ARM64Reg Ra, int opcode) {
+    int type = isDouble ? 1 : 0;
+    Rd = DecodeReg(Rd);
+    Rn = DecodeReg(Rn);
+    Rm = DecodeReg(Rm);
+    Ra = DecodeReg(Ra);
+    int o1 = opcode >> 1;
+    int o0 = opcode & 1;
+    m_emit->Write32((0x1F << 24) | (type << 22) | (o1 << 21) | (Rm << 16) | (o0 << 15) |
+                    (Ra << 10) | (Rn << 5) | Rd);
+}
+
+// Scalar three same
+void ARM64FloatEmitter::SQADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitScalarThreeSame(0, size, 0b00001, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::SQSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitScalarThreeSame(0, size, 0b00101, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::UQADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitScalarThreeSame(1, size, 0b00001, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::UQSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitScalarThreeSame(1, size, 0b00101, Rd, Rn, Rm);
+}
+
+// Scalar floating point immediate
+void ARM64FloatEmitter::FMOV(ARM64Reg Rd, uint8_t imm8) {
+    EmitScalarImm(0, 0, 0, 0, Rd, imm8);
+}
+
+// Vector
+void ARM64FloatEmitter::ADD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    ASSERT(!(IsDouble(Rd) && esize == D));
+    EmitThreeSame(0, static_cast<u32>(esize), 0b10000, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::ADDV(ESize esize, ARM64Reg Rd, ARM64Reg Rn) {
+    ASSERT(esize != D);
+    Emit2RegMisc(IsQuad(Rd), 0, static_cast<u32>(esize), 0b100011011, Rd, Rn);
+}
+void ARM64FloatEmitter::SUB(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    ASSERT(!(IsDouble(Rd) && esize == D));
+    EmitThreeSame(1, static_cast<u32>(esize), 0b10000, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitThreeSame(0, 0, 3, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitThreeSame(1, 1, 3, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::CMGE(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    ASSERT(!(IsDouble(Rd) && esize == D));
+    EmitThreeSame(0, static_cast<u32>(esize), 0b00111, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::CMGE_zero(ESize esize, ARM64Reg Rd, ARM64Reg Rn) {
+    ASSERT(!(IsDouble(Rd) && esize == D));
+    Emit2RegMisc(IsQuad(Rd), 1, static_cast<u32>(esize), 0b1000, Rd, Rn);
+}
+void ARM64FloatEmitter::CMGT(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    ASSERT(!(IsDouble(Rd) && esize == D));
+    EmitThreeSame(0, static_cast<u32>(esize), 0b00110, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::CMHI(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    ASSERT(!(IsDouble(Rd) && esize == D));
+    EmitThreeSame(1, static_cast<u32>(esize), 0b00110, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::CMHS(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    ASSERT(!(IsDouble(Rd) && esize == D));
+    EmitThreeSame(1, static_cast<u32>(esize), 0b00111, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) {
+    u32 imm5 = 0;
+
+    if (size == 8) {
+        imm5 = 1;
+        imm5 |= index << 1;
+    } else if (size == 16) {
+        imm5 = 2;
+        imm5 |= index << 2;
+    } else if (size == 32) {
+        imm5 = 4;
+        imm5 |= index << 3;
+    } else if (size == 64) {
+        imm5 = 8;
+        imm5 |= index << 4;
+    }
+
+    EmitCopy(IsQuad(Rd), 0, imm5, 0, Rd, Rn);
+}
+void ARM64FloatEmitter::FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xF, Rd, Rn);
+}
+void ARM64FloatEmitter::FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitThreeSame(0, size >> 6, 0x1A, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitThreeSame(0, size >> 6, 0b11110, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitThreeSame(0, size >> 6, 0x19, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitThreeSame(0, 2 | size >> 6, 0b11110, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(false, 0, size >> 6, 0x17, Rd, Rn);
+}
+void ARM64FloatEmitter::FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(true, 0, size >> 6, 0x17, Rd, Rn);
+}
+void ARM64FloatEmitter::FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(IsQuad(Rd), 0, dest_size >> 5, 0x16, Rd, Rn);
+}
+void ARM64FloatEmitter::FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x1B, Rd, Rn);
+}
+void ARM64FloatEmitter::FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1B, Rd, Rn);
+}
+void ARM64FloatEmitter::FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitThreeSame(1, size >> 6, 0x1F, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitThreeSame(1, size >> 6, 0x1B, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0xF, Rd, Rn);
+}
+void ARM64FloatEmitter::FRECPE(u8 size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x1D, Rd, Rn);
+}
+void ARM64FloatEmitter::FRSQRTE(u8 size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1D, Rd, Rn);
+}
+void ARM64FloatEmitter::FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitThreeSame(0, 2 | (size >> 6), 0x1A, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitThreeSame(0, 2 | (size >> 6), 0x19, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::NOT(ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(IsQuad(Rd), 1, 0, 5, Rd, Rn);
+}
+void ARM64FloatEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitThreeSame(0, 2, 3, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 1, Rd, Rn);
+}
+void ARM64FloatEmitter::REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(IsQuad(Rd), 1, size >> 4, 0, Rd, Rn);
+}
+void ARM64FloatEmitter::REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 0, Rd, Rn);
+}
+void ARM64FloatEmitter::SABD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    ASSERT(esize != D);
+    EmitThreeSame(0, static_cast<u32>(esize), 0b01110, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::UABD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    ASSERT(esize != D);
+    EmitThreeSame(1, static_cast<u32>(esize), 0b01110, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::SADDLV(ESize esize, ARM64Reg Rd, ARM64Reg Rn) {
+    ASSERT(esize != D);
+    Emit2RegMisc(IsQuad(Rd), 0, static_cast<u32>(esize), 0b100000011, Rd, Rn);
+}
+void ARM64FloatEmitter::UADDLV(ESize esize, ARM64Reg Rd, ARM64Reg Rn) {
+    ASSERT(esize != D);
+    Emit2RegMisc(IsQuad(Rd), 1, static_cast<u32>(esize), 0b100000011, Rd, Rn);
+}
+void ARM64FloatEmitter::SHADD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    ASSERT(!(IsDouble(Rd) && esize == D));
+    EmitThreeSame(0, static_cast<u32>(esize), 0b0, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::UHADD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    ASSERT(!(IsDouble(Rd) && esize == D));
+    EmitThreeSame(1, static_cast<u32>(esize), 0b0, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::SHSUB(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    ASSERT(!(IsDouble(Rd) && esize == D));
+    EmitThreeSame(0, static_cast<u32>(esize), 0b00100, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::UHSUB(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    ASSERT(!(IsDouble(Rd) && esize == D));
+    EmitThreeSame(1, static_cast<u32>(esize), 0b00100, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::SMIN(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    ASSERT(!(IsDouble(Rd) && esize == D));
+    EmitThreeSame(0, static_cast<u32>(esize), 0b01101, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::UMIN(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    ASSERT(!(IsDouble(Rd) && esize == D));
+    EmitThreeSame(1, static_cast<u32>(esize), 0b01101, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::SQADD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    ASSERT(!(IsDouble(Rd) && esize == D));
+    EmitThreeSame(0, static_cast<u32>(esize), 0b00001, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::SQSUB(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    ASSERT(!(IsDouble(Rd) && esize == D));
+    EmitThreeSame(0, static_cast<u32>(esize), 0b00101, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::UQADD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    ASSERT(!(IsDouble(Rd) && esize == D));
+    EmitThreeSame(1, static_cast<u32>(esize), 0b00001, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::UQSUB(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    ASSERT(!(IsDouble(Rd) && esize == D));
+    EmitThreeSame(1, static_cast<u32>(esize), 0b00101, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(IsQuad(Rd), 0, size >> 6, 0x1D, Rd, Rn);
+}
+void ARM64FloatEmitter::UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(IsQuad(Rd), 1, size >> 6, 0x1D, Rd, Rn);
+}
+void ARM64FloatEmitter::SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale) {
+    int imm = size * 2 - scale;
+    EmitShiftImm(IsQuad(Rd), 0, imm >> 3, imm & 7, 0x1C, Rd, Rn);
+}
+void ARM64FloatEmitter::UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale) {
+    int imm = size * 2 - scale;
+    EmitShiftImm(IsQuad(Rd), 1, imm >> 3, imm & 7, 0x1C, Rd, Rn);
+}
+void ARM64FloatEmitter::SQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(false, 0, dest_size >> 4, 0b10100, Rd, Rn);
+}
+void ARM64FloatEmitter::SQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(true, 0, dest_size >> 4, 0b10100, Rd, Rn);
+}
+void ARM64FloatEmitter::UQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(false, 1, dest_size >> 4, 0b10100, Rd, Rn);
+}
+void ARM64FloatEmitter::UQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(true, 1, dest_size >> 4, 0b10100, Rd, Rn);
+}
+void ARM64FloatEmitter::XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(false, 0, dest_size >> 4, 0b10010, Rd, Rn);
+}
+void ARM64FloatEmitter::XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(true, 0, dest_size >> 4, 0b10010, Rd, Rn);
+}
+
+// Move
+void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn) {
+    u32 imm5 = 0;
+
+    if (size == 8)
+        imm5 = 1;
+    else if (size == 16)
+        imm5 = 2;
+    else if (size == 32)
+        imm5 = 4;
+    else if (size == 64)
+        imm5 = 8;
+
+    EmitCopy(IsQuad(Rd), 0, imm5, 1, Rd, Rn);
+}
+void ARM64FloatEmitter::INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn) {
+    u32 imm5 = 0;
+
+    if (size == 8) {
+        imm5 = 1;
+        imm5 |= index << 1;
+    } else if (size == 16) {
+        imm5 = 2;
+        imm5 |= index << 2;
+    } else if (size == 32) {
+        imm5 = 4;
+        imm5 |= index << 3;
+    } else if (size == 64) {
+        imm5 = 8;
+        imm5 |= index << 4;
+    }
+
+    EmitCopy(1, 0, imm5, 3, Rd, Rn);
+}
+void ARM64FloatEmitter::INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 index2) {
+    u32 imm5 = 0, imm4 = 0;
+
+    if (size == 8) {
+        imm5 = 1;
+        imm5 |= index1 << 1;
+        imm4 = index2;
+    } else if (size == 16) {
+        imm5 = 2;
+        imm5 |= index1 << 2;
+        imm4 = index2 << 1;
+    } else if (size == 32) {
+        imm5 = 4;
+        imm5 |= index1 << 3;
+        imm4 = index2 << 2;
+    } else if (size == 64) {
+        imm5 = 8;
+        imm5 |= index1 << 4;
+        imm4 = index2 << 3;
+    }
+
+    EmitCopy(1, 1, imm5, imm4, Rd, Rn);
+}
+
+void ARM64FloatEmitter::UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) {
+    bool b64Bit = Is64Bit(Rd);
+    ASSERT_MSG(Rd < SP, "%s destination must be a GPR!", __func__);
+    ASSERT_MSG(!(b64Bit && size != 64), "%s must have a size of 64 when destination is 64bit!",
+               __func__);
+    u32 imm5 = 0;
+
+    if (size == 8) {
+        imm5 = 1;
+        imm5 |= index << 1;
+    } else if (size == 16) {
+        imm5 = 2;
+        imm5 |= index << 2;
+    } else if (size == 32) {
+        imm5 = 4;
+        imm5 |= index << 3;
+    } else if (size == 64) {
+        imm5 = 8;
+        imm5 |= index << 4;
+    }
+
+    EmitCopy(b64Bit, 0, imm5, 7, Rd, Rn);
+}
+void ARM64FloatEmitter::SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) {
+    bool b64Bit = Is64Bit(Rd);
+    ASSERT_MSG(Rd < SP, "%s destination must be a GPR!", __func__);
+    ASSERT_MSG(size != 64, "%s doesn't support 64bit destination. Use UMOV!", __func__);
+    u32 imm5 = 0;
+
+    if (size == 8) {
+        imm5 = 1;
+        imm5 |= index << 1;
+    } else if (size == 16) {
+        imm5 = 2;
+        imm5 |= index << 2;
+    } else if (size == 32) {
+        imm5 = 4;
+        imm5 |= index << 3;
+    }
+
+    EmitCopy(b64Bit, 0, imm5, 5, Rd, Rn);
+}
+
+// One source
+void ARM64FloatEmitter::FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn) {
+    u32 dst_encoding = 0;
+    u32 src_encoding = 0;
+
+    if (size_to == 16)
+        dst_encoding = 3;
+    else if (size_to == 32)
+        dst_encoding = 0;
+    else if (size_to == 64)
+        dst_encoding = 1;
+
+    if (size_from == 16)
+        src_encoding = 3;
+    else if (size_from == 32)
+        src_encoding = 0;
+    else if (size_from == 64)
+        src_encoding = 1;
+
+    Emit1Source(0, 0, src_encoding, 4 | dst_encoding, Rd, Rn);
+}
+
+void ARM64FloatEmitter::SCVTF(ARM64Reg Rd, ARM64Reg Rn) {
+    if (IsScalar(Rn)) {
+        // Source is in FP register (like destination!). We must use a vector
+        // encoding.
+        bool sign = false;
+        Rd = DecodeReg(Rd);
+        Rn = DecodeReg(Rn);
+        int sz = IsDouble(Rn);
+        Write32((0x5e << 24) | (sign << 29) | (sz << 22) | (0x876 << 10) | (Rn << 5) | Rd);
+    } else {
+        bool sf = Is64Bit(Rn);
+        u32 type = 0;
+        if (IsDouble(Rd))
+            type = 1;
+        EmitConversion(sf, 0, type, 0, 2, Rd, Rn);
+    }
+}
+
+void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn) {
+    if (IsScalar(Rn)) {
+        // Source is in FP register (like destination!). We must use a vector
+        // encoding.
+        bool sign = true;
+        Rd = DecodeReg(Rd);
+        Rn = DecodeReg(Rn);
+        int sz = IsDouble(Rn);
+        Write32((0x5e << 24) | (sign << 29) | (sz << 22) | (0x876 << 10) | (Rn << 5) | Rd);
+    } else {
+        bool sf = Is64Bit(Rn);
+        u32 type = 0;
+        if (IsDouble(Rd))
+            type = 1;
+
+        EmitConversion(sf, 0, type, 0, 3, Rd, Rn);
+    }
+}
+
+void ARM64FloatEmitter::SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale) {
+    bool sf = Is64Bit(Rn);
+    u32 type = 0;
+    if (IsDouble(Rd))
+        type = 1;
+
+    EmitConversion2(sf, 0, false, type, 0, 2, 64 - scale, Rd, Rn);
+}
+
+void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale) {
+    bool sf = Is64Bit(Rn);
+    u32 type = 0;
+    if (IsDouble(Rd))
+        type = 1;
+
+    EmitConversion2(sf, 0, false, type, 0, 3, 64 - scale, Rd, Rn);
+}
+
+void ARM64FloatEmitter::FCMP(ARM64Reg Rn, ARM64Reg Rm) {
+    EmitCompare(0, 0, 0, 0, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMP(ARM64Reg Rn) {
+    EmitCompare(0, 0, 0, 8, Rn, static_cast<ARM64Reg>(0));
+}
+void ARM64FloatEmitter::FCMPE(ARM64Reg Rn, ARM64Reg Rm) {
+    EmitCompare(0, 0, 0, 0x10, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMPE(ARM64Reg Rn) {
+    EmitCompare(0, 0, 0, 0x18, Rn, static_cast<ARM64Reg>(0));
+}
+void ARM64FloatEmitter::FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitThreeSame(0, size >> 6, 0x1C, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xD, Rd, Rn);
+}
+void ARM64FloatEmitter::FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitThreeSame(1, size >> 6, 0x1C, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x0C, Rd, Rn);
+}
+void ARM64FloatEmitter::FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitThreeSame(1, 2 | (size >> 6), 0x1C, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x0C, Rd, Rn);
+}
+void ARM64FloatEmitter::FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0xD, Rd, Rn);
+}
+void ARM64FloatEmitter::FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn) {
+    Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xE, Rd, Rn);
+}
+
+void ARM64FloatEmitter::FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) {
+    EmitCondSelect(0, 0, cond, Rd, Rn, Rm);
+}
+
+// Permute
+void ARM64FloatEmitter::UZP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitPermute(size, 0b001, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::TRN1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitPermute(size, 0b010, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::ZIP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitPermute(size, 0b011, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::UZP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitPermute(size, 0b101, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitPermute(size, 0b110, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+    EmitPermute(size, 0b111, Rd, Rn, Rm);
+}
+
+// Shift by immediate
+void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) {
+    SSHLL(src_size, Rd, Rn, shift, false);
+}
+void ARM64FloatEmitter::SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) {
+    SSHLL(src_size, Rd, Rn, shift, true);
+}
+void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) {
+    SHRN(dest_size, Rd, Rn, shift, false);
+}
+void ARM64FloatEmitter::SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) {
+    SHRN(dest_size, Rd, Rn, shift, true);
+}
+void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) {
+    USHLL(src_size, Rd, Rn, shift, false);
+}
+void ARM64FloatEmitter::USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) {
+    USHLL(src_size, Rd, Rn, shift, true);
+}
+void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) {
+    SXTL(src_size, Rd, Rn, false);
+}
+void ARM64FloatEmitter::SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) {
+    SXTL(src_size, Rd, Rn, true);
+}
+void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) {
+    UXTL(src_size, Rd, Rn, false);
+}
+void ARM64FloatEmitter::UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) {
+    UXTL(src_size, Rd, Rn, true);
+}
+
+void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper) {
+    ASSERT_MSG(shift < src_size, "%s shift amount must less than the element size!", __func__);
+    u32 immh = 0;
+    u32 immb = shift & 0xFFF;
+
+    if (src_size == 8) {
+        immh = 1;
+    } else if (src_size == 16) {
+        immh = 2 | ((shift >> 3) & 1);
+    } else if (src_size == 32) {
+        immh = 4 | ((shift >> 3) & 3);
+        ;
+    }
+    EmitShiftImm(upper, 0, immh, immb, 0b10100, Rd, Rn);
+}
+
+void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper) {
+    ASSERT_MSG(shift < src_size, "%s shift amount must less than the element size!", __func__);
+    u32 immh = 0;
+    u32 immb = shift & 0xFFF;
+
+    if (src_size == 8) {
+        immh = 1;
+    } else if (src_size == 16) {
+        immh = 2 | ((shift >> 3) & 1);
+    } else if (src_size == 32) {
+        immh = 4 | ((shift >> 3) & 3);
+        ;
+    }
+    EmitShiftImm(upper, 1, immh, immb, 0b10100, Rd, Rn);
+}
+
+void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper) {
+    ASSERT_MSG(shift < dest_size, "%s shift amount must less than the element size!", __func__);
+    u32 immh = 0;
+    u32 immb = shift & 0xFFF;
+
+    if (dest_size == 8) {
+        immh = 1;
+    } else if (dest_size == 16) {
+        immh = 2 | ((shift >> 3) & 1);
+    } else if (dest_size == 32) {
+        immh = 4 | ((shift >> 3) & 3);
+        ;
+    }
+    EmitShiftImm(upper, 1, immh, immb, 0b10000, Rd, Rn);
+}
+
+void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper) {
+    SSHLL(src_size, Rd, Rn, 0, upper);
+}
+
+void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper) {
+    USHLL(src_size, Rd, Rn, 0, upper);
+}
+
+// vector x indexed element
+void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index) {
+    ASSERT_MSG(size == 32 || size == 64, "%s only supports 32bit or 64bit size!", __func__);
+
+    bool L = false;
+    bool H = false;
+    if (size == 32) {
+        L = index & 1;
+        H = (index >> 1) & 1;
+    } else if (size == 64) {
+        H = index == 1;
+    }
+
+    EmitVectorxElement(0, 2 | (size >> 6), L, 0x9, H, Rd, Rn, Rm);
+}
+
+void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index) {
+    ASSERT_MSG(size == 32 || size == 64, "%s only supports 32bit or 64bit size!", __func__);
+
+    bool L = false;
+    bool H = false;
+    if (size == 32) {
+        L = index & 1;
+        H = (index >> 1) & 1;
+    } else if (size == 64) {
+        H = index == 1;
+    }
+
+    EmitVectorxElement(0, 2 | (size >> 6), L, 1, H, Rd, Rn, Rm);
+}
+
+// Modified Immediate
+void ARM64FloatEmitter::MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift) {
+    bool Q = IsQuad(Rd);
+    u8 cmode = 0;
+    u8 op = 0;
+    u8 abcdefgh = imm & 0xFF;
+    if (size == 8) {
+        ASSERT_MSG(shift == 0, "%s(size8) doesn't support shift!", __func__);
+        ASSERT_MSG(!(imm & ~0xFFULL), "%s(size8) only supports 8bit values!", __func__);
+    } else if (size == 16) {
+        ASSERT_MSG(shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!", __func__);
+        ASSERT_MSG(!(imm & ~0xFFULL), "%s(size16) only supports 8bit values!", __func__);
+
+        if (shift == 8)
+            cmode |= 2;
+    } else if (size == 32) {
+        ASSERT_MSG(shift == 0 || shift == 8 || shift == 16 || shift == 24,
+                   "%s(size32) only supports shift of {0, 8, 16, 24}!", __func__);
+        // XXX: Implement support for MOVI - shifting ones variant
+        ASSERT_MSG(!(imm & ~0xFFULL), "%s(size32) only supports 8bit values!", __func__);
+        switch (shift) {
+        case 8:
+            cmode |= 2;
+            break;
+        case 16:
+            cmode |= 4;
+            break;
+        case 24:
+            cmode |= 6;
+            break;
+        default:
+            break;
+        }
+    } else // 64
+    {
+        ASSERT_MSG(shift == 0, "%s(size64) doesn't support shift!", __func__);
+
+        op = 1;
+        cmode = 0xE;
+        abcdefgh = 0;
+        for (int i = 0; i < 8; ++i) {
+            u8 tmp = (imm >> (i << 3)) & 0xFF;
+            ASSERT_MSG(tmp == 0xFF || tmp == 0, "%s(size64) Invalid immediate!", __func__);
+            if (tmp == 0xFF)
+                abcdefgh |= (1 << i);
+        }
+    }
+    EncodeModImm(Q, op, cmode, 0, Rd, abcdefgh);
+}
+
+void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift) {
+    bool Q = IsQuad(Rd);
+    u8 cmode = 1;
+    u8 op = 1;
+    if (size == 16) {
+        ASSERT_MSG(shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!", __func__);
+
+        if (shift == 8)
+            cmode |= 2;
+    } else if (size == 32) {
+        ASSERT_MSG(shift == 0 || shift == 8 || shift == 16 || shift == 24,
+                   "%s(size32) only supports shift of {0, 8, 16, 24}!", __func__);
+        // XXX: Implement support for MOVI - shifting ones variant
+        switch (shift) {
+        case 8:
+            cmode |= 2;
+            break;
+        case 16:
+            cmode |= 4;
+            break;
+        case 24:
+            cmode |= 6;
+            break;
+        default:
+            break;
+        }
+    } else {
+        ASSERT_MSG(false, "%s only supports size of {16, 32}!", __func__);
+    }
+    EncodeModImm(Q, op, cmode, 0, Rd, imm);
+}
+
+void ARM64FloatEmitter::ABI_PushRegisters(u32 registers, ARM64Reg tmp) {
+    bool bundled_loadstore = false;
+
+    for (int i = 0; i < 32; ++i) {
+        if (!Common::Bit(i, registers))
+            continue;
+
+        int count = 0;
+        while (++count < 4 && (i + count) < 32 && Common::Bit(i + count, registers)) {
+        }
+        if (count > 1) {
+            bundled_loadstore = true;
+            break;
+        }
+    }
+
+    if (bundled_loadstore && tmp != INVALID_REG) {
+        int num_regs = Common::BitCount(registers);
+        m_emit->SUB(SP, SP, num_regs * 16);
+        m_emit->ADD(tmp, SP, 0);
+        std::vector<ARM64Reg> island_regs;
+        for (int i = 0; i < 32; ++i) {
+            if (!Common::Bit(i, registers))
+                continue;
+
+            int count = 0;
+
+            // 0 = true
+            // 1 < 4 && registers[i + 1] true!
+            // 2 < 4 && registers[i + 2] true!
+            // 3 < 4 && registers[i + 3] true!
+            // 4 < 4 && registers[i + 4] false!
+            while (++count < 4 && (i + count) < 32 && Common::Bit(i + count, registers)) {
+            }
+
+            if (count == 1)
+                island_regs.push_back((ARM64Reg)(Q0 + i));
+            else
+                ST1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), tmp);
+
+            i += count - 1;
+        }
+
+        // Handle island registers
+        std::vector<ARM64Reg> pair_regs;
+        for (auto& it : island_regs) {
+            pair_regs.push_back(it);
+            if (pair_regs.size() == 2) {
+                STP(128, INDEX_POST, pair_regs[0], pair_regs[1], tmp, 32);
+                pair_regs.clear();
+            }
+        }
+        if (pair_regs.size())
+            STR(128, INDEX_POST, pair_regs[0], tmp, 16);
+    } else {
+        std::vector<ARM64Reg> pair_regs;
+        for (int i = 0; i < 32; ++i) {
+            if (Common::Bit(i, registers)) {
+                pair_regs.push_back((ARM64Reg)(Q0 + i));
+                if (pair_regs.size() == 2) {
+                    STP(128, INDEX_PRE, pair_regs[0], pair_regs[1], SP, -32);
+                    pair_regs.clear();
+                }
+            }
+        }
+        if (pair_regs.size())
+            STR(128, INDEX_PRE, pair_regs[0], SP, -16);
+    }
+}
+void ARM64FloatEmitter::ABI_PopRegisters(u32 registers, ARM64Reg tmp) {
+    bool bundled_loadstore = false;
+    int num_regs = Common::BitCount(registers);
+
+    for (int i = 0; i < 32; ++i) {
+        if (!Common::Bit(i, registers))
+            continue;
+
+        int count = 0;
+        while (++count < 4 && (i + count) < 32 && Common::Bit(i + count, registers)) {
+        }
+        if (count > 1) {
+            bundled_loadstore = true;
+            break;
+        }
+    }
+
+    if (bundled_loadstore && tmp != INVALID_REG) {
+        // The temporary register is only used to indicate that we can use this code path
+        std::vector<ARM64Reg> island_regs;
+        for (int i = 0; i < 32; ++i) {
+            if (!Common::Bit(i, registers))
+                continue;
+
+            u8 count = 0;
+            while (++count < 4 && (i + count) < 32 && Common::Bit(i + count, registers)) {
+            }
+
+            if (count == 1)
+                island_regs.push_back(static_cast<ARM64Reg>(Q0 + i));
+            else
+                LD1(64, count, INDEX_POST, static_cast<ARM64Reg>(Q0 + i), SP);
+
+            i += count - 1;
+        }
+
+        // Handle island registers
+        std::vector<ARM64Reg> pair_regs;
+        for (auto& it : island_regs) {
+            pair_regs.push_back(it);
+            if (pair_regs.size() == 2) {
+                LDP(128, INDEX_POST, pair_regs[0], pair_regs[1], SP, 32);
+                pair_regs.clear();
+            }
+        }
+        if (pair_regs.size())
+            LDR(128, INDEX_POST, pair_regs[0], SP, 16);
+    } else {
+        bool odd = num_regs % 2;
+        std::vector<ARM64Reg> pair_regs;
+        for (int i = 31; i >= 0; --i) {
+            if (!Common::Bit(i, registers))
+                continue;
+
+            if (odd) {
+                // First load must be a regular LDR if odd
+                odd = false;
+                LDR(128, INDEX_POST, static_cast<ARM64Reg>(Q0 + i), SP, 16);
+            } else {
+                pair_regs.push_back(static_cast<ARM64Reg>(Q0 + i));
+                if (pair_regs.size() == 2) {
+                    LDP(128, INDEX_POST, pair_regs[1], pair_regs[0], SP, 32);
+                    pair_regs.clear();
+                }
+            }
+        }
+    }
+}
+
+void ARM64XEmitter::ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) {
+    unsigned int n, imm_s, imm_r;
+    if (!Is64Bit(Rn))
+        imm &= 0xFFFFFFFF;
+    if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) {
+        AND(Rd, Rn, imm_r, imm_s, n != 0);
+    } else {
+        ASSERT_MSG(scratch != INVALID_REG,
+                   "ANDI2R - failed to construct logical immediate value from "
+                   "%08x, need scratch",
+                   static_cast<u32>(imm));
+        MOVI2R(scratch, imm);
+        AND(Rd, Rn, scratch);
+    }
+}
+
+void ARM64XEmitter::ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) {
+    unsigned int n, imm_s, imm_r;
+    if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) {
+        ORR(Rd, Rn, imm_r, imm_s, n != 0);
+    } else {
+        ASSERT_MSG(scratch != INVALID_REG,
+                   "ORRI2R - failed to construct logical immediate value from "
+                   "%08x, need scratch",
+                   static_cast<u32>(imm));
+        MOVI2R(scratch, imm);
+        ORR(Rd, Rn, scratch);
+    }
+}
+
+void ARM64XEmitter::EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) {
+    unsigned int n, imm_s, imm_r;
+    if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) {
+        EOR(Rd, Rn, imm_r, imm_s, n != 0);
+    } else {
+        ASSERT_MSG(scratch != INVALID_REG,
+                   "EORI2R - failed to construct logical immediate value from "
+                   "%08x, need scratch",
+                   static_cast<u32>(imm));
+        MOVI2R(scratch, imm);
+        EOR(Rd, Rn, scratch);
+    }
+}
+
+void ARM64XEmitter::ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) {
+    unsigned int n, imm_s, imm_r;
+    if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) {
+        ANDS(Rd, Rn, imm_r, imm_s, n != 0);
+    } else {
+        ASSERT_MSG(scratch != INVALID_REG,
+                   "ANDSI2R - failed to construct logical immediate value from "
+                   "%08x, need scratch",
+                   static_cast<u32>(imm));
+        MOVI2R(scratch, imm);
+        ANDS(Rd, Rn, scratch);
+    }
+}
+
+void ARM64XEmitter::AddImmediate(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool shift, bool negative,
+                                 bool flags) {
+    switch ((negative << 1) | static_cast<unsigned int>(flags)) {
+    case 0:
+        ADD(Rd, Rn, static_cast<u32>(imm), shift);
+        break;
+    case 1:
+        ADDS(Rd, Rn, static_cast<u32>(imm), shift);
+        break;
+    case 2:
+        SUB(Rd, Rn, static_cast<u32>(imm), shift);
+        break;
+    case 3:
+        SUBS(Rd, Rn, static_cast<u32>(imm), shift);
+        break;
+    }
+}
+
+void ARM64XEmitter::ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool negative, bool flags,
+                                    ARM64Reg scratch) {
+    bool has_scratch = scratch != INVALID_REG;
+    u64 imm_neg = Is64Bit(Rd) ? ~imm + 1 : (~imm + 1) & 0xFFFFFFFFuLL;
+    bool neg_neg = negative ? false : true;
+
+    // Fast paths, aarch64 immediate instructions
+    // Try them all first
+    if (imm <= 0xFFF) {
+        AddImmediate(Rd, Rn, imm, false, negative, flags);
+        return;
+    }
+    if (imm <= 0xFFFFFF && (imm & 0xFFF) == 0) {
+        AddImmediate(Rd, Rn, imm >> 12, true, negative, flags);
+        return;
+    }
+    if (imm_neg <= 0xFFF) {
+        AddImmediate(Rd, Rn, imm_neg, false, neg_neg, flags);
+        return;
+    }
+    if (imm_neg <= 0xFFFFFF && (imm_neg & 0xFFF) == 0) {
+        AddImmediate(Rd, Rn, imm_neg >> 12, true, neg_neg, flags);
+        return;
+    }
+
+    // ADD+ADD is slower than MOVK+ADD, but inplace.
+    // But it supports a few more bits, so use it to avoid MOVK+MOVK+ADD.
+    // As this splits the addition in two parts, this must not be done on setting
+    // flags.
+    if (!flags && (imm >= 0x10000u || !has_scratch) && imm < 0x1000000u) {
+        AddImmediate(Rd, Rn, imm & 0xFFF, false, negative, false);
+        AddImmediate(Rd, Rd, imm >> 12, true, negative, false);
+        return;
+    }
+    if (!flags && (imm_neg >= 0x10000u || !has_scratch) && imm_neg < 0x1000000u) {
+        AddImmediate(Rd, Rn, imm_neg & 0xFFF, false, neg_neg, false);
+        AddImmediate(Rd, Rd, imm_neg >> 12, true, neg_neg, false);
+        return;
+    }
+
+    ASSERT_MSG(has_scratch,
+               "ADDI2R - failed to construct arithmetic immediate value from "
+               "%08x, need scratch",
+               static_cast<u32>(imm));
+
+    negative ^= MOVI2R2(scratch, imm, imm_neg);
+    switch ((negative << 1) | static_cast<unsigned int>(flags)) {
+    case 0:
+        ADD(Rd, Rn, scratch);
+        break;
+    case 1:
+        ADDS(Rd, Rn, scratch);
+        break;
+    case 2:
+        SUB(Rd, Rn, scratch);
+        break;
+    case 3:
+        SUBS(Rd, Rn, scratch);
+        break;
+    }
+}
+
+void ARM64XEmitter::ADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) {
+    ADDI2R_internal(Rd, Rn, imm, false, false, scratch);
+}
+
+void ARM64XEmitter::ADDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) {
+    ADDI2R_internal(Rd, Rn, imm, false, true, scratch);
+}
+
+void ARM64XEmitter::SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) {
+    ADDI2R_internal(Rd, Rn, imm, true, false, scratch);
+}
+
+void ARM64XEmitter::SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) {
+    ADDI2R_internal(Rd, Rn, imm, true, true, scratch);
+}
+
+void ARM64XEmitter::CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch) {
+    ADDI2R_internal(Is64Bit(Rn) ? ZR : WZR, Rn, imm, true, true, scratch);
+}
+
+bool ARM64XEmitter::TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) {
+    u32 val;
+    bool shift;
+    if (IsImmArithmetic(imm, &val, &shift))
+        ADD(Rd, Rn, val, shift);
+    else
+        return false;
+
+    return true;
+}
+
+bool ARM64XEmitter::TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) {
+    u32 val;
+    bool shift;
+    if (IsImmArithmetic(imm, &val, &shift))
+        SUB(Rd, Rn, val, shift);
+    else
+        return false;
+
+    return true;
+}
+
+bool ARM64XEmitter::TryCMPI2R(ARM64Reg Rn, u32 imm) {
+    u32 val;
+    bool shift;
+    if (IsImmArithmetic(imm, &val, &shift))
+        CMP(Rn, val, shift);
+    else
+        return false;
+
+    return true;
+}
+
+bool ARM64XEmitter::TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) {
+    u32 n, imm_r, imm_s;
+    if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r))
+        AND(Rd, Rn, imm_r, imm_s, n != 0);
+    else
+        return false;
+
+    return true;
+}
+bool ARM64XEmitter::TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) {
+    u32 n, imm_r, imm_s;
+    if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r))
+        ORR(Rd, Rn, imm_r, imm_s, n != 0);
+    else
+        return false;
+
+    return true;
+}
+bool ARM64XEmitter::TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) {
+    u32 n, imm_r, imm_s;
+    if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r))
+        EOR(Rd, Rn, imm_r, imm_s, n != 0);
+    else
+        return false;
+
+    return true;
+}
+
+void ARM64FloatEmitter::MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch, bool negate) {
+    ASSERT_MSG(!IsDouble(Rd), "MOVI2F does not yet support double precision");
+    uint8_t imm8;
+    if (value == 0.0) {
+        FMOV(Rd, IsDouble(Rd) ? ZR : WZR);
+        if (negate)
+            FNEG(Rd, Rd);
+        // TODO: There are some other values we could generate with the float-imm
+        // instruction, like 1.0...
+    } else if (FPImm8FromFloat(value, &imm8)) {
+        FMOV(Rd, imm8);
+    } else {
+        ASSERT_MSG(scratch != INVALID_REG,
+                   "Failed to find a way to generate FP immediate %f without scratch", value);
+        if (negate)
+            value = -value;
+
+        const u32 ival = Dynarmic::Common::BitCast<u32>(value);
+        m_emit->MOVI2R(scratch, ival);
+        FMOV(Rd, scratch);
+    }
+}
+
+// TODO: Quite a few values could be generated easily using the MOVI instruction
+// and friends.
+void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch) {
+    // TODO: Make it work with more element sizes
+    // TODO: Optimize - there are shorter solution for many values
+    ARM64Reg s = static_cast<ARM64Reg>(S0 + DecodeReg(Rd));
+    MOVI2F(s, value, scratch);
+    DUP(32, Rd, Rd, 0);
+}
+
+} // namespace Dynarmic::BackendA64::Arm64Gen
diff --git a/src/dynarmic/backend/A64/emitter/a64_emitter.h b/src/dynarmic/backend/A64/emitter/a64_emitter.h
new file mode 100644
index 00000000..e7d84638
--- /dev/null
+++ b/src/dynarmic/backend/A64/emitter/a64_emitter.h
@@ -0,0 +1,1172 @@
+// Copyright 2015 Dolphin Emulator Project / 2018 dynarmic project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <bitset>
+#include <cstring>
+#include <functional>
+
+#include "arm_common.h"
+#include "code_block.h"
+#include "common/assert.h"
+#include "common/common_types.h"
+
+namespace Dynarmic::BackendA64::Arm64Gen {
+
+// X30 serves a dual purpose as a link register
+// Encoded as <u3:type><u5:reg>
+// Types:
+// 000 - 32bit GPR
+// 001 - 64bit GPR
+// 010 - VFP single precision
+// 100 - VFP double precision
+// 110 - VFP quad precision
+enum ARM64Reg {
+    // 32bit registers
+    W0 = 0,
+    W1,
+    W2,
+    W3,
+    W4,
+    W5,
+    W6,
+    W7,
+    W8,
+    W9,
+    W10,
+    W11,
+    W12,
+    W13,
+    W14,
+    W15,
+    W16,
+    W17,
+    W18,
+    W19,
+    W20,
+    W21,
+    W22,
+    W23,
+    W24,
+    W25,
+    W26,
+    W27,
+    W28,
+    W29,
+    W30,
+
+    WSP,  // 32bit stack pointer
+
+    // 64bit registers
+    X0 = 0x20,
+    X1,
+    X2,
+    X3,
+    X4,
+    X5,
+    X6,
+    X7,
+    X8,
+    X9,
+    X10,
+    X11,
+    X12,
+    X13,
+    X14,
+    X15,
+    X16,
+    X17,
+    X18,
+    X19,
+    X20,
+    X21,
+    X22,
+    X23,
+    X24,
+    X25,
+    X26,
+    X27,
+    X28,
+    X29,
+    X30,
+
+    SP,  // 64bit stack pointer
+
+    // VFP single precision registers
+    S0 = 0x40,
+    S1,
+    S2,
+    S3,
+    S4,
+    S5,
+    S6,
+    S7,
+    S8,
+    S9,
+    S10,
+    S11,
+    S12,
+    S13,
+    S14,
+    S15,
+    S16,
+    S17,
+    S18,
+    S19,
+    S20,
+    S21,
+    S22,
+    S23,
+    S24,
+    S25,
+    S26,
+    S27,
+    S28,
+    S29,
+    S30,
+    S31,
+
+    // VFP Double Precision registers
+    D0 = 0x80,
+    D1,
+    D2,
+    D3,
+    D4,
+    D5,
+    D6,
+    D7,
+    D8,
+    D9,
+    D10,
+    D11,
+    D12,
+    D13,
+    D14,
+    D15,
+    D16,
+    D17,
+    D18,
+    D19,
+    D20,
+    D21,
+    D22,
+    D23,
+    D24,
+    D25,
+    D26,
+    D27,
+    D28,
+    D29,
+    D30,
+    D31,
+
+    // ASIMD Quad-Word registers
+    Q0 = 0xC0,
+    Q1,
+    Q2,
+    Q3,
+    Q4,
+    Q5,
+    Q6,
+    Q7,
+    Q8,
+    Q9,
+    Q10,
+    Q11,
+    Q12,
+    Q13,
+    Q14,
+    Q15,
+    Q16,
+    Q17,
+    Q18,
+    Q19,
+    Q20,
+    Q21,
+    Q22,
+    Q23,
+    Q24,
+    Q25,
+    Q26,
+    Q27,
+    Q28,
+    Q29,
+    Q30,
+    Q31,
+
+    // For PRFM(prefetch memory) encoding
+    // This is encoded in the Rt register
+    // Data preload
+    PLDL1KEEP = 0,
+    PLDL1STRM,
+    PLDL2KEEP,
+    PLDL2STRM,
+    PLDL3KEEP,
+    PLDL3STRM,
+    // Instruction preload
+    PLIL1KEEP = 8,
+    PLIL1STRM,
+    PLIL2KEEP,
+    PLIL2STRM,
+    PLIL3KEEP,
+    PLIL3STRM,
+    // Prepare for store
+    PLTL1KEEP = 16,
+    PLTL1STRM,
+    PLTL2KEEP,
+    PLTL2STRM,
+    PLTL3KEEP,
+    PLTL3STRM,
+
+    WZR = WSP,
+    ZR = SP,
+
+    INVALID_REG = 0xFFFFFFFF
+};
+
+constexpr bool Is64Bit(ARM64Reg reg) {
+    return (reg & 0x20) != 0;
+}
+constexpr bool IsSingle(ARM64Reg reg) {
+    return (reg & 0xC0) == 0x40;
+}
+constexpr bool IsDouble(ARM64Reg reg) {
+    return (reg & 0xC0) == 0x80;
+}
+constexpr bool IsScalar(ARM64Reg reg) {
+    return IsSingle(reg) || IsDouble(reg);
+}
+constexpr bool IsQuad(ARM64Reg reg) {
+    return (reg & 0xC0) == 0xC0;
+}
+constexpr bool IsVector(ARM64Reg reg) {
+    return (reg & 0xC0) != 0;
+}
+constexpr bool IsGPR(ARM64Reg reg) {
+    return static_cast<int>(reg) < 0x40;
+}
+
+constexpr ARM64Reg DecodeReg(ARM64Reg reg) {
+    return static_cast<ARM64Reg>(reg & 0x1F);
+}
+constexpr ARM64Reg EncodeRegTo64(ARM64Reg reg) {
+    return static_cast<ARM64Reg>(reg | 0x20);
+}
+constexpr ARM64Reg EncodeRegToSingle(ARM64Reg reg) {
+    return static_cast<ARM64Reg>(DecodeReg(reg) + S0);
+}
+constexpr ARM64Reg EncodeRegToDouble(ARM64Reg reg) {
+    return static_cast<ARM64Reg>((reg & ~0xC0) | 0x80);
+}
+constexpr ARM64Reg EncodeRegToQuad(ARM64Reg reg) {
+    return static_cast<ARM64Reg>(reg | 0xC0);
+}
+
+enum OpType { TYPE_IMM = 0, TYPE_REG, TYPE_IMMSREG, TYPE_RSR, TYPE_MEM };
+
+enum ShiftType {
+    ST_LSL = 0,
+    ST_LSR = 1,
+    ST_ASR = 2,
+    ST_ROR = 3,
+};
+
+enum IndexType {
+    INDEX_UNSIGNED,
+    INDEX_POST,
+    INDEX_PRE,
+    INDEX_SIGNED,  // used in LDP/STP
+};
+
+enum ShiftAmount {
+    SHIFT_0 = 0,
+    SHIFT_16 = 1,
+    SHIFT_32 = 2,
+    SHIFT_48 = 3,
+};
+
+enum RoundingMode {
+    ROUND_A,  // round to nearest, ties to away
+    ROUND_M,  // round towards -inf
+    ROUND_N,  // round to nearest, ties to even
+    ROUND_P,  // round towards +inf
+    ROUND_Z,  // round towards zero
+};
+
+// Size of each element in the Vector
+enum ESize {
+    B,  // Byte
+    H,  // Half Word
+    S,  // Single Word
+    D,  // Double Word
+};
+
+struct FixupBranch {
+    u8* ptr;
+    // Type defines
+    // 0 = CBZ (32bit)
+    // 1 = CBNZ (32bit)
+    // 2 = B (conditional)
+    // 3 = TBZ
+    // 4 = TBNZ
+    // 5 = B (unconditional)
+    // 6 = BL (unconditional)
+    u32 type;
+
+    // Used with B.cond
+    CCFlags cond;
+
+    // Used with TBZ/TBNZ
+    u8 bit;
+
+    // Used with Test/Compare and Branch
+    ARM64Reg reg;
+};
+
+// The only system registers accessible from EL0 (user space)
+enum PStateField {
+    FIELD_SPSel = 0,
+    FIELD_DAIFSet,
+    FIELD_DAIFClr,
+    FIELD_NZCV,
+    FIELD_PMCR_EL0,
+    FIELD_PMCCNTR_EL0,
+    FIELD_FPCR = 0x340,
+    FIELD_FPSR = 0x341,
+};
+
+enum SystemHint {
+    HINT_NOP = 0,
+    HINT_YIELD,
+    HINT_WFE,
+    HINT_WFI,
+    HINT_SEV,
+    HINT_SEVL,
+};
+
+enum BarrierType {
+    OSHLD = 1,
+    OSHST = 2,
+    OSH = 3,
+    NSHLD = 5,
+    NSHST = 6,
+    NSH = 7,
+    ISHLD = 9,
+    ISHST = 10,
+    ISH = 11,
+    LD = 13,
+    ST = 14,
+    SY = 15,
+};
+
+class ArithOption {
+public:
+    enum WidthSpecifier {
+        WIDTH_DEFAULT,
+        WIDTH_32BIT,
+        WIDTH_64BIT,
+    };
+
+    enum ExtendSpecifier {
+        EXTEND_UXTB = 0x0,
+        EXTEND_UXTH = 0x1,
+        EXTEND_UXTW = 0x2, /* Also LSL on 32bit width */
+        EXTEND_UXTX = 0x3, /* Also LSL on 64bit width */
+        EXTEND_SXTB = 0x4,
+        EXTEND_SXTH = 0x5,
+        EXTEND_SXTW = 0x6,
+        EXTEND_SXTX = 0x7,
+    };
+
+    enum TypeSpecifier {
+        TYPE_EXTENDEDREG,
+        TYPE_IMM,
+        TYPE_SHIFTEDREG,
+    };
+
+private:
+    ARM64Reg m_destReg;
+    WidthSpecifier m_width;
+    ExtendSpecifier m_extend;
+    TypeSpecifier m_type;
+    ShiftType m_shifttype;
+    u32 m_shift;
+
+public:
+    ArithOption(ARM64Reg Rd, bool index = false) {
+        // Indexed registers are a certain feature of AARch64
+        // On Loadstore instructions that use a register offset
+        // We can have the register as an index
+        // If we are indexing then the offset register will
+        // be shifted to the left so we are indexing at intervals
+        // of the size of what we are loading
+        // 8-bit: Index does nothing
+        // 16-bit: Index LSL 1
+        // 32-bit: Index LSL 2
+        // 64-bit: Index LSL 3
+        if (index)
+            m_shift = 4;
+        else
+            m_shift = 0;
+
+        m_destReg = Rd;
+        m_type = TYPE_EXTENDEDREG;
+        if (Is64Bit(Rd)) {
+            m_width = WIDTH_64BIT;
+            m_extend = EXTEND_UXTX;
+        } else {
+            m_width = WIDTH_32BIT;
+            m_extend = EXTEND_UXTW;
+        }
+        m_shifttype = ST_LSL;
+    }
+    ArithOption(ARM64Reg Rd, ShiftType shift_type, u32 shift) {
+        m_destReg = Rd;
+        m_shift = shift;
+        m_shifttype = shift_type;
+        m_type = TYPE_SHIFTEDREG;
+        if (Is64Bit(Rd)) {
+            m_width = WIDTH_64BIT;
+            if (shift == 64)
+                m_shift = 0;
+        } else {
+            m_width = WIDTH_32BIT;
+            if (shift == 32)
+                m_shift = 0;
+        }
+    }
+    TypeSpecifier GetType() const {
+        return m_type;
+    }
+    ARM64Reg GetReg() const {
+        return m_destReg;
+    }
+    u32 GetData() const {
+        switch (m_type) {
+        case TYPE_EXTENDEDREG:
+            return (m_extend << 13) | (m_shift << 10);
+            break;
+        case TYPE_SHIFTEDREG:
+            return (m_shifttype << 22) | (m_shift << 10);
+            break;
+        default:
+            ASSERT_MSG(false, "Invalid type in GetData");
+            break;
+        }
+        return 0;
+    }
+};
+
+class ARM64XEmitter {
+    friend class ARM64FloatEmitter;
+
+private:
+    u8* m_code;
+    u8* m_lastCacheFlushEnd;
+
+    void AddImmediate(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool shift, bool negative, bool flags);
+    void EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr);
+    void EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const void* ptr);
+    void EncodeUnconditionalBranchInst(u32 op, const void* ptr);
+    void EncodeUnconditionalBranchInst(u32 opc, u32 op2, u32 op3, u32 op4, ARM64Reg Rn);
+    void EncodeExceptionInst(u32 instenc, u32 imm);
+    void EncodeSystemInst(u32 op0, u32 op1, u32 CRn, u32 CRm, u32 op2, ARM64Reg Rt);
+    void EncodeArithmeticInst(u32 instenc, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
+                              ArithOption Option);
+    void EncodeArithmeticCarryInst(u32 op, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void EncodeCondCompareImmInst(u32 op, ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond);
+    void EncodeCondCompareRegInst(u32 op, ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);
+    void EncodeCondSelectInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+    void EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn);
+    void EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+    void EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+    void EncodeLoadRegisterInst(u32 bitop, ARM64Reg Rt, s32 imm);
+    void EncodeLoadStoreExcInst(u32 instenc, ARM64Reg Rs, ARM64Reg Rt2, ARM64Reg Rn, ARM64Reg Rt);
+    void EncodeLoadStorePairedInst(u32 op, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm);
+    void EncodeLoadStoreIndexedInst(u32 op, u32 op2, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+    void EncodeLoadStoreIndexedInst(u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm, u8 size);
+    void EncodeMOVWideInst(u32 op, ARM64Reg Rd, u32 imm, ShiftAmount pos);
+    void EncodeBitfieldMOVInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
+    void EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+    void EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn, ARM64Reg Rd);
+    void EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, int n);
+    void EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2,
+                             ARM64Reg Rn, s32 imm);
+    void EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm);
+    void EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+protected:
+    void Write32(u32 value);
+
+public:
+    ARM64XEmitter() : m_code(nullptr), m_lastCacheFlushEnd(nullptr) {
+    }
+
+    ARM64XEmitter(u8* code_ptr) {
+        m_code = code_ptr;
+        m_lastCacheFlushEnd = code_ptr;
+    }
+
+    virtual ~ARM64XEmitter() {
+    }
+
+    void SetCodePtr(u8* ptr);
+    void SetCodePtrUnsafe(u8* ptr);
+    void ReserveCodeSpace(u32 bytes);
+    const u8* AlignCode16();
+    const u8* AlignCodePage();
+    const u8* GetCodePtr() const;
+    void FlushIcache();
+    void FlushIcacheSection(const u8* start, const u8* end);
+    u8* GetWritableCodePtr();
+
+    // FixupBranch branching
+    void SetJumpTarget(FixupBranch const& branch, u8* target = nullptr);
+    FixupBranch CBZ(ARM64Reg Rt);
+    FixupBranch CBNZ(ARM64Reg Rt);
+    FixupBranch B(CCFlags cond);
+    FixupBranch TBZ(ARM64Reg Rt, u8 bit);
+    FixupBranch TBNZ(ARM64Reg Rt, u8 bit);
+    FixupBranch B();
+    FixupBranch BL();
+
+    // Compare and Branch
+    void CBZ(ARM64Reg Rt, const void* ptr);
+    void CBNZ(ARM64Reg Rt, const void* ptr);
+
+    // Conditional Branch
+    void B(CCFlags cond, const void* ptr);
+
+    // Test and Branch
+    void TBZ(ARM64Reg Rt, u8 bits, const void* ptr);
+    void TBNZ(ARM64Reg Rt, u8 bits, const void* ptr);
+
+    // Unconditional Branch
+    void B(const void* ptr);
+    void BL(const void* ptr);
+
+    // Unconditional Branch (register)
+    void BR(ARM64Reg Rn);
+    void BLR(ARM64Reg Rn);
+    void RET(ARM64Reg Rn = X30);
+    void ERET();
+    void DRPS();
+
+    // Exception generation
+    void SVC(u32 imm);
+    void HVC(u32 imm);
+    void SMC(u32 imm);
+    void BRK(u32 imm);
+    void HLT(u32 imm);
+    void DCPS1(u32 imm);
+    void DCPS2(u32 imm);
+    void DCPS3(u32 imm);
+
+    // System
+    void _MSR(PStateField field, u8 imm);
+    void _MSR(PStateField field, ARM64Reg Rt);
+    void MRS(ARM64Reg Rt, PStateField field);
+    void CNTVCT(ARM64Reg Rt);
+
+    void HINT(SystemHint op);
+    void CLREX();
+    void DSB(BarrierType type);
+    void DMB(BarrierType type);
+    void ISB(BarrierType type);
+
+    // Add/Subtract (Extended/Shifted register)
+    void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+    void ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+    void SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+    void SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+    void CMN(ARM64Reg Rn, ARM64Reg Rm);
+    void CMN(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+    void CMP(ARM64Reg Rn, ARM64Reg Rm);
+    void CMP(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option);
+
+    // Add/Subtract (with carry)
+    void ADC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void ADCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void SBC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void SBCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+    // Conditional Compare (immediate)
+    void CCMN(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond);
+    void CCMP(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond);
+
+    // Conditional Compare (register)
+    void CCMN(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);
+    void CCMP(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);
+
+    // Conditional Select
+    void CSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+    void CSINC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+    void CSINV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+    void CSNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+
+    // Aliases
+    void CSET(ARM64Reg Rd, CCFlags cond) {
+        ARM64Reg zr = Is64Bit(Rd) ? ZR : WZR;
+        CSINC(Rd, zr, zr, (CCFlags)((u32)cond ^ 1));
+    }
+    void CSETM(ARM64Reg Rd, CCFlags cond) {
+        ARM64Reg zr = Is64Bit(Rd) ? ZR : WZR;
+        CSINV(Rd, zr, zr, (CCFlags)((u32)cond ^ 1));
+    }
+    void NEG(ARM64Reg Rd, ARM64Reg Rs) {
+        SUB(Rd, Is64Bit(Rd) ? ZR : WZR, Rs);
+    }
+    // Data-Processing 1 source
+    void RBIT(ARM64Reg Rd, ARM64Reg Rn);
+    void REV16(ARM64Reg Rd, ARM64Reg Rn);
+    void REV32(ARM64Reg Rd, ARM64Reg Rn);
+    void REV64(ARM64Reg Rd, ARM64Reg Rn);
+    void CLZ(ARM64Reg Rd, ARM64Reg Rn);
+    void CLS(ARM64Reg Rd, ARM64Reg Rn);
+
+    // Data-Processing 2 source
+    void UDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void SDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void LSLV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void LSRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void ASRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void RORV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void CRC32B(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void CRC32H(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void CRC32W(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void CRC32CB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void CRC32CH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void CRC32CW(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void CRC32X(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void CRC32CX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+    // Data-Processing 3 source
+    void MADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+    void MSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+    void SMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+    void SMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void SMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+    void SMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void UMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+    void UMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void UMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+    void UMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void MUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void MNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+    // Logical (shifted register)
+    void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+    void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+    void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+    void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+    void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+    void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+    void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+    void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+
+    // Wrap the above for saner syntax
+    void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+        AND(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+    }
+    void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+        BIC(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+    }
+    void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+        ORR(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+    }
+    void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+        ORN(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+    }
+    void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+        EOR(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+    }
+    void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+        EON(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+    }
+    void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+        ANDS(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+    }
+    void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
+        BICS(Rd, Rn, Rm, ArithOption(Rd, ST_LSL, 0));
+    }
+
+    // Convenience wrappers around ORR. These match the official convenience
+    // syntax.
+    void MOV(ARM64Reg Rd, ARM64Reg Rm, ArithOption Shift);
+    void MOV(ARM64Reg Rd, ARM64Reg Rm);
+    void MVN(ARM64Reg Rd, ARM64Reg Rm);
+
+    // Convenience wrappers around UBFM/EXTR.
+    void LSR(ARM64Reg Rd, ARM64Reg Rm, int shift);
+    void LSL(ARM64Reg Rd, ARM64Reg Rm, int shift);
+    void ASR(ARM64Reg Rd, ARM64Reg Rm, int shift);
+    void ROR(ARM64Reg Rd, ARM64Reg Rm, int shift);
+
+    // Logical (immediate)
+    void AND(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+    void ANDS(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+    void EOR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+    void ORR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+    void TST(ARM64Reg Rn, u32 immr, u32 imms, bool invert = false);
+    void TST(ARM64Reg Rn, ARM64Reg Rm) {
+        ANDS(Is64Bit(Rn) ? ZR : WZR, Rn, Rm);
+    }
+
+    // Add/subtract (immediate)
+    void ADD(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
+    void ADDS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
+    void SUB(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
+    void SUBS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false);
+    void CMP(ARM64Reg Rn, u32 imm, bool shift = false);
+
+    // Data Processing (Immediate)
+    void MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0);
+    void MOVN(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0);
+    void MOVK(ARM64Reg Rd, u32 imm, ShiftAmount pos = SHIFT_0);
+
+    // Bitfield move
+    void BFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
+    void SBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
+    void UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
+    void BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width);
+    void UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width);
+
+    // Extract register (ROR with two inputs, if same then faster on A67)
+    void EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift);
+
+    // Aliases
+    void SXTB(ARM64Reg Rd, ARM64Reg Rn);
+    void SXTH(ARM64Reg Rd, ARM64Reg Rn);
+    void SXTW(ARM64Reg Rd, ARM64Reg Rn);
+    void UXTB(ARM64Reg Rd, ARM64Reg Rn);
+    void UXTH(ARM64Reg Rd, ARM64Reg Rn);
+    void UBFX(ARM64Reg Rd, ARM64Reg Rn, int lsb, int width) {
+        UBFM(Rd, Rn, lsb, lsb + width - 1);
+    }
+
+    // Load Register (Literal)
+    void LDR(ARM64Reg Rt, s32 imm);
+    void LDRSW(ARM64Reg Rt, s32 imm);
+    void PRFM(ARM64Reg Rt, s32 imm);
+
+    // Load/Store Exclusive
+    void STXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+    void STLXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+    void LDXRB(ARM64Reg Rt, ARM64Reg Rn);
+    void LDAXRB(ARM64Reg Rt, ARM64Reg Rn);
+    void STLRB(ARM64Reg Rt, ARM64Reg Rn);
+    void LDARB(ARM64Reg Rt, ARM64Reg Rn);
+    void STXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+    void STLXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+    void LDXRH(ARM64Reg Rt, ARM64Reg Rn);
+    void LDAXRH(ARM64Reg Rt, ARM64Reg Rn);
+    void STLRH(ARM64Reg Rt, ARM64Reg Rn);
+    void LDARH(ARM64Reg Rt, ARM64Reg Rn);
+    void STXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+    void STLXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn);
+    void STXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
+    void STLXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
+    void LDXR(ARM64Reg Rt, ARM64Reg Rn);
+    void LDAXR(ARM64Reg Rt, ARM64Reg Rn);
+    void LDXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
+    void LDAXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn);
+    void STLR(ARM64Reg Rt, ARM64Reg Rn);
+    void LDAR(ARM64Reg Rt, ARM64Reg Rn);
+
+    // Load/Store no-allocate pair (offset)
+    void STNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm);
+    void LDNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm);
+
+    // Load/Store register (immediate indexed)
+    void STRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+    void LDRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+    void LDRSB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+    void STRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+    void LDRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+    void LDRSH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+    void STR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+    void LDR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+    void LDRSW(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+    // Load/Store register (register offset)
+    void STRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+    void LDRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+    void LDRSB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+    void STRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+    void LDRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+    void LDRSH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+    void STR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+    void LDR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+    void LDRSW(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+    void PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+
+    // Load/Store register (unscaled offset)
+    void STURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+    void LDURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+    void LDURSB(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+    void STURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+    void LDURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+    void LDURSH(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+    void STUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+    void LDUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+    void LDURSW(ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+    // Load/Store pair
+    void LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+    void LDPSW(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+    void STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+
+    // Address of label/page PC-relative
+    void ADR(ARM64Reg Rd, s32 imm);
+    void ADRP(ARM64Reg Rd, s32 imm);
+
+    // Wrapper around MOVZ+MOVK
+    void MOVI2R(ARM64Reg Rd, u64 imm, bool optimize = true);
+    bool MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2);
+    template <class P>
+    void MOVP2R(ARM64Reg Rd, P* ptr) {
+        ASSERT_MSG(Is64Bit(Rd), "Can't store pointers in 32-bit registers");
+        MOVI2R(Rd, (uintptr_t)ptr);
+    }
+
+    // Wrapper around AND x, y, imm etc. If you are sure the imm will work, no
+    // need to pass a scratch register.
+    void ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+    void ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+    void TSTI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG) {
+        ANDSI2R(Is64Bit(Rn) ? ZR : WZR, Rn, imm, scratch);
+    }
+    void ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+    void EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+    void CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+
+    void ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool negative, bool flags,
+                         ARM64Reg scratch);
+    void ADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+    void ADDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+    void SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+    void SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG);
+
+    bool TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+    bool TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+    bool TryCMPI2R(ARM64Reg Rn, u32 imm);
+
+    bool TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+    bool TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+    bool TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
+
+    // ABI related
+    void ABI_PushRegisters(u32 registers);
+    void ABI_PopRegisters(u32 registers);
+
+    // Utility to generate a call to a std::function object.
+    //
+    // Unfortunately, calling operator() directly is undefined behavior in C++
+    // (this method might be a thunk in the case of multi-inheritance) so we
+    // have to go through a trampoline function.
+    template <typename T, typename... Args>
+    static T CallLambdaTrampoline(const std::function<T(Args...)>* f, Args... args) {
+        return (*f)(args...);
+    }
+
+    // This function expects you to have set up the state.
+    // Overwrites X0 and X30
+    template <typename T, typename... Args>
+    ARM64Reg ABI_SetupLambda(const std::function<T(Args...)>* f) {
+        auto trampoline = &ARM64XEmitter::CallLambdaTrampoline<T, Args...>;
+        MOVI2R(X30, (uintptr_t)trampoline);
+        MOVI2R(X0, (uintptr_t) const_cast<void*>((const void*)f));
+        return X30;
+    }
+
+    // Plain function call
+    void QuickCallFunction(const void* func, ARM64Reg scratchreg = X16);
+    template <typename T>
+    void QuickCallFunction(T func, ARM64Reg scratchreg = X16) {
+        QuickCallFunction((const void*)func, scratchreg);
+    }
+};
+
+class ARM64FloatEmitter {
+public:
+    ARM64FloatEmitter(ARM64XEmitter* emit) : m_emit(emit) {
+    }
+
+    void LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+    void STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+    // Loadstore unscaled
+    void LDUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+    void STUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+
+    // Loadstore single structure
+    void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn);
+    void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm);
+    void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn);
+    void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn);
+    void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
+    void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
+    void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn);
+    void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm);
+
+    // Loadstore multiple structure
+    void LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn);
+    void LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP);
+    void ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn);
+    void ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP);
+
+    // Loadstore paired
+    void LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+    void STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
+
+    // Loadstore register offset
+    void STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+    void LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
+
+    // Scalar - 1 Source
+    void FABS(ARM64Reg Rd, ARM64Reg Rn);
+    void FNEG(ARM64Reg Rd, ARM64Reg Rn);
+    void FSQRT(ARM64Reg Rd, ARM64Reg Rn);
+    void FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top = false); // Also generalized move between GPR/FP
+
+    // Scalar - 2 Source
+    void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void FDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void FMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void FMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void FMAXNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void FMINNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void FNMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+    // Scalar - 3 Source. Note - the accumulator is last on ARM!
+    void FMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+    void FMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+    void FNMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+    void FNMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
+
+    // Scalar three same
+    void SQADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void UQADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void SQSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void UQSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    
+    // Scalar floating point immediate
+    void FMOV(ARM64Reg Rd, uint8_t imm8);
+
+    // Vector
+    void ADD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void ADDV(ESize esize, ARM64Reg Rd, ARM64Reg Rn);
+    void SUB(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void CMGE(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void CMGE_zero(ESize esize, ARM64Reg Rd, ARM64Reg Rn);
+    void CMGT(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void CMHI(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void CMHS(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
+    void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+    void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+    void FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+    void FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+    void FCVTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+    void FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+    void FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+    void FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+    void FRECPE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+    void FRSQRTE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+    void FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void NOT(ARM64Reg Rd, ARM64Reg Rn);
+    void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void MOV(ARM64Reg Rd, ARM64Reg Rn) {
+        ORR(Rd, Rn, Rn);
+    }
+    void REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+    void REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+    void REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+    void SABD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void UABD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void SADDLV(ESize esize, ARM64Reg Rd, ARM64Reg Rn);
+    void UADDLV(ESize esize, ARM64Reg Rd, ARM64Reg Rn);
+    void SHADD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void UHADD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void SHSUB(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void UHSUB(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void SMIN(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void UMIN(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void SQADD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void SQSUB(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void UQADD(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void UQSUB(ESize esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+    void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+    void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale);
+    void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale);
+    void SQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+    void SQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+    void UQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+    void UQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+    void XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+    void XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
+
+    // Move
+    void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+    void INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn);
+    void INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 index2);
+    void UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
+    void SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
+
+    // One source
+    void FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn);
+
+    // Scalar convert float to int, in a lot of variants.
+    // Note that the scalar version of this operation has two encodings, one that
+    // goes to an integer register and one that outputs to a scalar fp register.
+    void FCVTS(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round);
+    void FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round);
+
+    // Scalar convert int to float. No rounding mode specifier necessary.
+    void SCVTF(ARM64Reg Rd, ARM64Reg Rn);
+    void UCVTF(ARM64Reg Rd, ARM64Reg Rn);
+
+    // Scalar fixed point to float. scale is the number of fractional bits.
+    void SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale);
+    void UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale);
+
+    // Float comparison
+    void FCMP(ARM64Reg Rn, ARM64Reg Rm);
+    void FCMP(ARM64Reg Rn);
+    void FCMPE(ARM64Reg Rn, ARM64Reg Rm);
+    void FCMPE(ARM64Reg Rn);
+    void FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+    void FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+    void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+    void FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+    void FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+
+    // Conditional select
+    void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
+
+    // Permute
+    void UZP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void TRN1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void ZIP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void UZP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
+    // Shift by immediate
+    void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+    void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+    void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+    void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+    void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+    void SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+    void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
+    void SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
+    void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
+    void UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
+
+    // vector x indexed element
+    void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index);
+    void FMLA(u8 esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index);
+
+    // Modified Immediate
+    void MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift = 0);
+    void BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift = 0);
+
+    void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG, bool negate = false);
+    void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG);
+
+    // ABI related
+    void ABI_PushRegisters(u32 registers, ARM64Reg tmp = INVALID_REG);
+    void ABI_PopRegisters(u32 registers, ARM64Reg tmp = INVALID_REG);
+
+private:
+    ARM64XEmitter* m_emit;
+
+    inline void Write32(u32 value) {
+        m_emit->Write32(value);
+    }
+
+    // Emitting functions
+    void EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt, ARM64Reg Rn,
+                                s32 imm);
+    void EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
+                           ARM64Reg Rm);
+    void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void EmitScalarThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn);
+    void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+    void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt,
+                                      ARM64Reg Rn);
+    void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt,
+                                      ARM64Reg Rn, ARM64Reg Rm);
+    void Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+    void EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+    void EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode, u32 opcode,
+                         int scale, ARM64Reg Rd, ARM64Reg Rn);
+    void EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm);
+    void EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+    void EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8);
+    void EmitShiftImm(bool Q, bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+    void EmitScalarShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+    void EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn);
+    void EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn,
+                                            ARM64Reg Rm);
+    void EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
+    void EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H, ARM64Reg Rd, ARM64Reg Rn,
+                            ARM64Reg Rm);
+    void EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
+    void EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round, bool sign);
+    void EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra,
+                           int opcode);
+    void EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2,
+                             ARM64Reg Rn, s32 imm);
+    void EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn,
+                                       ArithOption Rm);
+    void EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh);
+
+    void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
+    void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
+    void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
+    void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper);
+    void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper);
+};
+
+class ARM64CodeBlock : public CodeBlock<ARM64XEmitter> {
+private:
+    void PoisonMemory() override {
+        // If our memory isn't a multiple of u32 then this won't write the last
+        // remaining bytes with anything Less than optimal, but there would be
+        // nothing we could do but throw a runtime warning anyway. AArch64:
+        // 0xD4200000 = BRK 0
+        constexpr u32 brk_0 = 0xD4200000;
+
+        for (size_t i = 0; i < region_size; i += sizeof(u32)) {
+            std::memcpy(region + i, &brk_0, sizeof(u32));
+        }
+    }
+};
+
+} // namespace Dynarmic::BackendA64::Arm64Gen
diff --git a/src/dynarmic/backend/A64/emitter/arm_common.h b/src/dynarmic/backend/A64/emitter/arm_common.h
new file mode 100644
index 00000000..257467a6
--- /dev/null
+++ b/src/dynarmic/backend/A64/emitter/arm_common.h
@@ -0,0 +1,28 @@
+// Copyright 2014 Dolphin Emulator Project / 2018 dynarmic project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include "common/common_types.h"
+
+namespace Dynarmic::BackendA64 {
+enum CCFlags {
+    CC_EQ = 0,     // Equal
+    CC_NEQ,        // Not equal
+    CC_CS,         // Carry Set
+    CC_CC,         // Carry Clear
+    CC_MI,         // Minus (Negative)
+    CC_PL,         // Plus
+    CC_VS,         // Overflow
+    CC_VC,         // No Overflow
+    CC_HI,         // Unsigned higher
+    CC_LS,         // Unsigned lower or same
+    CC_GE,         // Signed greater than or equal
+    CC_LT,         // Signed less than
+    CC_GT,         // Signed greater than
+    CC_LE,         // Signed less than or equal
+    CC_AL,         // Always (unconditional) 14
+    CC_HS = CC_CS, // Alias of CC_CS  Unsigned higher or same
+    CC_LO = CC_CC, // Alias of CC_CC  Unsigned lower
+};
+const u32 NO_COND = 0xE0000000;
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/emitter/code_block.h b/src/dynarmic/backend/A64/emitter/code_block.h
new file mode 100644
index 00000000..26b0ebbc
--- /dev/null
+++ b/src/dynarmic/backend/A64/emitter/code_block.h
@@ -0,0 +1,139 @@
+// Copyright 2014 Dolphin Emulator Project / 2018 dynarmic project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <sys/mman.h>
+#endif
+
+#include "common/assert.h"
+#include "common/common_types.h"
+
+namespace Dynarmic::BackendA64 {
+// Everything that needs to generate code should inherit from this.
+// You get memory management for free, plus, you can use all emitter functions
+// without having to prefix them with gen-> or something similar. Example
+// implementation: class JIT : public CodeBlock<ARMXEmitter> {}
+template <class T>
+class CodeBlock : public T {
+private:
+    // A privately used function to set the executable RAM space to something
+    // invalid. For debugging usefulness it should be used to set the RAM to a
+    // host specific breakpoint instruction
+    virtual void PoisonMemory() = 0;
+
+protected:
+    u8* region = nullptr;
+    // Size of region we can use.
+    size_t region_size = 0;
+    // Original size of the region we allocated.
+    size_t total_region_size = 0;
+
+    bool m_is_child = false;
+    std::vector<CodeBlock*> m_children;
+
+public:
+    CodeBlock() = default;
+    virtual ~CodeBlock() {
+        if (region)
+            FreeCodeSpace();
+    }
+    CodeBlock(const CodeBlock&) = delete;
+    CodeBlock& operator=(const CodeBlock&) = delete;
+    CodeBlock(CodeBlock&&) = delete;
+    CodeBlock& operator=(CodeBlock&&) = delete;
+
+    // Call this before you generate any code.
+    void AllocCodeSpace(size_t size) {
+        region_size = size;
+        total_region_size = size;
+#if defined(_WIN32)
+        void* ptr = VirtualAlloc(nullptr, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
+#else
+#if defined(__APPLE__)
+        void* ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE | MAP_JIT, -1, 0);
+#else
+        void* ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE, -1, 0);
+#endif
+
+        if (ptr == MAP_FAILED)
+            ptr = nullptr;
+#endif
+        ASSERT_MSG(ptr != nullptr, "Failed to allocate executable memory");
+        region = static_cast<u8*>(ptr);
+        T::SetCodePtr(region);
+    }
+
+    // Always clear code space with breakpoints, so that if someone accidentally
+    // executes uninitialized, it just breaks into the debugger.
+    void ClearCodeSpace() {
+        PoisonMemory();
+        ResetCodePtr();
+    }
+
+    // Call this when shutting down. Don't rely on the destructor, even though
+    // it'll do the job.
+    void FreeCodeSpace() {
+        ASSERT(!m_is_child);
+        ASSERT(munmap(region, total_region_size) == 0);
+        region = nullptr;
+        region_size = 0;
+        total_region_size = 0;
+        for (CodeBlock* child : m_children) {
+            child->region = nullptr;
+            child->region_size = 0;
+            child->total_region_size = 0;
+        }
+    }
+
+    bool IsInSpace(const u8* ptr) const {
+        return ptr >= region && ptr < (region + region_size);
+    }
+    // Cannot currently be undone. Will write protect the entire code region.
+    // Start over if you need to change the code (call FreeCodeSpace(),
+    // AllocCodeSpace()).
+    void WriteProtect() {
+        ASSERT(mprotect(region, region_size, PROT_READ | PROT_EXEC) != 0);
+    }
+    void ResetCodePtr() {
+        T::SetCodePtr(region);
+    }
+    size_t GetSpaceLeft() const {
+        ASSERT(static_cast<size_t>(T::GetCodePtr() - region) < region_size);
+        return region_size - (T::GetCodePtr() - region);
+    }
+
+    bool IsAlmostFull() const {
+        // This should be bigger than the biggest block ever.
+        return GetSpaceLeft() < 0x10000;
+    }
+
+    bool HasChildren() const {
+        return region_size != total_region_size;
+    }
+
+    u8* AllocChildCodeSpace(size_t child_size) {
+        ASSERT_MSG(child_size < GetSpaceLeft(), "Insufficient space for child allocation.");
+        u8* child_region = region + region_size - child_size;
+        region_size -= child_size;
+        return child_region;
+    }
+
+    void AddChildCodeSpace(CodeBlock* child, size_t child_size) {
+        u8* child_region = AllocChildCodeSpace(child_size);
+        child->m_is_child = true;
+        child->region = child_region;
+        child->region_size = child_size;
+        child->total_region_size = child_size;
+        child->ResetCodePtr();
+        m_children.emplace_back(child);
+    }
+};
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/exception_handler.h b/src/dynarmic/backend/A64/exception_handler.h
new file mode 100644
index 00000000..04eb7d0c
--- /dev/null
+++ b/src/dynarmic/backend/A64/exception_handler.h
@@ -0,0 +1,39 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#pragma once
+
+#include <array>
+#include <memory>
+#include <functional>
+
+#include "backend/A64/a32_jitstate.h"
+#include "common/common_types.h"
+
+namespace Dynarmic::BackendA64 {
+
+class BlockOfCode;
+
+struct A64State {
+    std::array<u64, 32> X;
+    std::array<std::array<u64, 2>, 16> Q;
+};
+static_assert(sizeof(A64State) == sizeof(A64State::X) + sizeof(A64State::Q));
+
+class ExceptionHandler final {
+public:
+    ExceptionHandler();
+    ~ExceptionHandler();
+
+    void Register(BlockOfCode& code, std::function<void(CodePtr)> segv_callback = nullptr);
+
+    bool SupportsFastmem() const;
+private:
+    struct Impl;
+    std::unique_ptr<Impl> impl;
+};
+
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/exception_handler_generic.cpp b/src/dynarmic/backend/A64/exception_handler_generic.cpp
new file mode 100644
index 00000000..c5b17c07
--- /dev/null
+++ b/src/dynarmic/backend/A64/exception_handler_generic.cpp
@@ -0,0 +1,25 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#include "backend/A64/exception_handler.h"
+
+namespace Dynarmic::BackendA64 {
+
+struct ExceptionHandler::Impl final {
+};
+
+ExceptionHandler::ExceptionHandler() = default;
+ExceptionHandler::~ExceptionHandler() = default;
+
+void ExceptionHandler::Register(BlockOfCode&, std::function<void(CodePtr)>) {
+    // Do nothing
+}
+
+bool ExceptionHandler::SupportsFastmem() const {
+    return false;
+}
+
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/exception_handler_posix.cpp b/src/dynarmic/backend/A64/exception_handler_posix.cpp
new file mode 100644
index 00000000..0ddb55e4
--- /dev/null
+++ b/src/dynarmic/backend/A64/exception_handler_posix.cpp
@@ -0,0 +1,166 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2019 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include <mutex>
+#include <vector>
+
+#include <csignal>
+#ifdef __APPLE__
+#include <sys/ucontext.h>
+#else
+#include <ucontext.h>
+#endif
+
+#include "backend/A64/a32_jitstate.h"
+#include "backend/A64/block_of_code.h"
+#include "backend/A64/exception_handler.h"
+#include "common/assert.h"
+#include "common/cast_util.h"
+#include "common/common_types.h"
+
+namespace Dynarmic::BackendA64 {
+
+namespace {
+
+struct CodeBlockInfo {
+    BlockOfCode* block;
+    std::function<void(CodePtr)> callback;
+};
+
+class SigHandler {
+public:
+    SigHandler();
+
+    ~SigHandler();
+
+    void AddCodeBlock(CodeBlockInfo info);
+
+    void RemoveCodeBlock(CodePtr PC);
+
+private:
+    auto FindCodeBlockInfo(CodePtr PC) {
+        return std::find_if(code_block_infos.begin(), code_block_infos.end(),
+                            [&](const CodeBlockInfo& x) { return x.block->GetRegion() <= PC && x.block->GetRegion() + x.block->GetRegionSize() > PC; });
+    }
+
+    std::vector<CodeBlockInfo> code_block_infos;
+    std::mutex code_block_infos_mutex;
+
+    struct sigaction old_sa_segv;
+    struct sigaction old_sa_bus;
+
+    static void SigAction(int sig, siginfo_t* info, void* raw_context);
+};
+
+SigHandler sig_handler;
+
+SigHandler::SigHandler() {
+    // Method below from dolphin.
+
+    const size_t signal_stack_size = std::max<size_t>(SIGSTKSZ, 2 * 1024 * 1024);
+
+    stack_t signal_stack;
+    signal_stack.ss_sp = malloc(signal_stack_size);
+    signal_stack.ss_size = signal_stack_size;
+    signal_stack.ss_flags = 0;
+    ASSERT_MSG(sigaltstack(&signal_stack, nullptr) == 0,
+               "dynarmic: POSIX SigHandler: init failure at sigaltstack");
+
+    struct sigaction sa;
+    sa.sa_handler = nullptr;
+    sa.sa_sigaction = &SigHandler::SigAction;
+    sa.sa_flags = SA_SIGINFO | SA_ONSTACK | SA_RESTART;
+    sigemptyset(&sa.sa_mask);
+    sigaction(SIGSEGV, &sa, &old_sa_segv);
+}
+
+SigHandler::~SigHandler() {
+    // No cleanup required.
+}
+
+void SigHandler::AddCodeBlock(CodeBlockInfo cb) {
+    std::lock_guard<std::mutex> guard(code_block_infos_mutex);
+    ASSERT(FindCodeBlockInfo(cb.block->GetRegion()) == code_block_infos.end());
+    code_block_infos.push_back(std::move(cb));
+}
+
+void SigHandler::RemoveCodeBlock(CodePtr PC) {
+    std::lock_guard<std::mutex> guard(code_block_infos_mutex);
+    const auto iter = FindCodeBlockInfo(PC);
+    ASSERT(iter != code_block_infos.end());
+    code_block_infos.erase(iter);
+}
+
+void SigHandler::SigAction(int sig, siginfo_t* info, void* raw_context) {
+    ASSERT(sig == SIGSEGV || sig == SIGBUS);
+
+    std::lock_guard<std::mutex> guard(sig_handler.code_block_infos_mutex);
+#ifdef __APPLE__
+    auto PC = reinterpret_cast<CodePtr>(((ucontext_t*)raw_context)->uc_mcontext->__ss.__pc);
+#else
+    auto PC = reinterpret_cast<CodePtr>(((ucontext_t*)raw_context)->uc_mcontext.pc);
+#endif
+    const auto iter = sig_handler.FindCodeBlockInfo(PC);
+    if (iter != sig_handler.code_block_infos.end()) {
+        iter->callback(PC);
+        return;
+    }
+
+    fmt::print(
+        stderr,
+        "dynarmic: POSIX SigHandler: Exception was not in registered code blocks (PC {})\n",
+        PC);
+
+    struct sigaction* retry_sa =
+        sig == SIGSEGV ? &sig_handler.old_sa_segv : &sig_handler.old_sa_bus;
+    if (retry_sa->sa_flags & SA_SIGINFO) {
+        retry_sa->sa_sigaction(sig, info, raw_context);
+        return;
+    }
+    if (retry_sa->sa_handler == SIG_DFL) {
+        signal(sig, SIG_DFL);
+        return;
+    }
+    if (retry_sa->sa_handler == SIG_IGN) {
+        return;
+    }
+    retry_sa->sa_handler(sig);
+}
+
+} // anonymous namespace
+
+struct ExceptionHandler::Impl final {
+    Impl(BlockOfCode& code, std::function<void(CodePtr)> cb) {
+        code_begin = code.GetRegion();
+        sig_handler.AddCodeBlock({&code, std::move(cb)});
+    }
+
+    ~Impl() {
+        sig_handler.RemoveCodeBlock(code_begin);
+    }
+
+private:
+    CodePtr code_begin;
+};
+
+ExceptionHandler::ExceptionHandler() = default;
+
+ExceptionHandler::~ExceptionHandler() = default;
+
+void ExceptionHandler::Register(BlockOfCode& code, std::function<void(CodePtr)> cb) {
+    if (cb)
+        impl = std::make_unique<Impl>(code, std::move(cb));
+}
+
+bool ExceptionHandler::SupportsFastmem() const {
+    return static_cast<bool>(impl);
+}
+
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/hostloc.cpp b/src/dynarmic/backend/A64/hostloc.cpp
new file mode 100644
index 00000000..8d1094ec
--- /dev/null
+++ b/src/dynarmic/backend/A64/hostloc.cpp
@@ -0,0 +1,21 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#include "backend/A64/hostloc.h"
+
+namespace Dynarmic::BackendA64 {
+
+Arm64Gen::ARM64Reg HostLocToReg64(HostLoc loc) {
+    ASSERT(HostLocIsGPR(loc));
+    return static_cast<Arm64Gen::ARM64Reg>(static_cast<int>(Arm64Gen::X0) + static_cast<int>(loc));
+}
+
+Arm64Gen::ARM64Reg HostLocToFpr(HostLoc loc) {
+    ASSERT(HostLocIsFPR(loc));
+    return EncodeRegToQuad(static_cast<Arm64Gen::ARM64Reg>(static_cast<int>(loc) - static_cast<int>(HostLoc::Q0)));
+}
+
+} // namespace Dynarmic::BackendX64
diff --git a/src/dynarmic/backend/A64/hostloc.h b/src/dynarmic/backend/A64/hostloc.h
new file mode 100644
index 00000000..7183d0a8
--- /dev/null
+++ b/src/dynarmic/backend/A64/hostloc.h
@@ -0,0 +1,176 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+#pragma once
+
+#include "backend/A64/emitter/a64_emitter.h"
+#include "common/assert.h"
+#include "common/common_types.h"
+
+namespace Dynarmic::BackendA64 {
+
+enum class HostLoc {
+    // Ordering of the registers is intentional. See also: HostLocToA64.
+
+    // 64bit GPR registers
+    X0,
+    X1,
+    X2,
+    X3,
+    X4,
+    X5,
+    X6,
+    X7,
+    X8,
+    X9,
+    X10,
+    X11,
+    X12,
+    X13,
+    X14,
+    X15,
+    X16,
+    X17,
+    X18,
+    X19,
+    X20,
+    X21,
+    X22,
+    X23,
+    X24,
+    X25,
+    X26,
+    X27,
+    X28,
+    X29,
+    X30,
+
+    SP, // 64bit stack pointer
+
+    // Qword FPR registers
+    Q0,
+    Q1,
+    Q2,
+    Q3,
+    Q4,
+    Q5,
+    Q6,
+    Q7,
+    Q8,
+    Q9,
+    Q10,
+    Q11,
+    Q12,
+    Q13,
+    Q14,
+    Q15,
+    Q16,
+    Q17,
+    Q18,
+    Q19,
+    Q20,
+    Q21,
+    Q22,
+    Q23,
+    Q24,
+    Q25,
+    Q26,
+    Q27,
+    Q28,
+    Q29,
+    Q30,
+    Q31,
+
+    FirstSpill,
+};
+
+constexpr size_t NonSpillHostLocCount = static_cast<size_t>(HostLoc::FirstSpill);
+
+inline bool HostLocIsGPR(HostLoc reg) {
+    return reg >= HostLoc::X0 && reg <= HostLoc::X30;
+}
+
+inline bool HostLocIsFPR(HostLoc reg) {
+    return reg >= HostLoc::Q0 && reg <= HostLoc::Q31;
+}
+
+inline bool HostLocIsRegister(HostLoc reg) {
+    return HostLocIsGPR(reg) || HostLocIsFPR(reg);
+}
+
+inline HostLoc HostLocRegIdx(int idx) {
+    ASSERT(idx >= 0 && idx <= 30);
+    return static_cast<HostLoc>(idx);
+}
+
+inline HostLoc HostLocFprIdx(int idx) {
+    ASSERT(idx >= 0 && idx <= 31);
+    return static_cast<HostLoc>(static_cast<size_t>(HostLoc::Q0) + idx);
+}
+
+inline HostLoc HostLocSpill(size_t i) {
+    return static_cast<HostLoc>(static_cast<size_t>(HostLoc::FirstSpill) + i);
+}
+
+inline bool HostLocIsSpill(HostLoc reg) {
+    return reg >= HostLoc::FirstSpill;
+}
+
+inline size_t HostLocBitWidth(HostLoc loc) {
+    if (HostLocIsGPR(loc))
+        return 64;
+    if (HostLocIsFPR(loc))
+        return 128;
+    if (HostLocIsSpill(loc))
+        return 128;
+    UNREACHABLE();
+}
+
+using HostLocList = std::initializer_list<HostLoc>;
+
+// X18 may be reserved.(Windows and iOS)
+// X26 holds the cycle counter
+// X27 contains an emulated memory relate pointer
+// X28 used for holding the JitState.
+// X30 is the link register.
+// In order of desireablity based first on ABI
+constexpr HostLocList any_gpr = {
+    HostLoc::X19, HostLoc::X20, HostLoc::X21, HostLoc::X22, HostLoc::X23,
+    HostLoc::X24, HostLoc::X25,
+
+    HostLoc::X8,  HostLoc::X9,  HostLoc::X10, HostLoc::X11, HostLoc::X12,
+    HostLoc::X13, HostLoc::X14, HostLoc::X15, HostLoc::X16, HostLoc::X17,
+
+    HostLoc::X7,  HostLoc::X6,  HostLoc::X5,  HostLoc::X4,  HostLoc::X3,
+    HostLoc::X2,  HostLoc::X1,  HostLoc::X0,
+};
+
+constexpr HostLocList any_fpr = {
+    HostLoc::Q8,  HostLoc::Q9,  HostLoc::Q10, HostLoc::Q11, HostLoc::Q12, HostLoc::Q13,
+    HostLoc::Q14, HostLoc::Q15,
+
+    HostLoc::Q16, HostLoc::Q17, HostLoc::Q18, HostLoc::Q19, HostLoc::Q20, HostLoc::Q21,
+    HostLoc::Q22, HostLoc::Q23, HostLoc::Q24, HostLoc::Q25, HostLoc::Q26, HostLoc::Q27,
+    HostLoc::Q28, HostLoc::Q29, HostLoc::Q30, HostLoc::Q31,
+
+    HostLoc::Q7,  HostLoc::Q6,  HostLoc::Q5,  HostLoc::Q4,  HostLoc::Q3,  HostLoc::Q2,
+    HostLoc::Q1,  HostLoc::Q0,
+};
+
+Arm64Gen::ARM64Reg HostLocToReg64(HostLoc loc);
+Arm64Gen::ARM64Reg HostLocToFpr(HostLoc loc);
+
+template <typename JitStateType>
+size_t SpillToOpArg(HostLoc loc) {
+    ASSERT(HostLocIsSpill(loc));
+
+    size_t i = static_cast<size_t>(loc) - static_cast<size_t>(HostLoc::FirstSpill);
+    ASSERT_MSG(i < JitStateType::SpillCount,
+               "Spill index greater than number of available spill locations");
+
+    return JitStateType::GetSpillLocationOffsetFromIndex(i);
+}
+
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/jitstate_info.h b/src/dynarmic/backend/A64/jitstate_info.h
new file mode 100644
index 00000000..63336d79
--- /dev/null
+++ b/src/dynarmic/backend/A64/jitstate_info.h
@@ -0,0 +1,44 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#pragma once
+
+#include <cstddef>
+
+namespace Dynarmic::BackendA64 {
+
+struct JitStateInfo {
+    template <typename JitStateType>
+    JitStateInfo(const JitStateType&)
+        : offsetof_cycles_remaining(offsetof(JitStateType, cycles_remaining))
+        , offsetof_cycles_to_run(offsetof(JitStateType, cycles_to_run))
+        , offsetof_save_host_FPCR(offsetof(JitStateType, save_host_FPCR))
+        , offsetof_guest_fpcr(offsetof(JitStateType, guest_fpcr))
+        , offsetof_guest_fpsr(offsetof(JitStateType, guest_fpsr))
+        , offsetof_rsb_ptr(offsetof(JitStateType, rsb_ptr))
+        , rsb_ptr_mask(JitStateType::RSBPtrMask)
+        , offsetof_rsb_location_descriptors(offsetof(JitStateType, rsb_location_descriptors))
+        , offsetof_rsb_codeptrs(offsetof(JitStateType, rsb_codeptrs))
+        , offsetof_cpsr_nzcv(offsetof(JitStateType, cpsr_nzcv))
+        , offsetof_fpsr_exc(offsetof(JitStateType, fpsr_exc))
+        , offsetof_fpsr_qc(offsetof(JitStateType, fpsr_qc))
+    {}
+
+    const size_t offsetof_cycles_remaining;
+    const size_t offsetof_cycles_to_run;
+    const size_t offsetof_save_host_FPCR;
+    const size_t offsetof_guest_fpcr;
+    const size_t offsetof_guest_fpsr;
+    const size_t offsetof_rsb_ptr;
+    const size_t rsb_ptr_mask;
+    const size_t offsetof_rsb_location_descriptors;
+    const size_t offsetof_rsb_codeptrs;
+    const size_t offsetof_cpsr_nzcv;
+    const size_t offsetof_fpsr_exc;
+    const size_t offsetof_fpsr_qc;
+};
+
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/opcodes.inc b/src/dynarmic/backend/A64/opcodes.inc
new file mode 100644
index 00000000..8857cf78
--- /dev/null
+++ b/src/dynarmic/backend/A64/opcodes.inc
@@ -0,0 +1,651 @@
+//     opcode name,                                         return type,    arg1 type,      arg2 type,      arg3 type,      arg4 type,      ...
+
+OPCODE(Void,                                                Void,                                                                           )
+OPCODE(Identity,                                            Opaque,         Opaque                                                          )
+OPCODE(Breakpoint,                                          Void,                                                                           )
+
+// A32 Context getters/setters
+A32OPC(SetCheckBit,                                         Void,           U1                                                              )
+A32OPC(GetRegister,                                         U32,            A32Reg                                                          )
+A32OPC(GetExtendedRegister32,                               U32,            A32ExtReg                                                       )
+A32OPC(GetExtendedRegister64,                               U64,            A32ExtReg                                                       )
+A32OPC(SetRegister,                                         Void,           A32Reg,         U32                                             )
+A32OPC(SetExtendedRegister32,                               Void,           A32ExtReg,      U32                                             )
+A32OPC(SetExtendedRegister64,                               Void,           A32ExtReg,      U64                                             )
+A32OPC(GetCpsr,                                             U32,                                                                            )
+A32OPC(SetCpsr,                                             Void,           U32                                                             )
+A32OPC(SetCpsrNZCVRaw,                                       Void,           U32                                                             )
+A32OPC(SetCpsrNZCV,                                         Void,           NZCV                                                            )
+A32OPC(SetCpsrNZCVQ,                                        Void,           U32                                                             )
+A32OPC(GetNFlag,                                            U1,                                                                             )
+A32OPC(SetNFlag,                                            Void,           U1                                                              )
+A32OPC(GetZFlag,                                            U1,                                                                             )
+A32OPC(SetZFlag,                                            Void,           U1                                                              )
+A32OPC(GetCFlag,                                            U1,                                                                             )
+A32OPC(SetCFlag,                                            Void,           U1                                                              )
+A32OPC(GetVFlag,                                            U1,                                                                             )
+A32OPC(SetVFlag,                                            Void,           U1                                                              )
+A32OPC(OrQFlag,                                             Void,           U1                                                              )
+A32OPC(GetGEFlags,                                          U32,                                                                            )
+A32OPC(SetGEFlags,                                          Void,           U32                                                             )
+A32OPC(SetGEFlagsCompressed,                                Void,           U32                                                             )
+A32OPC(BXWritePC,                                           Void,           U32                                                             )
+A32OPC(CallSupervisor,                                      Void,           U32                                                             )
+A32OPC(ExceptionRaised,                                     Void,           U32,            U64                                             )
+A32OPC(GetFpscr,                                            U32,                                                                            )
+A32OPC(SetFpscr,                                            Void,           U32,                                                            )
+A32OPC(GetFpscrNZCV,                                        U32,                                                                            )
+A32OPC(SetFpscrNZCV,                                        Void,           NZCV                                                            )
+
+// A64 Context getters/setters
+//A64OPC(SetCheckBit,                                         Void,           U1                                                              )
+//A64OPC(GetCFlag,                                            U1,                                                                             )
+//A64OPC(GetNZCVRaw,                                          U32,                                                                            )
+//A64OPC(SetNZCVRaw,                                          Void,           U32                                                             )
+//A64OPC(SetNZCV,                                             Void,           NZCV                                                            )
+//A64OPC(GetW,                                                U32,            A64Reg                                                          )
+//A64OPC(GetX,                                                U64,            A64Reg                                                          )
+//A64OPC(GetS,                                                U128,           A64Vec                                                          )
+//A64OPC(GetD,                                                U128,           A64Vec                                                          )
+//A64OPC(GetQ,                                                U128,           A64Vec                                                          )
+//A64OPC(GetSP,                                               U64,                                                                            )
+//A64OPC(GetFPCR,                                             U32,                                                                            )
+//A64OPC(GetFPSR,                                             U32,                                                                            )
+//A64OPC(SetW,                                                Void,           A64Reg,         U32                                             )
+//A64OPC(SetX,                                                Void,           A64Reg,         U64                                             )
+//A64OPC(SetS,                                                Void,           A64Vec,         U128                                            )
+//A64OPC(SetD,                                                Void,           A64Vec,         U128                                            )
+//A64OPC(SetQ,                                                Void,           A64Vec,         U128                                            )
+//A64OPC(SetSP,                                               Void,           U64                                                             )
+//A64OPC(SetFPCR,                                             Void,           U32                                                             )
+//A64OPC(SetFPSR,                                             Void,           U32                                                             )
+//A64OPC(OrQC,                                                Void,           U1                                                              )
+//A64OPC(SetPC,                                               Void,           U64                                                             )
+//A64OPC(CallSupervisor,                                      Void,           U32                                                             )
+//A64OPC(ExceptionRaised,                                     Void,           U64,            U64                                             )
+//A64OPC(DataCacheOperationRaised,                            Void,           U64,            U64                                             )
+//A64OPC(DataSynchronizationBarrier,                          Void,                                                                           )
+//A64OPC(DataMemoryBarrier,                                   Void,                                                                           )
+//A64OPC(InstructionSynchronizationBarrier,                   Void,                                                                           )
+//A64OPC(GetCNTFRQ,                                           U32,                                                                            )
+//A64OPC(GetCNTPCT,                                           U64,                                                                            )
+//A64OPC(GetCTR,                                              U32,                                                                            )
+//A64OPC(GetDCZID,                                            U32,                                                                            )
+//A64OPC(GetTPIDR,                                            U64,                                                                            )
+//A64OPC(GetTPIDRRO,                                          U64,                                                                            )
+//A64OPC(SetTPIDR,                                            Void,           U64                                                             )
+
+// Hints
+OPCODE(PushRSB,                                             Void,           U64                                                             )
+
+// Pseudo-operation, handled specially at final emit
+OPCODE(GetCarryFromOp,                                      U1,             Opaque                                                          )
+OPCODE(GetOverflowFromOp,                                   U1,             Opaque                                                          )
+OPCODE(GetGEFromOp,                                         U32,            Opaque                                                          )
+OPCODE(GetNZCVFromOp,                                       NZCV,           Opaque                                                          )
+OPCODE(GetUpperFromOp,                                      U128,           Opaque                                                          )
+OPCODE(GetLowerFromOp,                                      U128,           Opaque                                                          )
+
+OPCODE(NZCVFromPackedFlags,                                 NZCV,           U32                                                             )
+
+// Calculations
+OPCODE(Pack2x32To1x64,                                      U64,            U32,            U32                                             )
+//OPCODE(Pack2x64To1x128,                                     U128,           U64,            U64                                             )
+OPCODE(LeastSignificantWord,                                U32,            U64                                                             )
+OPCODE(MostSignificantWord,                                 U32,            U64                                                             )
+OPCODE(LeastSignificantHalf,                                U16,            U32                                                             )
+OPCODE(LeastSignificantByte,                                U8,             U32                                                             )
+OPCODE(MostSignificantBit,                                  U1,             U32                                                             )
+OPCODE(IsZero32,                                            U1,             U32                                                             )
+OPCODE(IsZero64,                                            U1,             U64                                                             )
+OPCODE(TestBit,                                             U1,             U64,            U8                                              )
+OPCODE(ConditionalSelect32,                                 U32,            Cond,           U32,            U32                             )
+OPCODE(ConditionalSelect64,                                 U64,            Cond,           U64,            U64                             )
+OPCODE(ConditionalSelectNZCV,                               NZCV,           Cond,           NZCV,           NZCV                            )
+OPCODE(LogicalShiftLeft32,                                  U32,            U32,            U8,             U1                              )
+OPCODE(LogicalShiftLeft64,                                  U64,            U64,            U8                                              )
+OPCODE(LogicalShiftRight32,                                 U32,            U32,            U8,             U1                              )
+OPCODE(LogicalShiftRight64,                                 U64,            U64,            U8                                              )
+OPCODE(ArithmeticShiftRight32,                              U32,            U32,            U8,             U1                              )
+//OPCODE(ArithmeticShiftRight64,                              U64,            U64,            U8                                              )
+OPCODE(RotateRight32,                                       U32,            U32,            U8,             U1                              )
+OPCODE(RotateRight64,                                       U64,            U64,            U8                                              )
+OPCODE(RotateRightExtended,                                 U32,            U32,            U1                                              )
+OPCODE(Add32,                                               U32,            U32,            U32,            U1                              )
+OPCODE(Add64,                                               U64,            U64,            U64,            U1                              )
+OPCODE(Sub32,                                               U32,            U32,            U32,            U1                              )
+OPCODE(Sub64,                                               U64,            U64,            U64,            U1                              )
+OPCODE(Mul32,                                               U32,            U32,            U32                                             )
+OPCODE(Mul64,                                               U64,            U64,            U64                                             )
+//OPCODE(SignedMultiplyHigh64,                                U64,            U64,            U64                                             )
+//OPCODE(UnsignedMultiplyHigh64,                              U64,            U64,            U64                                             )
+OPCODE(UnsignedDiv32,                                       U32,            U32,            U32                                             )
+OPCODE(UnsignedDiv64,                                       U64,            U64,            U64                                             )
+OPCODE(SignedDiv32,                                         U32,            U32,            U32                                             )
+OPCODE(SignedDiv64,                                         U64,            U64,            U64                                             )
+OPCODE(And32,                                               U32,            U32,            U32                                             )
+OPCODE(And64,                                               U64,            U64,            U64                                             )
+OPCODE(Eor32,                                               U32,            U32,            U32                                             )
+OPCODE(Eor64,                                               U64,            U64,            U64                                             )
+OPCODE(Or32,                                                U32,            U32,            U32                                             )
+OPCODE(Or64,                                                U64,            U64,            U64                                             )
+OPCODE(Not32,                                               U32,            U32                                                             )
+OPCODE(Not64,                                               U64,            U64                                                             )
+OPCODE(SignExtendByteToWord,                                U32,            U8                                                              )
+OPCODE(SignExtendHalfToWord,                                U32,            U16                                                             )
+OPCODE(SignExtendByteToLong,                                U64,            U8                                                              )
+OPCODE(SignExtendHalfToLong,                                U64,            U16                                                             )
+OPCODE(SignExtendWordToLong,                                U64,            U32                                                             )
+OPCODE(ZeroExtendByteToWord,                                U32,            U8                                                              )
+OPCODE(ZeroExtendHalfToWord,                                U32,            U16                                                             )
+OPCODE(ZeroExtendByteToLong,                                U64,            U8                                                              )
+OPCODE(ZeroExtendHalfToLong,                                U64,            U16                                                             )
+OPCODE(ZeroExtendWordToLong,                                U64,            U32                                                             )
+//OPCODE(ZeroExtendLongToQuad,                                U128,           U64                                                             )
+//OPCODE(ByteReverseDual,                                     U64,            U64                                                             )
+OPCODE(ByteReverseWord,                                     U32,            U32                                                             )
+OPCODE(ByteReverseHalf,                                     U16,            U16                                                             )
+OPCODE(CountLeadingZeros32,                                 U32,            U32                                                             )
+OPCODE(CountLeadingZeros64,                                 U64,            U64                                                             )
+//OPCODE(ExtractRegister32,                                   U32,            U32,            U32,            U8                              )
+//OPCODE(ExtractRegister64,                                   U64,            U64,            U64,            U8                              )
+//OPCODE(MaxSigned32,                                         U32,            U32,            U32                                             )
+//OPCODE(MaxSigned64,                                         U64,            U64,            U64                                             )
+//OPCODE(MaxUnsigned32,                                       U32,            U32,            U32                                             )
+//OPCODE(MaxUnsigned64,                                       U64,            U64,            U64                                             )
+//OPCODE(MinSigned32,                                         U32,            U32,            U32                                             )
+//OPCODE(MinSigned64,                                         U64,            U64,            U64                                             )
+//OPCODE(MinUnsigned32,                                       U32,            U32,            U32                                             )
+//OPCODE(MinUnsigned64,                                       U64,            U64,            U64                                             )
+
+// Saturated instructions
+OPCODE(SignedSaturatedAdd8,                                 U8,             U8,             U8                                              )
+OPCODE(SignedSaturatedAdd16,                                U16,            U16,            U16                                             )
+OPCODE(SignedSaturatedAdd32,                                U32,            U32,            U32                                             )
+OPCODE(SignedSaturatedAdd64,                                U64,            U64,            U64                                             )
+//OPCODE(SignedSaturatedDoublingMultiplyReturnHigh16,         U16,            U16,            U16                                             )
+//OPCODE(SignedSaturatedDoublingMultiplyReturnHigh32,         U32,            U32,            U32                                             )
+OPCODE(SignedSaturatedSub8,                                 U8,             U8,             U8                                              )
+OPCODE(SignedSaturatedSub16,                                U16,            U16,            U16                                             )
+OPCODE(SignedSaturatedSub32,                                U32,            U32,            U32                                             )
+OPCODE(SignedSaturatedSub64,                                U64,            U64,            U64                                             )
+OPCODE(SignedSaturation,                                    U32,            U32,            U8                                              )
+//OPCODE(UnsignedSaturatedAdd8,                               U8,             U8,             U8                                              )
+//OPCODE(UnsignedSaturatedAdd16,                              U16,            U16,            U16                                             )
+//OPCODE(UnsignedSaturatedAdd32,                              U32,            U32,            U32                                             )
+//OPCODE(UnsignedSaturatedAdd64,                              U64,            U64,            U64                                             )
+//OPCODE(UnsignedSaturatedSub8,                               U8,             U8,             U8                                              )
+//OPCODE(UnsignedSaturatedSub16,                              U16,            U16,            U16                                             )
+//OPCODE(UnsignedSaturatedSub32,                              U32,            U32,            U32                                             )
+//OPCODE(UnsignedSaturatedSub64,                              U64,            U64,            U64                                             )
+OPCODE(UnsignedSaturation,                                  U32,            U32,            U8                                              )
+
+// Packed instructions
+OPCODE(PackedAddU8,                                         U32,            U32,            U32                                             )
+OPCODE(PackedAddS8,                                         U32,            U32,            U32                                             )
+OPCODE(PackedSubU8,                                         U32,            U32,            U32                                             )
+OPCODE(PackedSubS8,                                         U32,            U32,            U32                                             )
+OPCODE(PackedAddU16,                                        U32,            U32,            U32                                             )
+OPCODE(PackedAddS16,                                        U32,            U32,            U32                                             )
+OPCODE(PackedSubU16,                                        U32,            U32,            U32                                             )
+OPCODE(PackedSubS16,                                        U32,            U32,            U32                                             )
+OPCODE(PackedAddSubU16,                                     U32,            U32,            U32                                             )
+OPCODE(PackedAddSubS16,                                     U32,            U32,            U32                                             )
+OPCODE(PackedSubAddU16,                                     U32,            U32,            U32                                             )
+OPCODE(PackedSubAddS16,                                     U32,            U32,            U32                                             )
+OPCODE(PackedHalvingAddU8,                                  U32,            U32,            U32                                             )
+OPCODE(PackedHalvingAddS8,                                  U32,            U32,            U32                                             )
+OPCODE(PackedHalvingSubU8,                                  U32,            U32,            U32                                             )
+OPCODE(PackedHalvingSubS8,                                  U32,            U32,            U32                                             )
+OPCODE(PackedHalvingAddU16,                                 U32,            U32,            U32                                             )
+OPCODE(PackedHalvingAddS16,                                 U32,            U32,            U32                                             )
+OPCODE(PackedHalvingSubU16,                                 U32,            U32,            U32                                             )
+OPCODE(PackedHalvingSubS16,                                 U32,            U32,            U32                                             )
+OPCODE(PackedHalvingAddSubU16,                              U32,            U32,            U32                                             )
+OPCODE(PackedHalvingAddSubS16,                              U32,            U32,            U32                                             )
+OPCODE(PackedHalvingSubAddU16,                              U32,            U32,            U32                                             )
+OPCODE(PackedHalvingSubAddS16,                              U32,            U32,            U32                                             )
+OPCODE(PackedSaturatedAddU8,                                U32,            U32,            U32                                             )
+OPCODE(PackedSaturatedAddS8,                                U32,            U32,            U32                                             )
+OPCODE(PackedSaturatedSubU8,                                U32,            U32,            U32                                             )
+OPCODE(PackedSaturatedSubS8,                                U32,            U32,            U32                                             )
+OPCODE(PackedSaturatedAddU16,                               U32,            U32,            U32                                             )
+OPCODE(PackedSaturatedAddS16,                               U32,            U32,            U32                                             )
+OPCODE(PackedSaturatedSubU16,                               U32,            U32,            U32                                             )
+OPCODE(PackedSaturatedSubS16,                               U32,            U32,            U32                                             )
+OPCODE(PackedAbsDiffSumS8,                                  U32,            U32,            U32                                             )
+OPCODE(PackedSelect,                                        U32,            U32,            U32,            U32                             )
+
+// CRC instructions
+//OPCODE(CRC32Castagnoli8,                                    U32,            U32,            U32                                             )
+//OPCODE(CRC32Castagnoli16,                                   U32,            U32,            U32                                             )
+//OPCODE(CRC32Castagnoli32,                                   U32,            U32,            U32                                             )
+//OPCODE(CRC32Castagnoli64,                                   U32,            U32,            U64                                             )
+//OPCODE(CRC32ISO8,                                           U32,            U32,            U32                                             )
+//OPCODE(CRC32ISO16,                                          U32,            U32,            U32                                             )
+//OPCODE(CRC32ISO32,                                          U32,            U32,            U32                                             )
+//OPCODE(CRC32ISO64,                                          U32,            U32,            U64                                             )
+
+// AES instructions
+//OPCODE(AESDecryptSingleRound,                               U128,           U128                                                            )
+//OPCODE(AESEncryptSingleRound,                               U128,           U128                                                            )
+//OPCODE(AESInverseMixColumns,                                U128,           U128                                                            )
+//OPCODE(AESMixColumns,                                       U128,           U128                                                            )
+
+// SM4 instructions
+//OPCODE(SM4AccessSubstitutionBox,                            U8,             U8                                                              )
+
+// Vector instructions
+//OPCODE(VectorGetElement8,                                   U8,             U128,           U8                                              )
+//OPCODE(VectorGetElement16,                                  U16,            U128,           U8                                              )
+//OPCODE(VectorGetElement32,                                  U32,            U128,           U8                                              )
+//OPCODE(VectorGetElement64,                                  U64,            U128,           U8                                              )
+//OPCODE(VectorSetElement8,                                   U128,           U128,           U8,             U8                              )
+//OPCODE(VectorSetElement16,                                  U128,           U128,           U8,             U16                             )
+//OPCODE(VectorSetElement32,                                  U128,           U128,           U8,             U32                             )
+//OPCODE(VectorSetElement64,                                  U128,           U128,           U8,             U64                             )
+//OPCODE(VectorAbs8,                                          U128,           U128                                                            )
+//OPCODE(VectorAbs16,                                         U128,           U128                                                            )
+//OPCODE(VectorAbs32,                                         U128,           U128                                                            )
+//OPCODE(VectorAbs64,                                         U128,           U128                                                            )
+//OPCODE(VectorAdd8,                                          U128,           U128,           U128                                            )
+//OPCODE(VectorAdd16,                                         U128,           U128,           U128                                            )
+//OPCODE(VectorAdd32,                                         U128,           U128,           U128                                            )
+//OPCODE(VectorAdd64,                                         U128,           U128,           U128                                            )
+//OPCODE(VectorAnd,                                           U128,           U128,           U128                                            )
+//OPCODE(VectorArithmeticShiftRight8,                         U128,           U128,           U8                                              )
+//OPCODE(VectorArithmeticShiftRight16,                        U128,           U128,           U8                                              )
+//OPCODE(VectorArithmeticShiftRight32,                        U128,           U128,           U8                                              )
+//OPCODE(VectorArithmeticShiftRight64,                        U128,           U128,           U8                                              )
+//OPCODE(VectorArithmeticVShift8,                             U128,           U128,           U128                                            )
+//OPCODE(VectorArithmeticVShift16,                            U128,           U128,           U128                                            )
+//OPCODE(VectorArithmeticVShift32,                            U128,           U128,           U128                                            )
+//OPCODE(VectorArithmeticVShift64,                            U128,           U128,           U128                                            )
+//OPCODE(VectorBroadcastLower8,                               U128,           U8                                                              )
+//OPCODE(VectorBroadcastLower16,                              U128,           U16                                                             )
+//OPCODE(VectorBroadcastLower32,                              U128,           U32                                                             )
+//OPCODE(VectorBroadcast8,                                    U128,           U8                                                              )
+//OPCODE(VectorBroadcast16,                                   U128,           U16                                                             )
+//OPCODE(VectorBroadcast32,                                   U128,           U32                                                             )
+//OPCODE(VectorBroadcast64,                                   U128,           U64                                                             )
+//OPCODE(VectorCountLeadingZeros8,                            U128,           U128                                                            )
+//OPCODE(VectorCountLeadingZeros16,                           U128,           U128                                                            )
+//OPCODE(VectorCountLeadingZeros32,                           U128,           U128                                                            )
+//OPCODE(VectorDeinterleaveEven8,                             U128,           U128,           U128                                            )
+//OPCODE(VectorDeinterleaveEven16,                            U128,           U128,           U128                                            )
+//OPCODE(VectorDeinterleaveEven32,                            U128,           U128,           U128                                            )
+//OPCODE(VectorDeinterleaveEven64,                            U128,           U128,           U128                                            )
+//OPCODE(VectorDeinterleaveOdd8,                              U128,           U128,           U128                                            )
+//OPCODE(VectorDeinterleaveOdd16,                             U128,           U128,           U128                                            )
+//OPCODE(VectorDeinterleaveOdd32,                             U128,           U128,           U128                                            )
+//OPCODE(VectorDeinterleaveOdd64,                             U128,           U128,           U128                                            )
+//OPCODE(VectorEor,                                           U128,           U128,           U128                                            )
+//OPCODE(VectorEqual8,                                        U128,           U128,           U128                                            )
+//OPCODE(VectorEqual16,                                       U128,           U128,           U128                                            )
+//OPCODE(VectorEqual32,                                       U128,           U128,           U128                                            )
+//OPCODE(VectorEqual64,                                       U128,           U128,           U128                                            )
+//OPCODE(VectorEqual128,                                      U128,           U128,           U128                                            )
+//OPCODE(VectorExtract,                                       U128,           U128,           U128,           U8                              )
+//OPCODE(VectorExtractLower,                                  U128,           U128,           U128,           U8                              )
+//OPCODE(VectorGreaterS8,                                     U128,           U128,           U128                                            )
+//OPCODE(VectorGreaterS16,                                    U128,           U128,           U128                                            )
+//OPCODE(VectorGreaterS32,                                    U128,           U128,           U128                                            )
+//OPCODE(VectorGreaterS64,                                    U128,           U128,           U128                                            )
+//OPCODE(VectorHalvingAddS8,                                  U128,           U128,           U128                                            )
+//OPCODE(VectorHalvingAddS16,                                 U128,           U128,           U128                                            )
+//OPCODE(VectorHalvingAddS32,                                 U128,           U128,           U128                                            )
+//OPCODE(VectorHalvingAddU8,                                  U128,           U128,           U128                                            )
+//OPCODE(VectorHalvingAddU16,                                 U128,           U128,           U128                                            )
+//OPCODE(VectorHalvingAddU32,                                 U128,           U128,           U128                                            )
+//OPCODE(VectorHalvingSubS8,                                  U128,           U128,           U128                                            )
+//OPCODE(VectorHalvingSubS16,                                 U128,           U128,           U128                                            )
+//OPCODE(VectorHalvingSubS32,                                 U128,           U128,           U128                                            )
+//OPCODE(VectorHalvingSubU8,                                  U128,           U128,           U128                                            )
+//OPCODE(VectorHalvingSubU16,                                 U128,           U128,           U128                                            )
+//OPCODE(VectorHalvingSubU32,                                 U128,           U128,           U128                                            )
+//OPCODE(VectorInterleaveLower8,                              U128,           U128,           U128                                            )
+//OPCODE(VectorInterleaveLower16,                             U128,           U128,           U128                                            )
+//OPCODE(VectorInterleaveLower32,                             U128,           U128,           U128                                            )
+//OPCODE(VectorInterleaveLower64,                             U128,           U128,           U128                                            )
+//OPCODE(VectorInterleaveUpper8,                              U128,           U128,           U128                                            )
+//OPCODE(VectorInterleaveUpper16,                             U128,           U128,           U128                                            )
+//OPCODE(VectorInterleaveUpper32,                             U128,           U128,           U128                                            )
+//OPCODE(VectorInterleaveUpper64,                             U128,           U128,           U128                                            )
+//OPCODE(VectorLogicalShiftLeft8,                             U128,           U128,           U8                                              )
+//OPCODE(VectorLogicalShiftLeft16,                            U128,           U128,           U8                                              )
+//OPCODE(VectorLogicalShiftLeft32,                            U128,           U128,           U8                                              )
+//OPCODE(VectorLogicalShiftLeft64,                            U128,           U128,           U8                                              )
+//OPCODE(VectorLogicalShiftRight8,                            U128,           U128,           U8                                              )
+//OPCODE(VectorLogicalShiftRight16,                           U128,           U128,           U8                                              )
+//OPCODE(VectorLogicalShiftRight32,                           U128,           U128,           U8                                              )
+//OPCODE(VectorLogicalShiftRight64,                           U128,           U128,           U8                                              )
+//OPCODE(VectorLogicalVShift8,                                U128,           U128,           U128                                            )
+//OPCODE(VectorLogicalVShift16,                               U128,           U128,           U128                                            )
+//OPCODE(VectorLogicalVShift32,                               U128,           U128,           U128                                            )
+//OPCODE(VectorLogicalVShift64,                               U128,           U128,           U128                                            )
+//OPCODE(VectorMaxS8,                                         U128,           U128,           U128                                            )
+//OPCODE(VectorMaxS16,                                        U128,           U128,           U128                                            )
+//OPCODE(VectorMaxS32,                                        U128,           U128,           U128                                            )
+//OPCODE(VectorMaxS64,                                        U128,           U128,           U128                                            )
+//OPCODE(VectorMaxU8,                                         U128,           U128,           U128                                            )
+//OPCODE(VectorMaxU16,                                        U128,           U128,           U128                                            )
+//OPCODE(VectorMaxU32,                                        U128,           U128,           U128                                            )
+//OPCODE(VectorMaxU64,                                        U128,           U128,           U128                                            )
+//OPCODE(VectorMinS8,                                         U128,           U128,           U128                                            )
+//OPCODE(VectorMinS16,                                        U128,           U128,           U128                                            )
+//OPCODE(VectorMinS32,                                        U128,           U128,           U128                                            )
+//OPCODE(VectorMinS64,                                        U128,           U128,           U128                                            )
+//OPCODE(VectorMinU8,                                         U128,           U128,           U128                                            )
+//OPCODE(VectorMinU16,                                        U128,           U128,           U128                                            )
+//OPCODE(VectorMinU32,                                        U128,           U128,           U128                                            )
+//OPCODE(VectorMinU64,                                        U128,           U128,           U128                                            )
+//OPCODE(VectorMultiply8,                                     U128,           U128,           U128                                            )
+//OPCODE(VectorMultiply16,                                    U128,           U128,           U128                                            )
+//OPCODE(VectorMultiply32,                                    U128,           U128,           U128                                            )
+//OPCODE(VectorMultiply64,                                    U128,           U128,           U128                                            )
+//OPCODE(VectorNarrow16,                                      U128,           U128                                                            )
+//OPCODE(VectorNarrow32,                                      U128,           U128                                                            )
+//OPCODE(VectorNarrow64,                                      U128,           U128                                                            )
+//OPCODE(VectorNot,                                           U128,           U128                                                            )
+//OPCODE(VectorOr,                                            U128,           U128,           U128                                            )
+//OPCODE(VectorPairedAddLower8,                               U128,           U128,           U128                                            )
+//OPCODE(VectorPairedAddLower16,                              U128,           U128,           U128                                            )
+//OPCODE(VectorPairedAddLower32,                              U128,           U128,           U128                                            )
+//OPCODE(VectorPairedAddSignedWiden8,                         U128,           U128                                                            )
+//OPCODE(VectorPairedAddSignedWiden16,                        U128,           U128                                                            )
+//OPCODE(VectorPairedAddSignedWiden32,                        U128,           U128                                                            )
+//OPCODE(VectorPairedAddUnsignedWiden8,                       U128,           U128                                                            )
+//OPCODE(VectorPairedAddUnsignedWiden16,                      U128,           U128                                                            )
+//OPCODE(VectorPairedAddUnsignedWiden32,                      U128,           U128                                                            )
+//OPCODE(VectorPairedAdd8,                                    U128,           U128,           U128                                            )
+//OPCODE(VectorPairedAdd16,                                   U128,           U128,           U128                                            )
+//OPCODE(VectorPairedAdd32,                                   U128,           U128,           U128                                            )
+//OPCODE(VectorPairedAdd64,                                   U128,           U128,           U128                                            )
+//OPCODE(VectorPairedMaxS8,                                   U128,           U128,           U128                                            )
+//OPCODE(VectorPairedMaxS16,                                  U128,           U128,           U128                                            )
+//OPCODE(VectorPairedMaxS32,                                  U128,           U128,           U128                                            )
+//OPCODE(VectorPairedMaxU8,                                   U128,           U128,           U128                                            )
+//OPCODE(VectorPairedMaxU16,                                  U128,           U128,           U128                                            )
+//OPCODE(VectorPairedMaxU32,                                  U128,           U128,           U128                                            )
+//OPCODE(VectorPairedMinS8,                                   U128,           U128,           U128                                            )
+//OPCODE(VectorPairedMinS16,                                  U128,           U128,           U128                                            )
+//OPCODE(VectorPairedMinS32,                                  U128,           U128,           U128                                            )
+//OPCODE(VectorPairedMinU8,                                   U128,           U128,           U128                                            )
+//OPCODE(VectorPairedMinU16,                                  U128,           U128,           U128                                            )
+//OPCODE(VectorPairedMinU32,                                  U128,           U128,           U128                                            )
+//OPCODE(VectorPolynomialMultiply8,                           U128,           U128,           U128                                            )
+//OPCODE(VectorPolynomialMultiplyLong8,                       U128,           U128,           U128                                            )
+//OPCODE(VectorPolynomialMultiplyLong64,                      U128,           U128,           U128                                            )
+//OPCODE(VectorPopulationCount,                               U128,           U128                                                            )
+//OPCODE(VectorReverseBits,                                   U128,           U128                                                            )
+//OPCODE(VectorRoundingHalvingAddS8,                          U128,           U128,           U128                                            )
+//OPCODE(VectorRoundingHalvingAddS16,                         U128,           U128,           U128                                            )
+//OPCODE(VectorRoundingHalvingAddS32,                         U128,           U128,           U128                                            )
+//OPCODE(VectorRoundingHalvingAddU8,                          U128,           U128,           U128                                            )
+//OPCODE(VectorRoundingHalvingAddU16,                         U128,           U128,           U128                                            )
+//OPCODE(VectorRoundingHalvingAddU32,                         U128,           U128,           U128                                            )
+//OPCODE(VectorRoundingShiftLeftS8,                           U128,           U128,           U128                                            )
+//OPCODE(VectorRoundingShiftLeftS16,                          U128,           U128,           U128                                            )
+//OPCODE(VectorRoundingShiftLeftS32,                          U128,           U128,           U128                                            )
+//OPCODE(VectorRoundingShiftLeftS64,                          U128,           U128,           U128                                            )
+//OPCODE(VectorRoundingShiftLeftU8,                           U128,           U128,           U128                                            )
+//OPCODE(VectorRoundingShiftLeftU16,                          U128,           U128,           U128                                            )
+//OPCODE(VectorRoundingShiftLeftU32,                          U128,           U128,           U128                                            )
+//OPCODE(VectorRoundingShiftLeftU64,                          U128,           U128,           U128                                            )
+//OPCODE(VectorShuffleHighHalfwords,                          U128,           U128,           U8                                              )
+//OPCODE(VectorShuffleLowHalfwords,                           U128,           U128,           U8                                              )
+//OPCODE(VectorShuffleWords,                                  U128,           U128,           U8                                              )
+//OPCODE(VectorSignExtend8,                                   U128,           U128                                                            )
+//OPCODE(VectorSignExtend16,                                  U128,           U128                                                            )
+//OPCODE(VectorSignExtend32,                                  U128,           U128                                                            )
+//OPCODE(VectorSignExtend64,                                  U128,           U128                                                            )
+//OPCODE(VectorSignedAbsoluteDifference8,                     U128,           U128,           U128                                            )
+//OPCODE(VectorSignedAbsoluteDifference16,                    U128,           U128,           U128                                            )
+//OPCODE(VectorSignedAbsoluteDifference32,                    U128,           U128,           U128                                            )
+//OPCODE(VectorSignedMultiply16,                              Void,           U128,           U128                                            )
+//OPCODE(VectorSignedMultiply32,                              Void,           U128,           U128                                            )
+//OPCODE(VectorSignedSaturatedAbs8,                           U128,           U128                                                            )
+//OPCODE(VectorSignedSaturatedAbs16,                          U128,           U128                                                            )
+//OPCODE(VectorSignedSaturatedAbs32,                          U128,           U128                                                            )
+//OPCODE(VectorSignedSaturatedAbs64,                          U128,           U128                                                            )
+//OPCODE(VectorSignedSaturatedAccumulateUnsigned8,            U128,           U128,           U128                                            )
+//OPCODE(VectorSignedSaturatedAccumulateUnsigned16,           U128,           U128,           U128                                            )
+//OPCODE(VectorSignedSaturatedAccumulateUnsigned32,           U128,           U128,           U128                                            )
+//OPCODE(VectorSignedSaturatedAccumulateUnsigned64,           U128,           U128,           U128                                            )
+//OPCODE(VectorSignedSaturatedDoublingMultiply16,             Void,           U128,           U128                                            )
+//OPCODE(VectorSignedSaturatedDoublingMultiply32,             Void,           U128,           U128                                            )
+//OPCODE(VectorSignedSaturatedDoublingMultiplyLong16,         U128,           U128,           U128                                            )
+//OPCODE(VectorSignedSaturatedDoublingMultiplyLong32,         U128,           U128,           U128                                            )
+//OPCODE(VectorSignedSaturatedNarrowToSigned16,               U128,           U128                                                            )
+//OPCODE(VectorSignedSaturatedNarrowToSigned32,               U128,           U128                                                            )
+//OPCODE(VectorSignedSaturatedNarrowToSigned64,               U128,           U128                                                            )
+//OPCODE(VectorSignedSaturatedNarrowToUnsigned16,             U128,           U128                                                            )
+//OPCODE(VectorSignedSaturatedNarrowToUnsigned32,             U128,           U128                                                            )
+//OPCODE(VectorSignedSaturatedNarrowToUnsigned64,             U128,           U128                                                            )
+//OPCODE(VectorSignedSaturatedNeg8,                           U128,           U128                                                            )
+//OPCODE(VectorSignedSaturatedNeg16,                          U128,           U128                                                            )
+//OPCODE(VectorSignedSaturatedNeg32,                          U128,           U128                                                            )
+//OPCODE(VectorSignedSaturatedNeg64,                          U128,           U128                                                            )
+//OPCODE(VectorSignedSaturatedShiftLeft8,                     U128,           U128,           U128                                            )
+//OPCODE(VectorSignedSaturatedShiftLeft16,                    U128,           U128,           U128                                            )
+//OPCODE(VectorSignedSaturatedShiftLeft32,                    U128,           U128,           U128                                            )
+//OPCODE(VectorSignedSaturatedShiftLeft64,                    U128,           U128,           U128                                            )
+//OPCODE(VectorSignedSaturatedShiftLeftUnsigned8,             U128,           U128,           U128                                            )
+//OPCODE(VectorSignedSaturatedShiftLeftUnsigned16,            U128,           U128,           U128                                            )
+//OPCODE(VectorSignedSaturatedShiftLeftUnsigned32,            U128,           U128,           U128                                            )
+//OPCODE(VectorSignedSaturatedShiftLeftUnsigned64,            U128,           U128,           U128                                            )
+//OPCODE(VectorSub8,                                          U128,           U128,           U128                                            )
+//OPCODE(VectorSub16,                                         U128,           U128,           U128                                            )
+//OPCODE(VectorSub32,                                         U128,           U128,           U128                                            )
+//OPCODE(VectorSub64,                                         U128,           U128,           U128                                            )
+//OPCODE(VectorTable,                                         Table,          U128,           Opaque,         Opaque,         Opaque          )
+//OPCODE(VectorTableLookup,                                   U128,           U128,           Table,          U128                            )
+//OPCODE(VectorUnsignedAbsoluteDifference8,                   U128,           U128,           U128                                            )
+//OPCODE(VectorUnsignedAbsoluteDifference16,                  U128,           U128,           U128                                            )
+//OPCODE(VectorUnsignedAbsoluteDifference32,                  U128,           U128,           U128                                            )
+//OPCODE(VectorUnsignedMultiply16,                            Void,           U128,           U128                                            )
+//OPCODE(VectorUnsignedMultiply32,                            Void,           U128,           U128                                            )
+//OPCODE(VectorUnsignedRecipEstimate,                         U128,           U128                                                            )
+//OPCODE(VectorUnsignedRecipSqrtEstimate,                     U128,           U128                                                            )
+//OPCODE(VectorUnsignedSaturatedAccumulateSigned8,            U128,           U128,           U128                                            )
+//OPCODE(VectorUnsignedSaturatedAccumulateSigned16,           U128,           U128,           U128                                            )
+//OPCODE(VectorUnsignedSaturatedAccumulateSigned32,           U128,           U128,           U128                                            )
+//OPCODE(VectorUnsignedSaturatedAccumulateSigned64,           U128,           U128,           U128                                            )
+//OPCODE(VectorUnsignedSaturatedNarrow16,                     U128,           U128                                                            )
+//OPCODE(VectorUnsignedSaturatedNarrow32,                     U128,           U128                                                            )
+//OPCODE(VectorUnsignedSaturatedNarrow64,                     U128,           U128                                                            )
+//OPCODE(VectorUnsignedSaturatedShiftLeft8,                   U128,           U128,           U128                                            )
+//OPCODE(VectorUnsignedSaturatedShiftLeft16,                  U128,           U128,           U128                                            )
+//OPCODE(VectorUnsignedSaturatedShiftLeft32,                  U128,           U128,           U128                                            )
+//OPCODE(VectorUnsignedSaturatedShiftLeft64,                  U128,           U128,           U128                                            )
+//OPCODE(VectorZeroExtend8,                                   U128,           U128                                                            )
+//OPCODE(VectorZeroExtend16,                                  U128,           U128                                                            )
+//OPCODE(VectorZeroExtend32,                                  U128,           U128                                                            )
+//OPCODE(VectorZeroExtend64,                                  U128,           U128                                                            )
+//OPCODE(VectorZeroUpper,                                     U128,           U128                                                            )
+//OPCODE(ZeroVector,                                          U128,                                                                           )
+
+// Floating-point operations
+//OPCODE(FPAbs16,                                             U16,            U16                                                             )
+OPCODE(FPAbs32,                                             U32,            U32                                                             )
+OPCODE(FPAbs64,                                             U64,            U64                                                             )
+OPCODE(FPAdd32,                                             U32,            U32,            U32                                             )
+OPCODE(FPAdd64,                                             U64,            U64,            U64                                             )
+OPCODE(FPCompare32,                                         NZCV,           U32,            U32,            U1                              )
+OPCODE(FPCompare64,                                         NZCV,           U64,            U64,            U1                              )
+OPCODE(FPDiv32,                                             U32,            U32,            U32                                             )
+OPCODE(FPDiv64,                                             U64,            U64,            U64                                             )
+//OPCODE(FPMax32,                                             U32,            U32,            U32                                             )
+//OPCODE(FPMax64,                                             U64,            U64,            U64                                             )
+//OPCODE(FPMaxNumeric32,                                      U32,            U32,            U32                                             )
+//OPCODE(FPMaxNumeric64,                                      U64,            U64,            U64                                             )
+//OPCODE(FPMin32,                                             U32,            U32,            U32                                             )
+//OPCODE(FPMin64,                                             U64,            U64,            U64                                             )
+//OPCODE(FPMinNumeric32,                                      U32,            U32,            U32                                             )
+//OPCODE(FPMinNumeric64,                                      U64,            U64,            U64                                             )
+OPCODE(FPMul32,                                             U32,            U32,            U32                                             )
+OPCODE(FPMul64,                                             U64,            U64,            U64                                             )
+//OPCODE(FPMulAdd16,                                          U16,            U16,            U16,            U16                             )
+//OPCODE(FPMulAdd32,                                          U32,            U32,            U32,            U32                             )
+//OPCODE(FPMulAdd64,                                          U64,            U64,            U64,            U64                             )
+//OPCODE(FPMulX32,                                            U32,            U32,            U32                                             )
+//OPCODE(FPMulX64,                                            U64,            U64,            U64                                             )
+//OPCODE(FPNeg16,                                             U16,            U16                                                             )
+OPCODE(FPNeg32,                                             U32,            U32                                                             )
+OPCODE(FPNeg64,                                             U64,            U64                                                             )
+//OPCODE(FPRecipEstimate16,                                   U16,            U16                                                             )
+//OPCODE(FPRecipEstimate32,                                   U32,            U32                                                             )
+//OPCODE(FPRecipEstimate64,                                   U64,            U64                                                             )
+//OPCODE(FPRecipExponent16,                                   U16,            U16                                                             )
+//OPCODE(FPRecipExponent32,                                   U32,            U32                                                             )
+//OPCODE(FPRecipExponent64,                                   U64,            U64                                                             )
+//OPCODE(FPRecipStepFused16,                                  U16,            U16,            U16                                             )
+//OPCODE(FPRecipStepFused32,                                  U32,            U32,            U32                                             )
+//OPCODE(FPRecipStepFused64,                                  U64,            U64,            U64                                             )
+//OPCODE(FPRoundInt16,                                        U16,            U16,            U8,             U1                              )
+//OPCODE(FPRoundInt32,                                        U32,            U32,            U8,             U1                              )
+//OPCODE(FPRoundInt64,                                        U64,            U64,            U8,             U1                              )
+//OPCODE(FPRSqrtEstimate16,                                   U16,            U16                                                             )
+//OPCODE(FPRSqrtEstimate32,                                   U32,            U32                                                             )
+//OPCODE(FPRSqrtEstimate64,                                   U64,            U64                                                             )
+//OPCODE(FPRSqrtStepFused16,                                  U16,            U16,            U16                                             )
+//OPCODE(FPRSqrtStepFused32,                                  U32,            U32,            U32                                             )
+//OPCODE(FPRSqrtStepFused64,                                  U64,            U64,            U64                                             )
+OPCODE(FPSqrt32,                                            U32,            U32                                                             )
+OPCODE(FPSqrt64,                                            U64,            U64                                                             )
+OPCODE(FPSub32,                                             U32,            U32,            U32                                             )
+OPCODE(FPSub64,                                             U64,            U64,            U64                                             )
+
+// Floating-point conversions
+OPCODE(FPHalfToDouble,                                      U64,            U16,            U8                                              )
+OPCODE(FPHalfToSingle,                                      U32,            U16,            U8                                              )
+OPCODE(FPSingleToDouble,                                    U64,            U32,            U8                                              )
+OPCODE(FPSingleToHalf,                                      U16,            U32,            U8                                              )
+OPCODE(FPDoubleToHalf,                                      U16,            U64,            U8                                              )
+OPCODE(FPDoubleToSingle,                                    U32,            U64,            U8                                              )
+OPCODE(FPDoubleToFixedS32,                                  U32,            U64,            U8,             U8                              )
+OPCODE(FPDoubleToFixedS64,                                  U64,            U64,            U8,             U8                              )
+OPCODE(FPDoubleToFixedU32,                                  U32,            U64,            U8,             U8                              )
+OPCODE(FPDoubleToFixedU64,                                  U64,            U64,            U8,             U8                              )
+//OPCODE(FPHalfToFixedS32,                                    U32,            U16,            U8,             U8                              )
+//OPCODE(FPHalfToFixedS64,                                    U64,            U16,            U8,             U8                              )
+//OPCODE(FPHalfToFixedU32,                                    U32,            U16,            U8,             U8                              )
+//OPCODE(FPHalfToFixedU64,                                    U64,            U16,            U8,             U8                              )
+OPCODE(FPSingleToFixedS32,                                  U32,            U32,            U8,             U8                              )
+OPCODE(FPSingleToFixedS64,                                  U64,            U32,            U8,             U8                              )
+OPCODE(FPSingleToFixedU32,                                  U32,            U32,            U8,             U8                              )
+OPCODE(FPSingleToFixedU64,                                  U64,            U32,            U8,             U8                              )
+OPCODE(FPFixedU32ToSingle,                                  U32,            U32,            U8,             U8                              )
+OPCODE(FPFixedS32ToSingle,                                  U32,            U32,            U8,             U8                              )
+OPCODE(FPFixedU32ToDouble,                                  U64,            U32,            U8,             U8                              )
+OPCODE(FPFixedU64ToDouble,                                  U64,            U64,            U8,             U8                              )
+OPCODE(FPFixedU64ToSingle,                                  U32,            U64,            U8,             U8                              )
+OPCODE(FPFixedS32ToDouble,                                  U64,            U32,            U8,             U8                              )
+OPCODE(FPFixedS64ToDouble,                                  U64,            U64,            U8,             U8                              )
+OPCODE(FPFixedS64ToSingle,                                  U32,            U64,            U8,             U8                              )
+
+// Floating-point vector instructions
+//OPCODE(FPVectorAbs16,                                       U128,           U128                                                            )
+//OPCODE(FPVectorAbs32,                                       U128,           U128                                                            )
+//OPCODE(FPVectorAbs64,                                       U128,           U128                                                            )
+//OPCODE(FPVectorAdd32,                                       U128,           U128,           U128                                            )
+//OPCODE(FPVectorAdd64,                                       U128,           U128,           U128                                            )
+//OPCODE(FPVectorDiv32,                                       U128,           U128,           U128                                            )
+//OPCODE(FPVectorDiv64,                                       U128,           U128,           U128                                            )
+//OPCODE(FPVectorEqual32,                                     U128,           U128,           U128                                            )
+//OPCODE(FPVectorEqual64,                                     U128,           U128,           U128                                            )
+//OPCODE(FPVectorFromSignedFixed32,                           U128,           U128,           U8,             U8                              )
+//OPCODE(FPVectorFromSignedFixed64,                           U128,           U128,           U8,             U8                              )
+//OPCODE(FPVectorFromUnsignedFixed32,                         U128,           U128,           U8,             U8                              )
+//OPCODE(FPVectorFromUnsignedFixed64,                         U128,           U128,           U8,             U8                              )
+//OPCODE(FPVectorGreater32,                                   U128,           U128,           U128                                            )
+//OPCODE(FPVectorGreater64,                                   U128,           U128,           U128                                            )
+//OPCODE(FPVectorGreaterEqual32,                              U128,           U128,           U128                                            )
+//OPCODE(FPVectorGreaterEqual64,                              U128,           U128,           U128                                            )
+//OPCODE(FPVectorMax32,                                       U128,           U128,           U128                                            )
+//OPCODE(FPVectorMax64,                                       U128,           U128,           U128                                            )
+//OPCODE(FPVectorMin32,                                       U128,           U128,           U128                                            )
+//OPCODE(FPVectorMin64,                                       U128,           U128,           U128                                            )
+//OPCODE(FPVectorMul32,                                       U128,           U128,           U128                                            )
+//OPCODE(FPVectorMul64,                                       U128,           U128,           U128                                            )
+//OPCODE(FPVectorMulAdd16,                                    U128,           U128,           U128,           U128                            )
+//OPCODE(FPVectorMulAdd32,                                    U128,           U128,           U128,           U128                            )
+//OPCODE(FPVectorMulAdd64,                                    U128,           U128,           U128,           U128                            )
+//OPCODE(FPVectorMulX32,                                      U128,           U128,           U128                                            )
+//OPCODE(FPVectorMulX64,                                      U128,           U128,           U128                                            )
+//OPCODE(FPVectorNeg16,                                       U128,           U128                                                            )
+//OPCODE(FPVectorNeg32,                                       U128,           U128                                                            )
+//OPCODE(FPVectorNeg64,                                       U128,           U128                                                            )
+//OPCODE(FPVectorPairedAdd32,                                 U128,           U128,           U128                                            )
+//OPCODE(FPVectorPairedAdd64,                                 U128,           U128,           U128                                            )
+//OPCODE(FPVectorPairedAddLower32,                            U128,           U128,           U128                                            )
+//OPCODE(FPVectorPairedAddLower64,                            U128,           U128,           U128                                            )
+//OPCODE(FPVectorRecipEstimate16,                             U128,           U128                                                            )
+//OPCODE(FPVectorRecipEstimate32,                             U128,           U128                                                            )
+//OPCODE(FPVectorRecipEstimate64,                             U128,           U128                                                            )
+//OPCODE(FPVectorRecipStepFused16,                            U128,           U128,           U128                                            )
+//OPCODE(FPVectorRecipStepFused32,                            U128,           U128,           U128                                            )
+//OPCODE(FPVectorRecipStepFused64,                            U128,           U128,           U128                                            )
+//OPCODE(FPVectorRoundInt16,                                  U128,           U128,           U8,             U1                              )
+//OPCODE(FPVectorRoundInt32,                                  U128,           U128,           U8,             U1                              )
+//OPCODE(FPVectorRoundInt64,                                  U128,           U128,           U8,             U1                              )
+//OPCODE(FPVectorRSqrtEstimate16,                             U128,           U128                                                            )
+//OPCODE(FPVectorRSqrtEstimate32,                             U128,           U128                                                            )
+//OPCODE(FPVectorRSqrtEstimate64,                             U128,           U128                                                            )
+//OPCODE(FPVectorRSqrtStepFused16,                            U128,           U128,           U128                                            )
+//OPCODE(FPVectorRSqrtStepFused32,                            U128,           U128,           U128                                            )
+//OPCODE(FPVectorRSqrtStepFused64,                            U128,           U128,           U128                                            )
+//OPCODE(FPVectorSqrt32,                                      U128,           U128                                                            )
+//OPCODE(FPVectorSqrt64,                                      U128,           U128                                                            )
+//OPCODE(FPVectorSub32,                                       U128,           U128,           U128                                            )
+//OPCODE(FPVectorSub64,                                       U128,           U128,           U128                                            )
+//OPCODE(FPVectorToSignedFixed16,                             U128,           U128,           U8,             U8                              )
+//OPCODE(FPVectorToSignedFixed32,                             U128,           U128,           U8,             U8                              )
+//OPCODE(FPVectorToSignedFixed64,                             U128,           U128,           U8,             U8                              )
+//OPCODE(FPVectorToUnsignedFixed16,                           U128,           U128,           U8,             U8                              )
+//OPCODE(FPVectorToUnsignedFixed32,                           U128,           U128,           U8,             U8                              )
+//OPCODE(FPVectorToUnsignedFixed64,                           U128,           U128,           U8,             U8                              )
+
+// A32 Memory access
+A32OPC(ClearExclusive,                                      Void,                                                                           )
+A32OPC(SetExclusive,                                        Void,           U32,            U8                                              )
+A32OPC(ReadMemory8,                                         U8,             U32                                                             )
+A32OPC(ReadMemory16,                                        U16,            U32                                                             )
+A32OPC(ReadMemory32,                                        U32,            U32                                                             )
+A32OPC(ReadMemory64,                                        U64,            U32                                                             )
+A32OPC(WriteMemory8,                                        Void,           U32,            U8                                              )
+A32OPC(WriteMemory16,                                       Void,           U32,            U16                                             )
+A32OPC(WriteMemory32,                                       Void,           U32,            U32                                             )
+A32OPC(WriteMemory64,                                       Void,           U32,            U64                                             )
+A32OPC(ExclusiveWriteMemory8,                               U32,            U32,            U8                                              )
+A32OPC(ExclusiveWriteMemory16,                              U32,            U32,            U16                                             )
+A32OPC(ExclusiveWriteMemory32,                              U32,            U32,            U32                                             )
+A32OPC(ExclusiveWriteMemory64,                              U32,            U32,            U64                                             )
+
+// A64 Memory access
+//A64OPC(ClearExclusive,                                      Void,                                                                           )
+//A64OPC(SetExclusive,                                        Void,           U64,            U8                                              )
+//A64OPC(ReadMemory8,                                         U8,             U64                                                             )
+//A64OPC(ReadMemory16,                                        U16,            U64                                                             )
+//A64OPC(ReadMemory32,                                        U32,            U64                                                             )
+//A64OPC(ReadMemory64,                                        U64,            U64                                                             )
+//A64OPC(ReadMemory128,                                       U128,           U64                                                             )
+//A64OPC(WriteMemory8,                                        Void,           U64,            U8                                              )
+//A64OPC(WriteMemory16,                                       Void,           U64,            U16                                             )
+//A64OPC(WriteMemory32,                                       Void,           U64,            U32                                             )
+//A64OPC(WriteMemory64,                                       Void,           U64,            U64                                             )
+//A64OPC(WriteMemory128,                                      Void,           U64,            U128                                            )
+//A64OPC(ExclusiveWriteMemory8,                               U32,            U64,            U8                                              )
+//A64OPC(ExclusiveWriteMemory16,                              U32,            U64,            U16                                             )
+//A64OPC(ExclusiveWriteMemory32,                              U32,            U64,            U32                                             )
+//A64OPC(ExclusiveWriteMemory64,                              U32,            U64,            U64                                             )
+//A64OPC(ExclusiveWriteMemory128,                             U32,            U64,            U128                                            )
+
+// Coprocessor
+A32OPC(CoprocInternalOperation,                             Void,           CoprocInfo                                                      )
+A32OPC(CoprocSendOneWord,                                   Void,           CoprocInfo,     U32                                             )
+A32OPC(CoprocSendTwoWords,                                  Void,           CoprocInfo,     U32,            U32                             )
+A32OPC(CoprocGetOneWord,                                    U32,            CoprocInfo                                                      )
+A32OPC(CoprocGetTwoWords,                                   U64,            CoprocInfo                                                      )
+A32OPC(CoprocLoadWords,                                     Void,           CoprocInfo,     U32                                             )
+A32OPC(CoprocStoreWords,                                    Void,           CoprocInfo,     U32                                             )
diff --git a/src/dynarmic/backend/A64/perf_map.cpp b/src/dynarmic/backend/A64/perf_map.cpp
new file mode 100644
index 00000000..af46fa08
--- /dev/null
+++ b/src/dynarmic/backend/A64/perf_map.cpp
@@ -0,0 +1,89 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#include <cstddef>
+#include <string>
+
+#ifdef __linux__
+
+#include <cstdio>
+#include <cstdlib>
+#include <mutex>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <fmt/format.h>
+
+#include "common/common_types.h"
+
+namespace Dynarmic::BackendA64 {
+
+namespace {
+std::mutex mutex;
+std::FILE* file = nullptr;
+
+void OpenFile() {
+    const char* perf_dir = std::getenv("PERF_BUILDID_DIR");
+    if (!perf_dir) {
+        file = nullptr;
+        return;
+    }
+
+    const pid_t pid = getpid();
+    const std::string filename = fmt::format("{:s}/perf-{:d}.map", perf_dir, pid);
+
+    file = std::fopen(filename.c_str(), "w");
+    if (!file) {
+        return;
+    }
+
+    std::setvbuf(file, nullptr, _IONBF, 0);
+}
+} // anonymous namespace
+
+namespace detail {
+void PerfMapRegister(const void* start, const void* end, const std::string& friendly_name) {
+    std::lock_guard guard{mutex};
+
+    if (!file) {
+        OpenFile();
+        if (!file) {
+            return;
+        }
+    }
+
+    const std::string line = fmt::format("{:016x} {:016x} {:s}\n", reinterpret_cast<u64>(start), reinterpret_cast<u64>(end) - reinterpret_cast<u64>(start), friendly_name);
+    std::fwrite(line.data(), sizeof *line.data(), line.size(), file);
+}
+} // namespace detail
+
+void PerfMapClear() {
+    std::lock_guard guard{mutex};
+
+    if (!file) {
+        return;
+    }
+
+    std::fclose(file);
+    file = nullptr;
+    OpenFile();
+}
+
+} // namespace Dynarmic::BackendX64
+
+#else
+
+namespace Dynarmic::BackendA64 {
+
+namespace detail {
+void PerfMapRegister(const void*, const void*, const std::string&) {}
+} // namespace detail
+
+void PerfMapClear() {}
+
+} // namespace Dynarmic::BackendX64
+
+#endif
diff --git a/src/dynarmic/backend/A64/perf_map.h b/src/dynarmic/backend/A64/perf_map.h
new file mode 100644
index 00000000..0b563dd1
--- /dev/null
+++ b/src/dynarmic/backend/A64/perf_map.h
@@ -0,0 +1,27 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2018 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <string>
+
+#include "common/cast_util.h"
+
+namespace Dynarmic::BackendA64 {
+
+namespace detail {
+void PerfMapRegister(const void* start, const void* end, const std::string& friendly_name);
+} // namespace detail
+
+template<typename T>
+void PerfMapRegister(T start, const void* end, const std::string& friendly_name) {
+    detail::PerfMapRegister(Common::BitCast<const void*>(start), end, friendly_name);
+}
+
+void PerfMapClear();
+
+} // namespace Dynarmic::BackendX64
diff --git a/src/dynarmic/backend/A64/reg_alloc.cpp b/src/dynarmic/backend/A64/reg_alloc.cpp
new file mode 100644
index 00000000..353eecac
--- /dev/null
+++ b/src/dynarmic/backend/A64/reg_alloc.cpp
@@ -0,0 +1,650 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#include <algorithm>
+#include <numeric>
+#include <utility>
+
+#include <fmt/ostream.h>
+
+#include "backend/A64/abi.h"
+#include "backend/A64/reg_alloc.h"
+#include "common/assert.h"
+
+namespace Dynarmic::BackendA64 {
+
+static u64 ImmediateToU64(const IR::Value& imm) {
+    switch (imm.GetType()) {
+    case IR::Type::U1:
+        return u64(imm.GetU1());
+    case IR::Type::U8:
+        return u64(imm.GetU8());
+    case IR::Type::U16:
+        return u64(imm.GetU16());
+    case IR::Type::U32:
+        return u64(imm.GetU32());
+    case IR::Type::U64:
+        return u64(imm.GetU64());
+    default:
+        ASSERT_FALSE("This should never happen.");
+    }
+}
+
+static bool CanExchange(HostLoc a, HostLoc b) {
+    return HostLocIsGPR(a) && HostLocIsGPR(b);
+}
+
+// Minimum number of bits required to represent a type
+static size_t GetBitWidth(IR::Type type) {
+    switch (type) {
+    case IR::Type::A32Reg:
+    case IR::Type::A32ExtReg:
+    case IR::Type::A64Reg:
+    case IR::Type::A64Vec:
+    case IR::Type::CoprocInfo:
+    case IR::Type::Cond:
+    case IR::Type::Void:
+    case IR::Type::Table:
+        ASSERT_FALSE("Type {} cannot be represented at runtime", type);
+        return 0;
+    case IR::Type::Opaque:
+        ASSERT_FALSE("Not a concrete type");
+        return 0;
+    case IR::Type::U1:
+        return 8;
+    case IR::Type::U8:
+        return 8;
+    case IR::Type::U16:
+        return 16;
+    case IR::Type::U32:
+        return 32;
+    case IR::Type::U64:
+        return 64;
+    case IR::Type::U128:
+        return 128;
+    case IR::Type::NZCVFlags:
+        return 32; // TODO: Update to 16 when flags optimization is done
+    }
+    UNREACHABLE();
+    return 0;
+}
+
+static bool IsValuelessType(IR::Type type) {
+    switch (type) {
+    case IR::Type::Table:
+        return true;
+    default:
+        return false;
+    }
+}
+
+bool HostLocInfo::IsLocked() const {
+    return is_being_used_count > 0;
+}
+
+bool HostLocInfo::IsEmpty() const {
+    return is_being_used_count == 0 && values.empty();
+}
+
+bool HostLocInfo::IsLastUse() const {
+    return is_being_used_count == 0 && current_references == 1 && accumulated_uses + 1 == total_uses;
+}
+
+void HostLocInfo::ReadLock() {
+    ASSERT(!is_scratch);
+    is_being_used_count++;
+}
+
+void HostLocInfo::WriteLock() {
+    ASSERT(is_being_used_count == 0);
+    is_being_used_count++;
+    is_scratch = true;
+}
+
+void HostLocInfo::AddArgReference() {
+    current_references++;
+    ASSERT(accumulated_uses + current_references <= total_uses);
+}
+
+void HostLocInfo::ReleaseOne() {
+    is_being_used_count--;
+    is_scratch = false;
+
+    if (current_references == 0)
+        return;
+
+    accumulated_uses++;
+    current_references--;
+
+    if (current_references == 0)
+        ReleaseAll();
+}
+
+void HostLocInfo::ReleaseAll() {
+    accumulated_uses += current_references;
+    current_references = 0;
+
+    ASSERT(total_uses == std::accumulate(values.begin(), values.end(), size_t(0), [](size_t sum, IR::Inst* inst) { return sum + inst->UseCount(); }));
+
+    if (total_uses == accumulated_uses) {
+        values.clear();
+        accumulated_uses = 0;
+        total_uses = 0;
+        max_bit_width = 0;
+    }
+
+    is_being_used_count = 0;
+    is_scratch = false;
+}
+
+bool HostLocInfo::ContainsValue(const IR::Inst* inst) const {
+    return std::find(values.begin(), values.end(), inst) != values.end();
+}
+
+size_t HostLocInfo::GetMaxBitWidth() const {
+    return max_bit_width;
+}
+
+void HostLocInfo::AddValue(IR::Inst* inst) {
+    values.push_back(inst);
+    total_uses += inst->UseCount();
+    max_bit_width = std::max(max_bit_width, GetBitWidth(inst->GetType()));
+}
+
+IR::Type Argument::GetType() const {
+    return value.GetType();
+}
+
+bool Argument::IsImmediate() const {
+    return value.IsImmediate();
+}
+
+bool Argument::IsVoid() const {
+    return GetType() == IR::Type::Void;
+}
+
+bool Argument::FitsInImmediateU32() const {
+    if (!IsImmediate())
+        return false;
+    u64 imm = ImmediateToU64(value);
+    return imm < 0x100000000;
+}
+
+bool Argument::FitsInImmediateS32() const {
+    if (!IsImmediate())
+        return false;
+    s64 imm = static_cast<s64>(ImmediateToU64(value));
+    return -s64(0x80000000) <= imm && imm <= s64(0x7FFFFFFF);
+}
+
+bool Argument::GetImmediateU1() const {
+    return value.GetU1();
+}
+
+u8 Argument::GetImmediateU8() const {
+    u64 imm = ImmediateToU64(value);
+    ASSERT(imm < 0x100);
+    return u8(imm);
+}
+
+u16 Argument::GetImmediateU16() const {
+    u64 imm = ImmediateToU64(value);
+    ASSERT(imm < 0x10000);
+    return u16(imm);
+}
+
+u32 Argument::GetImmediateU32() const {
+    u64 imm = ImmediateToU64(value);
+    ASSERT(imm < 0x100000000);
+    return u32(imm);
+}
+
+u64 Argument::GetImmediateS32() const {
+    ASSERT(FitsInImmediateS32());
+    u64 imm = ImmediateToU64(value);
+    return imm;
+}
+
+u64 Argument::GetImmediateU64() const {
+    return ImmediateToU64(value);
+}
+
+IR::Cond Argument::GetImmediateCond() const {
+    ASSERT(IsImmediate() && GetType() == IR::Type::Cond);
+    return value.GetCond();
+}
+
+bool Argument::IsInGpr() const {
+    if (IsImmediate())
+        return false;
+    return HostLocIsGPR(*reg_alloc.ValueLocation(value.GetInst()));
+}
+
+bool Argument::IsInFpr() const {
+    if (IsImmediate())
+        return false;
+    return HostLocIsFPR(*reg_alloc.ValueLocation(value.GetInst()));
+}
+
+bool Argument::IsInMemory() const {
+    if (IsImmediate())
+        return false;
+    return HostLocIsSpill(*reg_alloc.ValueLocation(value.GetInst()));
+}
+
+RegAlloc::ArgumentInfo RegAlloc::GetArgumentInfo(IR::Inst* inst) {
+    ArgumentInfo ret = {Argument{*this}, Argument{*this}, Argument{*this}, Argument{*this}};
+    for (size_t i = 0; i < inst->NumArgs(); i++) {
+        const IR::Value& arg = inst->GetArg(i);
+        ret[i].value = arg;
+        if (!arg.IsImmediate() && !IsValuelessType(arg.GetType())) {
+            ASSERT_MSG(ValueLocation(arg.GetInst()), "argument must already been defined");
+            LocInfo(*ValueLocation(arg.GetInst())).AddArgReference();
+        }
+    }
+    return ret;
+}
+
+Arm64Gen::ARM64Reg RegAlloc::UseGpr(Argument& arg) {
+    ASSERT(!arg.allocated);
+    arg.allocated = true;
+    return HostLocToReg64(UseImpl(arg.value, any_gpr));
+}
+
+Arm64Gen::ARM64Reg RegAlloc::UseFpr(Argument& arg) {
+    ASSERT(!arg.allocated);
+    arg.allocated = true;
+    return HostLocToFpr(UseImpl(arg.value, any_fpr));
+}
+
+//OpArg RegAlloc::UseOpArg(Argument& arg) {
+//    return UseGpr(arg);
+//}
+
+void RegAlloc::Use(Argument& arg, HostLoc host_loc) {
+    ASSERT(!arg.allocated);
+    arg.allocated = true;
+    UseImpl(arg.value, {host_loc});
+}
+
+Arm64Gen::ARM64Reg RegAlloc::UseScratchGpr(Argument& arg) {
+    ASSERT(!arg.allocated);
+    arg.allocated = true;
+    return HostLocToReg64(UseScratchImpl(arg.value, any_gpr));
+}
+
+Arm64Gen::ARM64Reg RegAlloc::UseScratchFpr(Argument& arg) {
+    ASSERT(!arg.allocated);
+    arg.allocated = true;
+    return HostLocToFpr(UseScratchImpl(arg.value, any_fpr));
+}
+
+void RegAlloc::UseScratch(Argument& arg, HostLoc host_loc) {
+    ASSERT(!arg.allocated);
+    arg.allocated = true;
+    UseScratchImpl(arg.value, {host_loc});
+}
+
+void RegAlloc::DefineValue(IR::Inst* inst, const Arm64Gen::ARM64Reg& reg) {
+    ASSERT(IsVector(reg) || IsGPR(reg));
+    HostLoc hostloc = static_cast<HostLoc>(DecodeReg(reg) + static_cast<size_t>(IsVector(reg) ? HostLoc::Q0 : HostLoc::X0));
+    DefineValueImpl(inst, hostloc);
+}
+
+void RegAlloc::DefineValue(IR::Inst* inst, Argument& arg) {
+    ASSERT(!arg.allocated);
+    arg.allocated = true;
+    DefineValueImpl(inst, arg.value);
+}
+
+void RegAlloc::Release(const Arm64Gen::ARM64Reg& reg) {
+    ASSERT(IsVector(reg) || IsGPR(reg));
+    const HostLoc hostloc = static_cast<HostLoc>(DecodeReg(reg) + static_cast<size_t>(IsVector(reg) ? HostLoc::Q0 : HostLoc::X0));
+    LocInfo(hostloc).ReleaseOne();
+}
+
+Arm64Gen::ARM64Reg RegAlloc::ScratchGpr(HostLocList desired_locations) {
+    return HostLocToReg64(ScratchImpl(desired_locations));
+}
+
+Arm64Gen::ARM64Reg RegAlloc::ScratchFpr(HostLocList desired_locations) {
+    return HostLocToFpr(ScratchImpl(desired_locations));
+}
+
+HostLoc RegAlloc::UseImpl(IR::Value use_value, HostLocList desired_locations) {
+    if (use_value.IsImmediate()) {
+        return LoadImmediate(use_value, ScratchImpl(desired_locations));
+    }
+
+    const IR::Inst* use_inst = use_value.GetInst();
+    const HostLoc current_location = *ValueLocation(use_inst);
+    const size_t max_bit_width = LocInfo(current_location).GetMaxBitWidth();
+
+    const bool can_use_current_location = std::find(desired_locations.begin(), desired_locations.end(), current_location) != desired_locations.end();
+    if (can_use_current_location) {
+        LocInfo(current_location).ReadLock();
+        return current_location;
+    }
+
+    if (LocInfo(current_location).IsLocked()) {
+        return UseScratchImpl(use_value, desired_locations);
+    }
+
+    const HostLoc destination_location = SelectARegister(desired_locations);
+    if (max_bit_width > HostLocBitWidth(destination_location)) {
+        return UseScratchImpl(use_value, desired_locations);
+    } else if (CanExchange(destination_location, current_location)) {
+        Exchange(destination_location, current_location);
+    } else {
+        MoveOutOfTheWay(destination_location);
+        Move(destination_location, current_location);
+    }
+    LocInfo(destination_location).ReadLock();
+    return destination_location;
+}
+
+HostLoc RegAlloc::UseScratchImpl(IR::Value use_value, HostLocList desired_locations) {
+    if (use_value.IsImmediate()) {
+        return LoadImmediate(use_value, ScratchImpl(desired_locations));
+    }
+
+    const IR::Inst* use_inst = use_value.GetInst();
+    const HostLoc current_location = *ValueLocation(use_inst);
+    const size_t bit_width = GetBitWidth(use_inst->GetType());
+
+    const bool can_use_current_location = std::find(desired_locations.begin(), desired_locations.end(), current_location) != desired_locations.end();
+    if (can_use_current_location && !LocInfo(current_location).IsLocked()) {
+        if (!LocInfo(current_location).IsLastUse()) {
+            MoveOutOfTheWay(current_location);
+        }
+        LocInfo(current_location).WriteLock();
+        return current_location;
+    }
+
+    const HostLoc destination_location = SelectARegister(desired_locations);
+    MoveOutOfTheWay(destination_location);
+    CopyToScratch(bit_width, destination_location, current_location);
+    LocInfo(destination_location).WriteLock();
+    return destination_location;
+}
+
+HostLoc RegAlloc::ScratchImpl(HostLocList desired_locations) {
+    HostLoc location = SelectARegister(desired_locations);
+    MoveOutOfTheWay(location);
+    LocInfo(location).WriteLock();
+    return location;
+}
+
+void RegAlloc::HostCall(IR::Inst* result_def, std::optional<Argument::copyable_reference> arg0, 
+                        std::optional<Argument::copyable_reference> arg1,
+                        std::optional<Argument::copyable_reference> arg2, 
+                        std::optional<Argument::copyable_reference> arg3, 
+                        std::optional<Argument::copyable_reference> arg4, 
+                        std::optional<Argument::copyable_reference> arg5, 
+                        std::optional<Argument::copyable_reference> arg6, 
+                        std::optional<Argument::copyable_reference> arg7) {
+    constexpr size_t args_count = 8;
+    constexpr std::array<HostLoc, args_count> args_hostloc = { ABI_PARAM1, ABI_PARAM2, ABI_PARAM3, ABI_PARAM4, ABI_PARAM5, ABI_PARAM6, ABI_PARAM7, ABI_PARAM8 };
+    const std::array<std::optional<Argument::copyable_reference>, args_count> args = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7};
+
+    static const std::vector<HostLoc> other_caller_save = [args_hostloc]() {
+        std::vector<HostLoc> ret(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end());
+
+        for (auto hostloc : args_hostloc)
+            ret.erase(std::find(ret.begin(), ret.end(), hostloc));
+
+        return ret;
+    }();
+
+    for (size_t i = 0; i < args_count; i++) {
+        if (args[i]) {
+            UseScratch(*args[i], args_hostloc[i]);
+        }
+    }
+
+    for (size_t i = 0; i < args_count; i++) {
+        if (!args[i]) {
+            // TODO: Force spill
+            ScratchGpr({args_hostloc[i]});
+        }
+    }
+
+    for (HostLoc caller_saved : other_caller_save) {
+        ScratchImpl({caller_saved});
+    }
+
+     if (result_def) {
+        DefineValueImpl(result_def, ABI_RETURN);
+    }
+}
+
+void RegAlloc::EndOfAllocScope() {
+    for (auto& iter : hostloc_info) {
+        iter.ReleaseAll();
+    }
+}
+
+void RegAlloc::AssertNoMoreUses() {
+    ASSERT(std::all_of(hostloc_info.begin(), hostloc_info.end(), [](const auto& i) { return i.IsEmpty(); }));
+}
+
+HostLoc RegAlloc::SelectARegister(HostLocList desired_locations) const {
+     std::vector<HostLoc> candidates = desired_locations;
+
+    // Find all locations that have not been allocated..
+    const auto allocated_locs = std::partition(candidates.begin(), candidates.end(), [this](auto loc){
+        return !this->LocInfo(loc).IsLocked();
+    });
+    candidates.erase(allocated_locs, candidates.end());
+    ASSERT_MSG(!candidates.empty(), "All candidate registers have already been allocated");
+
+    // Selects the best location out of the available locations.
+    // TODO: Actually do LRU or something. Currently we just try to pick something without a value if possible.
+
+    std::partition(candidates.begin(), candidates.end(), [this](auto loc){
+        return this->LocInfo(loc).IsEmpty();
+    });
+
+    return candidates.front();
+}
+
+std::optional<HostLoc> RegAlloc::ValueLocation(const IR::Inst* value) const {
+    for (size_t i = 0; i < hostloc_info.size(); i++)
+        if (hostloc_info[i].ContainsValue(value))
+            return static_cast<HostLoc>(i);
+
+    return std::nullopt;
+}
+
+void RegAlloc::DefineValueImpl(IR::Inst* def_inst, HostLoc host_loc) {
+    ASSERT_MSG(!ValueLocation(def_inst), "def_inst has already been defined");
+    LocInfo(host_loc).AddValue(def_inst);
+}
+
+void RegAlloc::DefineValueImpl(IR::Inst* def_inst, const IR::Value& use_inst) {
+    ASSERT_MSG(!ValueLocation(def_inst), "def_inst has already been defined");
+
+    if (use_inst.IsImmediate()) {
+        HostLoc location = ScratchImpl(any_gpr);
+        DefineValueImpl(def_inst, location);
+        LoadImmediate(use_inst, location);
+        return;
+    }
+
+    ASSERT_MSG(ValueLocation(use_inst.GetInst()), "use_inst must already be defined");
+    HostLoc location = *ValueLocation(use_inst.GetInst());
+    DefineValueImpl(def_inst, location);
+}
+
+HostLoc RegAlloc::LoadImmediate(IR::Value imm, HostLoc host_loc) {
+    ASSERT_MSG(imm.IsImmediate(), "imm is not an immediate");
+
+    if (HostLocIsGPR(host_loc)) {
+        Arm64Gen::ARM64Reg reg = HostLocToReg64(host_loc);
+        u64 imm_value = ImmediateToU64(imm);
+        code.MOVI2R(reg, imm_value);
+        return host_loc;
+    }
+
+    if (HostLocIsFPR(host_loc)) {
+        Arm64Gen::ARM64Reg reg = Arm64Gen::EncodeRegToDouble(HostLocToFpr(host_loc));
+        u64 imm_value = ImmediateToU64(imm);
+        if (imm_value == 0)
+            code.fp_emitter.FMOV(reg, 0);
+        else {
+            code.EmitPatchLDR(reg, imm_value);
+        }
+        return host_loc;
+    }
+
+    UNREACHABLE();
+}
+
+void RegAlloc::Move(HostLoc to, HostLoc from) {
+    const size_t bit_width = LocInfo(from).GetMaxBitWidth();
+
+    ASSERT(LocInfo(to).IsEmpty() && !LocInfo(from).IsLocked());
+    ASSERT(bit_width <= HostLocBitWidth(to));
+
+    if (LocInfo(from).IsEmpty()) {
+        return;
+    }
+
+    EmitMove(bit_width, to, from);
+
+    LocInfo(to) = std::exchange(LocInfo(from), {});
+}
+
+void RegAlloc::CopyToScratch(size_t bit_width, HostLoc to, HostLoc from) {
+    ASSERT(LocInfo(to).IsEmpty() && !LocInfo(from).IsEmpty());
+
+    EmitMove(bit_width, to, from);
+}
+
+void RegAlloc::Exchange(HostLoc a, HostLoc b) {
+    ASSERT(!LocInfo(a).IsLocked() && !LocInfo(b).IsLocked());
+    ASSERT(LocInfo(a).GetMaxBitWidth() <= HostLocBitWidth(b));
+    ASSERT(LocInfo(b).GetMaxBitWidth() <= HostLocBitWidth(a));
+
+    if (LocInfo(a).IsEmpty()) {
+        Move(a, b);
+        return;
+    }
+
+    if (LocInfo(b).IsEmpty()) {
+        Move(b, a);
+        return;
+    }
+
+    EmitExchange(a, b);
+
+    std::swap(LocInfo(a), LocInfo(b));
+}
+
+void RegAlloc::MoveOutOfTheWay(HostLoc reg) {
+    ASSERT(!LocInfo(reg).IsLocked());
+    if (!LocInfo(reg).IsEmpty()) {
+        SpillRegister(reg);
+    }
+}
+
+void RegAlloc::SpillRegister(HostLoc loc) {
+    ASSERT_MSG(HostLocIsRegister(loc), "Only registers can be spilled");
+    ASSERT_MSG(!LocInfo(loc).IsEmpty(), "There is no need to spill unoccupied registers");
+    ASSERT_MSG(!LocInfo(loc).IsLocked(), "Registers that have been allocated must not be spilt");
+
+    HostLoc new_loc = FindFreeSpill();
+    Move(new_loc, loc);
+}
+
+HostLoc RegAlloc::FindFreeSpill() const {
+    for (size_t i = static_cast<size_t>(HostLoc::FirstSpill); i < hostloc_info.size(); i++) {
+        HostLoc loc = static_cast<HostLoc>(i);
+        if (LocInfo(loc).IsEmpty())
+            return loc;
+    }
+
+    ASSERT_FALSE("All spill locations are full");
+}
+
+HostLocInfo& RegAlloc::LocInfo(HostLoc loc) {
+    ASSERT(loc != HostLoc::SP && loc != HostLoc::X28 && loc != HostLoc::X29 && loc != HostLoc::X30);
+    return hostloc_info[static_cast<size_t>(loc)];
+}
+
+const HostLocInfo& RegAlloc::LocInfo(HostLoc loc) const {
+    ASSERT(loc != HostLoc::SP && loc != HostLoc::X28 && loc != HostLoc::X29 && loc != HostLoc::X30);
+    return hostloc_info[static_cast<size_t>(loc)];
+}
+
+void RegAlloc::EmitMove(size_t bit_width, HostLoc to, HostLoc from) {
+    if (HostLocIsFPR(to) && HostLocIsFPR(from)) {
+        // bit_width == 128
+        //mov(HostLocToFpr(to), HostLocToFpr(from));
+
+        ASSERT_FALSE("Unimplemented");
+    } else if (HostLocIsGPR(to) && HostLocIsGPR(from)) {
+        ASSERT(bit_width != 128);
+        if (bit_width == 64) {
+            code.MOV(HostLocToReg64(to), HostLocToReg64(from));
+        } else {
+            code.MOV(DecodeReg(HostLocToReg64(to)), DecodeReg(HostLocToReg64(from)));
+        }
+    } else if (HostLocIsFPR(to) && HostLocIsGPR(from)) {
+        ASSERT(bit_width != 128);
+        if (bit_width == 64) {
+            code.fp_emitter.FMOV(EncodeRegToDouble(HostLocToFpr(to)), HostLocToReg64(from));
+        } else {
+            code.fp_emitter.FMOV(EncodeRegToSingle(HostLocToFpr(to)), DecodeReg(HostLocToReg64(from)));
+        }
+    } else if (HostLocIsGPR(to) && HostLocIsFPR(from)) {
+        ASSERT(bit_width != 128);
+        if (bit_width == 64) {
+            code.fp_emitter.FMOV(HostLocToReg64(to), EncodeRegToDouble(HostLocToFpr(from)));
+        } else {
+            code.fp_emitter.FMOV(DecodeReg(HostLocToReg64(to)), EncodeRegToSingle(HostLocToFpr(from)));
+        }
+    } else if (HostLocIsFPR(to) && HostLocIsSpill(from)) {
+        s32 spill_addr = spill_to_addr(from);
+        // ASSERT(spill_addr.getBit() >= bit_width);
+        code.fp_emitter.LDR(bit_width, Arm64Gen::INDEX_UNSIGNED, HostLocToFpr(to), Arm64Gen::X28, spill_addr);
+    } else if (HostLocIsSpill(to) && HostLocIsFPR(from)) {
+        s32 spill_addr = spill_to_addr(to);
+        // ASSERT(spill_addr.getBit() >= bit_width);
+        code.fp_emitter.STR(bit_width, Arm64Gen::INDEX_UNSIGNED, HostLocToFpr(from), Arm64Gen::X28, spill_addr);
+    } else if (HostLocIsGPR(to) && HostLocIsSpill(from)) {
+        ASSERT(bit_width != 128);
+        if (bit_width == 64) {
+            code.LDR(Arm64Gen::INDEX_UNSIGNED, HostLocToReg64(to), Arm64Gen::X28, spill_to_addr(from));
+        } else {
+            code.LDR(Arm64Gen::INDEX_UNSIGNED, DecodeReg(HostLocToReg64(to)), Arm64Gen::X28, spill_to_addr(from));
+        }
+    } else if (HostLocIsSpill(to) && HostLocIsGPR(from)) {
+        ASSERT(bit_width != 128);
+        if (bit_width == 64) {
+            code.STR(Arm64Gen::INDEX_UNSIGNED, HostLocToReg64(from), Arm64Gen::X28, spill_to_addr(to));
+        } else {
+            code.STR(Arm64Gen::INDEX_UNSIGNED, DecodeReg(HostLocToReg64(from)), Arm64Gen::X28, spill_to_addr(to));
+        }
+    } else {
+        ASSERT_FALSE("Invalid RegAlloc::EmitMove");
+    }
+}
+
+void RegAlloc::EmitExchange(HostLoc a, HostLoc b) {
+    if (HostLocIsGPR(a) && HostLocIsGPR(b)) {
+        // Is this the best way to do it?
+        code.EOR(HostLocToReg64(a), HostLocToReg64(a), HostLocToReg64(b));
+        code.EOR(HostLocToReg64(b), HostLocToReg64(a), HostLocToReg64(b));
+        code.EOR(HostLocToReg64(a), HostLocToReg64(a), HostLocToReg64(b));
+    } else if (HostLocIsFPR(a) && HostLocIsFPR(b)) {
+        ASSERT_FALSE("Check your code: Exchanging XMM registers is unnecessary");
+    } else {
+        ASSERT_FALSE("Invalid RegAlloc::EmitExchange");
+    }
+}
+
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/backend/A64/reg_alloc.h b/src/dynarmic/backend/A64/reg_alloc.h
new file mode 100644
index 00000000..3eec7fa6
--- /dev/null
+++ b/src/dynarmic/backend/A64/reg_alloc.h
@@ -0,0 +1,167 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2016 MerryMage
+ * This software may be used and distributed according to the terms of the GNU
+ * General Public License version 2 or any later version.
+ */
+
+#pragma once
+
+#include <array>
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include <optional>
+
+#include "backend/A64/block_of_code.h"
+#include "backend/A64/hostloc.h"
+//#include "backend/A64/oparg.h"
+#include "common/common_types.h"
+#include "frontend/ir/cond.h"
+#include "frontend/ir/microinstruction.h"
+#include "frontend/ir/value.h"
+
+namespace Dynarmic::BackendA64 {
+
+class RegAlloc;
+
+struct HostLocInfo {
+public:
+    bool IsLocked() const;
+    bool IsEmpty() const;
+    bool IsLastUse() const;
+
+    void ReadLock();
+    void WriteLock();
+    void AddArgReference();
+    void ReleaseOne();
+    void ReleaseAll();
+
+    bool ContainsValue(const IR::Inst* inst) const;
+    size_t GetMaxBitWidth() const;
+
+    void AddValue(IR::Inst* inst);
+
+private:
+    // Current instruction state
+    size_t is_being_used_count = 0;
+    bool is_scratch = false;
+
+    // Block state
+    size_t current_references = 0;
+    size_t accumulated_uses = 0;
+    size_t total_uses = 0;
+
+    // Value state
+    std::vector<IR::Inst*> values;
+    size_t max_bit_width = 0;
+};
+
+struct Argument {
+public:
+    using copyable_reference = std::reference_wrapper<Argument>;
+
+    IR::Type GetType() const;
+    bool IsImmediate() const;
+    bool IsVoid() const;
+
+    bool FitsInImmediateU32() const;
+    bool FitsInImmediateS32() const;
+
+    bool GetImmediateU1() const;
+    u8 GetImmediateU8() const;
+    u16 GetImmediateU16() const;
+    u32 GetImmediateU32() const;
+    u64 GetImmediateS32() const;
+    u64 GetImmediateU64() const;
+    IR::Cond GetImmediateCond() const;
+
+    /// Is this value currently in a GPR?
+    bool IsInGpr() const;
+    /// Is this value currently in a FPR?
+    bool IsInFpr() const;
+    /// Is this value currently in memory?
+    bool IsInMemory() const;
+
+private:
+    friend class RegAlloc;
+    explicit Argument(RegAlloc& reg_alloc) : reg_alloc(reg_alloc) {}
+
+    bool allocated = false;
+    RegAlloc& reg_alloc;
+    IR::Value value;
+};
+
+class RegAlloc final {
+public:
+    using ArgumentInfo = std::array<Argument, IR::max_arg_count>;
+
+    explicit RegAlloc(BlockOfCode& code, size_t num_spills, std::function<u64(HostLoc)> spill_to_addr)
+        : hostloc_info(NonSpillHostLocCount + num_spills), code(code), spill_to_addr(std::move(spill_to_addr)) {}
+
+    ArgumentInfo GetArgumentInfo(IR::Inst* inst);
+
+    Arm64Gen::ARM64Reg UseGpr(Argument& arg);
+    Arm64Gen::ARM64Reg UseFpr(Argument& arg);
+    //OpArg UseOpArg(Argument& arg);
+    void Use(Argument& arg, HostLoc host_loc);
+
+    Arm64Gen::ARM64Reg UseScratchGpr(Argument& arg);
+    Arm64Gen::ARM64Reg UseScratchFpr(Argument& arg);
+    void UseScratch(Argument& arg, HostLoc host_loc);
+
+    void DefineValue(IR::Inst* inst, const Arm64Gen::ARM64Reg& reg);
+    void DefineValue(IR::Inst* inst, Argument& arg);
+
+    void Release(const Arm64Gen::ARM64Reg& reg);
+
+    Arm64Gen::ARM64Reg ScratchGpr(HostLocList desired_locations = any_gpr);
+    Arm64Gen::ARM64Reg ScratchFpr(HostLocList desired_locations = any_fpr);
+
+    void HostCall(IR::Inst* result_def = nullptr, std::optional<Argument::copyable_reference> arg0 = {},
+                  std::optional<Argument::copyable_reference> arg1 = {}, 
+                  std::optional<Argument::copyable_reference> arg2 = {},
+                  std::optional<Argument::copyable_reference> arg3 = {},
+                  std::optional<Argument::copyable_reference> arg4 = {},
+                  std::optional<Argument::copyable_reference> arg5 = {},
+                  std::optional<Argument::copyable_reference> arg6 = {},
+                  std::optional<Argument::copyable_reference> arg7 = {});
+
+    // TODO: Values in host flags
+
+    void EndOfAllocScope();
+
+    void AssertNoMoreUses();
+
+private:
+    friend struct Argument;
+
+    HostLoc SelectARegister(HostLocList desired_locations) const;
+    std::optional<HostLoc> ValueLocation(const IR::Inst* value) const;
+
+    HostLoc UseImpl(IR::Value use_value, HostLocList desired_locations);
+    HostLoc UseScratchImpl(IR::Value use_value, HostLocList desired_locations);
+    HostLoc ScratchImpl(HostLocList desired_locations);
+    void DefineValueImpl(IR::Inst* def_inst, HostLoc host_loc);
+    void DefineValueImpl(IR::Inst* def_inst, const IR::Value& use_inst);
+
+    HostLoc LoadImmediate(IR::Value imm, HostLoc reg);
+    void Move(HostLoc to, HostLoc from);
+    void CopyToScratch(size_t bit_width, HostLoc to, HostLoc from);
+    void Exchange(HostLoc a, HostLoc b);
+    void MoveOutOfTheWay(HostLoc reg);
+
+    void SpillRegister(HostLoc loc);
+    HostLoc FindFreeSpill() const;
+
+    std::vector<HostLocInfo> hostloc_info;
+    HostLocInfo& LocInfo(HostLoc loc);
+    const HostLocInfo& LocInfo(HostLoc loc) const;
+
+    BlockOfCode& code;
+    std::function<u32(HostLoc)> spill_to_addr;
+    void EmitMove(size_t bit_width, HostLoc to, HostLoc from);
+    void EmitExchange(HostLoc a, HostLoc b);
+};
+
+} // namespace Dynarmic::BackendA64
diff --git a/src/dynarmic/common/math_util.h b/src/dynarmic/common/math_util.h
index 5c1f784c..3b278031 100644
--- a/src/dynarmic/common/math_util.h
+++ b/src/dynarmic/common/math_util.h
@@ -44,4 +44,9 @@ u8 RecipEstimate(u64 a);
  */
 u8 RecipSqrtEstimate(u64 a);
 
+template <typename T>
+constexpr bool IsPow2(T imm){
+    return imm > 0 && (imm & (imm - 1)) == 0;
+}
+
 }  // namespace Dynarmic::Common