From 55bddc767f634fbe5f38ab034e6bb3627b523a3b Mon Sep 17 00:00:00 2001
From: MerryMage <MerryMage@users.noreply.github.com>
Date: Tue, 9 Jun 2020 22:28:28 +0100
Subject: [PATCH] backend/x64: Touch PEXT/PDEP code

* Use pext/pdep where not previously used
* Limit pext/pdep to non-AMD platforms due to slowness on AMD
* Use imul/and as alternatives for AMD and non-BMI2 platforms
---
 src/backend/x64/a32_emit_x64.cpp  | 140 ++++++++++++++++++------------
 src/backend/x64/a64_emit_x64.cpp  |  25 ++++--
 src/backend/x64/block_of_code.cpp |   4 +
 src/backend/x64/block_of_code.h   |   1 +
 src/backend/x64/emit_x64.cpp      |  17 +++-
 5 files changed, 125 insertions(+), 62 deletions(-)

diff --git a/src/backend/x64/a32_emit_x64.cpp b/src/backend/x64/a32_emit_x64.cpp
index 33fe4ccd..d1c5c64a 100644
--- a/src/backend/x64/a32_emit_x64.cpp
+++ b/src/backend/x64/a32_emit_x64.cpp
@@ -412,16 +412,12 @@ void A32EmitX64::EmitA32SetVector(A32EmitContext& ctx, IR::Inst* inst) {
     }
 }
 
-static u32 GetCpsrImpl(A32JitState* jit_state) {
-    return jit_state->Cpsr();
-}
-
 void A32EmitX64::EmitA32GetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
-    if (code.HasBMI2()) {
-        const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
-        const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
-        const Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr().cvt32();
+    const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+    const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
+    const Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr().cvt32();
 
+    if (code.HasFastBMI2()) {
         // Here we observe that cpsr_et and cpsr_ge are right next to each other in memory,
         // so we load them both at the same time with one 64-bit read. This allows us to
         // extract all of their bits together at once with one pext.
@@ -431,60 +427,78 @@ void A32EmitX64::EmitA32GetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
         code.pext(result.cvt64(), result.cvt64(), tmp.cvt64());
         code.mov(tmp, 0x000f0220);
         code.pdep(result, result, tmp);
-        code.mov(tmp, dword[r15 + offsetof(A32JitState, cpsr_q)]);
-        code.shl(tmp, 27);
+    } else {
+        code.mov(result, dword[r15 + offsetof(A32JitState, upper_location_descriptor)]);
+        code.imul(result, result, 0x12);
+        code.and_(result, 0x00000220);
+
+        code.mov(tmp, dword[r15 + offsetof(A32JitState, cpsr_ge)]);
+        code.and_(tmp, 0x80808080);
+        code.imul(tmp, tmp, 0x00204081);
+        code.shr(tmp, 12);
+        code.and_(tmp, 0x000f0000);
         code.or_(result, tmp);
-        code.mov(tmp2, dword[r15 + offsetof(A32JitState, cpsr_nzcv)]);
+    }
+
+    code.mov(tmp, dword[r15 + offsetof(A32JitState, cpsr_q)]);
+    code.shl(tmp, 27);
+    code.or_(result, tmp);
+
+    code.mov(tmp2, dword[r15 + offsetof(A32JitState, cpsr_nzcv)]);
+    if (code.HasFastBMI2()) {
         code.mov(tmp, NZCV::x64_mask);
         code.pext(tmp2, tmp2, tmp);
         code.shl(tmp2, 28);
-        code.or_(result, tmp2);
-        code.or_(result, dword[r15 + offsetof(A32JitState, cpsr_jaifm)]);
-
-        ctx.reg_alloc.DefineValue(inst, result);
     } else {
-        ctx.reg_alloc.HostCall(inst);
-        code.mov(code.ABI_PARAM1, code.r15);
-        code.CallFunction(&GetCpsrImpl);
+        code.and_(tmp2, NZCV::x64_mask);
+        code.imul(tmp2, tmp2, NZCV::from_x64_multiplier);
+        code.and_(tmp2, NZCV::arm_mask);
     }
-}
+    code.or_(result, tmp2);
 
-static void SetCpsrImpl(u32 value, A32JitState* jit_state) {
-    jit_state->SetCpsr(value);
+    code.or_(result, dword[r15 + offsetof(A32JitState, cpsr_jaifm)]);
+
+    ctx.reg_alloc.DefineValue(inst, result);
 }
 
 void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    if (code.HasBMI2()) {
-        const Xbyak::Reg32 cpsr = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
-        const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
-        const Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr().cvt32();
+    const Xbyak::Reg32 cpsr = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+    const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
+    const Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr().cvt32();
 
-        if (config.always_little_endian) {
-            code.and_(cpsr, 0xFFFFFDFF);
-        }
+    if (config.always_little_endian) {
+        code.and_(cpsr, 0xFFFFFDFF);
+    }
 
-        // cpsr_q
-        code.bt(cpsr, 27);
-        code.setc(code.byte[r15 + offsetof(A32JitState, cpsr_q)]);
+    // cpsr_q
+    code.bt(cpsr, 27);
+    code.setc(code.byte[r15 + offsetof(A32JitState, cpsr_q)]);
 
-        // cpsr_nzcv
-        code.mov(tmp, cpsr);
-        code.shr(tmp, 28);
+    // cpsr_nzcv
+    code.mov(tmp, cpsr);
+    code.shr(tmp, 28);
+    if (code.HasFastBMI2()) {
         code.mov(tmp2, NZCV::x64_mask);
         code.pdep(tmp, tmp, tmp2);
-        code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], tmp);
+    } else {
+        code.imul(tmp, tmp, NZCV::to_x64_multiplier);
+        code.and_(tmp, NZCV::x64_mask);
+    }
+    code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], tmp);
 
-        // cpsr_jaifm
-        code.mov(tmp, cpsr);
-        code.and_(tmp, 0x07F0FDDF);
-        code.mov(dword[r15 + offsetof(A32JitState, cpsr_jaifm)], tmp);
+    // cpsr_jaifm
+    code.mov(tmp, cpsr);
+    code.and_(tmp, 0x07F0FDDF);
+    code.mov(dword[r15 + offsetof(A32JitState, cpsr_jaifm)], tmp);
 
+    if (code.HasFastBMI2()) {
         // cpsr_et and cpsr_ge
         static_assert(offsetof(A32JitState, upper_location_descriptor) + 4 == offsetof(A32JitState, cpsr_ge));
         // This mask is 0x7FFF0000, because we do not want the MSB to be sign extended to the upper dword.
         static_assert((A32::LocationDescriptor::FPSCR_MODE_MASK & ~0x7FFF0000) == 0);
+
         code.and_(qword[r15 + offsetof(A32JitState, upper_location_descriptor)], u32(0x7FFF0000));
         code.mov(tmp, 0x000f0220);
         code.pext(cpsr, cpsr, tmp);
@@ -497,14 +511,21 @@ void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
         code.xor_(tmp.cvt64(), tmp2.cvt64());
         code.or_(qword[r15 + offsetof(A32JitState, upper_location_descriptor)], tmp.cvt64());
     } else {
-        ctx.reg_alloc.HostCall(nullptr, args[0]);
+        code.and_(dword[r15 + offsetof(A32JitState, upper_location_descriptor)], u32(0xFFFF0000));
+        code.mov(tmp, cpsr);
+        code.and_(tmp, 0x00000220);
+        code.imul(tmp, tmp, 0x00900000);
+        code.shr(tmp, 28);
+        code.or_(dword[r15 + offsetof(A32JitState, upper_location_descriptor)], tmp);
 
-        if (config.always_little_endian) {
-            code.and_(code.ABI_PARAM1, 0xFFFFFDFF);
-        }
-
-        code.mov(code.ABI_PARAM2, code.r15);
-        code.CallFunction(&SetCpsrImpl);
+        code.and_(cpsr, 0x000f0000);
+        code.shr(cpsr, 16);
+        code.imul(cpsr, cpsr, 0x00204081);
+        code.and_(cpsr, 0x01010101);
+        code.mov(tmp, 0x80808080);
+        code.sub(tmp, cpsr);
+        code.xor_(tmp, 0x80808080);
+        code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], tmp);
     }
 }
 
@@ -514,7 +535,7 @@ void A32EmitX64::EmitA32SetCpsrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
         const u32 imm = args[0].GetImmediateU32();
 
         code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], NZCV::ToX64(imm));
-    } else if (code.HasBMI2()) {
+    } else if (code.HasFastBMI2()) {
         const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
         const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32();
 
@@ -539,7 +560,7 @@ void A32EmitX64::EmitA32SetCpsrNZCVQ(A32EmitContext& ctx, IR::Inst* inst) {
 
         code.mov(dword[r15 + offsetof(A32JitState, cpsr_nzcv)], NZCV::ToX64(imm));
         code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_q)], u8((imm & 0x08000000) != 0 ? 1 : 0));
-    } else if (code.HasBMI2()) {
+    } else if (code.HasFastBMI2()) {
         const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
         const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32();
 
@@ -666,7 +687,7 @@ void A32EmitX64::EmitA32SetGEFlagsCompressed(A32EmitContext& ctx, IR::Inst* inst
         ge |= Common::Bit<16>(imm) ? 0x000000FF : 0;
 
         code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], ge);
-    } else if (code.HasBMI2()) {
+    } else if (code.HasFastBMI2()) {
         const Xbyak::Reg32 a = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
         const Xbyak::Reg32 b = ctx.reg_alloc.ScratchGpr().cvt32();
 
@@ -802,13 +823,24 @@ void A32EmitX64::EmitA32GetFpscrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
 
 void A32EmitX64::EmitA32SetFpscrNZCV(A32EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    if (code.HasFastBMI2()) {
+        const Xbyak::Reg32 value = ctx.reg_alloc.UseGpr(args[0]).cvt32();
+        const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
+
+        code.mov(tmp, NZCV::x64_mask);
+        code.pext(tmp, value, tmp);
+        code.shl(tmp, 28);
+        code.mov(dword[r15 + offsetof(A32JitState, fpsr_nzcv)], tmp);
+
+        return;
+    }
+
     const Xbyak::Reg32 value = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
 
-    code.and_(value, 0b11000001'00000001);
-    code.imul(value, value, 0b00010000'00100001);
-    code.shl(value, 16);
-    code.and_(value, 0xF0000000);
-
+    code.and_(value, NZCV::x64_mask);
+    code.imul(value, value, NZCV::from_x64_multiplier);
+    code.and_(value, NZCV::arm_mask);
     code.mov(dword[r15 + offsetof(A32JitState, fpsr_nzcv)], value);
 }
 
diff --git a/src/backend/x64/a64_emit_x64.cpp b/src/backend/x64/a64_emit_x64.cpp
index c09d8631..6097a983 100644
--- a/src/backend/x64/a64_emit_x64.cpp
+++ b/src/backend/x64/a64_emit_x64.cpp
@@ -381,9 +381,18 @@ void A64EmitX64::EmitA64GetNZCVRaw(A64EmitContext& ctx, IR::Inst* inst) {
     const Xbyak::Reg32 nzcv_raw = ctx.reg_alloc.ScratchGpr().cvt32();
 
     code.mov(nzcv_raw, dword[r15 + offsetof(A64JitState, cpsr_nzcv)]);
-    code.and_(nzcv_raw, NZCV::x64_mask);
-    code.imul(nzcv_raw, nzcv_raw, NZCV::from_x64_multiplier);
-    code.and_(nzcv_raw, NZCV::arm_mask);
+
+    if (code.HasFastBMI2()) {
+        const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
+        code.mov(tmp, NZCV::x64_mask);
+        code.pext(nzcv_raw, nzcv_raw, tmp);
+        code.shl(nzcv_raw, 28);
+    } else {
+        code.and_(nzcv_raw, NZCV::x64_mask);
+        code.imul(nzcv_raw, nzcv_raw, NZCV::from_x64_multiplier);
+        code.and_(nzcv_raw, NZCV::arm_mask);
+    }
+
     ctx.reg_alloc.DefineValue(inst, nzcv_raw);
 }
 
@@ -392,8 +401,14 @@ void A64EmitX64::EmitA64SetNZCVRaw(A64EmitContext& ctx, IR::Inst* inst) {
     const Xbyak::Reg32 nzcv_raw = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
 
     code.shr(nzcv_raw, 28);
-    code.imul(nzcv_raw, nzcv_raw, NZCV::to_x64_multiplier);
-    code.and_(nzcv_raw, NZCV::x64_mask);
+    if (code.HasFastBMI2()) {
+        const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
+        code.mov(tmp, NZCV::x64_mask);
+        code.pdep(nzcv_raw, nzcv_raw, tmp);
+    } else {
+        code.imul(nzcv_raw, nzcv_raw, NZCV::to_x64_multiplier);
+        code.and_(nzcv_raw, NZCV::x64_mask);
+    }
     code.mov(dword[r15 + offsetof(A64JitState, cpsr_nzcv)], nzcv_raw);
 }
 
diff --git a/src/backend/x64/block_of_code.cpp b/src/backend/x64/block_of_code.cpp
index 0557a3c0..b6c1b546 100644
--- a/src/backend/x64/block_of_code.cpp
+++ b/src/backend/x64/block_of_code.cpp
@@ -355,6 +355,10 @@ bool BlockOfCode::HasBMI2() const {
     return DoesCpuSupport(Xbyak::util::Cpu::tBMI2);
 }
 
+bool BlockOfCode::HasFastBMI2() const {
+    return DoesCpuSupport(Xbyak::util::Cpu::tBMI2) && !DoesCpuSupport(Xbyak::util::Cpu::tAMD);
+}
+
 bool BlockOfCode::HasFMA() const {
     return DoesCpuSupport(Xbyak::util::Cpu::tFMA);
 }
diff --git a/src/backend/x64/block_of_code.h b/src/backend/x64/block_of_code.h
index 4d4d3294..f55051fe 100644
--- a/src/backend/x64/block_of_code.h
+++ b/src/backend/x64/block_of_code.h
@@ -150,6 +150,7 @@ public:
     bool HasLZCNT() const;
     bool HasBMI1() const;
     bool HasBMI2() const;
+    bool HasFastBMI2() const;
     bool HasFMA() const;
     bool HasAVX2() const;
     bool HasAVX512_Skylake() const;
diff --git a/src/backend/x64/emit_x64.cpp b/src/backend/x64/emit_x64.cpp
index 4e3f9b48..6261f9ab 100644
--- a/src/backend/x64/emit_x64.cpp
+++ b/src/backend/x64/emit_x64.cpp
@@ -9,6 +9,7 @@
 
 #include "backend/x64/block_of_code.h"
 #include "backend/x64/emit_x64.h"
+#include "backend/x64/nzcv_util.h"
 #include "backend/x64/perf_map.h"
 #include "common/assert.h"
 #include "common/bit_util.h"
@@ -158,13 +159,23 @@ void EmitX64::EmitNZCVFromPackedFlags(EmitContext& ctx, IR::Inst* inst) {
         value |= Common::Bit<29>(args[0].GetImmediateU32()) ? (1 << 8) : 0;
         value |= Common::Bit<28>(args[0].GetImmediateU32()) ? (1 << 0) : 0;
         code.mov(nzcv, value);
+        ctx.reg_alloc.DefineValue(inst, nzcv);
+    } else if (code.HasFastBMI2()) {
+        const Xbyak::Reg32 nzcv = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
+        const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
+
+        code.shr(nzcv, 28);
+        code.mov(tmp, NZCV::x64_mask);
+        code.pdep(nzcv, nzcv, tmp);
+
         ctx.reg_alloc.DefineValue(inst, nzcv);
     } else {
         const Xbyak::Reg32 nzcv = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
-        // TODO: Optimize
+
         code.shr(nzcv, 28);
-        code.imul(nzcv, nzcv, 0b00010000'10000001);
-        code.and_(nzcv.cvt8(), 1);
+        code.imul(nzcv, nzcv, NZCV::to_x64_multiplier);
+        code.and_(nzcv, NZCV::x64_mask);
+
         ctx.reg_alloc.DefineValue(inst, nzcv);
     }
 }