From 685b85ce0e471d8dcd64cc97b1460acd784d2743 Mon Sep 17 00:00:00 2001
From: MerryMage <MerryMage@users.noreply.github.com>
Date: Thu, 25 Jun 2020 19:34:02 +0100
Subject: [PATCH] A32: Implement v8 ASIMD V{MAX,MIN}NM

---
 .../x64/emit_x64_vector_floating_point.cpp    | 117 ++++++++++++++++++
 src/frontend/A32/decoder/asimd.inc            |   2 +
 .../A32/translate/impl/asimd_three_same.cpp   |  12 ++
 .../A32/translate/impl/translate_arm.h        |   2 +
 src/frontend/ir/ir_emitter.cpp                |  20 +++
 src/frontend/ir/ir_emitter.h                  |   2 +
 src/frontend/ir/opcodes.inc                   |   4 +
 7 files changed, 159 insertions(+)
diff --git a/src/backend/x64/emit_x64_vector_floating_point.cpp b/src/backend/x64/emit_x64_vector_floating_point.cpp
index ba41b658..0501cc9d 100644
--- a/src/backend/x64/emit_x64_vector_floating_point.cpp
+++ b/src/backend/x64/emit_x64_vector_floating_point.cpp
@@ -950,6 +950,123 @@ void EmitX64::EmitFPVectorMin64(EmitContext& ctx, IR::Inst* inst) {
     EmitFPVectorMinMax<64, false>(code, ctx, inst);
 }
 
+template<size_t fsize, bool is_max>
+static void EmitFPVectorMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+    using FPT = mp::unsigned_integer_of_size<fsize>;
+    constexpr u8 mantissa_msb_bit = static_cast<u8>(FP::FPInfo<FPT>::explicit_mantissa_width - 1);
+
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const bool fpcr_controlled = args[2].GetImmediateU1();
+    const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+    const Xbyak::Xmm xmm_a = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[0]) : ctx.reg_alloc.UseXmm(args[0]);
+    const Xbyak::Xmm xmm_b = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]);
+
+    const Xbyak::Xmm eq = ctx.reg_alloc.ScratchXmm();
+    const Xbyak::Xmm tmp0 = ctx.reg_alloc.ScratchXmm();
+    const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm();
+
+    Xbyak::Label end, fallback;
+
+    MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{
+        DenormalsAreZero<fsize>(code, ctx.FPCR(fpcr_controlled), {xmm_a, xmm_b}, xmm0);
+
+        if (!code.HasAVX()) {
+            FCODE(vcmpeqp)(xmm0, xmm_a, xmm_b);
+            FCODE(vcmpunordp)(tmp0, xmm_a, xmm_a);
+            FCODE(vcmpunordp)(tmp1, xmm_b, xmm_b);
+            code.pand(tmp0, xmm_a);
+            code.vpandn(tmp1, xmm_b, tmp1);
+            FCODE(orp)(tmp0, tmp1);
+            if constexpr (is_max) {
+                code.vpand(eq, xmm_a, xmm_b);
+                FCODE(vmaxp)(result, xmm_a, xmm_b);
+            } else {
+                code.vpor(eq, xmm_a, xmm_b);
+                FCODE(vminp)(result, xmm_a, xmm_b);
+            }
+            ICODE(psll)(tmp0, static_cast<u8>(fsize - mantissa_msb_bit));
+
+            // At this point:
+            // tmp0 = IsSNaN(xmm_a) || IsQNaN(xmm_b)
+            // xmm0 == (xmm_a == xmm_b)
+            // result = xmm_a {<,>} xmm_b ? xmm_a : xmm_b
+
+            FCODE(blendvp)(result, eq);
+            FCODE(vblendvp)(result, result, xmm_a, tmp0);
+        } else {
+            /*
+            code.movaps(tmp0, xmm_a);
+            code.movaps(tmp1, xmm_b);
+            FCODE(cmpunordp)(tmp0, tmp0);
+            FCODE(cmpunordp)(tmp1, tmp1);
+            */
+            FCODE(vcmpunordp)(tmp0, xmm_a, xmm_a);
+            FCODE(vcmpunordp)(tmp1, xmm_b, xmm_b);
+
+            code.pand(tmp0, xmm_a);
+
+            /*
+            code.movaps(xmm0, xmm_b);
+            code.pandn(xmm0, tmp1);
+            code.por(tmp0, xmm0);
+            */
+            code.vpandn(tmp1, xmm_b, tmp1);
+            FCODE(orp)(tmp0, tmp1);
+
+            ICODE(psll)(tmp0, static_cast<u8>(fsize - mantissa_msb_bit));
+            code.psrad(tmp0, 31);
+            if constexpr (fsize == 64) {
+                code.pshufd(tmp0, tmp0, 0b11110101);
+            }
+
+            /*
+            code.movaps(xmm0, xmm_a);
+            FCODE(cmpeqp)(xmm0, xmm_b);
+            */
+            FCODE(vcmpeqp)(xmm0, xmm_a, xmm_b);
+
+            code.movaps(eq, xmm_a);
+            code.movaps(result, xmm_a);
+            if constexpr (is_max) {
+                code.pand(eq, xmm_b);
+                FCODE(maxp)(result, xmm_b);
+            } else {
+                code.por(eq, xmm_b);
+                FCODE(minp)(result, xmm_b);
+            }
+
+            code.pand(eq, xmm0);
+            code.pandn(xmm0, result);
+            code.por(eq, xmm0);
+
+            code.movaps(result, xmm_a);
+            code.pand(result, tmp0);
+            code.pandn(tmp0, eq);
+            code.por(result, tmp0);
+        }
+
+        ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), result);
+    });
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitFPVectorMaxNumeric32(EmitContext& ctx, IR::Inst* inst) {
+    EmitFPVectorMinMaxNumeric<32, true>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorMaxNumeric64(EmitContext& ctx, IR::Inst* inst) {
+    EmitFPVectorMinMaxNumeric<64, true>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorMinNumeric32(EmitContext& ctx, IR::Inst* inst) {
+    EmitFPVectorMinMaxNumeric<32, false>(code, ctx, inst);
+}
+
+void EmitX64::EmitFPVectorMinNumeric64(EmitContext& ctx, IR::Inst* inst) {
+    EmitFPVectorMinMaxNumeric<64, false>(code, ctx, inst);
+}
+
 void EmitX64::EmitFPVectorMul32(EmitContext& ctx, IR::Inst* inst) {
     EmitThreeOpVectorOperation<32, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::mulps);
 }
diff --git a/src/frontend/A32/decoder/asimd.inc b/src/frontend/A32/decoder/asimd.inc
index 44029fb6..90048fdb 100644
--- a/src/frontend/A32/decoder/asimd.inc
+++ b/src/frontend/A32/decoder/asimd.inc
@@ -50,6 +50,8 @@ INST(asimd_VCGT_reg_float,  "VCGT (register)",          "111100110D1znnnndddd111
 INST(asimd_VACGE,           "VACGE",                    "111100110Doznnnndddd1110NQM1mmmm") // ASIMD
 INST(asimd_VMAX_float,      "VMAX (floating-point)",    "111100100D0znnnndddd1111NQM0mmmm") // ASIMD
 INST(asimd_VMIN_float,      "VMIN (floating-point)",    "111100100D1znnnndddd1111NQM0mmmm") // ASIMD
+INST(v8_VMAXNM,             "VMAXNM",                   "111100110D0znnnndddd1111NQM1mmmm") // v8
+INST(v8_VMINNM,             "VMINNM",                   "111100110D1znnnndddd1111NQM1mmmm") // v8
 INST(asimd_VRECPS,          "VRECPS",                   "111100100D0znnnndddd1111NQM1mmmm") // ASIMD
 INST(asimd_VRSQRTS,         "VRSQRTS",                  "111100100D1znnnndddd1111NQM1mmmm") // ASIMD
 
diff --git a/src/frontend/A32/translate/impl/asimd_three_same.cpp b/src/frontend/A32/translate/impl/asimd_three_same.cpp
index 9994e4e7..4c9ac0b2 100644
--- a/src/frontend/A32/translate/impl/asimd_three_same.cpp
+++ b/src/frontend/A32/translate/impl/asimd_three_same.cpp
@@ -795,6 +795,18 @@ bool ArmTranslatorVisitor::asimd_VMIN_float(bool D, bool sz, size_t Vn, size_t V
     });
 }
 
+bool ArmTranslatorVisitor::v8_VMAXNM(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+    return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) {
+        return ir.FPVectorMaxNumeric(32, reg_n, reg_m, false);
+    });
+}
+
+bool ArmTranslatorVisitor::v8_VMINNM(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+    return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) {
+        return ir.FPVectorMinNumeric(32, reg_n, reg_m, false);
+    });
+}
+
 bool ArmTranslatorVisitor::asimd_VRECPS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
     return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) {
         return ir.FPVectorRecipStepFused(32, reg_n, reg_m, false);
diff --git a/src/frontend/A32/translate/impl/translate_arm.h b/src/frontend/A32/translate/impl/translate_arm.h
index dfc02b1d..4938e0ae 100644
--- a/src/frontend/A32/translate/impl/translate_arm.h
+++ b/src/frontend/A32/translate/impl/translate_arm.h
@@ -503,6 +503,8 @@ struct ArmTranslatorVisitor final {
     bool asimd_VACGE(bool D, bool op, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
     bool asimd_VMAX_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
     bool asimd_VMIN_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+    bool v8_VMAXNM(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+    bool v8_VMINNM(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
     bool asimd_VRECPS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
     bool asimd_VRSQRTS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
 
diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp
index 7f19e4f8..0e684bec 100644
--- a/src/frontend/ir/ir_emitter.cpp
+++ b/src/frontend/ir/ir_emitter.cpp
@@ -2426,6 +2426,16 @@ U128 IREmitter::FPVectorMax(size_t esize, const U128& a, const U128& b, bool fpc
     UNREACHABLE();
 }
 
+U128 IREmitter::FPVectorMaxNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
+    switch (esize) {
+    case 32:
+        return Inst<U128>(Opcode::FPVectorMaxNumeric32, a, b, Imm1(fpcr_controlled));
+    case 64:
+        return Inst<U128>(Opcode::FPVectorMaxNumeric64, a, b, Imm1(fpcr_controlled));
+    }
+    UNREACHABLE();
+}
+
 U128 IREmitter::FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
     switch (esize) {
     case 32:
@@ -2436,6 +2446,16 @@ U128 IREmitter::FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpc
     UNREACHABLE();
 }
 
+U128 IREmitter::FPVectorMinNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
+    switch (esize) {
+    case 32:
+        return Inst<U128>(Opcode::FPVectorMinNumeric32, a, b, Imm1(fpcr_controlled));
+    case 64:
+        return Inst<U128>(Opcode::FPVectorMinNumeric64, a, b, Imm1(fpcr_controlled));
+    }
+    UNREACHABLE();
+}
+
 U128 IREmitter::FPVectorMul(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
     switch (esize) {
     case 32:
diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h
index 95051edb..5125216b 100644
--- a/src/frontend/ir/ir_emitter.h
+++ b/src/frontend/ir/ir_emitter.h
@@ -358,7 +358,9 @@ public:
     U128 FPVectorGreater(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
     U128 FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
     U128 FPVectorMax(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
+    U128 FPVectorMaxNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
     U128 FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
+    U128 FPVectorMinNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
     U128 FPVectorMul(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
     U128 FPVectorMulAdd(size_t esize, const U128& addend, const U128& op1, const U128& op2, bool fpcr_controlled = true);
     U128 FPVectorMulX(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc
index 4314d5a3..3e144994 100644
--- a/src/frontend/ir/opcodes.inc
+++ b/src/frontend/ir/opcodes.inc
@@ -612,8 +612,12 @@ OPCODE(FPVectorGreaterEqual32,                              U128,           U128
 OPCODE(FPVectorGreaterEqual64,                              U128,           U128,           U128,           U1                              )
 OPCODE(FPVectorMax32,                                       U128,           U128,           U128,           U1                              )
 OPCODE(FPVectorMax64,                                       U128,           U128,           U128,           U1                              )
+OPCODE(FPVectorMaxNumeric32,                                U128,           U128,           U128,           U1                              )
+OPCODE(FPVectorMaxNumeric64,                                U128,           U128,           U128,           U1                              )
 OPCODE(FPVectorMin32,                                       U128,           U128,           U128,           U1                              )
 OPCODE(FPVectorMin64,                                       U128,           U128,           U128,           U1                              )
+OPCODE(FPVectorMinNumeric32,                                U128,           U128,           U128,           U1                              )
+OPCODE(FPVectorMinNumeric64,                                U128,           U128,           U128,           U1                              )
 OPCODE(FPVectorMul32,                                       U128,           U128,           U128,           U1                              )
 OPCODE(FPVectorMul64,                                       U128,           U128,           U128,           U1                              )
 OPCODE(FPVectorMulAdd16,                                    U128,           U128,           U128,           U128,           U1              )