From 6c2c68bce6c3589a576c461bfb343185db42f68d Mon Sep 17 00:00:00 2001
From: Lioncash <mathew1800@gmail.com>
Date: Thu, 11 Apr 2019 16:44:15 -0400
Subject: [PATCH] A64: Implement FCMLA's indexed element variant

With this, all of the instructions introduced with ARMv8.3-CompNum have
an implementation.
---
 src/frontend/A64/decoder/a64.inc              |  2 +-
 .../impl/simd_vector_x_indexed_element.cpp    | 89 +++++++++++++++++++
 2 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/src/frontend/A64/decoder/a64.inc b/src/frontend/A64/decoder/a64.inc
index 5bdca7e8..6d0ae98e 100644
--- a/src/frontend/A64/decoder/a64.inc
+++ b/src/frontend/A64/decoder/a64.inc
@@ -896,7 +896,7 @@ INST(UDOT_elt,               "UDOT (by element)",                         "0Q101
 //INST(SQRDMLSH_elt_2,         "SQRDMLSH (by element)",                     "0Q101111zzLMmmmm1111H0nnnnnddddd")
 //INST(FMULX_elt_3,            "FMULX (by element)",                        "0Q10111100LMmmmm1001H0nnnnnddddd")
 INST(FMULX_elt_4,            "FMULX (by element)",                        "0Q1011111zLMmmmm1001H0nnnnnddddd")
-//INST(FCMLA_elt,              "FCMLA (by element)",                        "0Q101111zzLMmmmm0rr1H0nnnnnddddd")
+INST(FCMLA_elt,              "FCMLA (by element)",                        "0Q101111zzLMmmmm0rr1H0nnnnnddddd")
 
 // Data Processing - FP and SIMD - Cryptographic three register
 INST(SM3TT1A,                "SM3TT1A",                                   "11001110010mmmmm10ii00nnnnnddddd")
diff --git a/src/frontend/A64/translate/impl/simd_vector_x_indexed_element.cpp b/src/frontend/A64/translate/impl/simd_vector_x_indexed_element.cpp
index 2fc62bab..8eda641d 100644
--- a/src/frontend/A64/translate/impl/simd_vector_x_indexed_element.cpp
+++ b/src/frontend/A64/translate/impl/simd_vector_x_indexed_element.cpp
@@ -187,6 +187,95 @@ bool TranslatorVisitor::MUL_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4>
     return MultiplyByElement(*this, Q, size, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::None);
 }
 
+bool TranslatorVisitor::FCMLA_elt(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<2> rot, Imm<1> H, Vec Vn, Vec Vd) {
+    if (size == 0b00 || size == 0b11) {
+        return ReservedValue();
+    }
+
+    if (size == 0b01 && H == 1 && Q == 0) {
+        return ReservedValue();
+    }
+
+    if (size == 0b10 && (L == 1 || Q == 0)) {
+        return ReservedValue();
+    }
+
+    const size_t esize = 8U << size.ZeroExtend();
+
+    // TODO: We don't support the half-precision floating point variant yet.
+    if (esize == 16) {
+        return InterpretThisInstruction();
+    }
+
+    const size_t index = [=] {
+        if (size == 0b01) {
+            return concatenate(H, L).ZeroExtend();
+        }
+        return H.ZeroExtend();
+    }();
+
+    const Vec Vm = concatenate(M, Vmlo).ZeroExtend<Vec>();
+
+    const size_t datasize = Q ? 128 : 64;
+    const size_t num_elements = datasize / esize;
+    const size_t num_iterations = num_elements / 2;
+
+    const IR::U128 operand1 = V(datasize, Vn);
+    const IR::U128 operand2 = V(datasize, Vm);
+    const IR::U128 operand3 = V(datasize, Vd);
+    IR::U128 result = ir.ZeroVector();
+
+    IR::U32U64 element1;
+    IR::U32U64 element2;
+    IR::U32U64 element3;
+    IR::U32U64 element4;
+    for (size_t e = 0; e < num_iterations; ++e) {
+        const size_t first = e * 2;
+        const size_t second = first + 1;
+
+        const size_t index_first = index * 2;
+        const size_t index_second = index_first + 1;
+
+        switch (rot.ZeroExtend()) {
+            case 0b00: // 0 degrees
+                element1 = ir.VectorGetElement(esize, operand2, index_first);
+                element2 = ir.VectorGetElement(esize, operand1, first);
+                element3 = ir.VectorGetElement(esize, operand2, index_second);
+                element4 = ir.VectorGetElement(esize, operand1, first);
+                break;
+            case 0b01: // 90 degrees
+                element1 = ir.FPNeg(ir.VectorGetElement(esize, operand2, index_second));
+                element2 = ir.VectorGetElement(esize, operand1, second);
+                element3 = ir.VectorGetElement(esize, operand2, index_first);
+                element4 = ir.VectorGetElement(esize, operand1, second);
+                break;
+            case 0b10: // 180 degrees
+                element1 = ir.FPNeg(ir.VectorGetElement(esize, operand2, index_first));
+                element2 = ir.VectorGetElement(esize, operand1, first);
+                element3 = ir.FPNeg(ir.VectorGetElement(esize, operand2, index_second));
+                element4 = ir.VectorGetElement(esize, operand1, first);
+                break;
+            case 0b11: // 270 degrees
+                element1 = ir.VectorGetElement(esize, operand2, index_second);
+                element2 = ir.VectorGetElement(esize, operand1, second);
+                element3 = ir.FPNeg(ir.VectorGetElement(esize, operand2, index_first));
+                element4 = ir.VectorGetElement(esize, operand1, second);
+                break;
+        }
+
+        const IR::U32U64 operand3_elem1 = ir.VectorGetElement(esize, operand3, first);
+        const IR::U32U64 operand3_elem2 = ir.VectorGetElement(esize, operand3, second);
+
+        result = ir.VectorSetElement(esize, result, first,
+                                     ir.FPMulAdd(operand3_elem1, element2, element1, true));
+        result = ir.VectorSetElement(esize, result, second,
+                                     ir.FPMulAdd(operand3_elem2, element4, element3, true));
+    }
+
+    ir.SetQ(Vd, result);
+    return true;
+}
+
 bool TranslatorVisitor::FMLA_elt_4(bool Q, bool sz, Imm<1> L, Imm<1> M, Imm<4> Vmlo, Imm<1> H, Vec Vn, Vec Vd) {
     return FPMultiplyByElement(*this, Q, sz, L, M, Vmlo, H, Vn, Vd, ExtraBehavior::Accumulate);
 }