diff --git a/src/frontend/A64/decoder/a64.inc b/src/frontend/A64/decoder/a64.inc index c1379aeb..a1a8565a 100644 --- a/src/frontend/A64/decoder/a64.inc +++ b/src/frontend/A64/decoder/a64.inc @@ -603,8 +603,8 @@ INST(INS_elt, "INS (element)", "01101 // Data Processing - FP and SIMD - SIMD Three same extra INST(SDOT_vec, "SDOT (vector)", "0Q001110zz0mmmmm100101nnnnnddddd") INST(UDOT_vec, "UDOT (vector)", "0Q101110zz0mmmmm100101nnnnnddddd") -//INST(FCMLA_vec, "FCMLA", "0Q101110zz0mmmmm110rr1nnnnnddddd") -//INST(FCADD_vec, "FCADD", "0Q101110zz0mmmmm111r01nnnnnddddd") +INST(FCMLA_vec, "FCMLA", "0Q101110zz0mmmmm110rr1nnnnnddddd") +INST(FCADD_vec, "FCADD", "0Q101110zz0mmmmm111r01nnnnnddddd") // Data Processing - FP and SIMD - SIMD Two-register misc INST(REV64_asimd, "REV64", "0Q001110zz100000000010nnnnnddddd") diff --git a/src/frontend/A64/translate/impl/simd_three_same_extra.cpp b/src/frontend/A64/translate/impl/simd_three_same_extra.cpp index 575d2159..a972a557 100644 --- a/src/frontend/A64/translate/impl/simd_three_same_extra.cpp +++ b/src/frontend/A64/translate/impl/simd_three_same_extra.cpp @@ -53,4 +53,128 @@ bool TranslatorVisitor::UDOT_vec(bool Q, Imm<2> size, Vec Vm, Vec Vn, Vec Vd) { return DotProduct(*this, Q, size, Vm, Vn, Vd, &IREmitter::ZeroExtendToWord); } +bool TranslatorVisitor::FCMLA_vec(bool Q, Imm<2> size, Vec Vm, Imm<2> rot, Vec Vn, Vec Vd) { + if (size == 0) { + return UnallocatedEncoding(); + } + + if (!Q && size == 0b11) { + return UnallocatedEncoding(); + } + + const size_t esize = 8U << size.ZeroExtend(); + + // TODO: Currently we don't support half-precision floating point + if (esize == 16) { + return UnallocatedEncoding(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t num_elements = datasize / esize; + const size_t num_iterations = num_elements / 2; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + const IR::U128 operand3 = V(datasize, Vd); + IR::U128 result = ir.ZeroVector(); + + IR::U32U64 element1; + IR::U32U64 element2; + IR::U32U64 element3; + IR::U32U64 element4; + for (size_t e = 0; e < num_iterations; ++e) { + const size_t first = e * 2; + const size_t second = first + 1; + + switch (rot.ZeroExtend()) { + case 0b00: // 0 degrees + element1 = ir.VectorGetElement(esize, operand2, first); + element2 = ir.VectorGetElement(esize, operand1, first); + element3 = ir.VectorGetElement(esize, operand2, second); + element4 = ir.VectorGetElement(esize, operand1, first); + break; + case 0b01: // 90 degrees + element1 = ir.FPNeg(ir.VectorGetElement(esize, operand2, second)); + element2 = ir.VectorGetElement(esize, operand1, second); + element3 = ir.VectorGetElement(esize, operand2, first); + element4 = ir.VectorGetElement(esize, operand1, second); + break; + case 0b10: // 180 degrees + element1 = ir.FPNeg(ir.VectorGetElement(esize, operand2, first)); + element2 = ir.VectorGetElement(esize, operand1, first); + element3 = ir.FPNeg(ir.VectorGetElement(esize, operand2, second)); + element4 = ir.VectorGetElement(esize, operand1, first); + break; + case 0b11: // 270 degrees + element1 = ir.VectorGetElement(esize, operand2, second); + element2 = ir.VectorGetElement(esize, operand1, second); + element3 = ir.FPNeg(ir.VectorGetElement(esize, operand2, first)); + element4 = ir.VectorGetElement(esize, operand1, second); + break; + } + + const IR::U32U64 operand3_elem1 = ir.VectorGetElement(esize, operand3, first); + const IR::U32U64 operand3_elem2 = ir.VectorGetElement(esize, operand3, second); + + result = ir.VectorSetElement(esize, result, first, + ir.FPMulAdd(operand3_elem1, element2, element1, true)); + result = ir.VectorSetElement(esize, result, second, + ir.FPMulAdd(operand3_elem2, element4, element3, true)); + } + + ir.SetQ(Vd, result); + return true; +} + +bool TranslatorVisitor::FCADD_vec(bool Q, Imm<2> size, Vec Vm, Imm<1> rot, Vec Vn, Vec Vd) { + if (size == 0) { + return UnallocatedEncoding(); + } + + if (!Q && size == 0b11) { + return UnallocatedEncoding(); + } + + const size_t esize = 8U << size.ZeroExtend(); + + // TODO: Currently we don't support half-precision floating point + if (esize == 16) { + return UnallocatedEncoding(); + } + + const size_t datasize = Q ? 128 : 64; + const size_t num_elements = datasize / esize; + const size_t num_iterations = num_elements / 2; + + const IR::U128 operand1 = V(datasize, Vn); + const IR::U128 operand2 = V(datasize, Vm); + IR::U128 result = ir.ZeroVector(); + + IR::U32U64 element1; + IR::U32U64 element3; + for (size_t e = 0; e < num_iterations; ++e) { + const size_t first = e * 2; + const size_t second = first + 1; + + if (rot == 0) { + element1 = ir.FPNeg(ir.VectorGetElement(esize, operand2, second)); + element3 = ir.VectorGetElement(esize, operand2, first); + } else if (rot == 1) { + element1 = ir.VectorGetElement(esize, operand2, second); + element3 = ir.FPNeg(ir.VectorGetElement(esize, operand2, first)); + } + + const IR::U32U64 operand1_elem1 = ir.VectorGetElement(esize, operand1, first); + const IR::U32U64 operand1_elem3 = ir.VectorGetElement(esize, operand1, second); + + result = ir.VectorSetElement(esize, result, first, + ir.FPAdd(operand1_elem1, element1, true)); + result = ir.VectorSetElement(esize, result, second, + ir.FPAdd(operand1_elem3, element3, true)); + } + + ir.SetQ(Vd, result); + return true; +} + } // namespace Dynarmic::A64