From eaf545877a746088451c4d12d8535ec60153c987 Mon Sep 17 00:00:00 2001
From: MerryMage <MerryMage@users.noreply.github.com>
Date: Tue, 23 Jan 2018 16:45:28 +0000
Subject: [PATCH] IR: Implement Vector{Lower,}PairedAdd{8,16,32,64}

---
 src/backend_x64/emit_x64_vector.cpp | 164 ++++++++++++++++++++++++++++
 src/frontend/ir/ir_emitter.cpp      |  28 +++++
 src/frontend/ir/ir_emitter.h        |   7 ++
 src/frontend/ir/opcodes.inc         |   7 ++
 4 files changed, 206 insertions(+)

diff --git a/src/backend_x64/emit_x64_vector.cpp b/src/backend_x64/emit_x64_vector.cpp
index 68c2e53c..2621a6a7 100644
--- a/src/backend_x64/emit_x64_vector.cpp
+++ b/src/backend_x64/emit_x64_vector.cpp
@@ -53,6 +53,170 @@ void EmitX64<JST>::EmitVectorAnd(EmitContext& ctx, IR::Inst* inst) {
     EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pand);
 }
 
+template <typename JST>
+void EmitX64<JST>::EmitVectorLowerPairedAdd8(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+    Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+    Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+    code->punpcklqdq(xmm_a, xmm_b);
+    code->movdqa(tmp, xmm_a);
+    code->psllw(xmm_a, 8);
+    code->paddw(xmm_a, tmp);
+    code->pxor(tmp, tmp);
+    code->psrlw(xmm_a, 8);
+    code->packuswb(xmm_a, tmp);
+
+    ctx.reg_alloc.DefineValue(inst, xmm_a);
+}
+
+template <typename JST>
+void EmitX64<JST>::EmitVectorLowerPairedAdd16(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+    Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+    Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+    code->punpcklqdq(xmm_a, xmm_b);
+    if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) {
+        code->pxor(tmp, tmp);
+        code->phaddw(xmm_a, tmp);
+    } else {
+        code->movdqa(tmp, xmm_a);
+        code->pslld(xmm_a, 16);
+        code->paddd(xmm_a, tmp);
+        code->pxor(tmp, tmp);
+        code->psrad(xmm_a, 16);
+        code->packssdw(xmm_a, tmp); // Note: packusdw is SSE4.1, hence the arithmetic shift above.
+    }
+
+    ctx.reg_alloc.DefineValue(inst, xmm_a);
+}
+
+template <typename JST>
+void EmitX64<JST>::EmitVectorLowerPairedAdd32(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
+    Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+    Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+    code->punpcklqdq(xmm_a, xmm_b);
+    if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) {
+        code->pxor(tmp, tmp);
+        code->phaddd(xmm_a, tmp);
+    } else {
+        code->movdqa(tmp, xmm_a);
+        code->psllq(xmm_a, 32);
+        code->paddq(xmm_a, tmp);
+        code->psrlq(xmm_a, 32);
+        code->pshufd(xmm_a, xmm_a, 0b11011000);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, xmm_a);
+}
+
+template <typename JST>
+void EmitX64<JST>::EmitVectorPairedAdd8(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+    Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
+    Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm();
+    Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm();
+
+    code->movdqa(c, a);
+    code->movdqa(d, b);
+    code->psllw(a, 8);
+    code->psllw(b, 8);
+    code->paddw(a, c);
+    code->paddw(b, d);
+    code->psrlw(a, 8);
+    code->psrlw(b, 8);
+    code->packuswb(a, b);
+
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+template <typename JST>
+void EmitX64<JST>::EmitVectorPairedAdd16(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) {
+        Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+        Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
+
+        code->phaddw(a, b);
+
+        ctx.reg_alloc.DefineValue(inst, a);
+    } else {
+        Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+        Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
+        Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm();
+        Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm();
+
+        code->movdqa(c, a);
+        code->movdqa(d, b);
+        code->pslld(a, 16);
+        code->pslld(b, 16);
+        code->paddd(a, c);
+        code->paddd(b, d);
+        code->psrad(a, 16);
+        code->psrad(b, 16);
+        code->packssdw(a, b);
+
+        ctx.reg_alloc.DefineValue(inst, a);
+    }
+}
+
+template <typename JST>
+void EmitX64<JST>::EmitVectorPairedAdd32(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    if (code->DoesCpuSupport(Xbyak::util::Cpu::tSSSE3)) {
+        Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+        Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
+
+        code->phaddd(a, b);
+
+        ctx.reg_alloc.DefineValue(inst, a);
+    } else {
+        Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+        Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(args[1]);
+        Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm();
+        Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm();
+
+        code->movdqa(c, a);
+        code->movdqa(d, b);
+        code->psllq(a, 32);
+        code->psllq(b, 32);
+        code->paddq(a, c);
+        code->paddq(b, d);
+        code->shufps(a, b, 0b11011101);
+
+        ctx.reg_alloc.DefineValue(inst, a);
+    }
+}
+
+template <typename JST>
+void EmitX64<JST>::EmitVectorPairedAdd64(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+    Xbyak::Xmm b = ctx.reg_alloc.UseXmm(args[1]);
+    Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm();
+
+    code->movdqa(c, a);
+    code->punpcklqdq(a, b);
+    code->punpckhqdq(c, b);
+    code->paddq(a, c);
+
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
 } // namespace BackendX64
 } // namespace Dynarmic
 
diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp
index 16b2b77f..97325027 100644
--- a/src/frontend/ir/ir_emitter.cpp
+++ b/src/frontend/ir/ir_emitter.cpp
@@ -656,6 +656,34 @@ U128 IREmitter::VectorAnd(const U128& a, const U128& b) {
     return Inst<U128>(Opcode::VectorAnd, a, b);
 }
 
+U128 IREmitter::VectorLowerPairedAdd8(const U128& a, const U128& b) {
+    return Inst<U128>(Opcode::VectorLowerPairedAdd8, a, b);
+}
+
+U128 IREmitter::VectorLowerPairedAdd16(const U128& a, const U128& b) {
+    return Inst<U128>(Opcode::VectorLowerPairedAdd16, a, b);
+}
+
+U128 IREmitter::VectorLowerPairedAdd32(const U128& a, const U128& b) {
+    return Inst<U128>(Opcode::VectorLowerPairedAdd32, a, b);
+}
+
+U128 IREmitter::VectorPairedAdd8(const U128& a, const U128& b) {
+    return Inst<U128>(Opcode::VectorPairedAdd8, a, b);
+}
+
+U128 IREmitter::VectorPairedAdd16(const U128& a, const U128& b) {
+    return Inst<U128>(Opcode::VectorPairedAdd16, a, b);
+}
+
+U128 IREmitter::VectorPairedAdd32(const U128& a, const U128& b) {
+    return Inst<U128>(Opcode::VectorPairedAdd32, a, b);
+}
+
+U128 IREmitter::VectorPairedAdd64(const U128& a, const U128& b) {
+    return Inst<U128>(Opcode::VectorPairedAdd64, a, b);
+}
+
 U32 IREmitter::FPAbs32(const U32& a) {
     return Inst<U32>(Opcode::FPAbs32, a);
 }
diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h
index da49cac7..72f1c5c3 100644
--- a/src/frontend/ir/ir_emitter.h
+++ b/src/frontend/ir/ir_emitter.h
@@ -188,6 +188,13 @@ public:
     U128 VectorAdd32(const U128& a, const U128& b);
     U128 VectorAdd64(const U128& a, const U128& b);
     U128 VectorAnd(const U128& a, const U128& b);
+    U128 VectorLowerPairedAdd8(const U128& a, const U128& b);
+    U128 VectorLowerPairedAdd16(const U128& a, const U128& b);
+    U128 VectorLowerPairedAdd32(const U128& a, const U128& b);
+    U128 VectorPairedAdd8(const U128& a, const U128& b);
+    U128 VectorPairedAdd16(const U128& a, const U128& b);
+    U128 VectorPairedAdd32(const U128& a, const U128& b);
+    U128 VectorPairedAdd64(const U128& a, const U128& b);
 
     U32 FPAbs32(const U32& a);
     U64 FPAbs64(const U64& a);
diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc
index 1575b91c..66a51844 100644
--- a/src/frontend/ir/opcodes.inc
+++ b/src/frontend/ir/opcodes.inc
@@ -166,6 +166,13 @@ OPCODE(VectorAdd16,             T::U128,        T::U128,        T::U128
 OPCODE(VectorAdd32,             T::U128,        T::U128,        T::U128                         )
 OPCODE(VectorAdd64,             T::U128,        T::U128,        T::U128                         )
 OPCODE(VectorAnd,               T::U128,        T::U128,        T::U128                         )
+OPCODE(VectorLowerPairedAdd8,   T::U128,        T::U128,        T::U128                         )
+OPCODE(VectorLowerPairedAdd16,  T::U128,        T::U128,        T::U128                         )
+OPCODE(VectorLowerPairedAdd32,  T::U128,        T::U128,        T::U128                         )
+OPCODE(VectorPairedAdd8,        T::U128,        T::U128,        T::U128                         )
+OPCODE(VectorPairedAdd16,       T::U128,        T::U128,        T::U128                         )
+OPCODE(VectorPairedAdd32,       T::U128,        T::U128,        T::U128                         )
+OPCODE(VectorPairedAdd64,       T::U128,        T::U128,        T::U128                         )
 
 // Floating-point operations
 OPCODE(FPAbs32,                 T::U32,         T::U32                                          )