diff --git a/src/backend/x64/emit_x64_vector.cpp b/src/backend/x64/emit_x64_vector.cpp index f1892a89..9e19f052 100644 --- a/src/backend/x64/emit_x64_vector.cpp +++ b/src/backend/x64/emit_x64_vector.cpp @@ -4326,6 +4326,69 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(inst, result); } +void EmitX64::EmitVectorTranspose8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm upper = ctx.reg_alloc.UseScratchXmm(args[1]); + const bool part = args[2].GetImmediateU1(); + + if (!part) { + code.pand(lower, code.MConst(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF)); + code.psllw(upper, 8); + } else { + code.psrlw(lower, 8); + code.pand(upper, code.MConst(xword, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00)); + } + code.por(lower, upper); + + ctx.reg_alloc.DefineValue(inst, lower); +} + +void EmitX64::EmitVectorTranspose16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm upper = ctx.reg_alloc.UseScratchXmm(args[1]); + const bool part = args[2].GetImmediateU1(); + + if (!part) { + code.pand(lower, code.MConst(xword, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF)); + code.pslld(upper, 16); + } else { + code.psrld(lower, 16); + code.pand(upper, code.MConst(xword, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000)); + } + code.por(lower, upper); + + ctx.reg_alloc.DefineValue(inst, lower); +} + +void EmitX64::EmitVectorTranspose32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm upper = ctx.reg_alloc.UseXmm(args[1]); + const bool part = args[2].GetImmediateU1(); + + code.shufps(lower, upper, !part ? 0b10001000 : 0b11011101); + code.pshufd(lower, lower, 0b11011000); + + ctx.reg_alloc.DefineValue(inst, lower); +} + +void EmitX64::EmitVectorTranspose64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(args[0]); + const Xbyak::Xmm upper = ctx.reg_alloc.UseXmm(args[1]); + const bool part = args[2].GetImmediateU1(); + + code.shufpd(lower, upper, !part ? 0b00 : 0b11); + + ctx.reg_alloc.DefineValue(inst, lower); +} + static void EmitVectorUnsignedAbsoluteDifference(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 16b9371e..d59d6af5 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -1761,6 +1761,20 @@ U128 IREmitter::VectorTableLookup(const U128& defaults, const Table& table, cons return Inst(Opcode::VectorTableLookup128, defaults, table, indices); } +U128 IREmitter::VectorTranspose(size_t esize, const U128& a, const U128& b, bool part) { + switch (esize) { + case 8: + return Inst(Opcode::VectorTranspose8, a, b, Imm1(part)); + case 16: + return Inst(Opcode::VectorTranspose16, a, b, Imm1(part)); + case 32: + return Inst(Opcode::VectorTranspose32, a, b, Imm1(part)); + case 64: + return Inst(Opcode::VectorTranspose64, a, b, Imm1(part)); + } + UNREACHABLE(); +} + U128 IREmitter::VectorUnsignedAbsoluteDifference(size_t esize, const U128& a, const U128& b) { switch (esize) { case 8: diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 03607636..3deed1eb 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -301,6 +301,7 @@ public: Table VectorTable(std::vector values); U64 VectorTableLookup(const U64& defaults, const Table& table, const U64& indices); U128 VectorTableLookup(const U128& defaults, const Table& table, const U128& indices); + U128 VectorTranspose(size_t esize, const U128& a, const U128& b, bool part); U128 VectorUnsignedAbsoluteDifference(size_t esize, const U128& a, const U128& b); U128 VectorUnsignedRecipEstimate(const U128& a); U128 VectorUnsignedRecipSqrtEstimate(const U128& a); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 1ede0e8b..fd88d026 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -473,6 +473,10 @@ OPCODE(VectorSub64, U128, U128 OPCODE(VectorTable, Table, Opaque, Opaque, Opaque, Opaque ) OPCODE(VectorTableLookup64, U64, U64, Table, U64 ) OPCODE(VectorTableLookup128, U128, U128, Table, U128 ) +OPCODE(VectorTranspose8, U128, U128, U128, U1 ) +OPCODE(VectorTranspose16, U128, U128, U128, U1 ) +OPCODE(VectorTranspose32, U128, U128, U128, U1 ) +OPCODE(VectorTranspose64, U128, U128, U128, U1 ) OPCODE(VectorUnsignedAbsoluteDifference8, U128, U128, U128 ) OPCODE(VectorUnsignedAbsoluteDifference16, U128, U128, U128 ) OPCODE(VectorUnsignedAbsoluteDifference32, U128, U128, U128 )