diff --git a/src/backend_x64/emit_x64_vector.cpp b/src/backend_x64/emit_x64_vector.cpp index 206e3b74..23c17d23 100644 --- a/src/backend_x64/emit_x64_vector.cpp +++ b/src/backend_x64/emit_x64_vector.cpp @@ -5,6 +5,7 @@ */ #include +#include #include #include @@ -1865,6 +1866,27 @@ void EmitX64::EmitVectorPairedAddUnsignedWiden32(EmitContext& ctx, IR::Inst* ins ctx.reg_alloc.DefineValue(inst, a); } +template +static T PolynomialMultiply(T lhs, T rhs) { + constexpr size_t bit_size = Common::BitSize(); + const std::bitset operand(lhs); + + T res = 0; + for (size_t i = 0; i < bit_size; i++) { + if (operand[i]) { + res ^= rhs << i; + } + } + + return res; +} + +void EmitX64::EmitVectorPolynomialMultiply8(EmitContext& ctx, IR::Inst* inst) { + EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { + std::transform(a.begin(), a.end(), b.begin(), result.begin(), PolynomialMultiply); + }); +} + void EmitX64::EmitVectorPopulationCount(EmitContext& ctx, IR::Inst* inst) { if (code.DoesCpuSupport(Xbyak::util::Cpu::tAVX512_BITALG)) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index b3001b5f..120775a6 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -1194,6 +1194,10 @@ U128 IREmitter::VectorPairedAddUnsignedWiden(size_t original_esize, const U128& return {}; } +U128 IREmitter::VectorPolynomialMultiply(const U128& a, const U128& b) { + return Inst(Opcode::VectorPolynomialMultiply8, a, b); +} + U128 IREmitter::VectorPopulationCount(const U128& a) { return Inst(Opcode::VectorPopulationCount, a); } diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 25bdaff1..5e66712a 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -238,6 +238,7 @@ public: U128 VectorPairedAddLower(size_t esize, const U128& a, const U128& b); U128 VectorPairedAddSignedWiden(size_t original_esize, const U128& a); U128 VectorPairedAddUnsignedWiden(size_t original_esize, const U128& a); + U128 VectorPolynomialMultiply(const U128& a, const U128& b); U128 VectorPopulationCount(const U128& a); U128 VectorReverseBits(const U128& a); U128 VectorRotateLeft(size_t esize, const U128& a, u8 amount); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 9ab17dcd..2e2534b2 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -330,6 +330,7 @@ OPCODE(VectorPairedAdd8, T::U128, T::U128, OPCODE(VectorPairedAdd16, T::U128, T::U128, T::U128 ) OPCODE(VectorPairedAdd32, T::U128, T::U128, T::U128 ) OPCODE(VectorPairedAdd64, T::U128, T::U128, T::U128 ) +OPCODE(VectorPolynomialMultiply8, T::U128, T::U128, T::U128 ) OPCODE(VectorPopulationCount, T::U128, T::U128 ) OPCODE(VectorReverseBits, T::U128, T::U128 ) OPCODE(VectorRoundingHalvingAddS8, T::U128, T::U128, T::U128 )