diff --git a/src/backend_x64/emit_x64.cpp b/src/backend_x64/emit_x64.cpp index 63be3014..1c567917 100644 --- a/src/backend_x64/emit_x64.cpp +++ b/src/backend_x64/emit_x64.cpp @@ -1289,6 +1289,52 @@ void EmitX64::EmitPackedAddU8(IR::Block& block, IR::Inst* inst) { } } +void EmitX64::EmitPackedSubU8(IR::Block& block, IR::Inst* inst) { + auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); + + IR::Value a = inst->GetArg(0); + IR::Value b = inst->GetArg(1); + + Xbyak::Reg32 reg_a = reg_alloc.UseDefGpr(a, inst).cvt32(); + Xbyak::Reg32 reg_b = reg_alloc.UseGpr(b).cvt32(); + Xbyak::Reg32 reg_ge; + + Xbyak::Xmm xmm_a = reg_alloc.ScratchXmm(); + Xbyak::Xmm xmm_b = reg_alloc.ScratchXmm(); + Xbyak::Xmm xmm_ge; + + if (ge_inst) { + EraseInstruction(block, ge_inst); + inst->DecrementRemainingUses(); + + reg_ge = reg_alloc.DefGpr(ge_inst).cvt32(); + xmm_ge = reg_alloc.ScratchXmm(); + } + + code->movd(xmm_a, reg_a); + code->movd(xmm_b, reg_b); + if (ge_inst) { + code->movaps(xmm_ge, xmm_a); + code->pmaxub(xmm_ge, xmm_b); + code->pcmpeqb(xmm_ge, xmm_a); + code->movd(reg_ge, xmm_ge); + } + code->psubb(xmm_a, xmm_b); + code->movd(reg_a, xmm_a); + + if (ge_inst) { + if (cpu_info.has(Xbyak::util::Cpu::tBMI2)) { + Xbyak::Reg32 tmp = reg_alloc.ScratchGpr().cvt32(); + code->mov(tmp, 0x80808080); + code->pext(reg_ge, reg_ge, tmp); + } else { + code->and_(reg_ge, 0x80808080); + code->imul(reg_ge, reg_ge, 0x0204081); + code->shr(reg_ge, 28); + } + } +} + void EmitX64::EmitPackedHalvingAddU8(IR::Block& block, IR::Inst* inst) { IR::Value a = inst->GetArg(0); IR::Value b = inst->GetArg(1); diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 11e1bb2b..ad2ae6a3 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -330,6 +330,12 @@ IREmitter::ResultAndGE IREmitter::PackedAddU8(const Value& a, const Value& b) { return {result, ge}; } +IREmitter::ResultAndGE IREmitter::PackedSubU8(const Value& a, const Value& b) { + auto result = Inst(Opcode::PackedSubU8, {a, b}); + auto ge = Inst(Opcode::GetGEFromOp, {result}); + return {result, ge}; +} + Value IREmitter::PackedHalvingAddU8(const Value& a, const Value& b) { return Inst(Opcode::PackedHalvingAddU8, {a, b}); } diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index 908aef64..bfe7d7b1 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -128,6 +128,7 @@ public: Value ByteReverseHalf(const Value& a); Value ByteReverseDual(const Value& a); ResultAndGE PackedAddU8(const Value& a, const Value& b); + ResultAndGE PackedSubU8(const Value& a, const Value& b); Value PackedHalvingAddU8(const Value& a, const Value& b); Value PackedHalvingAddS8(const Value& a, const Value& b); Value PackedHalvingSubU8(const Value& a, const Value& b); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index a2f5c3c5..b6a2f5d4 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -73,6 +73,7 @@ OPCODE(ByteReverseWord, T::U32, T::U32 OPCODE(ByteReverseHalf, T::U16, T::U16 ) OPCODE(ByteReverseDual, T::U64, T::U64 ) OPCODE(PackedAddU8, T::U32, T::U32, T::U32 ) +OPCODE(PackedSubU8, T::U32, T::U32, T::U32 ) OPCODE(PackedHalvingAddU8, T::U32, T::U32, T::U32 ) OPCODE(PackedHalvingAddS8, T::U32, T::U32, T::U32 ) OPCODE(PackedHalvingSubU8, T::U32, T::U32, T::U32 ) diff --git a/src/frontend/translate/translate_arm/parallel.cpp b/src/frontend/translate/translate_arm/parallel.cpp index a7ad61ab..6813c8e2 100644 --- a/src/frontend/translate/translate_arm/parallel.cpp +++ b/src/frontend/translate/translate_arm/parallel.cpp @@ -58,7 +58,14 @@ bool ArmTranslatorVisitor::arm_USAX(Cond cond, Reg n, Reg d, Reg m) { } bool ArmTranslatorVisitor::arm_USUB8(Cond cond, Reg n, Reg d, Reg m) { - return InterpretThisInstruction(); + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) + return UnpredictableInstruction(); + if (ConditionPassed(cond)) { + auto result = ir.PackedSubU8(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + } + return true; } bool ArmTranslatorVisitor::arm_USUB16(Cond cond, Reg n, Reg d, Reg m) {