A32: Implement v8 ASIMD V{MAX,MIN}NM
This commit is contained in:
parent
4ed34cdcea
commit
685b85ce0e
@ -950,6 +950,123 @@ void EmitX64::EmitFPVectorMin64(EmitContext& ctx, IR::Inst* inst) {
|
||||
EmitFPVectorMinMax<64, false>(code, ctx, inst);
|
||||
}
|
||||
|
||||
template<size_t fsize, bool is_max>
|
||||
static void EmitFPVectorMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||
using FPT = mp::unsigned_integer_of_size<fsize>;
|
||||
constexpr u8 mantissa_msb_bit = static_cast<u8>(FP::FPInfo<FPT>::explicit_mantissa_width - 1);
|
||||
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
const bool fpcr_controlled = args[2].GetImmediateU1();
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Xmm xmm_a = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[0]) : ctx.reg_alloc.UseXmm(args[0]);
|
||||
const Xbyak::Xmm xmm_b = ctx.FPCR(fpcr_controlled).FZ() ? ctx.reg_alloc.UseScratchXmm(args[1]) : ctx.reg_alloc.UseXmm(args[1]);
|
||||
|
||||
const Xbyak::Xmm eq = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Xmm tmp0 = ctx.reg_alloc.ScratchXmm();
|
||||
const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm();
|
||||
|
||||
Xbyak::Label end, fallback;
|
||||
|
||||
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{
|
||||
DenormalsAreZero<fsize>(code, ctx.FPCR(fpcr_controlled), {xmm_a, xmm_b}, xmm0);
|
||||
|
||||
if (!code.HasAVX()) {
|
||||
FCODE(vcmpeqp)(xmm0, xmm_a, xmm_b);
|
||||
FCODE(vcmpunordp)(tmp0, xmm_a, xmm_a);
|
||||
FCODE(vcmpunordp)(tmp1, xmm_b, xmm_b);
|
||||
code.pand(tmp0, xmm_a);
|
||||
code.vpandn(tmp1, xmm_b, tmp1);
|
||||
FCODE(orp)(tmp0, tmp1);
|
||||
if constexpr (is_max) {
|
||||
code.vpand(eq, xmm_a, xmm_b);
|
||||
FCODE(vmaxp)(result, xmm_a, xmm_b);
|
||||
} else {
|
||||
code.vpor(eq, xmm_a, xmm_b);
|
||||
FCODE(vminp)(result, xmm_a, xmm_b);
|
||||
}
|
||||
ICODE(psll)(tmp0, static_cast<u8>(fsize - mantissa_msb_bit));
|
||||
|
||||
// At this point:
|
||||
// tmp0 = IsSNaN(xmm_a) || IsQNaN(xmm_b)
|
||||
// xmm0 == (xmm_a == xmm_b)
|
||||
// result = xmm_a {<,>} xmm_b ? xmm_a : xmm_b
|
||||
|
||||
FCODE(blendvp)(result, eq);
|
||||
FCODE(vblendvp)(result, result, xmm_a, tmp0);
|
||||
} else {
|
||||
/*
|
||||
code.movaps(tmp0, xmm_a);
|
||||
code.movaps(tmp1, xmm_b);
|
||||
FCODE(cmpunordp)(tmp0, tmp0);
|
||||
FCODE(cmpunordp)(tmp1, tmp1);
|
||||
*/
|
||||
FCODE(vcmpunordp)(tmp0, xmm_a, xmm_a);
|
||||
FCODE(vcmpunordp)(tmp1, xmm_b, xmm_b);
|
||||
|
||||
code.pand(tmp0, xmm_a);
|
||||
|
||||
/*
|
||||
code.movaps(xmm0, xmm_b);
|
||||
code.pandn(xmm0, tmp1);
|
||||
code.por(tmp0, xmm0);
|
||||
*/
|
||||
code.vpandn(tmp1, xmm_b, tmp1);
|
||||
FCODE(orp)(tmp0, tmp1);
|
||||
|
||||
ICODE(psll)(tmp0, static_cast<u8>(fsize - mantissa_msb_bit));
|
||||
code.psrad(tmp0, 31);
|
||||
if constexpr (fsize == 64) {
|
||||
code.pshufd(tmp0, tmp0, 0b11110101);
|
||||
}
|
||||
|
||||
/*
|
||||
code.movaps(xmm0, xmm_a);
|
||||
FCODE(cmpeqp)(xmm0, xmm_b);
|
||||
*/
|
||||
FCODE(vcmpeqp)(xmm0, xmm_a, xmm_b);
|
||||
|
||||
code.movaps(eq, xmm_a);
|
||||
code.movaps(result, xmm_a);
|
||||
if constexpr (is_max) {
|
||||
code.pand(eq, xmm_b);
|
||||
FCODE(maxp)(result, xmm_b);
|
||||
} else {
|
||||
code.por(eq, xmm_b);
|
||||
FCODE(minp)(result, xmm_b);
|
||||
}
|
||||
|
||||
code.pand(eq, xmm0);
|
||||
code.pandn(xmm0, result);
|
||||
code.por(eq, xmm0);
|
||||
|
||||
code.movaps(result, xmm_a);
|
||||
code.pand(result, tmp0);
|
||||
code.pandn(tmp0, eq);
|
||||
code.por(result, tmp0);
|
||||
}
|
||||
|
||||
ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), result);
|
||||
});
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPVectorMaxNumeric32(EmitContext& ctx, IR::Inst* inst) {
|
||||
EmitFPVectorMinMaxNumeric<32, true>(code, ctx, inst);
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPVectorMaxNumeric64(EmitContext& ctx, IR::Inst* inst) {
|
||||
EmitFPVectorMinMaxNumeric<64, true>(code, ctx, inst);
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPVectorMinNumeric32(EmitContext& ctx, IR::Inst* inst) {
|
||||
EmitFPVectorMinMaxNumeric<32, false>(code, ctx, inst);
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPVectorMinNumeric64(EmitContext& ctx, IR::Inst* inst) {
|
||||
EmitFPVectorMinMaxNumeric<64, false>(code, ctx, inst);
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPVectorMul32(EmitContext& ctx, IR::Inst* inst) {
|
||||
EmitThreeOpVectorOperation<32, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::mulps);
|
||||
}
|
||||
|
@ -50,6 +50,8 @@ INST(asimd_VCGT_reg_float, "VCGT (register)", "111100110D1znnnndddd111
|
||||
INST(asimd_VACGE, "VACGE", "111100110Doznnnndddd1110NQM1mmmm") // ASIMD
|
||||
INST(asimd_VMAX_float, "VMAX (floating-point)", "111100100D0znnnndddd1111NQM0mmmm") // ASIMD
|
||||
INST(asimd_VMIN_float, "VMIN (floating-point)", "111100100D1znnnndddd1111NQM0mmmm") // ASIMD
|
||||
INST(v8_VMAXNM, "VMAXNM", "111100110D0znnnndddd1111NQM1mmmm") // v8
|
||||
INST(v8_VMINNM, "VMINNM", "111100110D1znnnndddd1111NQM1mmmm") // v8
|
||||
INST(asimd_VRECPS, "VRECPS", "111100100D0znnnndddd1111NQM1mmmm") // ASIMD
|
||||
INST(asimd_VRSQRTS, "VRSQRTS", "111100100D1znnnndddd1111NQM1mmmm") // ASIMD
|
||||
|
||||
|
@ -795,6 +795,18 @@ bool ArmTranslatorVisitor::asimd_VMIN_float(bool D, bool sz, size_t Vn, size_t V
|
||||
});
|
||||
}
|
||||
|
||||
bool ArmTranslatorVisitor::v8_VMAXNM(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
|
||||
return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) {
|
||||
return ir.FPVectorMaxNumeric(32, reg_n, reg_m, false);
|
||||
});
|
||||
}
|
||||
|
||||
bool ArmTranslatorVisitor::v8_VMINNM(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
|
||||
return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) {
|
||||
return ir.FPVectorMinNumeric(32, reg_n, reg_m, false);
|
||||
});
|
||||
}
|
||||
|
||||
bool ArmTranslatorVisitor::asimd_VRECPS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
|
||||
return FloatingPointInstruction(*this, D, sz, Vn, Vd, N, Q, M, Vm, [this](const auto&, const auto& reg_n, const auto& reg_m) {
|
||||
return ir.FPVectorRecipStepFused(32, reg_n, reg_m, false);
|
||||
|
@ -503,6 +503,8 @@ struct ArmTranslatorVisitor final {
|
||||
bool asimd_VACGE(bool D, bool op, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
|
||||
bool asimd_VMAX_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
|
||||
bool asimd_VMIN_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
|
||||
bool v8_VMAXNM(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
|
||||
bool v8_VMINNM(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
|
||||
bool asimd_VRECPS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
|
||||
bool asimd_VRSQRTS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
|
||||
|
||||
|
@ -2426,6 +2426,16 @@ U128 IREmitter::FPVectorMax(size_t esize, const U128& a, const U128& b, bool fpc
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
U128 IREmitter::FPVectorMaxNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
|
||||
switch (esize) {
|
||||
case 32:
|
||||
return Inst<U128>(Opcode::FPVectorMaxNumeric32, a, b, Imm1(fpcr_controlled));
|
||||
case 64:
|
||||
return Inst<U128>(Opcode::FPVectorMaxNumeric64, a, b, Imm1(fpcr_controlled));
|
||||
}
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
U128 IREmitter::FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
|
||||
switch (esize) {
|
||||
case 32:
|
||||
@ -2436,6 +2446,16 @@ U128 IREmitter::FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpc
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
U128 IREmitter::FPVectorMinNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
|
||||
switch (esize) {
|
||||
case 32:
|
||||
return Inst<U128>(Opcode::FPVectorMinNumeric32, a, b, Imm1(fpcr_controlled));
|
||||
case 64:
|
||||
return Inst<U128>(Opcode::FPVectorMinNumeric64, a, b, Imm1(fpcr_controlled));
|
||||
}
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
U128 IREmitter::FPVectorMul(size_t esize, const U128& a, const U128& b, bool fpcr_controlled) {
|
||||
switch (esize) {
|
||||
case 32:
|
||||
|
@ -358,7 +358,9 @@ public:
|
||||
U128 FPVectorGreater(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
||||
U128 FPVectorGreaterEqual(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
||||
U128 FPVectorMax(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
||||
U128 FPVectorMaxNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
||||
U128 FPVectorMin(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
||||
U128 FPVectorMinNumeric(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
||||
U128 FPVectorMul(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
||||
U128 FPVectorMulAdd(size_t esize, const U128& addend, const U128& op1, const U128& op2, bool fpcr_controlled = true);
|
||||
U128 FPVectorMulX(size_t esize, const U128& a, const U128& b, bool fpcr_controlled = true);
|
||||
|
@ -612,8 +612,12 @@ OPCODE(FPVectorGreaterEqual32, U128, U128
|
||||
OPCODE(FPVectorGreaterEqual64, U128, U128, U128, U1 )
|
||||
OPCODE(FPVectorMax32, U128, U128, U128, U1 )
|
||||
OPCODE(FPVectorMax64, U128, U128, U128, U1 )
|
||||
OPCODE(FPVectorMaxNumeric32, U128, U128, U128, U1 )
|
||||
OPCODE(FPVectorMaxNumeric64, U128, U128, U128, U1 )
|
||||
OPCODE(FPVectorMin32, U128, U128, U128, U1 )
|
||||
OPCODE(FPVectorMin64, U128, U128, U128, U1 )
|
||||
OPCODE(FPVectorMinNumeric32, U128, U128, U128, U1 )
|
||||
OPCODE(FPVectorMinNumeric64, U128, U128, U128, U1 )
|
||||
OPCODE(FPVectorMul32, U128, U128, U128, U1 )
|
||||
OPCODE(FPVectorMul64, U128, U128, U128, U1 )
|
||||
OPCODE(FPVectorMulAdd16, U128, U128, U128, U128, U1 )
|
||||
|
Loading…
x
Reference in New Issue
Block a user