ryujinx/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
LDj3SNuD 5e724cf24e
Add Profiled Persistent Translation Cache. (#769)
* Delete DelegateTypes.cs

* Delete DelegateCache.cs

* Add files via upload

* Update Horizon.cs

* Update Program.cs

* Update MainWindow.cs

* Update Aot.cs

* Update RelocEntry.cs

* Update Translator.cs

* Update MemoryManager.cs

* Update InstEmitMemoryHelper.cs

* Update Delegates.cs

* Nit.

* Nit.

* Nit.

* 10 fewer MSIL bytes for us

* Add comment. Nits.

* Update Translator.cs

* Update Aot.cs

* Nits.

* Opt..

* Opt..

* Opt..

* Opt..

* Allow to change compression level.

* Update MemoryManager.cs

* Update Translator.cs

* Manage corner cases during the save phase. Nits.

* Update Aot.cs

* Translator response tweak for Aot disabled. Nit.

* Nit.

* Nits.

* Create DelegateHelpers.cs

* Update Delegates.cs

* Nit.

* Nit.

* Nits.

* Fix due to #784.

* Fixes due to #757 & #841.

* Fix due to #846.

* Fix due to #847.

* Use MethodInfo for managed method calls.

Use IR methods instead of managed methods about Max/Min (S/U).
Follow-ups & Nits.

* Add missing exception messages.

Reintroduce slow path for Fmov_Vi.
Implement slow path for Fmov_Si.

* Switch to the new folder structure.

Nits.

* Impl. index-based relocation information. Impl. cache file version field.

* Nit.

* Address gdkchan comments.

Mainly:
- fixed cache file corruption issue on exit; - exposed a way to disable AOT on the GUI.

* Address AcK77 comment.

* Address Thealexbarney, jduncanator & emmauss comments.

Header magic, CpuId (FI) & Aot -> Ptc.

* Adaptation to the new application reloading system.

Improvements to the call system of managed methods.
Follow-ups.
Nits.

* Get the same boot times as on master when PTC is disabled.

* Profiled Aot.

* A32 support (#897).

* #975 support (1 of 2).

* #975 support (2 of 2).

* Rebase fix & nits.

* Some fixes and nits (still one bug left).

* One fix & nits.

* Tests fix (by gdk) & nits.

* Support translations not only in high quality and rejit.

Nits.

* Added possibility to skip translations and continue execution, using `ESC` key.

* Update SettingsWindow.cs

* Update GLRenderer.cs

* Update Ptc.cs

* Disabled Profiled PTC by default as requested in the past by gdk.

* Fix rejit bug. Increased number of parallel translations. Add stack unwinding stuffs support (1 of 2).

Nits.

* Add stack unwinding stuffs support (2 of 2). Tuned number of parallel translations.

* Restored the ability to assemble jumps with 8-bit offset when Profiled PTC is disabled or during profiling.

Modifications due to rebase.
Nits.

* Limited profiling of the functions to be translated to the addresses belonging to the range of static objects only.

* Nits.

* Nits.

* Update Delegates.cs

* Nit.

* Update InstEmitSimdArithmetic.cs

* Address riperiperi comments.

* Fixed the issue of unjustifiably longer boot times at the second boot than at the first boot, measured at the same time or reference point and with the same number of translated functions.

* Implemented a simple redundant load/save mechanism.

Halved the value of Decoder.MaxInstsPerFunction more appropriate for the current performance of the Translator.
Replaced by Logger.PrintError to Logger.PrintDebug in TexturePool.cs about the supposed invalid texture format to avoid the spawn of the log.
Nits.

* Nit.

Improved Logger.PrintError in TexturePool.cs to avoid log spawn.
Added missing code for FZ handling (in output) for fp max/min instructions (slow paths).

* Add configuration migration for PTC

Co-authored-by: Thog <me@thog.eu>
2020-06-16 20:28:02 +02:00

1207 lines
45 KiB
C#

using ARMeilleure.Decoders;
using ARMeilleure.IntermediateRepresentation;
using ARMeilleure.Translation;
using System;
using System.Diagnostics;
using static ARMeilleure.Instructions.InstEmitFlowHelper;
using static ARMeilleure.Instructions.InstEmitHelper;
using static ARMeilleure.Instructions.InstEmitSimdHelper;
using static ARMeilleure.Instructions.InstEmitSimdHelper32;
using static ARMeilleure.IntermediateRepresentation.OperandHelper;
namespace ARMeilleure.Instructions
{
static partial class InstEmit32
{
public static void Vabs_S(ArmEmitterContext context)
{
OpCode32SimdS op = (OpCode32SimdS)context.CurrOp;
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitScalarUnaryOpSimd32(context, (m) =>
{
return EmitFloatAbs(context, m, (op.Size & 1) == 0, false);
});
}
else
{
EmitScalarUnaryOpF32(context, (op1) => EmitUnaryMathCall(context, nameof(Math.Abs), op1));
}
}
public static void Vabs_V(ArmEmitterContext context)
{
OpCode32Simd op = (OpCode32Simd)context.CurrOp;
if (op.F)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitVectorUnaryOpSimd32(context, (m) =>
{
return EmitFloatAbs(context, m, (op.Size & 1) == 0, true);
});
}
else
{
EmitVectorUnaryOpF32(context, (op1) => EmitUnaryMathCall(context, nameof(Math.Abs), op1));
}
}
else
{
EmitVectorUnaryOpSx32(context, (op1) => EmitAbs(context, op1));
}
}
private static Operand EmitAbs(ArmEmitterContext context, Operand value)
{
Operand isPositive = context.ICompareGreaterOrEqual(value, Const(value.Type, 0));
return context.ConditionalSelect(isPositive, value, context.Negate(value));
}
public static void Vadd_S(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitScalarBinaryOpF32(context, Intrinsic.X86Addss, Intrinsic.X86Addsd);
}
else if (Optimizations.FastFP)
{
EmitScalarBinaryOpF32(context, (op1, op2) => context.Add(op1, op2));
}
else
{
EmitScalarBinaryOpF32(context, (op1, op2) => EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), op1, op2));
}
}
public static void Vadd_V(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitVectorBinaryOpF32(context, Intrinsic.X86Addps, Intrinsic.X86Addpd);
}
else if (Optimizations.FastFP)
{
EmitVectorBinaryOpF32(context, (op1, op2) => context.Add(op1, op2));
}
else
{
EmitVectorBinaryOpF32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPAddFpscr), op1, op2));
}
}
public static void Vadd_I(ArmEmitterContext context)
{
if (Optimizations.UseSse2)
{
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PaddInstruction[op.Size], op1, op2));
}
else
{
EmitVectorBinaryOpZx32(context, (op1, op2) => context.Add(op1, op2));
}
}
public static void Vdup(ArmEmitterContext context)
{
OpCode32SimdDupGP op = (OpCode32SimdDupGP)context.CurrOp;
Operand insert = GetIntA32(context, op.Rt);
// Zero extend into an I64, then replicate. Saves the most time over elementwise inserts.
insert = op.Size switch
{
2 => context.Multiply(context.ZeroExtend32(OperandType.I64, insert), Const(0x0000000100000001u)),
1 => context.Multiply(context.ZeroExtend16(OperandType.I64, insert), Const(0x0001000100010001u)),
0 => context.Multiply(context.ZeroExtend8(OperandType.I64, insert), Const(0x0101010101010101u)),
_ => throw new InvalidOperationException($"Invalid Vdup size \"{op.Size}\".")
};
InsertScalar(context, op.Vd, insert);
if (op.Q)
{
InsertScalar(context, op.Vd + 1, insert);
}
}
public static void Vdup_1(ArmEmitterContext context)
{
OpCode32SimdDupElem op = (OpCode32SimdDupElem)context.CurrOp;
Operand insert = EmitVectorExtractZx32(context, op.Vm >> 1, ((op.Vm & 1) << (3 - op.Size)) + op.Index, op.Size);
// Zero extend into an I64, then replicate. Saves the most time over elementwise inserts.
insert = op.Size switch
{
2 => context.Multiply(context.ZeroExtend32(OperandType.I64, insert), Const(0x0000000100000001u)),
1 => context.Multiply(context.ZeroExtend16(OperandType.I64, insert), Const(0x0001000100010001u)),
0 => context.Multiply(context.ZeroExtend8(OperandType.I64, insert), Const(0x0101010101010101u)),
_ => throw new InvalidOperationException($"Invalid Vdup size \"{op.Size}\".")
};
InsertScalar(context, op.Vd, insert);
if (op.Q)
{
InsertScalar(context, op.Vd | 1, insert);
}
}
private static (long, long) MaskHelperByteSequence(int start, int length, int startByte)
{
int end = start + length;
int b = startByte;
long result = 0;
long result2 = 0;
for (int i = 0; i < 8; i++)
{
result |= (long)((i >= end || i < start) ? 0x80 : b++) << (i * 8);
}
for (int i = 8; i < 16; i++)
{
result2 |= (long)((i >= end || i < start) ? 0x80 : b++) << ((i - 8) * 8);
}
return (result2, result);
}
public static void Vext(ArmEmitterContext context)
{
OpCode32SimdExt op = (OpCode32SimdExt)context.CurrOp;
int elems = op.GetBytesCount();
int byteOff = op.Immediate;
if (Optimizations.UseSsse3)
{
EmitVectorBinaryOpSimd32(context, (n, m) =>
{
// Writing low to high of d: start <imm> into n, overlap into m.
// Then rotate n down by <imm>, m up by (elems)-imm.
// Then OR them together for the result.
(long nMaskHigh, long nMaskLow) = MaskHelperByteSequence(0, elems - byteOff, byteOff);
(long mMaskHigh, long mMaskLow) = MaskHelperByteSequence(elems - byteOff, byteOff, 0);
Operand nMask, mMask;
if (!op.Q)
{
// Do the same operation to the bytes in the top doubleword too, as our target could be in either.
nMaskHigh = nMaskLow + 0x0808080808080808L;
mMaskHigh = mMaskLow + 0x0808080808080808L;
}
nMask = X86GetElements(context, nMaskHigh, nMaskLow);
mMask = X86GetElements(context, mMaskHigh, mMaskLow);
Operand nPart = context.AddIntrinsic(Intrinsic.X86Pshufb, n, nMask);
Operand mPart = context.AddIntrinsic(Intrinsic.X86Pshufb, m, mMask);
return context.AddIntrinsic(Intrinsic.X86Por, nPart, mPart);
});
}
else
{
Operand res = GetVecA32(op.Qd);
for (int index = 0; index < elems; index++)
{
Operand extract;
if (byteOff >= elems)
{
extract = EmitVectorExtractZx32(context, op.Qm, op.Im + (byteOff - elems), op.Size);
}
else
{
extract = EmitVectorExtractZx32(context, op.Qn, op.In + byteOff, op.Size);
}
byteOff++;
res = EmitVectorInsert(context, res, extract, op.Id + index, op.Size);
}
context.Copy(GetVecA32(op.Qd), res);
}
}
public static void Vmov_S(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitScalarUnaryOpF32(context, 0, 0);
}
else
{
EmitScalarUnaryOpF32(context, (op1) => op1);
}
}
public static void Vmovn(ArmEmitterContext context)
{
EmitVectorUnaryNarrowOp32(context, (op1) => op1);
}
public static void Vneg_S(ArmEmitterContext context)
{
OpCode32SimdS op = (OpCode32SimdS)context.CurrOp;
if (Optimizations.UseSse2)
{
EmitScalarUnaryOpSimd32(context, (m) =>
{
if ((op.Size & 1) == 0)
{
Operand mask = X86GetScalar(context, -0f);
return context.AddIntrinsic(Intrinsic.X86Xorps, mask, m);
}
else
{
Operand mask = X86GetScalar(context, -0d);
return context.AddIntrinsic(Intrinsic.X86Xorpd, mask, m);
}
});
}
else
{
EmitScalarUnaryOpF32(context, (op1) => context.Negate(op1));
}
}
public static void Vnmul_S(ArmEmitterContext context)
{
OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
if (Optimizations.UseSse2)
{
EmitScalarBinaryOpSimd32(context, (n, m) =>
{
if ((op.Size & 1) == 0)
{
Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
Operand mask = X86GetScalar(context, -0f);
return context.AddIntrinsic(Intrinsic.X86Xorps, mask, res);
}
else
{
Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
Operand mask = X86GetScalar(context, -0d);
return context.AddIntrinsic(Intrinsic.X86Xorpd, mask, res);
}
});
}
else
{
EmitScalarBinaryOpF32(context, (op1, op2) => context.Negate(context.Multiply(op1, op2)));
}
}
public static void Vnmla_S(ArmEmitterContext context)
{
OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitScalarTernaryOpSimd32(context, (d, n, m) =>
{
if ((op.Size & 1) == 0)
{
Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
res = context.AddIntrinsic(Intrinsic.X86Addss, d, res);
Operand mask = X86GetScalar(context, -0f);
return context.AddIntrinsic(Intrinsic.X86Xorps, mask, res);
}
else
{
Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
res = context.AddIntrinsic(Intrinsic.X86Addsd, d, res);
Operand mask = X86GetScalar(context, -0d);
return context.AddIntrinsic(Intrinsic.X86Xorpd, mask, res);
}
});
}
else if (Optimizations.FastFP)
{
EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
{
return context.Negate(context.Add(op1, context.Multiply(op2, op3)));
});
}
else
{
EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
{
return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulAdd), op1, op2, op3);
});
}
}
public static void Vnmls_S(ArmEmitterContext context)
{
OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitScalarTernaryOpSimd32(context, (d, n, m) =>
{
if ((op.Size & 1) == 0)
{
Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
Operand mask = X86GetScalar(context, -0f);
d = context.AddIntrinsic(Intrinsic.X86Xorps, mask, d);
return context.AddIntrinsic(Intrinsic.X86Addss, d, res);
}
else
{
Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
Operand mask = X86GetScalar(context, -0d);
d = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, res);
return context.AddIntrinsic(Intrinsic.X86Addsd, d, res);
}
});
}
else if (Optimizations.FastFP)
{
EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
{
return context.Add(context.Negate(op1), context.Multiply(op2, op3));
});
}
else
{
EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
{
return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulSub), op1, op2, op3);
});
}
}
public static void Vneg_V(ArmEmitterContext context)
{
OpCode32Simd op = (OpCode32Simd)context.CurrOp;
if (op.F)
{
if (Optimizations.UseSse2)
{
EmitVectorUnaryOpSimd32(context, (m) =>
{
if ((op.Size & 1) == 0)
{
Operand mask = X86GetScalar(context, -0f);
return context.AddIntrinsic(Intrinsic.X86Xorps, mask, m);
}
else
{
Operand mask = X86GetScalar(context, -0d);
return context.AddIntrinsic(Intrinsic.X86Xorpd, mask, m);
}
});
}
else
{
EmitVectorUnaryOpF32(context, (op1) => context.Negate(op1));
}
}
else
{
EmitVectorUnaryOpSx32(context, (op1) => context.Negate(op1));
}
}
public static void Vdiv_S(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitScalarBinaryOpF32(context, Intrinsic.X86Divss, Intrinsic.X86Divsd);
}
else if (Optimizations.FastFP)
{
EmitScalarBinaryOpF32(context, (op1, op2) => context.Divide(op1, op2));
}
else
{
EmitScalarBinaryOpF32(context, (op1, op2) =>
{
return EmitSoftFloatCall(context, nameof(SoftFloat32.FPDiv), op1, op2);
});
}
}
public static void Vmaxnm_S(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse41)
{
EmitSse41MaxMinNumOpF32(context, true, true);
}
else
{
EmitScalarBinaryOpF32(context, (op1, op2) => EmitSoftFloatCall(context, nameof(SoftFloat32.FPMaxNum), op1, op2));
}
}
public static void Vmaxnm_V(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse41)
{
EmitSse41MaxMinNumOpF32(context, true, false);
}
else
{
EmitVectorBinaryOpSx32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMaxNumFpscr), op1, op2));
}
}
public static void Vminnm_S(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse41)
{
EmitSse41MaxMinNumOpF32(context, false, true);
}
else
{
EmitScalarBinaryOpF32(context, (op1, op2) => EmitSoftFloatCall(context, nameof(SoftFloat32.FPMinNum), op1, op2));
}
}
public static void Vminnm_V(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse41)
{
EmitSse41MaxMinNumOpF32(context, false, false);
}
else
{
EmitVectorBinaryOpSx32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMinNumFpscr), op1, op2));
}
}
public static void Vmax_V(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitVectorBinaryOpF32(context, Intrinsic.X86Maxps, Intrinsic.X86Maxpd);
}
else
{
EmitVectorBinaryOpF32(context, (op1, op2) =>
{
return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMaxFpscr), op1, op2);
});
}
}
public static void Vmax_I(ArmEmitterContext context)
{
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
if (op.U)
{
if (Optimizations.UseSse2)
{
EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PmaxuInstruction[op.Size], op1, op2));
}
else
{
EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareGreaterUI(op1, op2), op1, op2));
}
}
else
{
if (Optimizations.UseSse2)
{
EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PmaxsInstruction[op.Size], op1, op2));
}
else
{
EmitVectorBinaryOpSx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareGreater(op1, op2), op1, op2));
}
}
}
public static void Vmin_V(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitVectorBinaryOpF32(context, Intrinsic.X86Minps, Intrinsic.X86Minpd);
}
else
{
EmitVectorBinaryOpF32(context, (op1, op2) =>
{
return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMinFpscr), op1, op2);
});
}
}
public static void Vmin_I(ArmEmitterContext context)
{
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
if (op.U)
{
if (Optimizations.UseSse2)
{
EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PminuInstruction[op.Size], op1, op2));
}
else
{
EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLessUI(op1, op2), op1, op2));
}
}
else
{
if (Optimizations.UseSse2)
{
EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PminsInstruction[op.Size], op1, op2));
}
else
{
EmitVectorBinaryOpSx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLess(op1, op2), op1, op2));
}
}
}
public static void Vmla_S(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd);
}
else if (Optimizations.FastFP)
{
EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
{
return context.Add(op1, context.Multiply(op2, op3));
});
}
else
{
EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
{
return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), op1, op2, op3);
});
}
}
public static void Vmla_V(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitVectorTernaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Addps, Intrinsic.X86Addpd);
}
else if (Optimizations.FastFP)
{
EmitVectorTernaryOpF32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)));
}
else
{
EmitVectorTernaryOpF32(context, (op1, op2, op3) =>
{
return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulAddFpscr), op1, op2, op3);
});
}
}
public static void Vmla_I(ArmEmitterContext context)
{
EmitVectorTernaryOpZx32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)));
}
public static void Vmla_1(ArmEmitterContext context)
{
OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
if (op.F)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Addps, Intrinsic.X86Addpd);
}
else if (Optimizations.FastFP)
{
EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)));
}
else
{
EmitVectorsByScalarOpF32(context, (op1, op2, op3) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulAddFpscr), op1, op2, op3));
}
}
else
{
EmitVectorsByScalarOpI32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)), false);
}
}
public static void Vmls_S(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd);
}
else if (Optimizations.FastFP)
{
EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
{
return context.Subtract(op1, context.Multiply(op2, op3));
});
}
else
{
EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
{
return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulSub), op1, op2, op3);
});
}
}
public static void Vmls_V(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitVectorTernaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Subps, Intrinsic.X86Subpd);
}
else if (Optimizations.FastFP)
{
EmitVectorTernaryOpF32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)));
}
else
{
EmitVectorTernaryOpF32(context, (op1, op2, op3) =>
{
return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulSubFpscr), op1, op2, op3);
});
}
}
public static void Vmls_I(ArmEmitterContext context)
{
EmitVectorTernaryOpZx32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)));
}
public static void Vmls_1(ArmEmitterContext context)
{
OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
if (op.F)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Subps, Intrinsic.X86Subpd);
}
else if (Optimizations.FastFP)
{
EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)));
}
else
{
EmitVectorsByScalarOpF32(context, (op1, op2, op3) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulSubFpscr), op1, op2, op3));
}
}
else
{
EmitVectorsByScalarOpI32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)), false);
}
}
public static void Vmlsl_I(ArmEmitterContext context)
{
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
EmitVectorTernaryLongOpI32(context, (opD, op1, op2) => context.Subtract(opD, context.Multiply(op1, op2)), !op.U);
}
public static void Vmul_S(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitScalarBinaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd);
}
else if (Optimizations.FastFP)
{
EmitScalarBinaryOpF32(context, (op1, op2) => context.Multiply(op1, op2));
}
else
{
EmitScalarBinaryOpF32(context, (op1, op2) =>
{
return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op1, op2);
});
}
}
public static void Vmul_V(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitVectorBinaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd);
}
else if (Optimizations.FastFP)
{
EmitVectorBinaryOpF32(context, (op1, op2) => context.Multiply(op1, op2));
}
else
{
EmitVectorBinaryOpF32(context, (op1, op2) =>
{
return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulFpscr), op1, op2);
});
}
}
public static void Vmul_I(ArmEmitterContext context)
{
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
if (op.U) // This instruction is always signed, U indicates polynomial mode.
{
EmitVectorBinaryOpZx32(context, (op1, op2) => EmitPolynomialMultiply(context, op1, op2, 8 << op.Size));
}
else
{
EmitVectorBinaryOpSx32(context, (op1, op2) => context.Multiply(op1, op2));
}
}
public static void Vmul_1(ArmEmitterContext context)
{
OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
if (op.F)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitVectorByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd);
}
else if (Optimizations.FastFP)
{
EmitVectorByScalarOpF32(context, (op1, op2) => context.Multiply(op1, op2));
}
else
{
EmitVectorByScalarOpF32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulFpscr), op1, op2));
}
}
else
{
EmitVectorByScalarOpI32(context, (op1, op2) => context.Multiply(op1, op2), false);
}
}
public static void Vmull_1(ArmEmitterContext context)
{
OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
EmitVectorByScalarLongOpI32(context, (op1, op2) => context.Multiply(op1, op2), !op.U);
}
public static void Vmull_I(ArmEmitterContext context)
{
OpCode32SimdRegLong op = (OpCode32SimdRegLong)context.CurrOp;
if (op.Polynomial)
{
EmitVectorBinaryLongOpI32(context, (op1, op2) => EmitPolynomialMultiply(context, op1, op2, 8 << op.Size), false);
}
else
{
EmitVectorBinaryLongOpI32(context, (op1, op2) => context.Multiply(op1, op2), !op.U);
}
}
public static void Vpadd_V(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitSse2VectorPairwiseOpF32(context, Intrinsic.X86Addps);
}
else
{
EmitVectorPairwiseOpF32(context, (op1, op2) => context.Add(op1, op2));
}
}
public static void Vpadd_I(ArmEmitterContext context)
{
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
if (Optimizations.UseSsse3)
{
EmitSsse3VectorPairwiseOp32(context, X86PaddInstruction);
}
else
{
EmitVectorPairwiseOpI32(context, (op1, op2) => context.Add(op1, op2), !op.U);
}
}
public static void Vrev(ArmEmitterContext context)
{
OpCode32SimdRev op = (OpCode32SimdRev)context.CurrOp;
if (Optimizations.UseSsse3)
{
EmitVectorUnaryOpSimd32(context, (op1) =>
{
Operand mask;
switch (op.Size)
{
case 3:
// Rev64
switch (op.Opc)
{
case 0:
mask = X86GetElements(context, 0x08090a0b0c0d0e0fL, 0x0001020304050607L);
return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask);
case 1:
mask = X86GetElements(context, 0x09080b0a0d0c0f0eL, 0x0100030205040706L);
return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask);
case 2:
return context.AddIntrinsic(Intrinsic.X86Shufps, op1, op1, Const(1 | (0 << 2) | (3 << 4) | (2 << 6)));
}
break;
case 2:
// Rev32
switch (op.Opc)
{
case 0:
mask = X86GetElements(context, 0x0c0d0e0f_08090a0bL, 0x04050607_00010203L);
return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask);
case 1:
mask = X86GetElements(context, 0x0d0c0f0e_09080b0aL, 0x05040706_01000302L);
return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask);
}
break;
case 1:
// Rev16
mask = X86GetElements(context, 0x0e0f_0c0d_0a0b_0809L, 0x_0607_0405_0203_0001L);
return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask);
}
throw new InvalidOperationException("Invalid VREV Opcode + Size combo."); // Should be unreachable.
});
}
else
{
EmitVectorUnaryOpZx32(context, (op1) =>
{
switch (op.Opc)
{
case 0:
switch (op.Size) // Swap bytes.
{
case 1:
return InstEmitAluHelper.EmitReverseBytes16_32Op(context, op1);
case 2:
case 3:
return context.ByteSwap(op1);
}
break;
case 1:
switch (op.Size)
{
case 2:
return context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffff0000)), Const(16)),
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x0000ffff)), Const(16)));
case 3:
return context.BitwiseOr(
context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffff000000000000ul)), Const(48)),
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x000000000000fffful)), Const(48))),
context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0x0000ffff00000000ul)), Const(16)),
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x00000000ffff0000ul)), Const(16))));
}
break;
case 2:
// Swap upper and lower halves.
return context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffffffff00000000ul)), Const(32)),
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x00000000fffffffful)), Const(32)));
}
throw new InvalidOperationException("Invalid VREV Opcode + Size combo."); // Should be unreachable.
});
}
}
public static void Vrecpe(ArmEmitterContext context)
{
OpCode32SimdSqrte op = (OpCode32SimdSqrte)context.CurrOp;
if (op.F)
{
int sizeF = op.Size & 1;
if (Optimizations.FastFP && Optimizations.UseSse2 && sizeF == 0)
{
EmitVectorUnaryOpF32(context, Intrinsic.X86Rcpps, 0);
}
else
{
EmitVectorUnaryOpF32(context, (op1) =>
{
return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPRecipEstimateFpscr), op1);
});
}
}
else
{
throw new NotImplementedException("Integer Vrecpe not currently implemented.");
}
}
public static void Vrecps(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
bool single = (op.Size & 1) == 0;
// (2 - (n*m))
EmitVectorBinaryOpSimd32(context, (n, m) =>
{
if (single)
{
Operand maskTwo = X86GetAllElements(context, 2f);
Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m);
return context.AddIntrinsic(Intrinsic.X86Subps, maskTwo, res);
}
else
{
Operand maskTwo = X86GetAllElements(context, 2d);
Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m);
return context.AddIntrinsic(Intrinsic.X86Subpd, maskTwo, res);
}
});
}
else
{
EmitVectorBinaryOpF32(context, (op1, op2) =>
{
return EmitSoftFloatCall(context, nameof(SoftFloat32.FPRecipStep), op1, op2);
});
}
}
public static void Vrsqrte(ArmEmitterContext context)
{
OpCode32SimdSqrte op = (OpCode32SimdSqrte)context.CurrOp;
if (op.F)
{
int sizeF = op.Size & 1;
if (Optimizations.FastFP && Optimizations.UseSse2 && sizeF == 0)
{
EmitVectorUnaryOpF32(context, Intrinsic.X86Rsqrtps, 0);
}
else
{
EmitVectorUnaryOpF32(context, (op1) =>
{
return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPRSqrtEstimateFpscr), op1);
});
}
}
else
{
throw new NotImplementedException("Integer Vrsqrte not currently implemented.");
}
}
public static void Vrsqrts(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
bool single = (op.Size & 1) == 0;
// (3 - (n*m)) / 2
EmitVectorBinaryOpSimd32(context, (n, m) =>
{
if (single)
{
Operand maskHalf = X86GetAllElements(context, 0.5f);
Operand maskThree = X86GetAllElements(context, 3f);
Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m);
res = context.AddIntrinsic(Intrinsic.X86Subps, maskThree, res);
return context.AddIntrinsic(Intrinsic.X86Mulps, maskHalf, res);
}
else
{
Operand maskHalf = X86GetAllElements(context, 0.5d);
Operand maskThree = X86GetAllElements(context, 3d);
Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m);
res = context.AddIntrinsic(Intrinsic.X86Subpd, maskThree, res);
return context.AddIntrinsic(Intrinsic.X86Mulpd, maskHalf, res);
}
});
}
else
{
EmitVectorBinaryOpF32(context, (op1, op2) =>
{
return EmitSoftFloatCall(context, nameof(SoftFloat32.FPRSqrtStep), op1, op2);
});
}
}
public static void Vsel(ArmEmitterContext context)
{
OpCode32SimdSel op = (OpCode32SimdSel)context.CurrOp;
Operand condition = null;
switch (op.Cc)
{
case OpCode32SimdSelMode.Eq:
condition = GetCondTrue(context, Condition.Eq);
break;
case OpCode32SimdSelMode.Ge:
condition = GetCondTrue(context, Condition.Ge);
break;
case OpCode32SimdSelMode.Gt:
condition = GetCondTrue(context, Condition.Gt);
break;
case OpCode32SimdSelMode.Vs:
condition = GetCondTrue(context, Condition.Vs);
break;
}
EmitScalarBinaryOpI32(context, (op1, op2) =>
{
return context.ConditionalSelect(condition, op1, op2);
});
}
public static void Vsqrt_S(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitScalarUnaryOpF32(context, Intrinsic.X86Sqrtss, Intrinsic.X86Sqrtsd);
}
else
{
EmitScalarUnaryOpF32(context, (op1) =>
{
return EmitSoftFloatCall(context, nameof(SoftFloat32.FPSqrt), op1);
});
}
}
public static void Vsub_S(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitScalarBinaryOpF32(context, Intrinsic.X86Subss, Intrinsic.X86Subsd);
}
else
{
EmitScalarBinaryOpF32(context, (op1, op2) => context.Subtract(op1, op2));
}
}
public static void Vsub_V(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitVectorBinaryOpF32(context, Intrinsic.X86Subps, Intrinsic.X86Subpd);
}
else
{
EmitVectorBinaryOpF32(context, (op1, op2) => context.Subtract(op1, op2));
}
}
public static void Vsub_I(ArmEmitterContext context)
{
if (Optimizations.UseSse2)
{
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PsubInstruction[op.Size], op1, op2));
}
else
{
EmitVectorBinaryOpZx32(context, (op1, op2) => context.Subtract(op1, op2));
}
}
private static void EmitSse41MaxMinNumOpF32(ArmEmitterContext context, bool isMaxNum, bool scalar)
{
IOpCode32Simd op = (IOpCode32Simd)context.CurrOp;
Func<Operand, Operand, Operand> genericEmit = (n, m) =>
{
Operand nNum = context.Copy(n);
Operand mNum = context.Copy(m);
Operand nQNaNMask = InstEmit.EmitSse2VectorIsQNaNOpF(context, nNum);
Operand mQNaNMask = InstEmit.EmitSse2VectorIsQNaNOpF(context, mNum);
int sizeF = op.Size & 1;
if (sizeF == 0)
{
Operand negInfMask = X86GetAllElements(context, isMaxNum ? float.NegativeInfinity : float.PositiveInfinity);
Operand nMask = context.AddIntrinsic(Intrinsic.X86Andnps, mQNaNMask, nQNaNMask);
Operand mMask = context.AddIntrinsic(Intrinsic.X86Andnps, nQNaNMask, mQNaNMask);
nNum = context.AddIntrinsic(Intrinsic.X86Blendvps, nNum, negInfMask, nMask);
mNum = context.AddIntrinsic(Intrinsic.X86Blendvps, mNum, negInfMask, mMask);
return context.AddIntrinsic(isMaxNum ? Intrinsic.X86Maxps : Intrinsic.X86Minps, nNum, mNum);
}
else /* if (sizeF == 1) */
{
Operand negInfMask = X86GetAllElements(context, isMaxNum ? double.NegativeInfinity : double.PositiveInfinity);
Operand nMask = context.AddIntrinsic(Intrinsic.X86Andnpd, mQNaNMask, nQNaNMask);
Operand mMask = context.AddIntrinsic(Intrinsic.X86Andnpd, nQNaNMask, mQNaNMask);
nNum = context.AddIntrinsic(Intrinsic.X86Blendvpd, nNum, negInfMask, nMask);
mNum = context.AddIntrinsic(Intrinsic.X86Blendvpd, mNum, negInfMask, mMask);
return context.AddIntrinsic(isMaxNum ? Intrinsic.X86Maxpd : Intrinsic.X86Minpd, nNum, mNum);
}
};
if (scalar)
{
EmitScalarBinaryOpSimd32(context, genericEmit);
}
else
{
EmitVectorBinaryOpSimd32(context, genericEmit);
}
}
private static Operand EmitPolynomialMultiply(ArmEmitterContext context, Operand op1, Operand op2, int eSize)
{
Debug.Assert(eSize <= 32);
Operand result = eSize == 32 ? Const(0L) : Const(0);
if (eSize == 32)
{
op1 = context.ZeroExtend32(OperandType.I64, op1);
op2 = context.ZeroExtend32(OperandType.I64, op2);
}
for (int i = 0; i < eSize; i++)
{
Operand mask = context.BitwiseAnd(op1, Const(op1.Type, 1L << i));
result = context.BitwiseExclusiveOr(result, context.Multiply(op2, mask));
}
return result;
}
}
}