0b52ee6627
* Optimize BIC, BSL, BIT, BIF, XTN, ZIP, DUP (Gp), FMADD (Scalar) and FCVT (Scalar) using SSE intrinsics, some CQ improvements * Remove useless space * Address PR feedback * Revert EmitVectorZero32_128 changes
551 lines
17 KiB
C#
551 lines
17 KiB
C#
using ChocolArm64.Decoder;
|
|
using ChocolArm64.State;
|
|
using ChocolArm64.Translation;
|
|
using System;
|
|
using System.Reflection.Emit;
|
|
using System.Runtime.Intrinsics.X86;
|
|
|
|
using static ChocolArm64.Instruction.AInstEmitSimdHelper;
|
|
|
|
namespace ChocolArm64.Instruction
|
|
{
|
|
static partial class AInstEmit
|
|
{
|
|
public static void Dup_Gp(AILEmitterCtx Context)
|
|
{
|
|
AOpCodeSimdIns Op = (AOpCodeSimdIns)Context.CurrOp;
|
|
|
|
if (AOptimizations.UseSse2)
|
|
{
|
|
Context.EmitLdintzr(Op.Rn);
|
|
|
|
switch (Op.Size)
|
|
{
|
|
case 0: Context.Emit(OpCodes.Conv_U1); break;
|
|
case 1: Context.Emit(OpCodes.Conv_U2); break;
|
|
case 2: Context.Emit(OpCodes.Conv_U4); break;
|
|
}
|
|
|
|
Type[] Types = new Type[] { UIntTypesPerSizeLog2[Op.Size] };
|
|
|
|
Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), Types));
|
|
|
|
EmitStvecWithUnsignedCast(Context, Op.Rd, Op.Size);
|
|
|
|
if (Op.RegisterSize == ARegisterSize.SIMD64)
|
|
{
|
|
EmitVectorZeroUpper(Context, Op.Rd);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
int Bytes = Op.GetBitsCount() >> 3;
|
|
int Elems = Bytes >> Op.Size;
|
|
|
|
for (int Index = 0; Index < Elems; Index++)
|
|
{
|
|
Context.EmitLdintzr(Op.Rn);
|
|
|
|
EmitVectorInsert(Context, Op.Rd, Index, Op.Size);
|
|
}
|
|
|
|
if (Op.RegisterSize == ARegisterSize.SIMD64)
|
|
{
|
|
EmitVectorZeroUpper(Context, Op.Rd);
|
|
}
|
|
}
|
|
}
|
|
|
|
public static void Dup_S(AILEmitterCtx Context)
|
|
{
|
|
AOpCodeSimdIns Op = (AOpCodeSimdIns)Context.CurrOp;
|
|
|
|
EmitVectorExtractZx(Context, Op.Rn, Op.DstIndex, Op.Size);
|
|
|
|
EmitScalarSet(Context, Op.Rd, Op.Size);
|
|
}
|
|
|
|
public static void Dup_V(AILEmitterCtx Context)
|
|
{
|
|
AOpCodeSimdIns Op = (AOpCodeSimdIns)Context.CurrOp;
|
|
|
|
int Bytes = Op.GetBitsCount() >> 3;
|
|
int Elems = Bytes >> Op.Size;
|
|
|
|
for (int Index = 0; Index < Elems; Index++)
|
|
{
|
|
EmitVectorExtractZx(Context, Op.Rn, Op.DstIndex, Op.Size);
|
|
|
|
EmitVectorInsert(Context, Op.Rd, Index, Op.Size);
|
|
}
|
|
|
|
if (Op.RegisterSize == ARegisterSize.SIMD64)
|
|
{
|
|
EmitVectorZeroUpper(Context, Op.Rd);
|
|
}
|
|
}
|
|
|
|
public static void Ext_V(AILEmitterCtx Context)
|
|
{
|
|
AOpCodeSimdExt Op = (AOpCodeSimdExt)Context.CurrOp;
|
|
|
|
Context.EmitLdvec(Op.Rd);
|
|
Context.EmitStvectmp();
|
|
|
|
int Bytes = Op.GetBitsCount() >> 3;
|
|
|
|
int Position = Op.Imm4;
|
|
|
|
for (int Index = 0; Index < Bytes; Index++)
|
|
{
|
|
int Reg = Op.Imm4 + Index < Bytes ? Op.Rn : Op.Rm;
|
|
|
|
if (Position == Bytes)
|
|
{
|
|
Position = 0;
|
|
}
|
|
|
|
EmitVectorExtractZx(Context, Reg, Position++, 0);
|
|
EmitVectorInsertTmp(Context, Index, 0);
|
|
}
|
|
|
|
Context.EmitLdvectmp();
|
|
Context.EmitStvec(Op.Rd);
|
|
|
|
if (Op.RegisterSize == ARegisterSize.SIMD64)
|
|
{
|
|
EmitVectorZeroUpper(Context, Op.Rd);
|
|
}
|
|
}
|
|
|
|
public static void Fcsel_S(AILEmitterCtx Context)
|
|
{
|
|
AOpCodeSimdFcond Op = (AOpCodeSimdFcond)Context.CurrOp;
|
|
|
|
AILLabel LblTrue = new AILLabel();
|
|
AILLabel LblEnd = new AILLabel();
|
|
|
|
Context.EmitCondBranch(LblTrue, Op.Cond);
|
|
|
|
EmitVectorExtractF(Context, Op.Rm, 0, Op.Size);
|
|
|
|
Context.Emit(OpCodes.Br_S, LblEnd);
|
|
|
|
Context.MarkLabel(LblTrue);
|
|
|
|
EmitVectorExtractF(Context, Op.Rn, 0, Op.Size);
|
|
|
|
Context.MarkLabel(LblEnd);
|
|
|
|
EmitScalarSetF(Context, Op.Rd, Op.Size);
|
|
}
|
|
|
|
public static void Fmov_Ftoi(AILEmitterCtx Context)
|
|
{
|
|
AOpCodeSimdCvt Op = (AOpCodeSimdCvt)Context.CurrOp;
|
|
|
|
EmitVectorExtractZx(Context, Op.Rn, 0, 3);
|
|
|
|
EmitIntZeroUpperIfNeeded(Context);
|
|
|
|
Context.EmitStintzr(Op.Rd);
|
|
}
|
|
|
|
public static void Fmov_Ftoi1(AILEmitterCtx Context)
|
|
{
|
|
AOpCodeSimdCvt Op = (AOpCodeSimdCvt)Context.CurrOp;
|
|
|
|
EmitVectorExtractZx(Context, Op.Rn, 1, 3);
|
|
|
|
EmitIntZeroUpperIfNeeded(Context);
|
|
|
|
Context.EmitStintzr(Op.Rd);
|
|
}
|
|
|
|
public static void Fmov_Itof(AILEmitterCtx Context)
|
|
{
|
|
AOpCodeSimdCvt Op = (AOpCodeSimdCvt)Context.CurrOp;
|
|
|
|
Context.EmitLdintzr(Op.Rn);
|
|
|
|
EmitIntZeroUpperIfNeeded(Context);
|
|
|
|
EmitScalarSet(Context, Op.Rd, 3);
|
|
}
|
|
|
|
public static void Fmov_Itof1(AILEmitterCtx Context)
|
|
{
|
|
AOpCodeSimdCvt Op = (AOpCodeSimdCvt)Context.CurrOp;
|
|
|
|
Context.EmitLdintzr(Op.Rn);
|
|
|
|
EmitIntZeroUpperIfNeeded(Context);
|
|
|
|
EmitVectorInsert(Context, Op.Rd, 1, 3);
|
|
}
|
|
|
|
public static void Fmov_S(AILEmitterCtx Context)
|
|
{
|
|
AOpCodeSimd Op = (AOpCodeSimd)Context.CurrOp;
|
|
|
|
EmitVectorExtractF(Context, Op.Rn, 0, Op.Size);
|
|
|
|
EmitScalarSetF(Context, Op.Rd, Op.Size);
|
|
}
|
|
|
|
public static void Fmov_Si(AILEmitterCtx Context)
|
|
{
|
|
AOpCodeSimdFmov Op = (AOpCodeSimdFmov)Context.CurrOp;
|
|
|
|
Context.EmitLdc_I8(Op.Imm);
|
|
|
|
EmitScalarSet(Context, Op.Rd, Op.Size + 2);
|
|
}
|
|
|
|
public static void Fmov_V(AILEmitterCtx Context)
|
|
{
|
|
AOpCodeSimdImm Op = (AOpCodeSimdImm)Context.CurrOp;
|
|
|
|
int Elems = Op.RegisterSize == ARegisterSize.SIMD128 ? 4 : 2;
|
|
|
|
for (int Index = 0; Index < (Elems >> Op.Size); Index++)
|
|
{
|
|
Context.EmitLdc_I8(Op.Imm);
|
|
|
|
EmitVectorInsert(Context, Op.Rd, Index, Op.Size + 2);
|
|
}
|
|
|
|
if (Op.RegisterSize == ARegisterSize.SIMD64)
|
|
{
|
|
EmitVectorZeroUpper(Context, Op.Rd);
|
|
}
|
|
}
|
|
|
|
public static void Ins_Gp(AILEmitterCtx Context)
|
|
{
|
|
AOpCodeSimdIns Op = (AOpCodeSimdIns)Context.CurrOp;
|
|
|
|
Context.EmitLdintzr(Op.Rn);
|
|
|
|
EmitVectorInsert(Context, Op.Rd, Op.DstIndex, Op.Size);
|
|
}
|
|
|
|
public static void Ins_V(AILEmitterCtx Context)
|
|
{
|
|
AOpCodeSimdIns Op = (AOpCodeSimdIns)Context.CurrOp;
|
|
|
|
EmitVectorExtractZx(Context, Op.Rn, Op.SrcIndex, Op.Size);
|
|
|
|
EmitVectorInsert(Context, Op.Rd, Op.DstIndex, Op.Size);
|
|
}
|
|
|
|
public static void Movi_V(AILEmitterCtx Context)
|
|
{
|
|
EmitVectorImmUnaryOp(Context, () => { });
|
|
}
|
|
|
|
public static void Mvni_V(AILEmitterCtx Context)
|
|
{
|
|
EmitVectorImmUnaryOp(Context, () => Context.Emit(OpCodes.Not));
|
|
}
|
|
|
|
public static void Tbl_V(AILEmitterCtx Context)
|
|
{
|
|
AOpCodeSimdTbl Op = (AOpCodeSimdTbl)Context.CurrOp;
|
|
|
|
Context.EmitLdvec(Op.Rm);
|
|
|
|
for (int Index = 0; Index < Op.Size; Index++)
|
|
{
|
|
Context.EmitLdvec((Op.Rn + Index) & 0x1f);
|
|
}
|
|
|
|
switch (Op.Size)
|
|
{
|
|
case 1: AVectorHelper.EmitCall(Context,
|
|
nameof(AVectorHelper.Tbl1_V64),
|
|
nameof(AVectorHelper.Tbl1_V128)); break;
|
|
|
|
case 2: AVectorHelper.EmitCall(Context,
|
|
nameof(AVectorHelper.Tbl2_V64),
|
|
nameof(AVectorHelper.Tbl2_V128)); break;
|
|
|
|
case 3: AVectorHelper.EmitCall(Context,
|
|
nameof(AVectorHelper.Tbl3_V64),
|
|
nameof(AVectorHelper.Tbl3_V128)); break;
|
|
|
|
case 4: AVectorHelper.EmitCall(Context,
|
|
nameof(AVectorHelper.Tbl4_V64),
|
|
nameof(AVectorHelper.Tbl4_V128)); break;
|
|
|
|
default: throw new InvalidOperationException();
|
|
}
|
|
|
|
Context.EmitStvec(Op.Rd);
|
|
}
|
|
|
|
public static void Trn1_V(AILEmitterCtx Context)
|
|
{
|
|
EmitVectorTranspose(Context, Part: 0);
|
|
}
|
|
|
|
public static void Trn2_V(AILEmitterCtx Context)
|
|
{
|
|
EmitVectorTranspose(Context, Part: 1);
|
|
}
|
|
|
|
public static void Umov_S(AILEmitterCtx Context)
|
|
{
|
|
AOpCodeSimdIns Op = (AOpCodeSimdIns)Context.CurrOp;
|
|
|
|
EmitVectorExtractZx(Context, Op.Rn, Op.DstIndex, Op.Size);
|
|
|
|
Context.EmitStintzr(Op.Rd);
|
|
}
|
|
|
|
public static void Uzp1_V(AILEmitterCtx Context)
|
|
{
|
|
EmitVectorUnzip(Context, Part: 0);
|
|
}
|
|
|
|
public static void Uzp2_V(AILEmitterCtx Context)
|
|
{
|
|
EmitVectorUnzip(Context, Part: 1);
|
|
}
|
|
|
|
public static void Xtn_V(AILEmitterCtx Context)
|
|
{
|
|
AOpCodeSimd Op = (AOpCodeSimd)Context.CurrOp;
|
|
|
|
int Elems = 8 >> Op.Size;
|
|
|
|
int Part = Op.RegisterSize == ARegisterSize.SIMD128 ? Elems : 0;
|
|
|
|
if (AOptimizations.UseSse41 && Op.Size < 2)
|
|
{
|
|
void EmitZeroVector()
|
|
{
|
|
switch (Op.Size)
|
|
{
|
|
case 0: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorInt16Zero)); break;
|
|
case 1: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorInt32Zero)); break;
|
|
}
|
|
}
|
|
|
|
//For XTN, first operand is source, second operand is 0.
|
|
//For XTN2, first operand is 0, second operand is source.
|
|
if (Part != 0)
|
|
{
|
|
EmitZeroVector();
|
|
}
|
|
|
|
EmitLdvecWithSignedCast(Context, Op.Rn, Op.Size + 1);
|
|
|
|
//Set mask to discard the upper half of the wide elements.
|
|
switch (Op.Size)
|
|
{
|
|
case 0: Context.EmitLdc_I4(0x00ff); break;
|
|
case 1: Context.EmitLdc_I4(0x0000ffff); break;
|
|
}
|
|
|
|
Type WideType = IntTypesPerSizeLog2[Op.Size + 1];
|
|
|
|
Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), new Type[] { WideType }));
|
|
|
|
WideType = VectorIntTypesPerSizeLog2[Op.Size + 1];
|
|
|
|
Type[] WideTypes = new Type[] { WideType, WideType };
|
|
|
|
Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.And), WideTypes));
|
|
|
|
if (Part == 0)
|
|
{
|
|
EmitZeroVector();
|
|
}
|
|
|
|
//Pack values with signed saturation, the signed saturation shouldn't
|
|
//saturate anything since the upper bits were masked off.
|
|
Type SseType = Op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
|
|
|
|
Context.EmitCall(SseType.GetMethod(nameof(Sse2.PackUnsignedSaturate), WideTypes));
|
|
|
|
if (Part != 0)
|
|
{
|
|
//For XTN2, we additionally need to discard the upper bits
|
|
//of the target register and OR the result with it.
|
|
EmitVectorZeroUpper(Context, Op.Rd);
|
|
|
|
EmitLdvecWithUnsignedCast(Context, Op.Rd, Op.Size);
|
|
|
|
Type NarrowType = VectorUIntTypesPerSizeLog2[Op.Size];
|
|
|
|
Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Or), new Type[] { NarrowType, NarrowType }));
|
|
}
|
|
|
|
EmitStvecWithUnsignedCast(Context, Op.Rd, Op.Size);
|
|
}
|
|
else
|
|
{
|
|
if (Part != 0)
|
|
{
|
|
Context.EmitLdvec(Op.Rd);
|
|
Context.EmitStvectmp();
|
|
}
|
|
|
|
for (int Index = 0; Index < Elems; Index++)
|
|
{
|
|
EmitVectorExtractZx(Context, Op.Rn, Index, Op.Size + 1);
|
|
|
|
EmitVectorInsertTmp(Context, Part + Index, Op.Size);
|
|
}
|
|
|
|
Context.EmitLdvectmp();
|
|
Context.EmitStvec(Op.Rd);
|
|
|
|
if (Part == 0)
|
|
{
|
|
EmitVectorZeroUpper(Context, Op.Rd);
|
|
}
|
|
}
|
|
}
|
|
|
|
public static void Zip1_V(AILEmitterCtx Context)
|
|
{
|
|
EmitVectorZip(Context, Part: 0);
|
|
}
|
|
|
|
public static void Zip2_V(AILEmitterCtx Context)
|
|
{
|
|
EmitVectorZip(Context, Part: 1);
|
|
}
|
|
|
|
private static void EmitIntZeroUpperIfNeeded(AILEmitterCtx Context)
|
|
{
|
|
if (Context.CurrOp.RegisterSize == ARegisterSize.Int32)
|
|
{
|
|
Context.Emit(OpCodes.Conv_U4);
|
|
Context.Emit(OpCodes.Conv_U8);
|
|
}
|
|
}
|
|
|
|
private static void EmitVectorTranspose(AILEmitterCtx Context, int Part)
|
|
{
|
|
AOpCodeSimdReg Op = (AOpCodeSimdReg)Context.CurrOp;
|
|
|
|
int Words = Op.GetBitsCount() >> 4;
|
|
int Pairs = Words >> Op.Size;
|
|
|
|
for (int Index = 0; Index < Pairs; Index++)
|
|
{
|
|
int Idx = Index << 1;
|
|
|
|
EmitVectorExtractZx(Context, Op.Rn, Idx + Part, Op.Size);
|
|
EmitVectorExtractZx(Context, Op.Rm, Idx + Part, Op.Size);
|
|
|
|
EmitVectorInsertTmp(Context, Idx + 1, Op.Size);
|
|
EmitVectorInsertTmp(Context, Idx, Op.Size);
|
|
}
|
|
|
|
Context.EmitLdvectmp();
|
|
Context.EmitStvec(Op.Rd);
|
|
|
|
if (Op.RegisterSize == ARegisterSize.SIMD64)
|
|
{
|
|
EmitVectorZeroUpper(Context, Op.Rd);
|
|
}
|
|
}
|
|
|
|
private static void EmitVectorUnzip(AILEmitterCtx Context, int Part)
|
|
{
|
|
AOpCodeSimdReg Op = (AOpCodeSimdReg)Context.CurrOp;
|
|
|
|
int Words = Op.GetBitsCount() >> 4;
|
|
int Pairs = Words >> Op.Size;
|
|
|
|
for (int Index = 0; Index < Pairs; Index++)
|
|
{
|
|
int Idx = Index << 1;
|
|
|
|
EmitVectorExtractZx(Context, Op.Rn, Idx + Part, Op.Size);
|
|
EmitVectorExtractZx(Context, Op.Rm, Idx + Part, Op.Size);
|
|
|
|
EmitVectorInsertTmp(Context, Pairs + Index, Op.Size);
|
|
EmitVectorInsertTmp(Context, Index, Op.Size);
|
|
}
|
|
|
|
Context.EmitLdvectmp();
|
|
Context.EmitStvec(Op.Rd);
|
|
|
|
if (Op.RegisterSize == ARegisterSize.SIMD64)
|
|
{
|
|
EmitVectorZeroUpper(Context, Op.Rd);
|
|
}
|
|
}
|
|
|
|
private static void EmitVectorZip(AILEmitterCtx Context, int Part)
|
|
{
|
|
AOpCodeSimdReg Op = (AOpCodeSimdReg)Context.CurrOp;
|
|
|
|
if (AOptimizations.UseSse2)
|
|
{
|
|
EmitLdvecWithUnsignedCast(Context, Op.Rn, Op.Size);
|
|
EmitLdvecWithUnsignedCast(Context, Op.Rm, Op.Size);
|
|
|
|
Type[] Types = new Type[]
|
|
{
|
|
VectorUIntTypesPerSizeLog2[Op.Size],
|
|
VectorUIntTypesPerSizeLog2[Op.Size]
|
|
};
|
|
|
|
string Name = Part == 0 || (Part != 0 && Op.RegisterSize == ARegisterSize.SIMD64)
|
|
? nameof(Sse2.UnpackLow)
|
|
: nameof(Sse2.UnpackHigh);
|
|
|
|
Context.EmitCall(typeof(Sse2).GetMethod(Name, Types));
|
|
|
|
if (Op.RegisterSize == ARegisterSize.SIMD64 && Part != 0)
|
|
{
|
|
Context.EmitLdc_I4(8);
|
|
|
|
Type[] ShTypes = new Type[] { VectorUIntTypesPerSizeLog2[Op.Size], typeof(byte) };
|
|
|
|
Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), ShTypes));
|
|
}
|
|
|
|
EmitStvecWithUnsignedCast(Context, Op.Rd, Op.Size);
|
|
|
|
if (Op.RegisterSize == ARegisterSize.SIMD64 && Part == 0)
|
|
{
|
|
EmitVectorZeroUpper(Context, Op.Rd);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
int Words = Op.GetBitsCount() >> 4;
|
|
int Pairs = Words >> Op.Size;
|
|
|
|
int Base = Part != 0 ? Pairs : 0;
|
|
|
|
for (int Index = 0; Index < Pairs; Index++)
|
|
{
|
|
int Idx = Index << 1;
|
|
|
|
EmitVectorExtractZx(Context, Op.Rn, Base + Index, Op.Size);
|
|
EmitVectorExtractZx(Context, Op.Rm, Base + Index, Op.Size);
|
|
|
|
EmitVectorInsertTmp(Context, Idx + 1, Op.Size);
|
|
EmitVectorInsertTmp(Context, Idx, Op.Size);
|
|
}
|
|
|
|
Context.EmitLdvectmp();
|
|
Context.EmitStvec(Op.Rd);
|
|
|
|
if (Op.RegisterSize == ARegisterSize.SIMD64)
|
|
{
|
|
EmitVectorZeroUpper(Context, Op.Rd);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|