Implement Fast Paths for most A32 SIMD instructions (#952)

* Begin work on A32 SIMD Intrinsics

* More instructions, some cleanup.

* Intrinsics for Move instructions (zip etc)

These pass the existing tests.

* Intrinsics for some of Cvt

While doing this I noticed that the conversion for int/fp was incorrect
in the slow path. I'll fix this in the original repo.

* Intrinsics for more Arithmetic instructions.

* Intrinsics for Vext

* Fix VEXT Intrinsic for double words.

* Use InsertPs to move scalar values.

* Cleanup, fix VPADD.f32 and VMIN signed integer.

* Cleanup, add SSE2 support for scalar insert.

Works similarly to the IR scalar insert, but obviously this one works
directly on V128.

* Minor cleanup.

* Enable intrinsic for FP64 to integer conversion.

* Address feedback apart from splitting out intrinsic float abs

Also: bad VREV encodings as undefined rather than throwing in translation.

* Move float abs to helper, fix bug with cvt

* Rename opc2 & 3 to match A32 docs, use ArgumentOutOfRangeException appropriately.

* Get name of variable at compilation rather than string literal.

* Use correct double sign mask.
This commit is contained in:
jduncanator 2020-03-05 11:41:33 +11:00 committed by GitHub
parent d9ed827696
commit 68e15c1a74
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 2077 additions and 400 deletions

View file

@ -1,4 +1,5 @@
using ARMeilleure.Decoders;
using ARMeilleure.IntermediateRepresentation;
using ARMeilleure.Translation;
using static ARMeilleure.Instructions.InstEmitSimdHelper32;
@ -9,7 +10,14 @@ namespace ARMeilleure.Instructions
{
public static void Vand_I(ArmEmitterContext context)
{
EmitVectorBinaryOpZx32(context, (op1, op2) => context.BitwiseAnd(op1, op2));
if (Optimizations.UseSse2)
{
EmitVectorBinaryOpF32(context, Intrinsic.X86Pand, Intrinsic.X86Pand);
}
else
{
EmitVectorBinaryOpZx32(context, (op1, op2) => context.BitwiseAnd(op1, op2));
}
}
public static void Vbif(ArmEmitterContext context)
@ -24,33 +32,64 @@ namespace ARMeilleure.Instructions
public static void Vbsl(ArmEmitterContext context)
{
EmitVectorTernaryOpZx32(context, (op1, op2, op3) =>
if (Optimizations.UseSse2)
{
return context.BitwiseExclusiveOr(
context.BitwiseAnd(op1,
context.BitwiseExclusiveOr(op2, op3)), op3);
});
EmitVectorTernaryOpSimd32(context, (d, n, m) =>
{
Operand res = context.AddIntrinsic(Intrinsic.X86Pxor, n, m);
res = context.AddIntrinsic(Intrinsic.X86Pand, res, d);
return context.AddIntrinsic(Intrinsic.X86Pxor, res, m);
});
}
else
{
EmitVectorTernaryOpZx32(context, (op1, op2, op3) =>
{
return context.BitwiseExclusiveOr(
context.BitwiseAnd(op1,
context.BitwiseExclusiveOr(op2, op3)), op3);
});
}
}
public static void Vorr_I(ArmEmitterContext context)
{
EmitVectorBinaryOpZx32(context, (op1, op2) => context.BitwiseOr(op1, op2));
if (Optimizations.UseSse2)
{
EmitVectorBinaryOpF32(context, Intrinsic.X86Por, Intrinsic.X86Por);
}
else
{
EmitVectorBinaryOpZx32(context, (op1, op2) => context.BitwiseOr(op1, op2));
}
}
private static void EmitBifBit(ArmEmitterContext context, bool notRm)
{
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
EmitVectorTernaryOpZx32(context, (d, n, m) =>
if (Optimizations.UseSse2)
{
if (notRm)
EmitVectorTernaryOpSimd32(context, (d, n, m) =>
{
m = context.BitwiseNot(m);
}
return context.BitwiseExclusiveOr(
context.BitwiseAnd(m,
context.BitwiseExclusiveOr(d, n)), d);
});
Operand res = context.AddIntrinsic(Intrinsic.X86Pxor, n, d);
res = context.AddIntrinsic((notRm) ? Intrinsic.X86Pandn : Intrinsic.X86Pand, m, res);
return context.AddIntrinsic(Intrinsic.X86Pxor, d, res);
});
}
else
{
EmitVectorTernaryOpZx32(context, (d, n, m) =>
{
if (notRm)
{
m = context.BitwiseNot(m);
}
return context.BitwiseExclusiveOr(
context.BitwiseAnd(m,
context.BitwiseExclusiveOr(d, n)), d);
});
}
}
}
}