Implement Fast Paths for most A32 SIMD instructions (#952)

* Begin work on A32 SIMD Intrinsics * More instructions, some cleanup. * Intrinsics for Move instructions (zip etc) These pass the existing tests. * Intrinsics for some of Cvt While doing this I noticed that the conversion for int/fp was incorrect in the slow path. I'll fix this in the original repo. * Intrinsics for more Arithmetic instructions. * Intrinsics for Vext * Fix VEXT Intrinsic for double words. * Use InsertPs to move scalar values. * Cleanup, fix VPADD.f32 and VMIN signed integer. * Cleanup, add SSE2 support for scalar insert. Works similarly to the IR scalar insert, but obviously this one works directly on V128. * Minor cleanup. * Enable intrinsic for FP64 to integer conversion. * Address feedback apart from splitting out intrinsic float abs Also: bad VREV encodings as undefined rather than throwing in translation. * Move float abs to helper, fix bug with cvt * Rename opc2 & 3 to match A32 docs, use ArgumentOutOfRangeException appropriately. * Get name of variable at compilation rather than string literal. * Use correct double sign mask.
2020-03-05 11:41:33 +11:00 · 2020-03-05 11:41:33 +11:00 · 68e15c1a74
commit 68e15c1a74
parent d9ed827696
12 changed files with 2077 additions and 400 deletions
--- a/ARMeilleure/Instructions/InstEmitSimdLogical32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdLogical32.cs
@ -1,4 +1,5 @@
 using ARMeilleure.Decoders;
+using ARMeilleure.IntermediateRepresentation;
 using ARMeilleure.Translation;

 using static ARMeilleure.Instructions.InstEmitSimdHelper32;
@ -9,7 +10,14 @@ namespace ARMeilleure.Instructions
    {
        public static void Vand_I(ArmEmitterContext context)
        {
-            EmitVectorBinaryOpZx32(context, (op1, op2) => context.BitwiseAnd(op1, op2));
+            if (Optimizations.UseSse2)
+            {
+                EmitVectorBinaryOpF32(context, Intrinsic.X86Pand, Intrinsic.X86Pand);
+            }
+            else
+            {
+                EmitVectorBinaryOpZx32(context, (op1, op2) => context.BitwiseAnd(op1, op2));
+            }
        }

        public static void Vbif(ArmEmitterContext context)
@ -24,33 +32,64 @@ namespace ARMeilleure.Instructions

        public static void Vbsl(ArmEmitterContext context)
        {
-            EmitVectorTernaryOpZx32(context, (op1, op2, op3) =>
+            if (Optimizations.UseSse2)
            {
-                return context.BitwiseExclusiveOr(
-                    context.BitwiseAnd(op1,
-                    context.BitwiseExclusiveOr(op2, op3)), op3);
-            });
+                EmitVectorTernaryOpSimd32(context, (d, n, m) =>
+                {
+                    Operand res = context.AddIntrinsic(Intrinsic.X86Pxor, n, m);
+                    res = context.AddIntrinsic(Intrinsic.X86Pand, res, d);
+                    return context.AddIntrinsic(Intrinsic.X86Pxor, res, m);
+                });
+            }
+            else
+            {
+                EmitVectorTernaryOpZx32(context, (op1, op2, op3) =>
+                {
+                    return context.BitwiseExclusiveOr(
+                        context.BitwiseAnd(op1,
+                        context.BitwiseExclusiveOr(op2, op3)), op3);
+                });
+            }
        }

        public static void Vorr_I(ArmEmitterContext context)
        {
-            EmitVectorBinaryOpZx32(context, (op1, op2) => context.BitwiseOr(op1, op2));
+            if (Optimizations.UseSse2)
+            {
+                EmitVectorBinaryOpF32(context, Intrinsic.X86Por, Intrinsic.X86Por);
+            }
+            else
+            {
+                EmitVectorBinaryOpZx32(context, (op1, op2) => context.BitwiseOr(op1, op2));
+            }
        }

        private static void EmitBifBit(ArmEmitterContext context, bool notRm)
        {
            OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;

-            EmitVectorTernaryOpZx32(context, (d, n, m) =>
+            if (Optimizations.UseSse2)
            {
-                if (notRm)
+                EmitVectorTernaryOpSimd32(context, (d, n, m) =>
                {
-                    m = context.BitwiseNot(m);
-                }
-                return context.BitwiseExclusiveOr(
-                    context.BitwiseAnd(m,
-                    context.BitwiseExclusiveOr(d, n)), d);
-            });
+                    Operand res = context.AddIntrinsic(Intrinsic.X86Pxor, n, d);
+                    res = context.AddIntrinsic((notRm) ? Intrinsic.X86Pandn : Intrinsic.X86Pand, m, res);
+                    return context.AddIntrinsic(Intrinsic.X86Pxor, d, res);
+                });
+            }
+            else
+            {
+                EmitVectorTernaryOpZx32(context, (d, n, m) =>
+                {
+                    if (notRm)
+                    {
+                        m = context.BitwiseNot(m);
+                    }
+                    return context.BitwiseExclusiveOr(
+                        context.BitwiseAnd(m,
+                        context.BitwiseExclusiveOr(d, n)), d);
+                });
+            }
        }
    }
 }