From 0f8f40486d1b3215c845325744bd545149223805 Mon Sep 17 00:00:00 2001
From: Merry <MerryMage@users.noreply.github.com>
Date: Sun, 8 Jul 2018 20:54:47 +0100
Subject: [PATCH] ChocolArm64: More accurate implementation of Frecpe & Frecps
 (#228)

* ChocolArm64: More accurate implementation of Frecpe

* ChocolArm64: Handle infinities and zeros in Frecps
---
 .../Instruction/AInstEmitSimdArithmetic.cs    | 100 ++-------------
 .../Instruction/AInstEmitSimdHelper.cs        |  20 +++
 ChocolArm64/Instruction/ASoftFloat.cs         | 120 ++++++++++++++++++
 Ryujinx.Tests/Cpu/CpuTestSimdArithmetic.cs    |  39 +++---
 4 files changed, 170 insertions(+), 109 deletions(-)

diff --git a/ChocolArm64/Instruction/AInstEmitSimdArithmetic.cs b/ChocolArm64/Instruction/AInstEmitSimdArithmetic.cs
index b96b71be..39331f96 100644
--- a/ChocolArm64/Instruction/AInstEmitSimdArithmetic.cs
+++ b/ChocolArm64/Instruction/AInstEmitSimdArithmetic.cs
@@ -641,106 +641,34 @@ namespace ChocolArm64.Instruction
 
         public static void Frecpe_S(AILEmitterCtx Context)
         {
-            EmitFrecpe(Context, 0, Scalar: true);
+            EmitScalarUnaryOpF(Context, () =>
+            {
+                EmitUnarySoftFloatCall(Context, nameof(ASoftFloat.RecipEstimate));
+            });
         }
 
         public static void Frecpe_V(AILEmitterCtx Context)
         {
-            AOpCodeSimd Op = (AOpCodeSimd)Context.CurrOp;
-
-            int SizeF = Op.Size & 1;
-
-            int Bytes = Context.CurrOp.GetBitsCount() >> 3;
-
-            for (int Index = 0; Index < Bytes >> SizeF + 2; Index++)
+            EmitVectorUnaryOpF(Context, () =>
             {
-                EmitFrecpe(Context, Index, Scalar: false);
-            }
-
-            if (Op.RegisterSize == ARegisterSize.SIMD64)
-            {
-                EmitVectorZeroUpper(Context, Op.Rd);
-            }
-        }
-
-        private static void EmitFrecpe(AILEmitterCtx Context, int Index, bool Scalar)
-        {
-            AOpCodeSimd Op = (AOpCodeSimd)Context.CurrOp;
-
-            int SizeF = Op.Size & 1;
-
-            if (SizeF == 0)
-            {
-                Context.EmitLdc_R4(1);
-            }
-            else /* if (SizeF == 1) */
-            {
-                Context.EmitLdc_R8(1);
-            }
-
-            EmitVectorExtractF(Context, Op.Rn, Index, SizeF);
-
-            Context.Emit(OpCodes.Div);
-
-            if (Scalar)
-            {
-                EmitVectorZeroAll(Context, Op.Rd);
-            }
-
-            EmitVectorInsertF(Context, Op.Rd, Index, SizeF);
+                EmitUnarySoftFloatCall(Context, nameof(ASoftFloat.RecipEstimate));
+            });
         }
 
         public static void Frecps_S(AILEmitterCtx Context)
         {
-            EmitFrecps(Context, 0, Scalar: true);
+            EmitScalarBinaryOpF(Context, () =>
+            {
+                EmitBinarySoftFloatCall(Context, nameof(ASoftFloat.RecipStep));
+            });
         }
 
         public static void Frecps_V(AILEmitterCtx Context)
         {
-            AOpCodeSimd Op = (AOpCodeSimd)Context.CurrOp;
-
-            int SizeF = Op.Size & 1;
-
-            int Bytes = Context.CurrOp.GetBitsCount() >> 3;
-
-            for (int Index = 0; Index < Bytes >> SizeF + 2; Index++)
+            EmitVectorBinaryOpF(Context, () =>
             {
-                EmitFrecps(Context, Index, Scalar: false);
-            }
-
-            if (Op.RegisterSize == ARegisterSize.SIMD64)
-            {
-                EmitVectorZeroUpper(Context, Op.Rd);
-            }
-        }
-
-        private static void EmitFrecps(AILEmitterCtx Context, int Index, bool Scalar)
-        {
-            AOpCodeSimdReg Op = (AOpCodeSimdReg)Context.CurrOp;
-
-            int SizeF = Op.Size & 1;
-
-            if (SizeF == 0)
-            {
-                Context.EmitLdc_R4(2);
-            }
-            else /* if (SizeF == 1) */
-            {
-                Context.EmitLdc_R8(2);
-            }
-
-            EmitVectorExtractF(Context, Op.Rn, Index, SizeF);
-            EmitVectorExtractF(Context, Op.Rm, Index, SizeF);
-
-            Context.Emit(OpCodes.Mul);
-            Context.Emit(OpCodes.Sub);
-
-            if (Scalar)
-            {
-                EmitVectorZeroAll(Context, Op.Rd);
-            }
-
-            EmitVectorInsertF(Context, Op.Rd, Index, SizeF);
+                EmitBinarySoftFloatCall(Context, nameof(ASoftFloat.RecipStep));
+            });
         }
 
         public static void Frinta_S(AILEmitterCtx Context)
diff --git a/ChocolArm64/Instruction/AInstEmitSimdHelper.cs b/ChocolArm64/Instruction/AInstEmitSimdHelper.cs
index 0f6ea42c..d895ec9c 100644
--- a/ChocolArm64/Instruction/AInstEmitSimdHelper.cs
+++ b/ChocolArm64/Instruction/AInstEmitSimdHelper.cs
@@ -253,6 +253,26 @@ namespace ChocolArm64.Instruction
             Context.EmitCall(MthdInfo);
         }
 
+        public static void EmitBinarySoftFloatCall(AILEmitterCtx Context, string Name)
+        {
+            IAOpCodeSimd Op = (IAOpCodeSimd)Context.CurrOp;
+
+            int SizeF = Op.Size & 1;
+
+            MethodInfo MthdInfo;
+
+            if (SizeF == 0)
+            {
+                MthdInfo = typeof(ASoftFloat).GetMethod(Name, new Type[] { typeof(float), typeof(float) });
+            }
+            else /* if (SizeF == 1) */
+            {
+                MthdInfo = typeof(ASoftFloat).GetMethod(Name, new Type[] { typeof(double), typeof(double) });
+            }
+
+            Context.EmitCall(MthdInfo);
+        }
+
         public static void EmitScalarBinaryOpByElemF(AILEmitterCtx Context, Action Emit)
         {
             AOpCodeSimdRegElemF Op = (AOpCodeSimdRegElemF)Context.CurrOp;
diff --git a/ChocolArm64/Instruction/ASoftFloat.cs b/ChocolArm64/Instruction/ASoftFloat.cs
index 1bd71665..e63c82be 100644
--- a/ChocolArm64/Instruction/ASoftFloat.cs
+++ b/ChocolArm64/Instruction/ASoftFloat.cs
@@ -7,8 +7,10 @@ namespace ChocolArm64.Instruction
         static ASoftFloat()
         {
             InvSqrtEstimateTable = BuildInvSqrtEstimateTable();
+            RecipEstimateTable = BuildRecipEstimateTable();
         }
 
+        private static readonly byte[] RecipEstimateTable;
         private static readonly byte[] InvSqrtEstimateTable;
 
         private static byte[] BuildInvSqrtEstimateTable()
@@ -38,6 +40,22 @@ namespace ChocolArm64.Instruction
             return Table;
         }
 
+        private static byte[] BuildRecipEstimateTable()
+        {
+            byte[] Table = new byte[256];
+            for (ulong index = 0; index < 256; index++)
+            {
+                ulong a = index | 0x100;
+
+                a = (a << 1) + 1;
+                ulong b = 0x80000 / a;
+                b = (b + 1) >> 1;
+
+                Table[index] = (byte)(b & 0xFF);
+            }
+            return Table;
+        }
+
         public static float InvSqrtEstimate(float x)
         {
             return (float)InvSqrtEstimate((double)x);
@@ -105,5 +123,107 @@ namespace ChocolArm64.Instruction
             ulong result = x_sign | (result_exp << 52) | fraction;
             return BitConverter.Int64BitsToDouble((long)result);
         }
+
+        public static float RecipEstimate(float x)
+        {
+            return (float)RecipEstimate((double)x);
+        }
+
+        public static double RecipEstimate(double x)
+        {
+            ulong x_bits = (ulong)BitConverter.DoubleToInt64Bits(x);
+            ulong x_sign = x_bits & 0x8000000000000000;
+            ulong x_exp = (x_bits >> 52) & 0x7FF;
+            ulong scaled = x_bits & ((1ul << 52) - 1);
+
+            if (x_exp >= 2045)
+            {
+                if (x_exp == 0x7ff && scaled != 0)
+                {
+                    // NaN
+                    return BitConverter.Int64BitsToDouble((long)(x_bits | 0x0008000000000000));
+                }
+
+                // Infinity, or Out of range -> Zero
+                return BitConverter.Int64BitsToDouble((long)x_sign);
+            }
+
+            if (x_exp == 0)
+            {
+                if (scaled == 0)
+                {
+                    // Zero -> Infinity
+                    return BitConverter.Int64BitsToDouble((long)(x_sign | 0x7ff0000000000000));
+                }
+
+                // Denormal
+                if ((scaled & (1ul << 51)) == 0)
+                {
+                    x_exp = ~0ul;
+                    scaled <<= 2;
+                }
+                else
+                {
+                    scaled <<= 1;
+                }
+            }
+
+            scaled >>= 44;
+            scaled &= 0xFF;
+
+            ulong result_exp = (2045 - x_exp) & 0x7FF;
+            ulong estimate = (ulong)RecipEstimateTable[scaled];
+            ulong fraction = estimate << 44;
+
+            if (result_exp == 0)
+            {
+                fraction >>= 1;
+                fraction |= 1ul << 51;
+            }
+            else if (result_exp == 0x7FF)
+            {
+                result_exp = 0;
+                fraction >>= 2;
+                fraction |= 1ul << 50;
+            }
+
+            ulong result = x_sign | (result_exp << 52) | fraction;
+            return BitConverter.Int64BitsToDouble((long)result);
+        }
+
+        public static float RecipStep(float op1, float op2)
+        {
+            return (float)RecipStep((double)op1, (double)op2);
+        }
+
+        public static double RecipStep(double op1, double op2)
+        {
+            op1 = -op1;
+
+            ulong op1_bits = (ulong)BitConverter.DoubleToInt64Bits(op1);
+            ulong op2_bits = (ulong)BitConverter.DoubleToInt64Bits(op2);
+
+            ulong op1_sign = op1_bits & 0x8000000000000000;
+            ulong op2_sign = op2_bits & 0x8000000000000000;
+            ulong op1_other = op1_bits & 0x7FFFFFFFFFFFFFFF;
+            ulong op2_other = op2_bits & 0x7FFFFFFFFFFFFFFF;
+
+            bool inf1 = op1_other == 0x7ff0000000000000;
+            bool inf2 = op2_other == 0x7ff0000000000000;
+            bool zero1 = op1_other == 0;
+            bool zero2 = op2_other == 0;
+
+            if ((inf1 && zero2) || (zero1 && inf2))
+            {
+                return 2.0;
+            }
+            else if (inf1 || inf2)
+            {
+                // Infinity
+                return BitConverter.Int64BitsToDouble((long)(0x7ff0000000000000 | (op1_sign ^ op2_sign)));
+            }
+
+            return 2.0 + op1 * op2;
+        }
     }
 }
\ No newline at end of file
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdArithmetic.cs b/Ryujinx.Tests/Cpu/CpuTestSimdArithmetic.cs
index 98be2fc5..2a0f5ed9 100644
--- a/Ryujinx.Tests/Cpu/CpuTestSimdArithmetic.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdArithmetic.cs
@@ -163,26 +163,18 @@ namespace Ryujinx.Tests.Cpu
             Assert.That(Sse41.Extract(ThreadState.V6, (byte)0), Is.EqualTo(A * B));
         }
 
-        [Test, Description("FRECPE D0, D1")]
-        public void Frecpe_S([Random(100)] double A)
+        [TestCase(0x00000000u, 0x7F800000u)]
+        [TestCase(0x80000000u, 0xFF800000u)]
+        [TestCase(0x00FFF000u, 0x7E000000u)]
+        [TestCase(0x41200000u, 0x3DCC8000u)]
+        [TestCase(0xC1200000u, 0xBDCC8000u)]
+        [TestCase(0x001FFFFFu, 0x7F800000u)]
+        [TestCase(0x007FF000u, 0x7E800000u)]
+        public void Frecpe_S(uint A, uint Result)
         {
-            AThreadState ThreadState = SingleOpcode(0x5EE1D820, V1: MakeVectorE0(A));
-
-            Assert.That(VectorExtractDouble(ThreadState.V0, 0), Is.EqualTo(1 / A));
-        }
-
-        [Test, Description("FRECPE V2.4S, V0.4S")]
-        public void Frecpe_V([Random(100)] float A)
-        {
-            AThreadState ThreadState = SingleOpcode(0x4EA1D802, V0: Sse.SetAllVector128(A));
-
-            Assert.Multiple(() =>
-            {
-                Assert.That(Sse41.Extract(ThreadState.V2, (byte)0), Is.EqualTo(1 / A));
-                Assert.That(Sse41.Extract(ThreadState.V2, (byte)1), Is.EqualTo(1 / A));
-                Assert.That(Sse41.Extract(ThreadState.V2, (byte)2), Is.EqualTo(1 / A));
-                Assert.That(Sse41.Extract(ThreadState.V2, (byte)3), Is.EqualTo(1 / A));
-            });
+            Vector128<float> V1 = MakeVectorE0(A);
+            AThreadState ThreadState = SingleOpcode(0x5EA1D820, V1: V1);
+            Assert.AreEqual(Result, GetVectorE0(ThreadState.V0));
         }
 
         [Test, Description("FRECPS D0, D1, D2")]
@@ -202,12 +194,13 @@ namespace Ryujinx.Tests.Cpu
                 V2: Sse.SetAllVector128(A),
                 V0: Sse.SetAllVector128(B));
 
+            float Result = (float)(2 - ((double)A * (double)B));
             Assert.Multiple(() =>
             {
-                Assert.That(Sse41.Extract(ThreadState.V4, (byte)0), Is.EqualTo(2 - (A * B)));
-                Assert.That(Sse41.Extract(ThreadState.V4, (byte)1), Is.EqualTo(2 - (A * B)));
-                Assert.That(Sse41.Extract(ThreadState.V4, (byte)2), Is.EqualTo(2 - (A * B)));
-                Assert.That(Sse41.Extract(ThreadState.V4, (byte)3), Is.EqualTo(2 - (A * B)));
+                Assert.That(Sse41.Extract(ThreadState.V4, (byte)0), Is.EqualTo(Result));
+                Assert.That(Sse41.Extract(ThreadState.V4, (byte)1), Is.EqualTo(Result));
+                Assert.That(Sse41.Extract(ThreadState.V4, (byte)2), Is.EqualTo(Result));
+                Assert.That(Sse41.Extract(ThreadState.V4, (byte)3), Is.EqualTo(Result));
             });
         }