CPU (A64): Add Pmull_V Inst. with Clmul fast path for the "1/2D -> 1Q" variant & Sse fast path and slow path for both the "8/16B -> 8H" and "1/2D -> 1Q" variants; with Test. (#1817)

* Add Pmull_V Sse fast path only, both "8/16B -> 8H" and "1/2D -> 1Q" variants; with Test. * Add Clmul fast path for the 128 bits variant. * Small optimisation (save 60 instructions) for the Sse fast path about the 128 bits variant. * Add slow path, both variants. Fix V128 Shl/Shr when shift = 0. * A32: Add Vmull_I P64 variant (slow path); not tested. * A32: Add Vmull_I_P8_P64 Test and fix P64 variant.
2021-01-04 23:45:54 +01:00 · 2021-01-04 23:45:54 +01:00 · 430ba6da65
commit 430ba6da65
parent a03ab0c4a0
11 changed files with 264 additions and 25 deletions
--- a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
@ -1167,5 +1167,27 @@ namespace ARMeilleure.Instructions

            return res;
        }
+
+        public static Operand EmitPolynomialMultiply(ArmEmitterContext context, Operand op1, Operand op2, int eSize)
+        {
+            Debug.Assert(eSize <= 32);
+
+            Operand result = eSize == 32 ? Const(0L) : Const(0);
+
+            if (eSize == 32)
+            {
+                op1 = context.ZeroExtend32(OperandType.I64, op1);
+                op2 = context.ZeroExtend32(OperandType.I64, op2);
+            }
+
+            for (int i = 0; i < eSize; i++)
+            {
+                Operand mask = context.BitwiseAnd(op1, Const(op1.Type, 1L << i));
+
+                result = context.BitwiseExclusiveOr(result, context.Multiply(op2, mask));
+            }
+
+            return result;
+        }
    }
 }