Implement VCNT instruction (#1963)

* Implement VCNT based on AArch64 CNT Add tests * Update PTC version * Address LDj's comments * Explicit size in encoding * Tighter tests * Replace SoftFallback with IR helper Co-authored-by: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com> * Reduce one BitwiseAnd from IR fallback Based on popcount64b from https://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation * Rename parameter and add assert Co-authored-by: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com> Co-authored-by: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com>
2021-02-22 20:56:13 +05:30 · 2021-02-22 20:56:13 +05:30 · 9bda7b4699
commit 9bda7b4699
parent dc0adb533d
9 changed files with 81 additions and 11 deletions
--- a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs
@ -135,6 +135,34 @@ namespace ARMeilleure.Instructions
            EmitVectorBinaryWideOpI32(context, (op1, op2) => context.Add(op1, op2), !op.U);
        }

+        public static void Vcnt(ArmEmitterContext context)
+        {
+            OpCode32SimdCmpZ op = (OpCode32SimdCmpZ)context.CurrOp;
+
+            Operand res = GetVecA32(op.Qd);
+
+            int elems = op.GetBytesCount();
+
+            for (int index = 0; index < elems; index++)
+            {
+                Operand de;
+                Operand me = EmitVectorExtractZx32(context, op.Qm, op.Im + index, op.Size);
+
+                if (Optimizations.UsePopCnt)
+                {
+                    de = context.AddIntrinsicInt(Intrinsic.X86Popcnt, me);
+                }
+                else
+                {
+                    de = EmitCountSetBits8(context, me);
+                }
+
+                res = EmitVectorInsert(context, res, de, op.Id + index, op.Size);
+            }
+
+            context.Copy(GetVecA32(op.Qd), res);
+        }
+
        public static void Vdup(ArmEmitterContext context)
        {
            OpCode32SimdDupGP op = (OpCode32SimdDupGP)context.CurrOp;