Implement VCNT instruction (#1963)

* Implement VCNT based on AArch64 CNT

Add tests

* Update PTC version

* Address LDj's comments

* Explicit size in encoding
* Tighter tests
* Replace SoftFallback with IR helper

Co-authored-by: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com>

* Reduce one BitwiseAnd from IR fallback

Based on popcount64b from https://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation

* Rename parameter and add assert

Co-authored-by: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com>

Co-authored-by: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com>
This commit is contained in:
mageven 2021-02-22 20:56:13 +05:30 committed by GitHub
parent dc0adb533d
commit 9bda7b4699
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 81 additions and 11 deletions

View file

@ -135,6 +135,34 @@ namespace ARMeilleure.Instructions
EmitVectorBinaryWideOpI32(context, (op1, op2) => context.Add(op1, op2), !op.U);
}
public static void Vcnt(ArmEmitterContext context)
{
OpCode32SimdCmpZ op = (OpCode32SimdCmpZ)context.CurrOp;
Operand res = GetVecA32(op.Qd);
int elems = op.GetBytesCount();
for (int index = 0; index < elems; index++)
{
Operand de;
Operand me = EmitVectorExtractZx32(context, op.Qm, op.Im + index, op.Size);
if (Optimizations.UsePopCnt)
{
de = context.AddIntrinsicInt(Intrinsic.X86Popcnt, me);
}
else
{
de = EmitCountSetBits8(context, me);
}
res = EmitVectorInsert(context, res, de, op.Id + index, op.Size);
}
context.Copy(GetVecA32(op.Qd), res);
}
public static void Vdup(ArmEmitterContext context)
{
OpCode32SimdDupGP op = (OpCode32SimdDupGP)context.CurrOp;