CPU (A64): Add Pmull_V Inst. with Clmul fast path for the "1/2D -> 1Q" variant & Sse fast path and slow path for both the "8/16B -> 8H" and "1/2D -> 1Q" variants; with Test. (#1817)
* Add Pmull_V Sse fast path only, both "8/16B -> 8H" and "1/2D -> 1Q" variants; with Test. * Add Clmul fast path for the 128 bits variant. * Small optimisation (save 60 instructions) for the Sse fast path about the 128 bits variant. * Add slow path, both variants. Fix V128 Shl/Shr when shift = 0. * A32: Add Vmull_I P64 variant (slow path); not tested. * A32: Add Vmull_I_P8_P64 Test and fix P64 variant.
This commit is contained in:
parent
a03ab0c4a0
commit
430ba6da65
11 changed files with 264 additions and 25 deletions
|
@ -413,6 +413,8 @@ namespace ARMeilleure.Decoders
|
|||
SetA64("0x001110101xxxxx000111xxxxxxxxxx", InstName.Orr_V, InstEmit.Orr_V, OpCodeSimdReg.Create);
|
||||
SetA64("0x00111100000xxx0xx101xxxxxxxxxx", InstName.Orr_Vi, InstEmit.Orr_Vi, OpCodeSimdImm.Create);
|
||||
SetA64("0x00111100000xxx10x101xxxxxxxxxx", InstName.Orr_Vi, InstEmit.Orr_Vi, OpCodeSimdImm.Create);
|
||||
SetA64("0x001110001xxxxx111000xxxxxxxxxx", InstName.Pmull_V, InstEmit.Pmull_V, OpCodeSimdReg.Create);
|
||||
SetA64("0x001110111xxxxx111000xxxxxxxxxx", InstName.Pmull_V, InstEmit.Pmull_V, OpCodeSimdReg.Create);
|
||||
SetA64("0x101110<<1xxxxx010000xxxxxxxxxx", InstName.Raddhn_V, InstEmit.Raddhn_V, OpCodeSimdReg.Create);
|
||||
SetA64("0x10111001100000010110xxxxxxxxxx", InstName.Rbit_V, InstEmit.Rbit_V, OpCodeSimd.Create);
|
||||
SetA64("0x00111000100000000110xxxxxxxxxx", InstName.Rev16_V, InstEmit.Rev16_V, OpCodeSimd.Create);
|
||||
|
@ -886,7 +888,7 @@ namespace ARMeilleure.Decoders
|
|||
SetA32("111100110x00xxxxxxxx1101xxx1xxxx", InstName.Vmul, InstEmit32.Vmul_V, OpCode32SimdReg.Create);
|
||||
SetA32("1111001x1x<<xxxxxxx01010x1x0xxxx", InstName.Vmull, InstEmit32.Vmull_1, OpCode32SimdRegElemLong.Create);
|
||||
SetA32("1111001x1x<<xxxxxxx01100x0x0xxxx", InstName.Vmull, InstEmit32.Vmull_I, OpCode32SimdRegLong.Create);
|
||||
SetA32("111100101x00xxxxxxx01110x0x0xxxx", InstName.Vmull, InstEmit32.Vmull_I, OpCode32SimdRegLong.Create); // Polynomial
|
||||
SetA32("111100101xx0xxxxxxx01110x0x0xxxx", InstName.Vmull, InstEmit32.Vmull_I, OpCode32SimdRegLong.Create); // P8/P64
|
||||
SetA32("111100111x110000xxxx01011xx0xxxx", InstName.Vmvn, InstEmit32.Vmvn_I, OpCode32SimdBinary.Create);
|
||||
SetA32("1111001x1x000xxxxxxx0xx00x11xxxx", InstName.Vmvn, InstEmit32.Vmvn_II, OpCode32SimdImm.Create); // D/Q vector I32.
|
||||
SetA32("1111001x1x000xxxxxxx10x00x11xxxx", InstName.Vmvn, InstEmit32.Vmvn_II, OpCode32SimdImm.Create);
|
||||
|
|
|
@ -10,6 +10,7 @@ using System.Diagnostics;
|
|||
|
||||
using static ARMeilleure.Instructions.InstEmitHelper;
|
||||
using static ARMeilleure.Instructions.InstEmitSimdHelper;
|
||||
using static ARMeilleure.Instructions.InstEmitSimdHelper32;
|
||||
using static ARMeilleure.IntermediateRepresentation.OperandHelper;
|
||||
|
||||
namespace ARMeilleure.Instructions
|
||||
|
@ -1928,6 +1929,112 @@ namespace ARMeilleure.Instructions
|
|||
}
|
||||
}
|
||||
|
||||
public static void Pmull_V(ArmEmitterContext context)
|
||||
{
|
||||
OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
|
||||
|
||||
if (Optimizations.UsePclmulqdq && op.Size == 3)
|
||||
{
|
||||
Operand n = GetVec(op.Rn);
|
||||
Operand m = GetVec(op.Rm);
|
||||
|
||||
int imm8 = op.RegisterSize == RegisterSize.Simd64 ? 0b0000_0000 : 0b0001_0001;
|
||||
|
||||
Operand res = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, n, m, Const(imm8));
|
||||
|
||||
context.Copy(GetVec(op.Rd), res);
|
||||
}
|
||||
else if (Optimizations.UseSse41)
|
||||
{
|
||||
Operand n = GetVec(op.Rn);
|
||||
Operand m = GetVec(op.Rm);
|
||||
|
||||
if (op.RegisterSize == RegisterSize.Simd64)
|
||||
{
|
||||
n = context.VectorZeroUpper64(n);
|
||||
m = context.VectorZeroUpper64(m);
|
||||
}
|
||||
else /* if (op.RegisterSize == RegisterSize.Simd128) */
|
||||
{
|
||||
n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8));
|
||||
m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8));
|
||||
}
|
||||
|
||||
Operand res = context.VectorZero();
|
||||
|
||||
if (op.Size == 0)
|
||||
{
|
||||
n = context.AddIntrinsic(Intrinsic.X86Pmovzxbw, n);
|
||||
m = context.AddIntrinsic(Intrinsic.X86Pmovzxbw, m);
|
||||
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
Operand mask = context.AddIntrinsic(Intrinsic.X86Psllw, n, Const(15 - i));
|
||||
mask = context.AddIntrinsic(Intrinsic.X86Psraw, mask, Const(15));
|
||||
|
||||
Operand tmp = context.AddIntrinsic(Intrinsic.X86Psllw, m, Const(i));
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pand, tmp, mask);
|
||||
|
||||
res = context.AddIntrinsic(Intrinsic.X86Pxor, res, tmp);
|
||||
}
|
||||
}
|
||||
else /* if (op.Size == 3) */
|
||||
{
|
||||
Operand zero = context.VectorZero();
|
||||
|
||||
for (int i = 0; i < 64; i++)
|
||||
{
|
||||
Operand mask = context.AddIntrinsic(Intrinsic.X86Movlhps, n, n);
|
||||
mask = context.AddIntrinsic(Intrinsic.X86Psllq, mask, Const(63 - i));
|
||||
mask = context.AddIntrinsic(Intrinsic.X86Psrlq, mask, Const(63));
|
||||
mask = context.AddIntrinsic(Intrinsic.X86Psubq, zero, mask);
|
||||
|
||||
Operand tmp = EmitSse2Sll_128(context, m, i);
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pand, tmp, mask);
|
||||
|
||||
res = context.AddIntrinsic(Intrinsic.X86Pxor, res, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
context.Copy(GetVec(op.Rd), res);
|
||||
}
|
||||
else
|
||||
{
|
||||
Operand n = GetVec(op.Rn);
|
||||
Operand m = GetVec(op.Rm);
|
||||
|
||||
Operand res;
|
||||
|
||||
if (op.Size == 0)
|
||||
{
|
||||
res = context.VectorZero();
|
||||
|
||||
int part = op.RegisterSize == RegisterSize.Simd64 ? 0 : 8;
|
||||
|
||||
for (int index = 0; index < 8; index++)
|
||||
{
|
||||
Operand ne = context.VectorExtract8(n, part + index);
|
||||
Operand me = context.VectorExtract8(m, part + index);
|
||||
|
||||
Operand de = EmitPolynomialMultiply(context, ne, me, 8);
|
||||
|
||||
res = EmitVectorInsert(context, res, de, index, 1);
|
||||
}
|
||||
}
|
||||
else /* if (op.Size == 3) */
|
||||
{
|
||||
int part = op.RegisterSize == RegisterSize.Simd64 ? 0 : 1;
|
||||
|
||||
Operand ne = context.VectorExtract(OperandType.I64, n, part);
|
||||
Operand me = context.VectorExtract(OperandType.I64, m, part);
|
||||
|
||||
res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.PolynomialMult64_128)), ne, me);
|
||||
}
|
||||
|
||||
context.Copy(GetVec(op.Rd), res);
|
||||
}
|
||||
}
|
||||
|
||||
public static void Raddhn_V(ArmEmitterContext context)
|
||||
{
|
||||
EmitHighNarrow(context, (op1, op2) => context.Add(op1, op2), round: true);
|
||||
|
@ -3690,5 +3797,23 @@ namespace ARMeilleure.Instructions
|
|||
|
||||
context.Copy(GetVec(op.Rd), res);
|
||||
}
|
||||
|
||||
private static Operand EmitSse2Sll_128(ArmEmitterContext context, Operand op, int shift)
|
||||
{
|
||||
// The upper part of op is assumed to be zero.
|
||||
Debug.Assert(shift >= 0 && shift < 64);
|
||||
|
||||
if (shift == 0)
|
||||
{
|
||||
return op;
|
||||
}
|
||||
|
||||
Operand high = context.AddIntrinsic(Intrinsic.X86Pslldq, op, Const(8));
|
||||
high = context.AddIntrinsic(Intrinsic.X86Psrlq, high, Const(64 - shift));
|
||||
|
||||
Operand low = context.AddIntrinsic(Intrinsic.X86Psllq, op, Const(shift));
|
||||
|
||||
return context.AddIntrinsic(Intrinsic.X86Por, high, low);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -920,7 +920,19 @@ namespace ARMeilleure.Instructions
|
|||
|
||||
if (op.Polynomial)
|
||||
{
|
||||
EmitVectorBinaryLongOpI32(context, (op1, op2) => EmitPolynomialMultiply(context, op1, op2, 8 << op.Size), false);
|
||||
if (op.Size == 0) // P8
|
||||
{
|
||||
EmitVectorBinaryLongOpI32(context, (op1, op2) => EmitPolynomialMultiply(context, op1, op2, 8 << op.Size), false);
|
||||
}
|
||||
else /* if (op.Size == 2) // P64 */
|
||||
{
|
||||
Operand ne = context.VectorExtract(OperandType.I64, GetVec(op.Qn), op.Vn & 1);
|
||||
Operand me = context.VectorExtract(OperandType.I64, GetVec(op.Qm), op.Vm & 1);
|
||||
|
||||
Operand res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.PolynomialMult64_128)), ne, me);
|
||||
|
||||
context.Copy(GetVecA32(op.Qd), res);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -1366,27 +1378,5 @@ namespace ARMeilleure.Instructions
|
|||
EmitVectorBinaryOpSimd32(context, genericEmit);
|
||||
}
|
||||
}
|
||||
|
||||
private static Operand EmitPolynomialMultiply(ArmEmitterContext context, Operand op1, Operand op2, int eSize)
|
||||
{
|
||||
Debug.Assert(eSize <= 32);
|
||||
|
||||
Operand result = eSize == 32 ? Const(0L) : Const(0);
|
||||
|
||||
if (eSize == 32)
|
||||
{
|
||||
op1 = context.ZeroExtend32(OperandType.I64, op1);
|
||||
op2 = context.ZeroExtend32(OperandType.I64, op2);
|
||||
}
|
||||
|
||||
for (int i = 0; i < eSize; i++)
|
||||
{
|
||||
Operand mask = context.BitwiseAnd(op1, Const(op1.Type, 1L << i));
|
||||
|
||||
result = context.BitwiseExclusiveOr(result, context.Multiply(op2, mask));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1167,5 +1167,27 @@ namespace ARMeilleure.Instructions
|
|||
|
||||
return res;
|
||||
}
|
||||
|
||||
public static Operand EmitPolynomialMultiply(ArmEmitterContext context, Operand op1, Operand op2, int eSize)
|
||||
{
|
||||
Debug.Assert(eSize <= 32);
|
||||
|
||||
Operand result = eSize == 32 ? Const(0L) : Const(0);
|
||||
|
||||
if (eSize == 32)
|
||||
{
|
||||
op1 = context.ZeroExtend32(OperandType.I64, op1);
|
||||
op2 = context.ZeroExtend32(OperandType.I64, op2);
|
||||
}
|
||||
|
||||
for (int i = 0; i < eSize; i++)
|
||||
{
|
||||
Operand mask = context.BitwiseAnd(op1, Const(op1.Type, 1L << i));
|
||||
|
||||
result = context.BitwiseExclusiveOr(result, context.Multiply(op2, mask));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -296,6 +296,7 @@ namespace ARMeilleure.Instructions
|
|||
Orn_V,
|
||||
Orr_V,
|
||||
Orr_Vi,
|
||||
Pmull_V,
|
||||
Raddhn_V,
|
||||
Rbit_V,
|
||||
Rev16_V,
|
||||
|
|
|
@ -1260,5 +1260,22 @@ namespace ARMeilleure.Instructions
|
|||
: (uint)(value >> 32);
|
||||
}
|
||||
#endregion
|
||||
|
||||
public static V128 PolynomialMult64_128(ulong op1, ulong op2)
|
||||
{
|
||||
V128 result = V128.Zero;
|
||||
|
||||
V128 op2_128 = new V128(op2, 0);
|
||||
|
||||
for (int i = 0; i < 64; i++)
|
||||
{
|
||||
if (((op1 >> i) & 1) == 1)
|
||||
{
|
||||
result ^= op2_128 << i;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -189,6 +189,11 @@ namespace ARMeilleure.State
|
|||
/// </remarks>
|
||||
public static V128 operator <<(V128 x, int shift)
|
||||
{
|
||||
if (shift == 0)
|
||||
{
|
||||
return new V128(x._e0, x._e1);
|
||||
}
|
||||
|
||||
ulong shiftOut = x._e0 >> (64 - shift);
|
||||
|
||||
return new V128(x._e0 << shift, (x._e1 << shift) | shiftOut);
|
||||
|
@ -205,6 +210,11 @@ namespace ARMeilleure.State
|
|||
/// </remarks>
|
||||
public static V128 operator >>(V128 x, int shift)
|
||||
{
|
||||
if (shift == 0)
|
||||
{
|
||||
return new V128(x._e0, x._e1);
|
||||
}
|
||||
|
||||
ulong shiftOut = x._e1 & ((1UL << shift) - 1);
|
||||
|
||||
return new V128((x._e0 >> shift) | (shiftOut << (64 - shift)), x._e1 >> shift);
|
||||
|
|
|
@ -171,6 +171,7 @@ namespace ARMeilleure.Translation
|
|||
SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.HashUpper)));
|
||||
SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.InverseMixColumns)));
|
||||
SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.MixColumns)));
|
||||
SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.PolynomialMult64_128)));
|
||||
SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.Round)));
|
||||
SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.RoundF)));
|
||||
SetDelegateInfo(typeof(SoftFallback).GetMethod(nameof(SoftFallback.SatF32ToS32)));
|
||||
|
|
|
@ -22,7 +22,7 @@ namespace ARMeilleure.Translation.PTC
|
|||
{
|
||||
private const string HeaderMagic = "PTChd";
|
||||
|
||||
private const int InternalVersion = 1814; //! To be incremented manually for each change to the ARMeilleure project.
|
||||
private const int InternalVersion = 1817; //! To be incremented manually for each change to the ARMeilleure project.
|
||||
|
||||
private const string ActualDir = "0";
|
||||
private const string BackupDir = "1";
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue