From f695a215ad84607a2df8f31f2138918926eb3f0c Mon Sep 17 00:00:00 2001 From: riperiperi Date: Wed, 25 Mar 2020 06:20:29 +0000 Subject: [PATCH] Add Fast Paths for Crypto instructions (A32/A64) (#1026) * Add Fast Paths for Crypto instructions (A32/A64) * Replace additional XOR with passing in const zero. --- ARMeilleure/CodeGen/X86/Assembler.cs | 5 ++ ARMeilleure/CodeGen/X86/IntrinsicTable.cs | 5 ++ ARMeilleure/CodeGen/X86/X86Instruction.cs | 5 ++ .../Instructions/InstEmitSimdCrypto.cs | 54 +++++++++++++++++-- .../Instructions/InstEmitSimdCrypto32.cs | 54 +++++++++++++++++-- .../IntermediateRepresentation/Intrinsic.cs | 5 ++ ARMeilleure/Optimizations.cs | 2 + 7 files changed, 122 insertions(+), 8 deletions(-) diff --git a/ARMeilleure/CodeGen/X86/Assembler.cs b/ARMeilleure/CodeGen/X86/Assembler.cs index 5088e6f0..de361677 100644 --- a/ARMeilleure/CodeGen/X86/Assembler.cs +++ b/ARMeilleure/CodeGen/X86/Assembler.cs @@ -74,6 +74,11 @@ namespace ARMeilleure.CodeGen.X86 Add(X86Instruction.Addps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f58, InstructionFlags.Vex)); Add(X86Instruction.Addsd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f58, InstructionFlags.Vex | InstructionFlags.PrefixF2)); Add(X86Instruction.Addss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f58, InstructionFlags.Vex | InstructionFlags.PrefixF3)); + Add(X86Instruction.Aesdec, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38de, InstructionFlags.Vex | InstructionFlags.Prefix66)); + Add(X86Instruction.Aesdeclast, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38df, InstructionFlags.Vex | InstructionFlags.Prefix66)); + Add(X86Instruction.Aesenc, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38dc, InstructionFlags.Vex | InstructionFlags.Prefix66)); + Add(X86Instruction.Aesenclast, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38dd, InstructionFlags.Vex | InstructionFlags.Prefix66)); + Add(X86Instruction.Aesimc, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38db, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.And, new InstructionInfo(0x00000021, 0x04000083, 0x04000081, BadOp, 0x00000023, InstructionFlags.None)); Add(X86Instruction.Andnpd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f55, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Andnps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f55, InstructionFlags.Vex)); diff --git a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs index c003eff3..5382e3ea 100644 --- a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs +++ b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs @@ -17,6 +17,11 @@ namespace ARMeilleure.CodeGen.X86 Add(Intrinsic.X86Addps, new IntrinsicInfo(X86Instruction.Addps, IntrinsicType.Binary)); Add(Intrinsic.X86Addsd, new IntrinsicInfo(X86Instruction.Addsd, IntrinsicType.Binary)); Add(Intrinsic.X86Addss, new IntrinsicInfo(X86Instruction.Addss, IntrinsicType.Binary)); + Add(Intrinsic.X86Aesdec, new IntrinsicInfo(X86Instruction.Aesdec, IntrinsicType.Binary)); + Add(Intrinsic.X86Aesdeclast, new IntrinsicInfo(X86Instruction.Aesdeclast, IntrinsicType.Binary)); + Add(Intrinsic.X86Aesenc, new IntrinsicInfo(X86Instruction.Aesenc, IntrinsicType.Binary)); + Add(Intrinsic.X86Aesenclast, new IntrinsicInfo(X86Instruction.Aesenclast, IntrinsicType.Binary)); + Add(Intrinsic.X86Aesimc, new IntrinsicInfo(X86Instruction.Aesimc, IntrinsicType.Unary)); Add(Intrinsic.X86Andnpd, new IntrinsicInfo(X86Instruction.Andnpd, IntrinsicType.Binary)); Add(Intrinsic.X86Andnps, new IntrinsicInfo(X86Instruction.Andnps, IntrinsicType.Binary)); Add(Intrinsic.X86Andpd, new IntrinsicInfo(X86Instruction.Andpd, IntrinsicType.Binary)); diff --git a/ARMeilleure/CodeGen/X86/X86Instruction.cs b/ARMeilleure/CodeGen/X86/X86Instruction.cs index a6dbf1a5..e4682595 100644 --- a/ARMeilleure/CodeGen/X86/X86Instruction.cs +++ b/ARMeilleure/CodeGen/X86/X86Instruction.cs @@ -7,6 +7,11 @@ namespace ARMeilleure.CodeGen.X86 Addps, Addsd, Addss, + Aesdec, + Aesdeclast, + Aesenc, + Aesenclast, + Aesimc, And, Andnpd, Andnps, diff --git a/ARMeilleure/Instructions/InstEmitSimdCrypto.cs b/ARMeilleure/Instructions/InstEmitSimdCrypto.cs index 2b61fada..5b470567 100644 --- a/ARMeilleure/Instructions/InstEmitSimdCrypto.cs +++ b/ARMeilleure/Instructions/InstEmitSimdCrypto.cs @@ -15,7 +15,17 @@ namespace ARMeilleure.Instructions Operand d = GetVec(op.Rd); Operand n = GetVec(op.Rn); - context.Copy(d, context.Call(new _V128_V128_V128(SoftFallback.Decrypt), d, n)); + Operand res; + if (Optimizations.UseAesni) + { + res = context.AddIntrinsic(Intrinsic.X86Aesdeclast, context.AddIntrinsic(Intrinsic.X86Xorpd, d, n), context.VectorZero()); + } + else + { + res = context.Call(new _V128_V128_V128(SoftFallback.Decrypt), d, n); + } + + context.Copy(d, res); } public static void Aese_V(ArmEmitterContext context) @@ -25,7 +35,17 @@ namespace ARMeilleure.Instructions Operand d = GetVec(op.Rd); Operand n = GetVec(op.Rn); - context.Copy(d, context.Call(new _V128_V128_V128(SoftFallback.Encrypt), d, n)); + Operand res; + if (Optimizations.UseAesni) + { + res = context.AddIntrinsic(Intrinsic.X86Aesenclast, context.AddIntrinsic(Intrinsic.X86Xorpd, d, n), context.VectorZero()); + } + else + { + res = context.Call(new _V128_V128_V128(SoftFallback.Encrypt), d, n); + } + + context.Copy(d, res); } public static void Aesimc_V(ArmEmitterContext context) @@ -34,7 +54,17 @@ namespace ARMeilleure.Instructions Operand n = GetVec(op.Rn); - context.Copy(GetVec(op.Rd), context.Call(new _V128_V128(SoftFallback.InverseMixColumns), n)); + Operand res; + if (Optimizations.UseAesni) + { + res = context.AddIntrinsic(Intrinsic.X86Aesimc, n); + } + else + { + res = context.Call(new _V128_V128(SoftFallback.InverseMixColumns), n); + } + + context.Copy(GetVec(op.Rd), res); } public static void Aesmc_V(ArmEmitterContext context) @@ -43,7 +73,23 @@ namespace ARMeilleure.Instructions Operand n = GetVec(op.Rn); - context.Copy(GetVec(op.Rd), context.Call(new _V128_V128(SoftFallback.MixColumns), n)); + Operand res; + if (Optimizations.UseAesni) + { + Operand roundKey = context.VectorZero(); + + // Inverse Shift Rows, Inverse Sub Bytes, xor 0 so nothing happens + res = context.AddIntrinsic(Intrinsic.X86Aesdeclast, n, roundKey); + + // Shift Rows, Sub Bytes, Mix Columns (!), xor 0 so nothing happens + res = context.AddIntrinsic(Intrinsic.X86Aesenc, res, roundKey); + } + else + { + res = context.Call(new _V128_V128(SoftFallback.MixColumns), n); + } + + context.Copy(GetVec(op.Rd), res); } } } diff --git a/ARMeilleure/Instructions/InstEmitSimdCrypto32.cs b/ARMeilleure/Instructions/InstEmitSimdCrypto32.cs index 1cfce3ad..f62fd307 100644 --- a/ARMeilleure/Instructions/InstEmitSimdCrypto32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdCrypto32.cs @@ -15,7 +15,17 @@ namespace ARMeilleure.Instructions Operand d = GetVecA32(op.Qd); Operand n = GetVecA32(op.Qm); - context.Copy(d, context.Call(new _V128_V128_V128(SoftFallback.Decrypt), d, n)); + Operand res; + if (Optimizations.UseAesni) + { + res = context.AddIntrinsic(Intrinsic.X86Aesdeclast, context.AddIntrinsic(Intrinsic.X86Xorpd, d, n), context.VectorZero()); + } + else + { + res = context.Call(new _V128_V128_V128(SoftFallback.Decrypt), d, n); + } + + context.Copy(d, res); } public static void Aese_V(ArmEmitterContext context) @@ -25,7 +35,17 @@ namespace ARMeilleure.Instructions Operand d = GetVecA32(op.Qd); Operand n = GetVecA32(op.Qm); - context.Copy(d, context.Call(new _V128_V128_V128(SoftFallback.Encrypt), d, n)); + Operand res; + if (Optimizations.UseAesni) + { + res = context.AddIntrinsic(Intrinsic.X86Aesenclast, context.AddIntrinsic(Intrinsic.X86Xorpd, d, n), context.VectorZero()); + } + else + { + res = context.Call(new _V128_V128_V128(SoftFallback.Encrypt), d, n); + } + + context.Copy(d, res); } public static void Aesimc_V(ArmEmitterContext context) @@ -34,7 +54,17 @@ namespace ARMeilleure.Instructions Operand n = GetVecA32(op.Qm); - context.Copy(GetVec(op.Qd), context.Call(new _V128_V128(SoftFallback.InverseMixColumns), n)); + Operand res; + if (Optimizations.UseAesni) + { + res = context.AddIntrinsic(Intrinsic.X86Aesimc, n); + } + else + { + res = context.Call(new _V128_V128(SoftFallback.InverseMixColumns), n); + } + + context.Copy(GetVecA32(op.Qd), res); } public static void Aesmc_V(ArmEmitterContext context) @@ -43,7 +73,23 @@ namespace ARMeilleure.Instructions Operand n = GetVecA32(op.Qm); - context.Copy(GetVec(op.Qd), context.Call(new _V128_V128(SoftFallback.MixColumns), n)); + Operand res; + if (Optimizations.UseAesni) + { + Operand roundKey = context.VectorZero(); + + // Inverse Shift Rows, Inverse Sub Bytes, xor 0 so nothing happens. + res = context.AddIntrinsic(Intrinsic.X86Aesdeclast, n, roundKey); + + // Shift Rows, Sub Bytes, Mix Columns (!), xor 0 so nothing happens. + res = context.AddIntrinsic(Intrinsic.X86Aesenc, res, roundKey); + } + else + { + res = context.Call(new _V128_V128(SoftFallback.MixColumns), n); + } + + context.Copy(GetVecA32(op.Qd), res); } } } diff --git a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs index c60e80cf..28ec9f32 100644 --- a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs +++ b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs @@ -6,6 +6,11 @@ namespace ARMeilleure.IntermediateRepresentation X86Addps, X86Addsd, X86Addss, + X86Aesdec, + X86Aesdeclast, + X86Aesenc, + X86Aesenclast, + X86Aesimc, X86Andnpd, X86Andnps, X86Andpd, diff --git a/ARMeilleure/Optimizations.cs b/ARMeilleure/Optimizations.cs index 28af0936..b486c5d2 100644 --- a/ARMeilleure/Optimizations.cs +++ b/ARMeilleure/Optimizations.cs @@ -16,6 +16,7 @@ namespace ARMeilleure public static bool UseSse42IfAvailable { get; set; } = true; public static bool UsePopCntIfAvailable { get; set; } = true; public static bool UseAvxIfAvailable { get; set; } = true; + public static bool UseAesniIfAvailable { get; set; } = true; public static bool ForceLegacySse { @@ -31,5 +32,6 @@ namespace ARMeilleure internal static bool UseSse42 => UseSse42IfAvailable && HardwareCapabilities.SupportsSse42; internal static bool UsePopCnt => UsePopCntIfAvailable && HardwareCapabilities.SupportsPopcnt; internal static bool UseAvx => UseAvxIfAvailable && HardwareCapabilities.SupportsAvx && !ForceLegacySse; + internal static bool UseAesni => UseAesniIfAvailable && HardwareCapabilities.SupportsAesni; } } \ No newline at end of file