From a11784fcbf7a19b9d36e755cc92a27fe994008c7 Mon Sep 17 00:00:00 2001 From: merry Date: Thu, 12 Jan 2023 07:05:18 +0000 Subject: [PATCH] Arm64: Cpu feature detection (#4264) * Arm64: Cpu feature detection * Ptc: Add Arm64 feature info * nits * simplify CheckSysctlName * restore some macos flags * feedback --- .../CodeGen/Arm64/HardwareCapabilities.cs | 185 ++++++++++++++++++ .../Instructions/InstEmitSimdArithmetic.cs | 2 +- ARMeilleure/Optimizations.cs | 44 +++-- ARMeilleure/Translation/PTC/Ptc.cs | 37 +++- 4 files changed, 237 insertions(+), 31 deletions(-) create mode 100644 ARMeilleure/CodeGen/Arm64/HardwareCapabilities.cs diff --git a/ARMeilleure/CodeGen/Arm64/HardwareCapabilities.cs b/ARMeilleure/CodeGen/Arm64/HardwareCapabilities.cs new file mode 100644 index 00000000..99ff299e --- /dev/null +++ b/ARMeilleure/CodeGen/Arm64/HardwareCapabilities.cs @@ -0,0 +1,185 @@ +using System; +using System.Linq; +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics.Arm; +using System.Runtime.Versioning; + +namespace ARMeilleure.CodeGen.Arm64 +{ + static partial class HardwareCapabilities + { + static HardwareCapabilities() + { + if (!ArmBase.Arm64.IsSupported) + { + return; + } + + if (OperatingSystem.IsLinux()) + { + LinuxFeatureInfoHwCap = (LinuxFeatureFlagsHwCap)getauxval(AT_HWCAP); + LinuxFeatureInfoHwCap2 = (LinuxFeatureFlagsHwCap2)getauxval(AT_HWCAP2); + } + + if (OperatingSystem.IsMacOS()) + { + for (int i = 0; i < _sysctlNames.Length; i++) + { + if (CheckSysctlName(_sysctlNames[i])) + { + MacOsFeatureInfo |= (MacOsFeatureFlags)(1 << i); + } + } + } + } + +#region Linux + + private const ulong AT_HWCAP = 16; + private const ulong AT_HWCAP2 = 26; + + [LibraryImport("libc", SetLastError = true)] + private static partial ulong getauxval(ulong type); + + [Flags] + public enum LinuxFeatureFlagsHwCap : ulong + { + Fp = 1 << 0, + Asimd = 1 << 1, + Evtstrm = 1 << 2, + Aes = 1 << 3, + Pmull = 1 << 4, + Sha1 = 1 << 5, + Sha2 = 1 << 6, + Crc32 = 1 << 7, + Atomics = 1 << 8, + FpHp = 1 << 9, + AsimdHp = 1 << 10, + CpuId = 1 << 11, + AsimdRdm = 1 << 12, + Jscvt = 1 << 13, + Fcma = 1 << 14, + Lrcpc = 1 << 15, + DcpOp = 1 << 16, + Sha3 = 1 << 17, + Sm3 = 1 << 18, + Sm4 = 1 << 19, + AsimdDp = 1 << 20, + Sha512 = 1 << 21, + Sve = 1 << 22, + AsimdFhm = 1 << 23, + Dit = 1 << 24, + Uscat = 1 << 25, + Ilrcpc = 1 << 26, + FlagM = 1 << 27, + Ssbs = 1 << 28, + Sb = 1 << 29, + Paca = 1 << 30, + Pacg = 1UL << 31 + } + + [Flags] + public enum LinuxFeatureFlagsHwCap2 : ulong + { + Dcpodp = 1 << 0, + Sve2 = 1 << 1, + SveAes = 1 << 2, + SvePmull = 1 << 3, + SveBitperm = 1 << 4, + SveSha3 = 1 << 5, + SveSm4 = 1 << 6, + FlagM2 = 1 << 7, + Frint = 1 << 8, + SveI8mm = 1 << 9, + SveF32mm = 1 << 10, + SveF64mm = 1 << 11, + SveBf16 = 1 << 12, + I8mm = 1 << 13, + Bf16 = 1 << 14, + Dgh = 1 << 15, + Rng = 1 << 16, + Bti = 1 << 17, + Mte = 1 << 18, + Ecv = 1 << 19, + Afp = 1 << 20, + Rpres = 1 << 21, + Mte3 = 1 << 22, + Sme = 1 << 23, + Sme_i16i64 = 1 << 24, + Sme_f64f64 = 1 << 25, + Sme_i8i32 = 1 << 26, + Sme_f16f32 = 1 << 27, + Sme_b16f32 = 1 << 28, + Sme_f32f32 = 1 << 29, + Sme_fa64 = 1 << 30, + Wfxt = 1UL << 31, + Ebf16 = 1UL << 32, + Sve_Ebf16 = 1UL << 33, + Cssc = 1UL << 34, + Rprfm = 1UL << 35, + Sve2p1 = 1UL << 36 + } + + public static LinuxFeatureFlagsHwCap LinuxFeatureInfoHwCap { get; } = 0; + public static LinuxFeatureFlagsHwCap2 LinuxFeatureInfoHwCap2 { get; } = 0; + +#endregion + +#region macOS + + [LibraryImport("libSystem.dylib", SetLastError = true)] + private static unsafe partial int sysctlbyname([MarshalAs(UnmanagedType.LPStr)] string name, out int oldValue, ref ulong oldSize, IntPtr newValue, ulong newValueSize); + + [SupportedOSPlatform("macos")] + private static bool CheckSysctlName(string name) + { + ulong size = sizeof(int); + if (sysctlbyname(name, out int val, ref size, IntPtr.Zero, 0) == 0 && size == sizeof(int)) + { + return val != 0; + } + return false; + } + + private static string[] _sysctlNames = new string[] + { + "hw.optional.floatingpoint", + "hw.optional.AdvSIMD", + "hw.optional.arm.FEAT_FP16", + "hw.optional.arm.FEAT_AES", + "hw.optional.arm.FEAT_PMULL", + "hw.optional.arm.FEAT_LSE", + "hw.optional.armv8_crc32", + "hw.optional.arm.FEAT_SHA1", + "hw.optional.arm.FEAT_SHA256" + }; + + [Flags] + public enum MacOsFeatureFlags + { + Fp = 1 << 0, + AdvSimd = 1 << 1, + Fp16 = 1 << 2, + Aes = 1 << 3, + Pmull = 1 << 4, + Lse = 1 << 5, + Crc32 = 1 << 6, + Sha1 = 1 << 7, + Sha256 = 1 << 8 + } + + public static MacOsFeatureFlags MacOsFeatureInfo { get; } = 0; + +#endregion + + public static bool SupportsAdvSimd => LinuxFeatureInfoHwCap.HasFlag(LinuxFeatureFlagsHwCap.Asimd) || MacOsFeatureInfo.HasFlag(MacOsFeatureFlags.AdvSimd); + public static bool SupportsAes => LinuxFeatureInfoHwCap.HasFlag(LinuxFeatureFlagsHwCap.Aes) || MacOsFeatureInfo.HasFlag(MacOsFeatureFlags.Aes); + public static bool SupportsPmull => LinuxFeatureInfoHwCap.HasFlag(LinuxFeatureFlagsHwCap.Pmull) || MacOsFeatureInfo.HasFlag(MacOsFeatureFlags.Pmull); + public static bool SupportsLse => LinuxFeatureInfoHwCap.HasFlag(LinuxFeatureFlagsHwCap.Atomics) || MacOsFeatureInfo.HasFlag(MacOsFeatureFlags.Lse); + public static bool SupportsCrc32 => LinuxFeatureInfoHwCap.HasFlag(LinuxFeatureFlagsHwCap.Crc32) || MacOsFeatureInfo.HasFlag(MacOsFeatureFlags.Crc32); + public static bool SupportsSha1 => LinuxFeatureInfoHwCap.HasFlag(LinuxFeatureFlagsHwCap.Sha1) || MacOsFeatureInfo.HasFlag(MacOsFeatureFlags.Sha1); + public static bool SupportsSha256 => LinuxFeatureInfoHwCap.HasFlag(LinuxFeatureFlagsHwCap.Sha2) || MacOsFeatureInfo.HasFlag(MacOsFeatureFlags.Sha256); + } +} diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs index 3e65db23..d0bb68e4 100644 --- a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs +++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs @@ -2556,7 +2556,7 @@ namespace ARMeilleure.Instructions { OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; - if (Optimizations.UseAdvSimd && false) // Not supported by all Arm CPUs. + if (Optimizations.UseArm64Pmull) { InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64PmullV); } diff --git a/ARMeilleure/Optimizations.cs b/ARMeilleure/Optimizations.cs index 0810d96c..9044314f 100644 --- a/ARMeilleure/Optimizations.cs +++ b/ARMeilleure/Optimizations.cs @@ -1,8 +1,10 @@ -using ARMeilleure.CodeGen.X86; using System.Runtime.Intrinsics.Arm; namespace ARMeilleure { + using Arm64HardwareCapabilities = ARMeilleure.CodeGen.Arm64.HardwareCapabilities; + using X86HardwareCapabilities = ARMeilleure.CodeGen.X86.HardwareCapabilities; + public static class Optimizations { public static bool FastFP { get; set; } = true; @@ -10,7 +12,8 @@ namespace ARMeilleure public static bool AllowLcqInFunctionTable { get; set; } = true; public static bool UseUnmanagedDispatchLoop { get; set; } = true; - public static bool UseAdvSimdIfAvailable { get; set; } = true; + public static bool UseAdvSimdIfAvailable { get; set; } = true; + public static bool UseArm64PmullIfAvailable { get; set; } = true; public static bool UseSseIfAvailable { get; set; } = true; public static bool UseSse2IfAvailable { get; set; } = true; @@ -29,25 +32,26 @@ namespace ARMeilleure public static bool ForceLegacySse { - get => HardwareCapabilities.ForceLegacySse; - set => HardwareCapabilities.ForceLegacySse = value; + get => X86HardwareCapabilities.ForceLegacySse; + set => X86HardwareCapabilities.ForceLegacySse = value; } - internal static bool UseAdvSimd => UseAdvSimdIfAvailable && AdvSimd.IsSupported; + internal static bool UseAdvSimd => UseAdvSimdIfAvailable && Arm64HardwareCapabilities.SupportsAdvSimd; + internal static bool UseArm64Pmull => UseArm64PmullIfAvailable && Arm64HardwareCapabilities.SupportsPmull; - internal static bool UseSse => UseSseIfAvailable && HardwareCapabilities.SupportsSse; - internal static bool UseSse2 => UseSse2IfAvailable && HardwareCapabilities.SupportsSse2; - internal static bool UseSse3 => UseSse3IfAvailable && HardwareCapabilities.SupportsSse3; - internal static bool UseSsse3 => UseSsse3IfAvailable && HardwareCapabilities.SupportsSsse3; - internal static bool UseSse41 => UseSse41IfAvailable && HardwareCapabilities.SupportsSse41; - internal static bool UseSse42 => UseSse42IfAvailable && HardwareCapabilities.SupportsSse42; - internal static bool UsePopCnt => UsePopCntIfAvailable && HardwareCapabilities.SupportsPopcnt; - internal static bool UseAvx => UseAvxIfAvailable && HardwareCapabilities.SupportsAvx && !ForceLegacySse; - internal static bool UseF16c => UseF16cIfAvailable && HardwareCapabilities.SupportsF16c; - internal static bool UseFma => UseFmaIfAvailable && HardwareCapabilities.SupportsFma; - internal static bool UseAesni => UseAesniIfAvailable && HardwareCapabilities.SupportsAesni; - internal static bool UsePclmulqdq => UsePclmulqdqIfAvailable && HardwareCapabilities.SupportsPclmulqdq; - internal static bool UseSha => UseShaIfAvailable && HardwareCapabilities.SupportsSha; - internal static bool UseGfni => UseGfniIfAvailable && HardwareCapabilities.SupportsGfni; + internal static bool UseSse => UseSseIfAvailable && X86HardwareCapabilities.SupportsSse; + internal static bool UseSse2 => UseSse2IfAvailable && X86HardwareCapabilities.SupportsSse2; + internal static bool UseSse3 => UseSse3IfAvailable && X86HardwareCapabilities.SupportsSse3; + internal static bool UseSsse3 => UseSsse3IfAvailable && X86HardwareCapabilities.SupportsSsse3; + internal static bool UseSse41 => UseSse41IfAvailable && X86HardwareCapabilities.SupportsSse41; + internal static bool UseSse42 => UseSse42IfAvailable && X86HardwareCapabilities.SupportsSse42; + internal static bool UsePopCnt => UsePopCntIfAvailable && X86HardwareCapabilities.SupportsPopcnt; + internal static bool UseAvx => UseAvxIfAvailable && X86HardwareCapabilities.SupportsAvx && !ForceLegacySse; + internal static bool UseF16c => UseF16cIfAvailable && X86HardwareCapabilities.SupportsF16c; + internal static bool UseFma => UseFmaIfAvailable && X86HardwareCapabilities.SupportsFma; + internal static bool UseAesni => UseAesniIfAvailable && X86HardwareCapabilities.SupportsAesni; + internal static bool UsePclmulqdq => UsePclmulqdqIfAvailable && X86HardwareCapabilities.SupportsPclmulqdq; + internal static bool UseSha => UseShaIfAvailable && X86HardwareCapabilities.SupportsSha; + internal static bool UseGfni => UseGfniIfAvailable && X86HardwareCapabilities.SupportsGfni; } -} \ No newline at end of file +} diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs index 6f57e188..a59bc588 100644 --- a/ARMeilleure/Translation/PTC/Ptc.cs +++ b/ARMeilleure/Translation/PTC/Ptc.cs @@ -1,7 +1,6 @@ using ARMeilleure.CodeGen; using ARMeilleure.CodeGen.Linking; using ARMeilleure.CodeGen.Unwinding; -using ARMeilleure.CodeGen.X86; using ARMeilleure.Common; using ARMeilleure.Memory; using Ryujinx.Common; @@ -22,12 +21,15 @@ using static ARMeilleure.Translation.PTC.PtcFormatter; namespace ARMeilleure.Translation.PTC { + using Arm64HardwareCapabilities = ARMeilleure.CodeGen.Arm64.HardwareCapabilities; + using X86HardwareCapabilities = ARMeilleure.CodeGen.X86.HardwareCapabilities; + class Ptc : IPtcLoadState { private const string OuterHeaderMagicString = "PTCohd\0\0"; private const string InnerHeaderMagicString = "PTCihd\0\0"; - private const uint InternalVersion = 4114; //! To be incremented manually for each change to the ARMeilleure project. + private const uint InternalVersion = 4264; //! To be incremented manually for each change to the ARMeilleure project. private const string ActualDir = "0"; private const string BackupDir = "1"; @@ -952,11 +954,26 @@ namespace ARMeilleure.Translation.PTC private static FeatureInfo GetFeatureInfo() { - return new FeatureInfo( - (uint)HardwareCapabilities.FeatureInfo1Ecx, - (uint)HardwareCapabilities.FeatureInfo1Edx, - (uint)HardwareCapabilities.FeatureInfo7Ebx, - (uint)HardwareCapabilities.FeatureInfo7Ecx); + if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64) + { + return new FeatureInfo( + (ulong)Arm64HardwareCapabilities.LinuxFeatureInfoHwCap, + (ulong)Arm64HardwareCapabilities.LinuxFeatureInfoHwCap2, + (ulong)Arm64HardwareCapabilities.MacOsFeatureInfo, + 0); + } + else if (RuntimeInformation.ProcessArchitecture == Architecture.X64) + { + return new FeatureInfo( + (ulong)X86HardwareCapabilities.FeatureInfo1Ecx, + (ulong)X86HardwareCapabilities.FeatureInfo1Edx, + (ulong)X86HardwareCapabilities.FeatureInfo7Ebx, + (ulong)X86HardwareCapabilities.FeatureInfo7Ecx); + } + else + { + return new FeatureInfo(0, 0, 0, 0); + } } private byte GetMemoryManagerMode() @@ -976,7 +993,7 @@ namespace ARMeilleure.Translation.PTC return osPlatform; } - [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 58*/)] + [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 74*/)] private struct OuterHeader { public ulong Magic; @@ -1007,8 +1024,8 @@ namespace ARMeilleure.Translation.PTC } } - [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 16*/)] - private record struct FeatureInfo(uint FeatureInfo0, uint FeatureInfo1, uint FeatureInfo2, uint FeatureInfo3); + [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 32*/)] + private record struct FeatureInfo(ulong FeatureInfo0, ulong FeatureInfo1, ulong FeatureInfo2, ulong FeatureInfo3); [StructLayout(LayoutKind.Sequential, Pack = 1/*, Size = 128*/)] private struct InnerHeader