diff --git a/ARMeilleure/CodeGen/Arm64/IntrinsicTable.cs b/ARMeilleure/CodeGen/Arm64/IntrinsicTable.cs index 53ef152e..a309d56d 100644 --- a/ARMeilleure/CodeGen/Arm64/IntrinsicTable.cs +++ b/ARMeilleure/CodeGen/Arm64/IntrinsicTable.cs @@ -226,6 +226,8 @@ namespace ARMeilleure.CodeGen.Arm64 Add(Intrinsic.Arm64MlsVe, new IntrinsicInfo(0x2f004000u, IntrinsicType.VectorTernaryRdByElem)); Add(Intrinsic.Arm64MlsV, new IntrinsicInfo(0x2e209400u, IntrinsicType.VectorTernaryRd)); Add(Intrinsic.Arm64MoviV, new IntrinsicInfo(0x0f000400u, IntrinsicType.VectorMovi)); + Add(Intrinsic.Arm64MrsFpcr, new IntrinsicInfo(0xd53b4400u, IntrinsicType.GetRegister)); + Add(Intrinsic.Arm64MsrFpcr, new IntrinsicInfo(0xd51b4400u, IntrinsicType.SetRegister)); Add(Intrinsic.Arm64MrsFpsr, new IntrinsicInfo(0xd53b4420u, IntrinsicType.GetRegister)); Add(Intrinsic.Arm64MsrFpsr, new IntrinsicInfo(0xd51b4420u, IntrinsicType.SetRegister)); Add(Intrinsic.Arm64MulVe, new IntrinsicInfo(0x0f008000u, IntrinsicType.VectorBinaryByElem)); diff --git a/ARMeilleure/CodeGen/X86/AssemblerTable.cs b/ARMeilleure/CodeGen/X86/AssemblerTable.cs index b47b3ecd..e6a2ff07 100644 --- a/ARMeilleure/CodeGen/X86/AssemblerTable.cs +++ b/ARMeilleure/CodeGen/X86/AssemblerTable.cs @@ -268,11 +268,13 @@ namespace ARMeilleure.CodeGen.X86 Add(X86Instruction.Vblendvps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4a, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Vcvtph2ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3813, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Vcvtps2ph, new InstructionInfo(0x000f3a1d, BadOp, BadOp, BadOp, BadOp, InstructionFlags.Vex | InstructionFlags.Prefix66)); + Add(X86Instruction.Vfmadd231pd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38b8, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW)); Add(X86Instruction.Vfmadd231ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38b8, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Vfmadd231sd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38b9, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW)); Add(X86Instruction.Vfmadd231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38b9, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Vfmsub231sd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bb, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW)); Add(X86Instruction.Vfmsub231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bb, InstructionFlags.Vex | InstructionFlags.Prefix66)); + Add(X86Instruction.Vfnmadd231pd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bc, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW)); Add(X86Instruction.Vfnmadd231ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bc, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Vfnmadd231sd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bd, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW)); Add(X86Instruction.Vfnmadd231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bd, InstructionFlags.Vex | InstructionFlags.Prefix66)); diff --git a/ARMeilleure/CodeGen/X86/CodeGenerator.cs b/ARMeilleure/CodeGen/X86/CodeGenerator.cs index 8b5a3fc5..e7179b51 100644 --- a/ARMeilleure/CodeGen/X86/CodeGenerator.cs +++ b/ARMeilleure/CodeGen/X86/CodeGenerator.cs @@ -249,10 +249,9 @@ namespace ARMeilleure.CodeGen.X86 case IntrinsicType.Mxcsr: { Operand offset = operation.GetSource(0); - Operand bits = operation.GetSource(1); - Debug.Assert(offset.Kind == OperandKind.Constant && bits.Kind == OperandKind.Constant); - Debug.Assert(offset.Type == OperandType.I32 && bits.Type == OperandType.I32); + Debug.Assert(offset.Kind == OperandKind.Constant); + Debug.Assert(offset.Type == OperandType.I32); int offs = offset.AsInt32() + context.CallArgsRegionSize; @@ -261,21 +260,23 @@ namespace ARMeilleure.CodeGen.X86 Debug.Assert(HardwareCapabilities.SupportsSse || HardwareCapabilities.SupportsVexEncoding); - context.Assembler.Stmxcsr(memOp); - - if (operation.Intrinsic == Intrinsic.X86Mxcsrmb) + if (operation.Intrinsic == Intrinsic.X86Ldmxcsr) { - context.Assembler.Or(memOp, bits, OperandType.I32); + Operand bits = operation.GetSource(1); + Debug.Assert(bits.Type == OperandType.I32); + + context.Assembler.Mov(memOp, bits, OperandType.I32); + context.Assembler.Ldmxcsr(memOp); } - else /* if (intrinOp.Intrinsic == Intrinsic.X86Mxcsrub) */ + else if (operation.Intrinsic == Intrinsic.X86Stmxcsr) { - Operand notBits = Const(~bits.AsInt32()); + Operand dest = operation.Destination; + Debug.Assert(dest.Type == OperandType.I32); - context.Assembler.And(memOp, notBits, OperandType.I32); + context.Assembler.Stmxcsr(memOp); + context.Assembler.Mov(dest, memOp, OperandType.I32); } - context.Assembler.Ldmxcsr(memOp); - break; } diff --git a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs index c788fa44..e3d94b7a 100644 --- a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs +++ b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs @@ -60,6 +60,7 @@ namespace ARMeilleure.CodeGen.X86 Add(Intrinsic.X86Haddpd, new IntrinsicInfo(X86Instruction.Haddpd, IntrinsicType.Binary)); Add(Intrinsic.X86Haddps, new IntrinsicInfo(X86Instruction.Haddps, IntrinsicType.Binary)); Add(Intrinsic.X86Insertps, new IntrinsicInfo(X86Instruction.Insertps, IntrinsicType.TernaryImm)); + Add(Intrinsic.X86Ldmxcsr, new IntrinsicInfo(X86Instruction.None, IntrinsicType.Mxcsr)); Add(Intrinsic.X86Maxpd, new IntrinsicInfo(X86Instruction.Maxpd, IntrinsicType.Binary)); Add(Intrinsic.X86Maxps, new IntrinsicInfo(X86Instruction.Maxps, IntrinsicType.Binary)); Add(Intrinsic.X86Maxsd, new IntrinsicInfo(X86Instruction.Maxsd, IntrinsicType.Binary)); @@ -75,8 +76,6 @@ namespace ARMeilleure.CodeGen.X86 Add(Intrinsic.X86Mulps, new IntrinsicInfo(X86Instruction.Mulps, IntrinsicType.Binary)); Add(Intrinsic.X86Mulsd, new IntrinsicInfo(X86Instruction.Mulsd, IntrinsicType.Binary)); Add(Intrinsic.X86Mulss, new IntrinsicInfo(X86Instruction.Mulss, IntrinsicType.Binary)); - Add(Intrinsic.X86Mxcsrmb, new IntrinsicInfo(X86Instruction.None, IntrinsicType.Mxcsr)); // Mask bits. - Add(Intrinsic.X86Mxcsrub, new IntrinsicInfo(X86Instruction.None, IntrinsicType.Mxcsr)); // Unmask bits. Add(Intrinsic.X86Paddb, new IntrinsicInfo(X86Instruction.Paddb, IntrinsicType.Binary)); Add(Intrinsic.X86Paddd, new IntrinsicInfo(X86Instruction.Paddd, IntrinsicType.Binary)); Add(Intrinsic.X86Paddq, new IntrinsicInfo(X86Instruction.Paddq, IntrinsicType.Binary)); @@ -160,6 +159,7 @@ namespace ARMeilleure.CodeGen.X86 Add(Intrinsic.X86Sqrtps, new IntrinsicInfo(X86Instruction.Sqrtps, IntrinsicType.Unary)); Add(Intrinsic.X86Sqrtsd, new IntrinsicInfo(X86Instruction.Sqrtsd, IntrinsicType.Unary)); Add(Intrinsic.X86Sqrtss, new IntrinsicInfo(X86Instruction.Sqrtss, IntrinsicType.Unary)); + Add(Intrinsic.X86Stmxcsr, new IntrinsicInfo(X86Instruction.None, IntrinsicType.Mxcsr)); Add(Intrinsic.X86Subpd, new IntrinsicInfo(X86Instruction.Subpd, IntrinsicType.Binary)); Add(Intrinsic.X86Subps, new IntrinsicInfo(X86Instruction.Subps, IntrinsicType.Binary)); Add(Intrinsic.X86Subsd, new IntrinsicInfo(X86Instruction.Subsd, IntrinsicType.Binary)); @@ -170,11 +170,13 @@ namespace ARMeilleure.CodeGen.X86 Add(Intrinsic.X86Unpcklps, new IntrinsicInfo(X86Instruction.Unpcklps, IntrinsicType.Binary)); Add(Intrinsic.X86Vcvtph2ps, new IntrinsicInfo(X86Instruction.Vcvtph2ps, IntrinsicType.Unary)); Add(Intrinsic.X86Vcvtps2ph, new IntrinsicInfo(X86Instruction.Vcvtps2ph, IntrinsicType.BinaryImm)); + Add(Intrinsic.X86Vfmadd231pd, new IntrinsicInfo(X86Instruction.Vfmadd231pd, IntrinsicType.Fma)); Add(Intrinsic.X86Vfmadd231ps, new IntrinsicInfo(X86Instruction.Vfmadd231ps, IntrinsicType.Fma)); Add(Intrinsic.X86Vfmadd231sd, new IntrinsicInfo(X86Instruction.Vfmadd231sd, IntrinsicType.Fma)); Add(Intrinsic.X86Vfmadd231ss, new IntrinsicInfo(X86Instruction.Vfmadd231ss, IntrinsicType.Fma)); Add(Intrinsic.X86Vfmsub231sd, new IntrinsicInfo(X86Instruction.Vfmsub231sd, IntrinsicType.Fma)); Add(Intrinsic.X86Vfmsub231ss, new IntrinsicInfo(X86Instruction.Vfmsub231ss, IntrinsicType.Fma)); + Add(Intrinsic.X86Vfnmadd231pd, new IntrinsicInfo(X86Instruction.Vfnmadd231pd, IntrinsicType.Fma)); Add(Intrinsic.X86Vfnmadd231ps, new IntrinsicInfo(X86Instruction.Vfnmadd231ps, IntrinsicType.Fma)); Add(Intrinsic.X86Vfnmadd231sd, new IntrinsicInfo(X86Instruction.Vfnmadd231sd, IntrinsicType.Fma)); Add(Intrinsic.X86Vfnmadd231ss, new IntrinsicInfo(X86Instruction.Vfnmadd231ss, IntrinsicType.Fma)); diff --git a/ARMeilleure/CodeGen/X86/Mxcsr.cs b/ARMeilleure/CodeGen/X86/Mxcsr.cs new file mode 100644 index 00000000..c61eac31 --- /dev/null +++ b/ARMeilleure/CodeGen/X86/Mxcsr.cs @@ -0,0 +1,15 @@ +using System; + +namespace ARMeilleure.CodeGen.X86 +{ + [Flags] + enum Mxcsr + { + Ftz = 1 << 15, // Flush To Zero. + Rhi = 1 << 14, // Round Mode high bit. + Rlo = 1 << 13, // Round Mode low bit. + Um = 1 << 11, // Underflow Mask. + Dm = 1 << 8, // Denormal Mask. + Daz = 1 << 6 // Denormals Are Zero. + } +} diff --git a/ARMeilleure/CodeGen/X86/PreAllocator.cs b/ARMeilleure/CodeGen/X86/PreAllocator.cs index 72f56514..cb742d67 100644 --- a/ARMeilleure/CodeGen/X86/PreAllocator.cs +++ b/ARMeilleure/CodeGen/X86/PreAllocator.cs @@ -120,12 +120,18 @@ namespace ARMeilleure.CodeGen.X86 break; case Instruction.Extended: - if (node.Intrinsic == Intrinsic.X86Mxcsrmb || node.Intrinsic == Intrinsic.X86Mxcsrub) + if (node.Intrinsic == Intrinsic.X86Ldmxcsr) { int stackOffset = stackAlloc.Allocate(OperandType.I32); node.SetSources(new Operand[] { Const(stackOffset), node.GetSource(0) }); } + else if (node.Intrinsic == Intrinsic.X86Stmxcsr) + { + int stackOffset = stackAlloc.Allocate(OperandType.I32); + + node.SetSources(new Operand[] { Const(stackOffset) }); + } break; } } diff --git a/ARMeilleure/CodeGen/X86/X86Instruction.cs b/ARMeilleure/CodeGen/X86/X86Instruction.cs index ecfc432d..9a85c516 100644 --- a/ARMeilleure/CodeGen/X86/X86Instruction.cs +++ b/ARMeilleure/CodeGen/X86/X86Instruction.cs @@ -208,11 +208,13 @@ namespace ARMeilleure.CodeGen.X86 Vblendvps, Vcvtph2ps, Vcvtps2ph, + Vfmadd231pd, Vfmadd231ps, Vfmadd231sd, Vfmadd231ss, Vfmsub231sd, Vfmsub231ss, + Vfnmadd231pd, Vfnmadd231ps, Vfnmadd231sd, Vfnmadd231ss, diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs index d0bb68e4..7e7f26b1 100644 --- a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs +++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs @@ -615,14 +615,11 @@ namespace ARMeilleure.Instructions { return EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - IOpCodeSimd op = (IOpCodeSimd)context.CurrOp; + IOpCodeSimd op = (IOpCodeSimd)context.CurrOp; - Intrinsic addInst = (op.Size & 1) == 0 ? Intrinsic.X86Addps : Intrinsic.X86Addpd; + Intrinsic addInst = (op.Size & 1) == 0 ? Intrinsic.X86Addps : Intrinsic.X86Addpd; - return context.AddIntrinsic(addInst, op1, op2); - }, scalar: false, op1, op2); + return context.AddIntrinsic(addInst, op1, op2); }, scalar: false, op1, op2); }); } @@ -696,17 +693,33 @@ namespace ARMeilleure.Instructions Operand n = GetVec(op.Rn); Operand m = GetVec(op.Rm); + Operand res; + if (op.Size == 0) { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); - res = context.AddIntrinsic(Intrinsic.X86Addss, a, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231ss, a, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addss, a, res); + } context.Copy(d, context.VectorZeroUpper96(res)); } else /* if (op.Size == 1) */ { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); - res = context.AddIntrinsic(Intrinsic.X86Addsd, a, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231sd, a, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addsd, a, res); + } context.Copy(d, context.VectorZeroUpper64(res)); } @@ -730,10 +743,7 @@ namespace ARMeilleure.Instructions { EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); - }, scalar: true, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); }, scalar: true); } else @@ -755,10 +765,7 @@ namespace ARMeilleure.Instructions { EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); - }, scalar: false, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); }, scalar: false); } else @@ -886,10 +893,7 @@ namespace ARMeilleure.Instructions { return EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); - }, scalar: false, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); }, scalar: false, op1, op2); }); } @@ -914,10 +918,7 @@ namespace ARMeilleure.Instructions { return EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); - }, scalar: false, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); }, scalar: false, op1, op2); }); } @@ -940,10 +941,7 @@ namespace ARMeilleure.Instructions { EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); - }, scalar: true, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); }, scalar: true); } else @@ -965,10 +963,7 @@ namespace ARMeilleure.Instructions { EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); - }, scalar: false, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); }, scalar: false); } else @@ -1096,10 +1091,7 @@ namespace ARMeilleure.Instructions { return EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); - }, scalar: false, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); }, scalar: false, op1, op2); }); } @@ -1124,10 +1116,7 @@ namespace ARMeilleure.Instructions { return EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); - }, scalar: false, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); }, scalar: false, op1, op2); }); } @@ -1146,6 +1135,37 @@ namespace ARMeilleure.Instructions { InstEmitSimdHelperArm64.EmitScalarTernaryOpFRdByElem(context, Intrinsic.Arm64FmlaSe); } + else if (Optimizations.UseFma) + { + OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + int shuffleMask = op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6; + + Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask)); + + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231ss, d, n, res); + + context.Copy(d, context.VectorZeroUpper96(res)); + } + else /* if (sizeF == 1) */ + { + int shuffleMask = op.Index | op.Index << 1; + + Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask)); + + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231sd, d, n, res); + + context.Copy(d, context.VectorZeroUpper64(res)); + } + } else { EmitScalarTernaryOpByElemF(context, (op1, op2, op3) => @@ -1171,11 +1191,19 @@ namespace ARMeilleure.Instructions int sizeF = op.Size & 1; + Operand res; + if (sizeF == 0) { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); - - res = context.AddIntrinsic(Intrinsic.X86Addps, d, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231ps, d, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addps, d, res); + } if (op.RegisterSize == RegisterSize.Simd64) { @@ -1186,9 +1214,15 @@ namespace ARMeilleure.Instructions } else /* if (sizeF == 1) */ { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); - - res = context.AddIntrinsic(Intrinsic.X86Addpd, d, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231pd, d, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addpd, d, res); + } context.Copy(d, res); } @@ -1224,8 +1258,15 @@ namespace ARMeilleure.Instructions Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask)); - res = context.AddIntrinsic(Intrinsic.X86Mulps, n, res); - res = context.AddIntrinsic(Intrinsic.X86Addps, d, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231ps, d, n, res); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, res); + res = context.AddIntrinsic(Intrinsic.X86Addps, d, res); + } if (op.RegisterSize == RegisterSize.Simd64) { @@ -1240,8 +1281,15 @@ namespace ARMeilleure.Instructions Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask)); - res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, res); - res = context.AddIntrinsic(Intrinsic.X86Addpd, d, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231pd, d, n, res); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, res); + res = context.AddIntrinsic(Intrinsic.X86Addpd, d, res); + } context.Copy(d, res); } @@ -1261,6 +1309,37 @@ namespace ARMeilleure.Instructions { InstEmitSimdHelperArm64.EmitScalarTernaryOpFRdByElem(context, Intrinsic.Arm64FmlsSe); } + else if (Optimizations.UseFma) + { + OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + int shuffleMask = op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6; + + Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask)); + + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ss, d, n, res); + + context.Copy(d, context.VectorZeroUpper96(res)); + } + else /* if (sizeF == 1) */ + { + int shuffleMask = op.Index | op.Index << 1; + + Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask)); + + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231sd, d, n, res); + + context.Copy(d, context.VectorZeroUpper64(res)); + } + } else { EmitScalarTernaryOpByElemF(context, (op1, op2, op3) => @@ -1286,11 +1365,19 @@ namespace ARMeilleure.Instructions int sizeF = op.Size & 1; + Operand res; + if (sizeF == 0) { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); - - res = context.AddIntrinsic(Intrinsic.X86Subps, d, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ps, d, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subps, d, res); + } if (op.RegisterSize == RegisterSize.Simd64) { @@ -1301,9 +1388,15 @@ namespace ARMeilleure.Instructions } else /* if (sizeF == 1) */ { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); - - res = context.AddIntrinsic(Intrinsic.X86Subpd, d, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231pd, d, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subpd, d, res); + } context.Copy(d, res); } @@ -1339,8 +1432,15 @@ namespace ARMeilleure.Instructions Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask)); - res = context.AddIntrinsic(Intrinsic.X86Mulps, n, res); - res = context.AddIntrinsic(Intrinsic.X86Subps, d, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ps, d, n, res); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, res); + res = context.AddIntrinsic(Intrinsic.X86Subps, d, res); + } if (op.RegisterSize == RegisterSize.Simd64) { @@ -1355,8 +1455,15 @@ namespace ARMeilleure.Instructions Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask)); - res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, res); - res = context.AddIntrinsic(Intrinsic.X86Subpd, d, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231pd, d, n, res); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, res); + res = context.AddIntrinsic(Intrinsic.X86Subpd, d, res); + } context.Copy(d, res); } @@ -1385,17 +1492,33 @@ namespace ARMeilleure.Instructions Operand n = GetVec(op.Rn); Operand m = GetVec(op.Rm); + Operand res; + if (op.Size == 0) { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); - res = context.AddIntrinsic(Intrinsic.X86Subss, a, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ss, a, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subss, a, res); + } context.Copy(d, context.VectorZeroUpper96(res)); } else /* if (op.Size == 1) */ { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); - res = context.AddIntrinsic(Intrinsic.X86Subsd, a, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231sd, a, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subsd, a, res); + } context.Copy(d, context.VectorZeroUpper64(res)); } @@ -1669,25 +1792,39 @@ namespace ARMeilleure.Instructions Operand n = GetVec(op.Rn); Operand m = GetVec(op.Rm); + Operand res; + if (op.Size == 0) { - Operand mask = X86GetScalar(context, -0f); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmsub231ss, a, n, m); + } + else + { + Operand mask = X86GetScalar(context, -0f); + Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorps, mask, a); - Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorps, mask, a); - - Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); - res = context.AddIntrinsic(Intrinsic.X86Subss, aNeg, res); + res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subss, aNeg, res); + } context.Copy(d, context.VectorZeroUpper96(res)); } else /* if (op.Size == 1) */ { - Operand mask = X86GetScalar(context, -0d); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmsub231sd, a, n, m); + } + else + { + Operand mask = X86GetScalar(context, -0d); + Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, a); - Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, a); - - Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); - res = context.AddIntrinsic(Intrinsic.X86Subsd, aNeg, res); + res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subsd, aNeg, res); + } context.Copy(d, context.VectorZeroUpper64(res)); } @@ -1716,25 +1853,39 @@ namespace ARMeilleure.Instructions Operand n = GetVec(op.Rn); Operand m = GetVec(op.Rm); + Operand res; + if (op.Size == 0) { - Operand mask = X86GetScalar(context, -0f); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmsub231ss, a, n, m); + } + else + { + Operand mask = X86GetScalar(context, -0f); + Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorps, mask, a); - Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorps, mask, a); - - Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); - res = context.AddIntrinsic(Intrinsic.X86Addss, aNeg, res); + res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addss, aNeg, res); + } context.Copy(d, context.VectorZeroUpper96(res)); } else /* if (op.Size == 1) */ { - Operand mask = X86GetScalar(context, -0d); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmsub231sd, a, n, m); + } + else + { + Operand mask = X86GetScalar(context, -0d); + Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, a); - Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, a); - - Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); - res = context.AddIntrinsic(Intrinsic.X86Addsd, aNeg, res); + res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addsd, aNeg, res); + } context.Copy(d, context.VectorZeroUpper64(res)); } @@ -1830,13 +1981,22 @@ namespace ARMeilleure.Instructions int sizeF = op.Size & 1; + Operand res; + if (sizeF == 0) { Operand mask = X86GetScalar(context, 2f); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ss, mask, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subss, mask, res); + } - res = context.AddIntrinsic(Intrinsic.X86Subss, mask, res); res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: true, sizeF); context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res)); @@ -1845,9 +2005,16 @@ namespace ARMeilleure.Instructions { Operand mask = X86GetScalar(context, 2d); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231sd, mask, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subsd, mask, res); + } - res = context.AddIntrinsic(Intrinsic.X86Subsd, mask, res); res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: true, sizeF); context.Copy(GetVec(op.Rd), context.VectorZeroUpper64(res)); @@ -1877,14 +2044,23 @@ namespace ARMeilleure.Instructions int sizeF = op.Size & 1; + Operand res; + if (sizeF == 0) { Operand mask = X86GetAllElements(context, 2f); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); - res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: false, sizeF); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ps, mask, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subps, mask, res); + } - res = context.AddIntrinsic(Intrinsic.X86Subps, mask, res); + res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: false, sizeF); if (op.RegisterSize == RegisterSize.Simd64) { @@ -1897,10 +2073,17 @@ namespace ARMeilleure.Instructions { Operand mask = X86GetAllElements(context, 2d); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); - res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: false, sizeF); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231pd, mask, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subpd, mask, res); + } - res = context.AddIntrinsic(Intrinsic.X86Subpd, mask, res); + res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: false, sizeF); context.Copy(GetVec(op.Rd), res); } @@ -2113,20 +2296,32 @@ namespace ARMeilleure.Instructions public static void Frintx_S(ArmEmitterContext context) { - // TODO Arm64: Fast path. Should we set host FPCR? - EmitScalarUnaryOpF(context, (op1) => + if (Optimizations.UseAdvSimd) { - return EmitRoundByRMode(context, op1); - }); + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FrintxS); + } + else + { + EmitScalarUnaryOpF(context, (op1) => + { + return EmitRoundByRMode(context, op1); + }); + } } public static void Frintx_V(ArmEmitterContext context) { - // TODO Arm64: Fast path. Should we set host FPCR? - EmitVectorUnaryOpF(context, (op1) => + if (Optimizations.UseAdvSimd) { - return EmitRoundByRMode(context, op1); - }); + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FrintxV); + } + else + { + EmitVectorUnaryOpF(context, (op1) => + { + return EmitRoundByRMode(context, op1); + }); + } } public static void Frintz_S(ArmEmitterContext context) @@ -2237,16 +2432,25 @@ namespace ARMeilleure.Instructions int sizeF = op.Size & 1; + Operand res; + if (sizeF == 0) { Operand maskHalf = X86GetScalar(context, 0.5f); Operand maskThree = X86GetScalar(context, 3f); Operand maskOneHalf = X86GetScalar(context, 1.5f); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ss, maskThree, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subss, maskThree, res); + } - res = context.AddIntrinsic(Intrinsic.X86Subss, maskThree, res); - res = context.AddIntrinsic(Intrinsic.X86Mulss, maskHalf, res); + res = context.AddIntrinsic(Intrinsic.X86Mulss, maskHalf, res); res = EmitSse41RecipStepSelectOpF(context, n, m, res, maskOneHalf, scalar: true, sizeF); context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res)); @@ -2257,10 +2461,17 @@ namespace ARMeilleure.Instructions Operand maskThree = X86GetScalar(context, 3d); Operand maskOneHalf = X86GetScalar(context, 1.5d); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231sd, maskThree, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subsd, maskThree, res); + } - res = context.AddIntrinsic(Intrinsic.X86Subsd, maskThree, res); - res = context.AddIntrinsic(Intrinsic.X86Mulsd, maskHalf, res); + res = context.AddIntrinsic(Intrinsic.X86Mulsd, maskHalf, res); res = EmitSse41RecipStepSelectOpF(context, n, m, res, maskOneHalf, scalar: true, sizeF); context.Copy(GetVec(op.Rd), context.VectorZeroUpper64(res)); @@ -2290,15 +2501,24 @@ namespace ARMeilleure.Instructions int sizeF = op.Size & 1; + Operand res; + if (sizeF == 0) { Operand maskHalf = X86GetAllElements(context, 0.5f); Operand maskThree = X86GetAllElements(context, 3f); Operand maskOneHalf = X86GetAllElements(context, 1.5f); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ps, maskThree, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subps, maskThree, res); + } - res = context.AddIntrinsic(Intrinsic.X86Subps, maskThree, res); res = context.AddIntrinsic(Intrinsic.X86Mulps, maskHalf, res); res = EmitSse41RecipStepSelectOpF(context, n, m, res, maskOneHalf, scalar: false, sizeF); @@ -2315,9 +2535,16 @@ namespace ARMeilleure.Instructions Operand maskThree = X86GetAllElements(context, 3d); Operand maskOneHalf = X86GetAllElements(context, 1.5d); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231pd, maskThree, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subpd, maskThree, res); + } - res = context.AddIntrinsic(Intrinsic.X86Subpd, maskThree, res); res = context.AddIntrinsic(Intrinsic.X86Mulpd, maskHalf, res); res = EmitSse41RecipStepSelectOpF(context, n, m, res, maskOneHalf, scalar: false, sizeF); @@ -4728,53 +4955,6 @@ namespace ARMeilleure.Instructions } } - public static Operand EmitSseOrAvxHandleFzModeOpF( - ArmEmitterContext context, - Func2I emit, - bool scalar, - Operand n = default, - Operand m = default) - { - Operand nCopy = n == default ? context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rn)) : n; - Operand mCopy = m == default ? context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rm)) : m; - - EmitSseOrAvxEnterFtzAndDazModesOpF(context, out Operand isTrue); - - Operand res = emit(nCopy, mCopy); - - EmitSseOrAvxExitFtzAndDazModesOpF(context, isTrue); - - if (n != default || m != default) - { - return res; - } - - int sizeF = ((IOpCodeSimd)context.CurrOp).Size & 1; - - if (sizeF == 0) - { - if (scalar) - { - res = context.VectorZeroUpper96(res); - } - else if (((OpCodeSimdReg)context.CurrOp).RegisterSize == RegisterSize.Simd64) - { - res = context.VectorZeroUpper64(res); - } - } - else /* if (sizeF == 1) */ - { - if (scalar) - { - res = context.VectorZeroUpper64(res); - } - } - - context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rd), res); - - return default; - } - private static Operand EmitSse2VectorMaxMinOpF(ArmEmitterContext context, Operand n, Operand m, bool isMax) { IOpCodeSimd op = (IOpCodeSimd)context.CurrOp; @@ -4834,10 +5014,7 @@ namespace ARMeilleure.Instructions Operand res = EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: isMaxNum); - }, scalar: scalar, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: isMaxNum); }, scalar: scalar, nCopy, mCopy); if (n != default || m != default) @@ -4872,10 +5049,7 @@ namespace ARMeilleure.Instructions Operand res = EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: isMaxNum); - }, scalar: scalar, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: isMaxNum); }, scalar: scalar, nCopy, mCopy); if (n != default || m != default) diff --git a/ARMeilleure/Instructions/InstEmitSimdCvt32.cs b/ARMeilleure/Instructions/InstEmitSimdCvt32.cs index 5fdc3b5a..33ae83df 100644 --- a/ARMeilleure/Instructions/InstEmitSimdCvt32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdCvt32.cs @@ -356,9 +356,11 @@ namespace ARMeilleure.Instructions ? typeof(SoftFloat64_16).GetMethod(nameof(SoftFloat64_16.FPConvert)) : typeof(SoftFloat32_16).GetMethod(nameof(SoftFloat32_16.FPConvert)); + context.ExitArmFpMode(); context.StoreToContext(); Operand res = context.Call(method, src); context.LoadFromContext(); + context.EnterArmFpMode(); InsertScalar16(context, op.Vd, op.T, res); } @@ -372,9 +374,11 @@ namespace ARMeilleure.Instructions ? typeof(SoftFloat16_64).GetMethod(nameof(SoftFloat16_64.FPConvert)) : typeof(SoftFloat16_32).GetMethod(nameof(SoftFloat16_32.FPConvert)); + context.ExitArmFpMode(); context.StoreToContext(); Operand res = context.Call(method, src); context.LoadFromContext(); + context.EnterArmFpMode(); InsertScalar(context, op.Vd, res); } @@ -542,10 +546,17 @@ namespace ARMeilleure.Instructions // VRINTX (floating-point). public static void Vrintx_S(ArmEmitterContext context) { - EmitScalarUnaryOpF32(context, (op1) => + if (Optimizations.UseAdvSimd) { - return EmitRoundByRMode(context, op1); - }); + InstEmitSimdHelper32Arm64.EmitScalarUnaryOpF32(context, Intrinsic.Arm64FrintxS); + } + else + { + EmitScalarUnaryOpF32(context, (op1) => + { + return EmitRoundByRMode(context, op1); + }); + } } private static Operand EmitFPConvert(ArmEmitterContext context, Operand value, OperandType type, bool signed) diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper.cs b/ARMeilleure/Instructions/InstEmitSimdHelper.cs index 0e7af794..c44c9b4d 100644 --- a/ARMeilleure/Instructions/InstEmitSimdHelper.cs +++ b/ARMeilleure/Instructions/InstEmitSimdHelper.cs @@ -1,3 +1,4 @@ +using ARMeilleure.CodeGen.X86; using ARMeilleure.Decoders; using ARMeilleure.IntermediateRepresentation; using ARMeilleure.State; @@ -158,6 +159,75 @@ namespace ARMeilleure.Instructions }; #endregion + public static void EnterArmFpMode(EmitterContext context, Func getFpFlag) + { + if (Optimizations.UseSse2) + { + Operand mxcsr = context.AddIntrinsicInt(Intrinsic.X86Stmxcsr); + + Operand fzTrue = getFpFlag(FPState.FzFlag); + Operand r0True = getFpFlag(FPState.RMode0Flag); + Operand r1True = getFpFlag(FPState.RMode1Flag); + + mxcsr = context.BitwiseAnd(mxcsr, Const(~(int)(Mxcsr.Ftz | Mxcsr.Daz | Mxcsr.Rhi | Mxcsr.Rlo))); + + mxcsr = context.BitwiseOr(mxcsr, context.ConditionalSelect(fzTrue, Const((int)(Mxcsr.Ftz | Mxcsr.Daz | Mxcsr.Um | Mxcsr.Dm)), Const(0))); + + // X86 round modes in order: nearest, negative, positive, zero + // ARM round modes in order: nearest, positive, negative, zero + // Read the bits backwards to correct this. + + mxcsr = context.BitwiseOr(mxcsr, context.ConditionalSelect(r0True, Const((int)Mxcsr.Rhi), Const(0))); + mxcsr = context.BitwiseOr(mxcsr, context.ConditionalSelect(r1True, Const((int)Mxcsr.Rlo), Const(0))); + + context.AddIntrinsicNoRet(Intrinsic.X86Ldmxcsr, mxcsr); + } + else if (Optimizations.UseAdvSimd) + { + Operand fpcr = context.AddIntrinsicInt(Intrinsic.Arm64MrsFpcr); + + Operand fzTrue = getFpFlag(FPState.FzFlag); + Operand r0True = getFpFlag(FPState.RMode0Flag); + Operand r1True = getFpFlag(FPState.RMode1Flag); + + fpcr = context.BitwiseAnd(fpcr, Const(~(int)(FPCR.Fz | FPCR.RMode0 | FPCR.RMode1))); + + fpcr = context.BitwiseOr(fpcr, context.ConditionalSelect(fzTrue, Const((int)FPCR.Fz), Const(0))); + fpcr = context.BitwiseOr(fpcr, context.ConditionalSelect(r0True, Const((int)FPCR.RMode0), Const(0))); + fpcr = context.BitwiseOr(fpcr, context.ConditionalSelect(r1True, Const((int)FPCR.RMode1), Const(0))); + + context.AddIntrinsicNoRet(Intrinsic.Arm64MsrFpcr, fpcr); + + // TODO: Restore FPSR + } + } + + public static void ExitArmFpMode(EmitterContext context, Action setFpFlag) + { + if (Optimizations.UseSse2) + { + Operand mxcsr = context.AddIntrinsicInt(Intrinsic.X86Stmxcsr); + + // Unset round mode (to nearest) and ftz. + mxcsr = context.BitwiseAnd(mxcsr, Const(~(int)(Mxcsr.Ftz | Mxcsr.Daz | Mxcsr.Rhi | Mxcsr.Rlo))); + + context.AddIntrinsicNoRet(Intrinsic.X86Ldmxcsr, mxcsr); + + // Status flags would be stored here if they were used. + } + else if (Optimizations.UseAdvSimd) + { + Operand fpcr = context.AddIntrinsicInt(Intrinsic.Arm64MrsFpcr); + + // Unset round mode (to nearest) and fz. + fpcr = context.BitwiseAnd(fpcr, Const(~(int)(FPCR.Fz | FPCR.RMode0 | FPCR.RMode1))); + + context.AddIntrinsicNoRet(Intrinsic.Arm64MsrFpcr, fpcr); + + // TODO: Store FPSR + } + } + public static int GetImmShl(OpCodeSimdShImm op) { return op.Imm - (8 << op.Size); @@ -465,9 +535,11 @@ namespace ARMeilleure.Instructions ? typeof(SoftFloat32).GetMethod(name) : typeof(SoftFloat64).GetMethod(name); + context.ExitArmFpMode(); context.StoreToContext(); Operand res = context.Call(info, callArgs); context.LoadFromContext(); + context.EnterArmFpMode(); return res; } @@ -1358,39 +1430,6 @@ namespace ARMeilleure.Instructions } } - [Flags] - public enum Mxcsr - { - Ftz = 1 << 15, // Flush To Zero. - Um = 1 << 11, // Underflow Mask. - Dm = 1 << 8, // Denormal Mask. - Daz = 1 << 6 // Denormals Are Zero. - } - - public static void EmitSseOrAvxEnterFtzAndDazModesOpF(ArmEmitterContext context, out Operand isTrue) - { - isTrue = GetFpFlag(FPState.FzFlag); - - Operand lblTrue = Label(); - context.BranchIfFalse(lblTrue, isTrue); - - context.AddIntrinsicNoRet(Intrinsic.X86Mxcsrmb, Const((int)(Mxcsr.Ftz | Mxcsr.Um | Mxcsr.Dm | Mxcsr.Daz))); - - context.MarkLabel(lblTrue); - } - - public static void EmitSseOrAvxExitFtzAndDazModesOpF(ArmEmitterContext context, Operand isTrue = default) - { - isTrue = isTrue == default ? GetFpFlag(FPState.FzFlag) : isTrue; - - Operand lblTrue = Label(); - context.BranchIfFalse(lblTrue, isTrue); - - context.AddIntrinsicNoRet(Intrinsic.X86Mxcsrub, Const((int)(Mxcsr.Ftz | Mxcsr.Daz))); - - context.MarkLabel(lblTrue); - } - public enum CmpCondition { // Legacy Sse. diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs index 84b01d05..36d27d42 100644 --- a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs @@ -1197,9 +1197,11 @@ namespace ARMeilleure.Instructions Array.Resize(ref callArgs, callArgs.Length + 1); callArgs[callArgs.Length - 1] = Const(1); + context.ExitArmFpMode(); context.StoreToContext(); Operand res = context.Call(info, callArgs); context.LoadFromContext(); + context.EnterArmFpMode(); return res; } diff --git a/ARMeilleure/Instructions/InstEmitSystem.cs b/ARMeilleure/Instructions/InstEmitSystem.cs index 1345bbf1..f668b83b 100644 --- a/ARMeilleure/Instructions/InstEmitSystem.cs +++ b/ARMeilleure/Instructions/InstEmitSystem.cs @@ -192,6 +192,8 @@ namespace ARMeilleure.Instructions SetFpFlag(context, (FPState)flag, context.BitwiseAnd(context.ShiftRightUI(fpcr, Const(flag)), Const(1))); } } + + context.UpdateArmFpMode(); } private static void EmitSetFpsr(ArmEmitterContext context) @@ -210,6 +212,8 @@ namespace ARMeilleure.Instructions SetFpFlag(context, (FPState)flag, context.BitwiseAnd(context.ShiftRightUI(fpsr, Const(flag)), Const(1))); } } + + context.UpdateArmFpMode(); } } } diff --git a/ARMeilleure/Instructions/InstEmitSystem32.cs b/ARMeilleure/Instructions/InstEmitSystem32.cs index e07db412..2f6cf19d 100644 --- a/ARMeilleure/Instructions/InstEmitSystem32.cs +++ b/ARMeilleure/Instructions/InstEmitSystem32.cs @@ -321,6 +321,8 @@ namespace ARMeilleure.Instructions SetFpFlag(context, (FPState)flag, context.BitwiseAnd(context.ShiftRightUI(fpscr, Const(flag)), Const(1))); } } + + context.UpdateArmFpMode(); } } } diff --git a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs index b629345e..f5a776fa 100644 --- a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs +++ b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs @@ -53,6 +53,7 @@ namespace ARMeilleure.IntermediateRepresentation X86Haddpd, X86Haddps, X86Insertps, + X86Ldmxcsr, X86Maxpd, X86Maxps, X86Maxsd, @@ -68,8 +69,6 @@ namespace ARMeilleure.IntermediateRepresentation X86Mulps, X86Mulsd, X86Mulss, - X86Mxcsrmb, - X86Mxcsrub, X86Paddb, X86Paddd, X86Paddq, @@ -153,6 +152,7 @@ namespace ARMeilleure.IntermediateRepresentation X86Sqrtps, X86Sqrtsd, X86Sqrtss, + X86Stmxcsr, X86Subpd, X86Subps, X86Subsd, @@ -163,11 +163,13 @@ namespace ARMeilleure.IntermediateRepresentation X86Unpcklps, X86Vcvtph2ps, X86Vcvtps2ph, + X86Vfmadd231pd, X86Vfmadd231ps, X86Vfmadd231sd, X86Vfmadd231ss, X86Vfmsub231sd, X86Vfmsub231ss, + X86Vfnmadd231pd, X86Vfnmadd231ps, X86Vfnmadd231sd, X86Vfnmadd231ss, @@ -394,6 +396,8 @@ namespace ARMeilleure.IntermediateRepresentation Arm64MlsVe, Arm64MlsV, Arm64MoviV, + Arm64MrsFpcr, + Arm64MsrFpcr, Arm64MrsFpsr, Arm64MsrFpsr, Arm64MulVe, diff --git a/ARMeilleure/Translation/ArmEmitterContext.cs b/ARMeilleure/Translation/ArmEmitterContext.cs index 238f8508..565d2aad 100644 --- a/ARMeilleure/Translation/ArmEmitterContext.cs +++ b/ARMeilleure/Translation/ArmEmitterContext.cs @@ -188,6 +188,21 @@ namespace ARMeilleure.Translation } } + public void EnterArmFpMode() + { + InstEmitSimdHelper.EnterArmFpMode(this, InstEmitHelper.GetFpFlag); + } + + public void UpdateArmFpMode() + { + EnterArmFpMode(); + } + + public void ExitArmFpMode() + { + InstEmitSimdHelper.ExitArmFpMode(this, (flag, value) => InstEmitHelper.SetFpFlag(this, flag, value)); + } + public Operand TryGetComparisonResult(Condition condition) { if (_optOpLastCompare == null || _optOpLastCompare != _optOpLastFlagSet) diff --git a/ARMeilleure/Translation/DispatcherFunction.cs b/ARMeilleure/Translation/DispatcherFunction.cs index e3ea21f6..7d5a3388 100644 --- a/ARMeilleure/Translation/DispatcherFunction.cs +++ b/ARMeilleure/Translation/DispatcherFunction.cs @@ -3,4 +3,5 @@ namespace ARMeilleure.Translation { delegate void DispatcherFunction(IntPtr nativeContext, ulong startAddress); + delegate ulong WrapperFunction(IntPtr nativeContext, ulong startAddress); } diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs index 17f68706..5970c4ff 100644 --- a/ARMeilleure/Translation/PTC/Ptc.cs +++ b/ARMeilleure/Translation/PTC/Ptc.cs @@ -30,7 +30,7 @@ namespace ARMeilleure.Translation.PTC private const string OuterHeaderMagicString = "PTCohd\0\0"; private const string InnerHeaderMagicString = "PTCihd\0\0"; - private const uint InternalVersion = 4485; //! To be incremented manually for each change to the ARMeilleure project. + private const uint InternalVersion = 4626; //! To be incremented manually for each change to the ARMeilleure project. private const string ActualDir = "0"; private const string BackupDir = "1"; diff --git a/ARMeilleure/Translation/TranslatedFunction.cs b/ARMeilleure/Translation/TranslatedFunction.cs index 71eec08a..f007883e 100644 --- a/ARMeilleure/Translation/TranslatedFunction.cs +++ b/ARMeilleure/Translation/TranslatedFunction.cs @@ -25,5 +25,10 @@ namespace ARMeilleure.Translation { return _func(context.NativeContextPtr); } + + public ulong Execute(WrapperFunction dispatcher, State.ExecutionContext context) + { + return dispatcher(context.NativeContextPtr, (ulong)FuncPointer); + } } } \ No newline at end of file diff --git a/ARMeilleure/Translation/Translator.cs b/ARMeilleure/Translation/Translator.cs index 0c05b2b4..f349c5eb 100644 --- a/ARMeilleure/Translation/Translator.cs +++ b/ARMeilleure/Translation/Translator.cs @@ -183,7 +183,7 @@ namespace ARMeilleure.Translation Statistics.StartTimer(); - ulong nextAddr = func.Execute(context); + ulong nextAddr = func.Execute(Stubs.ContextWrapper, context); Statistics.StopTimer(address); @@ -194,7 +194,7 @@ namespace ARMeilleure.Translation { TranslatedFunction func = Translate(address, context.ExecutionMode, highCq: false, singleStep: true); - address = func.Execute(context); + address = func.Execute(Stubs.ContextWrapper, context); EnqueueForDeletion(address, func); diff --git a/ARMeilleure/Translation/TranslatorStubs.cs b/ARMeilleure/Translation/TranslatorStubs.cs index 6ed84de8..69648df4 100644 --- a/ARMeilleure/Translation/TranslatorStubs.cs +++ b/ARMeilleure/Translation/TranslatorStubs.cs @@ -21,6 +21,7 @@ namespace ARMeilleure.Translation private readonly Translator _translator; private readonly Lazy _dispatchStub; private readonly Lazy _dispatchLoop; + private readonly Lazy _contextWrapper; /// /// Gets the dispatch stub. @@ -64,6 +65,20 @@ namespace ARMeilleure.Translation } } + /// + /// Gets the context wrapper function. + /// + /// instance was disposed + public WrapperFunction ContextWrapper + { + get + { + ObjectDisposedException.ThrowIf(_disposed, this); + + return _contextWrapper.Value; + } + } + /// /// Initializes a new instance of the class with the specified /// instance. @@ -77,6 +92,7 @@ namespace ARMeilleure.Translation _translator = translator; _dispatchStub = new(GenerateDispatchStub, isThreadSafe: true); _dispatchLoop = new(GenerateDispatchLoop, isThreadSafe: true); + _contextWrapper = new(GenerateContextWrapper, isThreadSafe: true); } /// @@ -202,6 +218,32 @@ namespace ARMeilleure.Translation return Marshal.GetFunctionPointerForDelegate(func); } + /// + /// Emits code that syncs FP state before executing guest code, or returns it to normal. + /// + /// Emitter context for the method + /// Pointer to the native context + /// True if entering guest code, false otherwise + private void EmitSyncFpContext(EmitterContext context, Operand nativeContext, bool enter) + { + if (enter) + { + InstEmitSimdHelper.EnterArmFpMode(context, (flag) => + { + Operand flagAddress = context.Add(nativeContext, Const((ulong)NativeContext.GetRegisterOffset(new Register((int)flag, RegisterType.FpFlag)))); + return context.Load(OperandType.I32, flagAddress); + }); + } + else + { + InstEmitSimdHelper.ExitArmFpMode(context, (flag, value) => + { + Operand flagAddress = context.Add(nativeContext, Const((ulong)NativeContext.GetRegisterOffset(new Register((int)flag, RegisterType.FpFlag)))); + context.Store(flagAddress, value); + }); + } + } + /// /// Generates a function. /// @@ -221,6 +263,8 @@ namespace ARMeilleure.Translation Operand runningAddress = context.Add(nativeContext, Const((ulong)NativeContext.GetRunningOffset())); Operand dispatchAddress = context.Add(nativeContext, Const((ulong)NativeContext.GetDispatchAddressOffset())); + EmitSyncFpContext(context, nativeContext, true); + context.MarkLabel(beginLbl); context.Store(dispatchAddress, guestAddress); context.Copy(guestAddress, context.Call(Const((ulong)DispatchStub), OperandType.I64, nativeContext)); @@ -229,6 +273,9 @@ namespace ARMeilleure.Translation context.Branch(beginLbl); context.MarkLabel(endLbl); + + EmitSyncFpContext(context, nativeContext, false); + context.Return(); var cfg = context.GetControlFlowGraph(); @@ -237,5 +284,29 @@ namespace ARMeilleure.Translation return Compiler.Compile(cfg, argTypes, retType, CompilerOptions.HighCq, RuntimeInformation.ProcessArchitecture).Map(); } + + /// + /// Generates a function. + /// + /// function + private WrapperFunction GenerateContextWrapper() + { + var context = new EmitterContext(); + + Operand nativeContext = context.LoadArgument(OperandType.I64, 0); + Operand guestMethod = context.LoadArgument(OperandType.I64, 1); + + EmitSyncFpContext(context, nativeContext, true); + Operand returnValue = context.Call(guestMethod, OperandType.I64, nativeContext); + EmitSyncFpContext(context, nativeContext, false); + + context.Return(returnValue); + + var cfg = context.GetControlFlowGraph(); + var retType = OperandType.I64; + var argTypes = new[] { OperandType.I64, OperandType.I64 }; + + return Compiler.Compile(cfg, argTypes, retType, CompilerOptions.HighCq, RuntimeInformation.ProcessArchitecture).Map(); + } } } diff --git a/ARMeilleure/Translation/TranslatorTestMethods.cs b/ARMeilleure/Translation/TranslatorTestMethods.cs new file mode 100644 index 00000000..ab96019a --- /dev/null +++ b/ARMeilleure/Translation/TranslatorTestMethods.cs @@ -0,0 +1,148 @@ +using ARMeilleure.CodeGen.X86; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; +using System; +using System.Runtime.InteropServices; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Translation +{ + public static class TranslatorTestMethods + { + public delegate int FpFlagsPInvokeTest(IntPtr managedMethod); + + private static bool SetPlatformFtz(EmitterContext context, bool ftz) + { + if (Optimizations.UseSse2) + { + Operand mxcsr = context.AddIntrinsicInt(Intrinsic.X86Stmxcsr); + + if (ftz) + { + mxcsr = context.BitwiseOr(mxcsr, Const((int)(Mxcsr.Ftz | Mxcsr.Um | Mxcsr.Dm))); + } + else + { + mxcsr = context.BitwiseAnd(mxcsr, Const(~(int)Mxcsr.Ftz)); + } + + context.AddIntrinsicNoRet(Intrinsic.X86Ldmxcsr, mxcsr); + + return true; + } + else if (Optimizations.UseAdvSimd) + { + Operand fpcr = context.AddIntrinsicInt(Intrinsic.Arm64MrsFpcr); + + if (ftz) + { + fpcr = context.BitwiseOr(fpcr, Const((int)FPCR.Fz)); + } + else + { + fpcr = context.BitwiseAnd(fpcr, Const(~(int)FPCR.Fz)); + } + + context.AddIntrinsicNoRet(Intrinsic.Arm64MsrFpcr, fpcr); + + return true; + } + else + { + return false; + } + } + + private static Operand FpBitsToInt(EmitterContext context, Operand fp) + { + Operand vec = context.VectorInsert(context.VectorZero(), fp, 0); + return context.VectorExtract(OperandType.I32, vec, 0); + } + + public static FpFlagsPInvokeTest GenerateFpFlagsPInvokeTest() + { + EmitterContext context = new EmitterContext(); + + Operand methodAddress = context.Copy(context.LoadArgument(OperandType.I64, 0)); + + // Verify that default dotnet fp state does not flush to zero. + // This is required for SoftFloat to function. + + // Denormal + zero != 0 + + Operand denormal = ConstF(BitConverter.Int32BitsToSingle(1)); // 1.40129846432e-45 + Operand zeroF = ConstF(0f); + Operand zero = Const(0); + + Operand result = context.Add(zeroF, denormal); + + // Must not be zero. + + Operand correct1Label = Label(); + + context.BranchIfFalse(correct1Label, context.ICompareEqual(FpBitsToInt(context, result), zero)); + + context.Return(Const(1)); + + context.MarkLabel(correct1Label); + + // Set flush to zero flag. If unsupported by the backend, just return true. + + if (!SetPlatformFtz(context, true)) + { + context.Return(Const(0)); + } + + // Denormal + zero == 0 + + Operand resultFz = context.Add(zeroF, denormal); + + // Must equal zero. + + Operand correct2Label = Label(); + + context.BranchIfTrue(correct2Label, context.ICompareEqual(FpBitsToInt(context, resultFz), zero)); + + SetPlatformFtz(context, false); + + context.Return(Const(2)); + + context.MarkLabel(correct2Label); + + // Call a managed method. This method should not change Fz state. + + context.Call(methodAddress, OperandType.None); + + // Denormal + zero == 0 + + Operand resultFz2 = context.Add(zeroF, denormal); + + // Must equal zero. + + Operand correct3Label = Label(); + + context.BranchIfTrue(correct3Label, context.ICompareEqual(FpBitsToInt(context, resultFz2), zero)); + + SetPlatformFtz(context, false); + + context.Return(Const(3)); + + context.MarkLabel(correct3Label); + + // Success. + + SetPlatformFtz(context, false); + + context.Return(Const(0)); + + // Compile and return the function. + + ControlFlowGraph cfg = context.GetControlFlowGraph(); + + OperandType[] argTypes = new OperandType[] { OperandType.I64 }; + + return Compiler.Compile(cfg, argTypes, OperandType.I32, CompilerOptions.HighCq, RuntimeInformation.ProcessArchitecture).Map(); + } + } +} diff --git a/Ryujinx.Tests/Cpu/EnvironmentTests.cs b/Ryujinx.Tests/Cpu/EnvironmentTests.cs new file mode 100644 index 00000000..d374c08a --- /dev/null +++ b/Ryujinx.Tests/Cpu/EnvironmentTests.cs @@ -0,0 +1,91 @@ +using ARMeilleure.Translation; +using NUnit.Framework; +using Ryujinx.Cpu.Jit; +using Ryujinx.Tests.Memory; +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace Ryujinx.Tests.Cpu +{ + internal class EnvironmentTests + { + private static Translator _translator; + + private void EnsureTranslator() + { + // Create a translator, as one is needed to register the signal handler or emit methods. + _translator ??= new Translator(new JitMemoryAllocator(), new MockMemoryManager(), true); + } + + [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.NoOptimization)] + private float GetDenormal() + { + return BitConverter.Int32BitsToSingle(1); + } + + [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.NoOptimization)] + private float GetZero() + { + return BitConverter.Int32BitsToSingle(0); + } + + /// + /// This test ensures that managed methods do not reset floating point control flags. + /// This is used to avoid changing control flags when running methods that don't require it, such as SVC calls, software memory... + /// + [Test] + public void FpFlagsPInvoke() + { + EnsureTranslator(); + + // Subnormal results are not flushed to zero by default. + // This operation should not be allowed to do constant propagation, hence the methods that explicitly disallow inlining. + Assert.AreNotEqual(GetDenormal() + GetZero(), 0f); + + bool methodCalled = false; + bool isFz = false; + + var managedMethod = () => + { + // Floating point math should not modify fp flags. + float test = 2f * 3.5f; + + if (test < 4f) + { + throw new System.Exception("Sanity check."); + } + + isFz = GetDenormal() + GetZero() == 0f; + + try + { + if (test >= 4f) + { + throw new System.Exception("Always throws."); + } + } + catch + { + // Exception handling should not modify fp flags. + + methodCalled = true; + } + }; + + var method = TranslatorTestMethods.GenerateFpFlagsPInvokeTest(); + + // This method sets flush-to-zero and then calls the managed method. + // Before and after setting the flags, it ensures subnormal addition works as expected. + // It returns a positive result if any tests fail, and 0 on success (or if the platform cannot change FP flags) + int result = method(Marshal.GetFunctionPointerForDelegate(managedMethod)); + + // Subnormal results are not flushed to zero by default, which we should have returned to exiting the method. + Assert.AreNotEqual(GetDenormal() + GetZero(), 0f); + + Assert.True(result == 0); + Assert.True(methodCalled); + Assert.True(isFz); + } + } +}