ryujinx/Ryujinx.Graphics/VDec/VideoDecoder.cs

281 lines
12 KiB
C#
Raw Normal View History

Add a new JIT compiler for CPU code (#693) * Start of the ARMeilleure project * Refactoring around the old IRAdapter, now renamed to PreAllocator * Optimize the LowestBitSet method * Add CLZ support and fix CLS implementation * Add missing Equals and GetHashCode overrides on some structs, misc small tweaks * Implement the ByteSwap IR instruction, and some refactoring on the assembler * Implement the DivideUI IR instruction and fix 64-bits IDIV * Correct constant operand type on CSINC * Move division instructions implementation to InstEmitDiv * Fix destination type for the ConditionalSelect IR instruction * Implement UMULH and SMULH, with new IR instructions * Fix some issues with shift instructions * Fix constant types for BFM instructions * Fix up new tests using the new V128 struct * Update tests * Move DIV tests to a separate file * Add support for calls, and some instructions that depends on them * Start adding support for SIMD & FP types, along with some of the related ARM instructions * Fix some typos and the divide instruction with FP operands * Fix wrong method call on Clz_V * Implement ARM FP & SIMD move instructions, Saddlv_V, and misc. fixes * Implement SIMD logical instructions and more misc. fixes * Fix PSRAD x86 instruction encoding, TRN, UABD and UABDL implementations * Implement float conversion instruction, merge in LDj3SNuD fixes, and some other misc. fixes * Implement SIMD shift instruction and fix Dup_V * Add SCVTF and UCVTF (vector, fixed-point) variants to the opcode table * Fix check with tolerance on tester * Implement FP & SIMD comparison instructions, and some fixes * Update FCVT (Scalar) encoding on the table to support the Half-float variants * Support passing V128 structs, some cleanup on the register allocator, merge LDj3SNuD fixes * Use old memory access methods, made a start on SIMD memory insts support, some fixes * Fix float constant passed to functions, save and restore non-volatile XMM registers, other fixes * Fix arguments count with struct return values, other fixes * More instructions * Misc. fixes and integrate LDj3SNuD fixes * Update tests * Add a faster linear scan allocator, unwinding support on windows, and other changes * Update Ryujinx.HLE * Update Ryujinx.Graphics * Fix V128 return pointer passing, RCX is clobbered * Update Ryujinx.Tests * Update ITimeZoneService * Stop using GetFunctionPointer as that can't be called from native code, misc. fixes and tweaks * Use generic GetFunctionPointerForDelegate method and other tweaks * Some refactoring on the code generator, assert on invalid operations and use a separate enum for intrinsics * Remove some unused code on the assembler * Fix REX.W prefix regression on float conversion instructions, add some sort of profiler * Add hardware capability detection * Fix regression on Sha1h and revert Fcm** changes * Add SSE2-only paths on vector extract and insert, some refactoring on the pre-allocator * Fix silly mistake introduced on last commit on CpuId * Generate inline stack probes when the stack allocation is too large * Initial support for the System-V ABI * Support multiple destination operands * Fix SSE2 VectorInsert8 path, and other fixes * Change placement of XMM callee save and restore code to match other compilers * Rename Dest to Destination and Inst to Instruction * Fix a regression related to calls and the V128 type * Add an extra space on comments to match code style * Some refactoring * Fix vector insert FP32 SSE2 path * Port over the ARM32 instructions * Avoid memory protection races on JIT Cache * Another fix on VectorInsert FP32 (thanks to LDj3SNuD * Float operands don't need to use the same register when VEX is supported * Add a new register allocator, higher quality code for hot code (tier up), and other tweaks * Some nits, small improvements on the pre allocator * CpuThreadState is gone * Allow changing CPU emulators with a config entry * Add runtime identifiers on the ARMeilleure project * Allow switching between CPUs through a config entry (pt. 2) * Change win10-x64 to win-x64 on projects * Update the Ryujinx project to use ARMeilleure * Ensure that the selected register is valid on the hybrid allocator * Allow exiting on returns to 0 (should fix test regression) * Remove register assignments for most used variables on the hybrid allocator * Do not use fixed registers as spill temp * Add missing namespace and remove unneeded using * Address PR feedback * Fix types, etc * Enable AssumeStrictAbiCompliance by default * Ensure that Spill and Fill don't load or store any more than necessary
2019-08-08 14:56:22 -04:00
using ARMeilleure.Memory;
using Ryujinx.Graphics.Gal;
using Ryujinx.Graphics.Memory;
using Ryujinx.Graphics.Texture;
using Ryujinx.Graphics.Vic;
using System;
namespace Ryujinx.Graphics.VDec
{
unsafe class VideoDecoder
{
private NvGpu _gpu;
private H264Decoder _h264Decoder;
private Vp9Decoder _vp9Decoder;
private VideoCodec _currentVideoCodec;
private long _decoderContextAddress;
private long _frameDataAddress;
private long _vpxCurrLumaAddress;
private long _vpxRef0LumaAddress;
private long _vpxRef1LumaAddress;
private long _vpxRef2LumaAddress;
private long _vpxCurrChromaAddress;
private long _vpxRef0ChromaAddress;
private long _vpxRef1ChromaAddress;
private long _vpxRef2ChromaAddress;
private long _vpxProbTablesAddress;
public VideoDecoder(NvGpu gpu)
{
_gpu = gpu;
_h264Decoder = new H264Decoder();
_vp9Decoder = new Vp9Decoder();
}
public void Process(NvGpuVmm vmm, int methodOffset, int[] arguments)
{
VideoDecoderMeth method = (VideoDecoderMeth)methodOffset;
switch (method)
{
case VideoDecoderMeth.SetVideoCodec: SetVideoCodec (vmm, arguments); break;
case VideoDecoderMeth.Execute: Execute (vmm, arguments); break;
case VideoDecoderMeth.SetDecoderCtxAddr: SetDecoderCtxAddr (vmm, arguments); break;
case VideoDecoderMeth.SetFrameDataAddr: SetFrameDataAddr (vmm, arguments); break;
case VideoDecoderMeth.SetVpxCurrLumaAddr: SetVpxCurrLumaAddr (vmm, arguments); break;
case VideoDecoderMeth.SetVpxRef0LumaAddr: SetVpxRef0LumaAddr (vmm, arguments); break;
case VideoDecoderMeth.SetVpxRef1LumaAddr: SetVpxRef1LumaAddr (vmm, arguments); break;
case VideoDecoderMeth.SetVpxRef2LumaAddr: SetVpxRef2LumaAddr (vmm, arguments); break;
case VideoDecoderMeth.SetVpxCurrChromaAddr: SetVpxCurrChromaAddr(vmm, arguments); break;
case VideoDecoderMeth.SetVpxRef0ChromaAddr: SetVpxRef0ChromaAddr(vmm, arguments); break;
case VideoDecoderMeth.SetVpxRef1ChromaAddr: SetVpxRef1ChromaAddr(vmm, arguments); break;
case VideoDecoderMeth.SetVpxRef2ChromaAddr: SetVpxRef2ChromaAddr(vmm, arguments); break;
case VideoDecoderMeth.SetVpxProbTablesAddr: SetVpxProbTablesAddr(vmm, arguments); break;
}
}
private void SetVideoCodec(NvGpuVmm vmm, int[] arguments)
{
_currentVideoCodec = (VideoCodec)arguments[0];
}
private void Execute(NvGpuVmm vmm, int[] arguments)
{
if (_currentVideoCodec == VideoCodec.H264)
{
int frameDataSize = vmm.ReadInt32(_decoderContextAddress + 0x48);
H264ParameterSets Params = MemoryHelper.Read<H264ParameterSets>(vmm.Memory, vmm.GetPhysicalAddress(_decoderContextAddress + 0x58));
H264Matrices matrices = new H264Matrices()
{
ScalingMatrix4 = vmm.ReadBytes(_decoderContextAddress + 0x1c0, 6 * 16),
ScalingMatrix8 = vmm.ReadBytes(_decoderContextAddress + 0x220, 2 * 64)
};
byte[] frameData = vmm.ReadBytes(_frameDataAddress, frameDataSize);
_h264Decoder.Decode(Params, matrices, frameData);
}
else if (_currentVideoCodec == VideoCodec.Vp9)
{
int frameDataSize = vmm.ReadInt32(_decoderContextAddress + 0x30);
Vp9FrameKeys keys = new Vp9FrameKeys()
{
CurrKey = vmm.GetPhysicalAddress(_vpxCurrLumaAddress),
Ref0Key = vmm.GetPhysicalAddress(_vpxRef0LumaAddress),
Ref1Key = vmm.GetPhysicalAddress(_vpxRef1LumaAddress),
Ref2Key = vmm.GetPhysicalAddress(_vpxRef2LumaAddress)
};
Vp9FrameHeader header = MemoryHelper.Read<Vp9FrameHeader>(vmm.Memory, vmm.GetPhysicalAddress(_decoderContextAddress + 0x48));
Vp9ProbabilityTables probs = new Vp9ProbabilityTables()
{
SegmentationTreeProbs = vmm.ReadBytes(_vpxProbTablesAddress + 0x387, 0x7),
SegmentationPredProbs = vmm.ReadBytes(_vpxProbTablesAddress + 0x38e, 0x3),
Tx8x8Probs = vmm.ReadBytes(_vpxProbTablesAddress + 0x470, 0x2),
Tx16x16Probs = vmm.ReadBytes(_vpxProbTablesAddress + 0x472, 0x4),
Tx32x32Probs = vmm.ReadBytes(_vpxProbTablesAddress + 0x476, 0x6),
CoefProbs = vmm.ReadBytes(_vpxProbTablesAddress + 0x5a0, 0x900),
SkipProbs = vmm.ReadBytes(_vpxProbTablesAddress + 0x537, 0x3),
InterModeProbs = vmm.ReadBytes(_vpxProbTablesAddress + 0x400, 0x1c),
InterpFilterProbs = vmm.ReadBytes(_vpxProbTablesAddress + 0x52a, 0x8),
IsInterProbs = vmm.ReadBytes(_vpxProbTablesAddress + 0x41c, 0x4),
CompModeProbs = vmm.ReadBytes(_vpxProbTablesAddress + 0x532, 0x5),
SingleRefProbs = vmm.ReadBytes(_vpxProbTablesAddress + 0x580, 0xa),
CompRefProbs = vmm.ReadBytes(_vpxProbTablesAddress + 0x58a, 0x5),
YModeProbs0 = vmm.ReadBytes(_vpxProbTablesAddress + 0x480, 0x20),
YModeProbs1 = vmm.ReadBytes(_vpxProbTablesAddress + 0x47c, 0x4),
PartitionProbs = vmm.ReadBytes(_vpxProbTablesAddress + 0x4e0, 0x40),
MvJointProbs = vmm.ReadBytes(_vpxProbTablesAddress + 0x53b, 0x3),
MvSignProbs = vmm.ReadBytes(_vpxProbTablesAddress + 0x53e, 0x3),
MvClassProbs = vmm.ReadBytes(_vpxProbTablesAddress + 0x54c, 0x14),
MvClass0BitProbs = vmm.ReadBytes(_vpxProbTablesAddress + 0x540, 0x3),
MvBitsProbs = vmm.ReadBytes(_vpxProbTablesAddress + 0x56c, 0x14),
MvClass0FrProbs = vmm.ReadBytes(_vpxProbTablesAddress + 0x560, 0xc),
MvFrProbs = vmm.ReadBytes(_vpxProbTablesAddress + 0x542, 0x6),
MvClass0HpProbs = vmm.ReadBytes(_vpxProbTablesAddress + 0x548, 0x2),
MvHpProbs = vmm.ReadBytes(_vpxProbTablesAddress + 0x54a, 0x2)
};
byte[] frameData = vmm.ReadBytes(_frameDataAddress, frameDataSize);
_vp9Decoder.Decode(keys, header, probs, frameData);
}
else
{
ThrowUnimplementedCodec();
}
}
private void SetDecoderCtxAddr(NvGpuVmm vmm, int[] arguments)
{
_decoderContextAddress = GetAddress(arguments);
}
private void SetFrameDataAddr(NvGpuVmm vmm, int[] arguments)
{
_frameDataAddress = GetAddress(arguments);
}
private void SetVpxCurrLumaAddr(NvGpuVmm vmm, int[] arguments)
{
_vpxCurrLumaAddress = GetAddress(arguments);
}
private void SetVpxRef0LumaAddr(NvGpuVmm vmm, int[] arguments)
{
_vpxRef0LumaAddress = GetAddress(arguments);
}
private void SetVpxRef1LumaAddr(NvGpuVmm vmm, int[] arguments)
{
_vpxRef1LumaAddress = GetAddress(arguments);
}
private void SetVpxRef2LumaAddr(NvGpuVmm vmm, int[] arguments)
{
_vpxRef2LumaAddress = GetAddress(arguments);
}
private void SetVpxCurrChromaAddr(NvGpuVmm vmm, int[] arguments)
{
_vpxCurrChromaAddress = GetAddress(arguments);
}
private void SetVpxRef0ChromaAddr(NvGpuVmm vmm, int[] arguments)
{
_vpxRef0ChromaAddress = GetAddress(arguments);
}
private void SetVpxRef1ChromaAddr(NvGpuVmm vmm, int[] arguments)
{
_vpxRef1ChromaAddress = GetAddress(arguments);
}
private void SetVpxRef2ChromaAddr(NvGpuVmm vmm, int[] arguments)
{
_vpxRef2ChromaAddress = GetAddress(arguments);
}
private void SetVpxProbTablesAddr(NvGpuVmm vmm, int[] arguments)
{
_vpxProbTablesAddress = GetAddress(arguments);
}
private static long GetAddress(int[] arguments)
{
return (long)(uint)arguments[0] << 8;
}
internal void CopyPlanes(NvGpuVmm vmm, SurfaceOutputConfig outputConfig)
{
switch (outputConfig.PixelFormat)
{
case SurfacePixelFormat.Rgba8: CopyPlanesRgba8 (vmm, outputConfig); break;
case SurfacePixelFormat.Yuv420P: CopyPlanesYuv420P(vmm, outputConfig); break;
default: ThrowUnimplementedPixelFormat(outputConfig.PixelFormat); break;
}
}
private void CopyPlanesRgba8(NvGpuVmm vmm, SurfaceOutputConfig outputConfig)
{
FFmpegFrame frame = FFmpegWrapper.GetFrameRgba();
if ((frame.Width | frame.Height) == 0)
{
return;
}
GalImage image = new GalImage(
outputConfig.SurfaceWidth,
outputConfig.SurfaceHeight, 1, 1, 1,
outputConfig.GobBlockHeight, 1,
GalMemoryLayout.BlockLinear,
GalImageFormat.Rgba8 | GalImageFormat.Unorm,
Initial non 2D textures support (#525) * Initial non 2D textures support - Shaders still need to be changed - Some types aren't yet implemented * Start implementing texture instructions suffixes Fix wrong texture type with cube and TEXS Also support array textures in TEX and TEX.B Clean up TEX and TEXS coords managment Fix TEXS.LL with non-2d textures Implement TEX.AOFFI Get the right arguments for TEX, TEXS and TLDS Also, store suffix operands in appropriate values to support multiple suffix combinaisons * Support depth in read/writeTexture Also support WrapR and detect mipmap * Proper cube map textures support + fix TEXS.LZ * Implement depth compare * some code clean up * Implement CubeMap textures in OGLTexture.Create * Implement TLD4 and TLD4S * Add Texture 1D support * updates comments * fix some code style issues * Fix some nits + rename some things to be less confusing * Remove GetSuffix local functions * AOFFI => AOffI * TextureType => GalTextureTarget * finish renaming TextureType to TextureTarget * Disable LL, LZ and LB support in the decompiler This needs more work at the GL level (GLSL implementation should be right) * Revert "Disable LL, LZ and LB support in the decompiler" This reverts commit 64536c3d9f673645faff3152838d1413c3203395. * Fix TEXS ARRAY_2D index * ImageFormat depth should be 1 for all image format * Fix shader build issues with sampler1DShadow and texture * Fix DC & AOFFI combinaison with TEX/TEXS * Support AOFFI with TLD4 and TLD4S * Fix shader compilation error for TLD4.AOFFI with no DC * Fix binding isuses on the 2d copy engine TODO: support 2d array copy * Support 2D array copy operation in the 2D engine This make every copy right in the GPU side. Thie CPU copy probably needs to be updated * Implement GetGpuSize + fix somes issues with 2d engine copies TODO: mipmap level in it * Don't throw an exception in the layer handling * Fix because of rebase * Reject 2d layers of non textures in 2d copy engine * Add 3D textures and mipmap support on BlockLinearSwizzle * Fix naming on new BitUtils methods * gpu cache: Make sure to invalidate textures that doesn't have the same target * Add the concept of layer count for array instead of using depth Also cleanup GetGpuSize as Swizzle can compute the size with mipmap * Support multi layer with mip map in ReadTexture * Add more check for cache invalidation & remove cubemap and cubemap array code for now Also fix compressed 2d array * Fix texelFetchOffset shader build error * Start looking into cube map again Also add some way to log write in register in engines * fix write register log levles * Remove debug logs in WriteRegister * Disable AOFFI support on non NVIDIA drivers * Fix code align
2019-02-27 20:12:24 -05:00
GalTextureTarget.TwoD);
ImageUtils.WriteTexture(vmm, image, vmm.GetPhysicalAddress(outputConfig.SurfaceLumaAddress), frame.Data);
}
private void CopyPlanesYuv420P(NvGpuVmm vmm, SurfaceOutputConfig outputConfig)
{
FFmpegFrame frame = FFmpegWrapper.GetFrame();
if ((frame.Width | frame.Height) == 0)
{
return;
}
int halfSrcWidth = frame.Width / 2;
int halfWidth = frame.Width / 2;
int halfHeight = frame.Height / 2;
int alignedWidth = (outputConfig.SurfaceWidth + 0xff) & ~0xff;
for (int y = 0; y < frame.Height; y++)
{
int src = y * frame.Width;
int dst = y * alignedWidth;
int size = frame.Width;
for (int offset = 0; offset < size; offset++)
{
vmm.WriteByte(outputConfig.SurfaceLumaAddress + dst + offset, *(frame.LumaPtr + src + offset));
}
}
// Copy chroma data from both channels with interleaving.
for (int y = 0; y < halfHeight; y++)
{
int src = y * halfSrcWidth;
int dst = y * alignedWidth;
for (int x = 0; x < halfWidth; x++)
{
vmm.WriteByte(outputConfig.SurfaceChromaUAddress + dst + x * 2 + 0, *(frame.ChromaBPtr + src + x));
vmm.WriteByte(outputConfig.SurfaceChromaUAddress + dst + x * 2 + 1, *(frame.ChromaRPtr + src + x));
}
}
}
private void ThrowUnimplementedCodec()
{
throw new NotImplementedException("Codec \"" + _currentVideoCodec + "\" is not supported!");
}
private void ThrowUnimplementedPixelFormat(SurfacePixelFormat pixelFormat)
{
throw new NotImplementedException("Pixel format \"" + pixelFormat + "\" is not supported!");
}
}
}