Ryujinx/Ryujinx.Graphics.Gpu/Shader/DiskCache/ParallelDiskCacheLoader.cs
riperiperi c48a75979f
Fix Multithreaded Compilation of Shader Cache on OpenGL (#3540)
This was broken by the Vulkan changes - OpenGL was building host caches at boot on one thread, which is very notably slower than when it is multithreaded.

This was caused by trying to get the program binary immediately after compilation started, which blocks. Now it does it after compilation has completed.
2022-08-03 19:37:56 -03:00

708 lines
29 KiB
C#

using Ryujinx.Common.Logging;
using Ryujinx.Graphics.GAL;
using Ryujinx.Graphics.Shader;
using Ryujinx.Graphics.Shader.Translation;
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.IO;
using System.Threading;
using static Ryujinx.Graphics.Gpu.Shader.ShaderCache;
namespace Ryujinx.Graphics.Gpu.Shader.DiskCache
{
class ParallelDiskCacheLoader
{
private const int ThreadCount = 8;
private readonly GpuContext _context;
private readonly ShaderCacheHashTable _graphicsCache;
private readonly ComputeShaderCacheHashTable _computeCache;
private readonly DiskCacheHostStorage _hostStorage;
private readonly CancellationToken _cancellationToken;
private readonly Action<ShaderCacheState, int, int> _stateChangeCallback;
/// <summary>
/// Indicates if the cache should be loaded.
/// </summary>
public bool Active => !_cancellationToken.IsCancellationRequested;
private bool _needsHostRegen;
/// <summary>
/// Number of shaders that failed to compile from the cache.
/// </summary>
public int ErrorCount { get; private set; }
/// <summary>
/// Program validation entry.
/// </summary>
private struct ProgramEntry
{
/// <summary>
/// Cached shader program.
/// </summary>
public readonly CachedShaderProgram CachedProgram;
/// <summary>
/// Optional binary code. If not null, it is used instead of the backend host binary.
/// </summary>
public readonly byte[] BinaryCode;
/// <summary>
/// Program index.
/// </summary>
public readonly int ProgramIndex;
/// <summary>
/// Indicates if the program is a compute shader.
/// </summary>
public readonly bool IsCompute;
/// <summary>
/// Indicates if the program is a host binary shader.
/// </summary>
public readonly bool IsBinary;
/// <summary>
/// Creates a new program validation entry.
/// </summary>
/// <param name="cachedProgram">Cached shader program</param>
/// <param name="binaryCode">Optional binary code. If not null, it is used instead of the backend host binary</param>
/// <param name="programIndex">Program index</param>
/// <param name="isCompute">Indicates if the program is a compute shader</param>
/// <param name="isBinary">Indicates if the program is a host binary shader</param>
public ProgramEntry(
CachedShaderProgram cachedProgram,
byte[] binaryCode,
int programIndex,
bool isCompute,
bool isBinary)
{
CachedProgram = cachedProgram;
BinaryCode = binaryCode;
ProgramIndex = programIndex;
IsCompute = isCompute;
IsBinary = isBinary;
}
}
/// <summary>
/// Translated shader compilation entry.
/// </summary>
private struct ProgramCompilation
{
/// <summary>
/// Translated shader stages.
/// </summary>
public readonly ShaderProgram[] TranslatedStages;
/// <summary>
/// Cached shaders.
/// </summary>
public readonly CachedShaderStage[] Shaders;
/// <summary>
/// Specialization state.
/// </summary>
public readonly ShaderSpecializationState SpecializationState;
/// <summary>
/// Program index.
/// </summary>
public readonly int ProgramIndex;
/// <summary>
/// Indicates if the program is a compute shader.
/// </summary>
public readonly bool IsCompute;
/// <summary>
/// Creates a new translated shader compilation entry.
/// </summary>
/// <param name="translatedStages">Translated shader stages</param>
/// <param name="shaders">Cached shaders</param>
/// <param name="specState">Specialization state</param>
/// <param name="programIndex">Program index</param>
/// <param name="isCompute">Indicates if the program is a compute shader</param>
public ProgramCompilation(
ShaderProgram[] translatedStages,
CachedShaderStage[] shaders,
ShaderSpecializationState specState,
int programIndex,
bool isCompute)
{
TranslatedStages = translatedStages;
Shaders = shaders;
SpecializationState = specState;
ProgramIndex = programIndex;
IsCompute = isCompute;
}
}
/// <summary>
/// Program translation entry.
/// </summary>
private struct AsyncProgramTranslation
{
/// <summary>
/// Guest code for each active stage.
/// </summary>
public readonly GuestCodeAndCbData?[] GuestShaders;
/// <summary>
/// Specialization state.
/// </summary>
public readonly ShaderSpecializationState SpecializationState;
/// <summary>
/// Program index.
/// </summary>
public readonly int ProgramIndex;
/// <summary>
/// Indicates if the program is a compute shader.
/// </summary>
public readonly bool IsCompute;
/// <summary>
/// Creates a new program translation entry.
/// </summary>
/// <param name="guestShaders">Guest code for each active stage</param>
/// <param name="specState">Specialization state</param>
/// <param name="programIndex">Program index</param>
/// <param name="isCompute">Indicates if the program is a compute shader</param>
public AsyncProgramTranslation(
GuestCodeAndCbData?[] guestShaders,
ShaderSpecializationState specState,
int programIndex,
bool isCompute)
{
GuestShaders = guestShaders;
SpecializationState = specState;
ProgramIndex = programIndex;
IsCompute = isCompute;
}
}
private readonly Queue<ProgramEntry> _validationQueue;
private readonly ConcurrentQueue<ProgramCompilation> _compilationQueue;
private readonly BlockingCollection<AsyncProgramTranslation> _asyncTranslationQueue;
private readonly SortedList<int, (CachedShaderProgram, byte[])> _programList;
private int _backendParallelCompileThreads;
private int _compiledCount;
private int _totalCount;
/// <summary>
/// Creates a new parallel disk cache loader.
/// </summary>
/// <param name="context">GPU context</param>
/// <param name="graphicsCache">Graphics shader cache</param>
/// <param name="computeCache">Compute shader cache</param>
/// <param name="hostStorage">Disk cache host storage</param>
/// <param name="cancellationToken">Cancellation token</param>
/// <param name="stateChangeCallback">Function to be called when there is a state change, reporting state, compiled and total shaders count</param>
public ParallelDiskCacheLoader(
GpuContext context,
ShaderCacheHashTable graphicsCache,
ComputeShaderCacheHashTable computeCache,
DiskCacheHostStorage hostStorage,
CancellationToken cancellationToken,
Action<ShaderCacheState, int, int> stateChangeCallback)
{
_context = context;
_graphicsCache = graphicsCache;
_computeCache = computeCache;
_hostStorage = hostStorage;
_cancellationToken = cancellationToken;
_stateChangeCallback = stateChangeCallback;
_validationQueue = new Queue<ProgramEntry>();
_compilationQueue = new ConcurrentQueue<ProgramCompilation>();
_asyncTranslationQueue = new BlockingCollection<AsyncProgramTranslation>(ThreadCount);
_programList = new SortedList<int, (CachedShaderProgram, byte[])>();
_backendParallelCompileThreads = Math.Min(Environment.ProcessorCount, 8); // Must be kept in sync with the backend code.
}
/// <summary>
/// Loads all shaders from the cache.
/// </summary>
public void LoadShaders()
{
Thread[] workThreads = new Thread[ThreadCount];
for (int index = 0; index < ThreadCount; index++)
{
workThreads[index] = new Thread(ProcessAsyncQueue)
{
Name = $"GPU.AsyncTranslationThread.{index}"
};
}
int programCount = _hostStorage.GetProgramCount();
_compiledCount = 0;
_totalCount = programCount;
_stateChangeCallback(ShaderCacheState.Start, 0, programCount);
Logger.Info?.Print(LogClass.Gpu, $"Loading {programCount} shaders from the cache...");
for (int index = 0; index < ThreadCount; index++)
{
workThreads[index].Start(_cancellationToken);
}
try
{
_hostStorage.LoadShaders(_context, this);
}
catch (DiskCacheLoadException diskCacheLoadException)
{
Logger.Warning?.Print(LogClass.Gpu, $"Error loading the shader cache. {diskCacheLoadException.Message}");
// If we can't even access the file, then we also can't rebuild.
if (diskCacheLoadException.Result != DiskCacheLoadResult.NoAccess)
{
_needsHostRegen = true;
}
}
catch (InvalidDataException invalidDataException)
{
Logger.Warning?.Print(LogClass.Gpu, $"Error decompressing the shader cache file. {invalidDataException.Message}");
_needsHostRegen = true;
}
catch (IOException ioException)
{
Logger.Warning?.Print(LogClass.Gpu, $"Error reading the shader cache file. {ioException.Message}");
_needsHostRegen = true;
}
_asyncTranslationQueue.CompleteAdding();
for (int index = 0; index < ThreadCount; index++)
{
workThreads[index].Join();
}
CheckCompilationBlocking();
if (_needsHostRegen && Active)
{
// Rebuild both shared and host cache files.
// Rebuilding shared is required because the shader information returned by the translator
// might have changed, and so we have to reconstruct the file with the new information.
try
{
_hostStorage.ClearSharedCache();
_hostStorage.ClearHostCache(_context);
if (_programList.Count != 0)
{
Logger.Info?.Print(LogClass.Gpu, $"Rebuilding {_programList.Count} shaders...");
using var streams = _hostStorage.GetOutputStreams(_context);
foreach (var kv in _programList)
{
if (!Active)
{
break;
}
(CachedShaderProgram program, byte[] binaryCode) = kv.Value;
_hostStorage.AddShader(_context, program, binaryCode, streams);
}
Logger.Info?.Print(LogClass.Gpu, $"Rebuilt {_programList.Count} shaders successfully.");
}
else
{
_hostStorage.ClearGuestCache();
Logger.Info?.Print(LogClass.Gpu, "Shader cache deleted due to corruption.");
}
}
catch (DiskCacheLoadException diskCacheLoadException)
{
Logger.Warning?.Print(LogClass.Gpu, $"Error deleting the shader cache. {diskCacheLoadException.Message}");
}
catch (IOException ioException)
{
Logger.Warning?.Print(LogClass.Gpu, $"Error deleting the shader cache file. {ioException.Message}");
}
}
Logger.Info?.Print(LogClass.Gpu, "Shader cache loaded.");
_stateChangeCallback(ShaderCacheState.Loaded, programCount, programCount);
}
/// <summary>
/// Enqueues a host program for compilation.
/// </summary>
/// <param name="cachedProgram">Cached program</param>
/// <param name="binaryCode">Host binary code</param>
/// <param name="programIndex">Program index</param>
/// <param name="isCompute">Indicates if the program is a compute shader</param>
public void QueueHostProgram(CachedShaderProgram cachedProgram, byte[] binaryCode, int programIndex, bool isCompute)
{
EnqueueForValidation(new ProgramEntry(cachedProgram, binaryCode, programIndex, isCompute, isBinary: true));
}
/// <summary>
/// Enqueues a guest program for compilation.
/// </summary>
/// <param name="guestShaders">Guest code for each active stage</param>
/// <param name="specState">Specialization state</param>
/// <param name="programIndex">Program index</param>
/// <param name="isCompute">Indicates if the program is a compute shader</param>
public void QueueGuestProgram(GuestCodeAndCbData?[] guestShaders, ShaderSpecializationState specState, int programIndex, bool isCompute)
{
try
{
AsyncProgramTranslation asyncTranslation = new AsyncProgramTranslation(guestShaders, specState, programIndex, isCompute);
_asyncTranslationQueue.Add(asyncTranslation, _cancellationToken);
}
catch (OperationCanceledException)
{
}
}
/// <summary>
/// Check the state of programs that have already been compiled,
/// and add to the cache if the compilation was successful.
/// </summary>
public void CheckCompilation()
{
ProcessCompilationQueue();
// Process programs that already finished compiling.
// If not yet compiled, do nothing. This avoids blocking to wait for shader compilation.
while (_validationQueue.TryPeek(out ProgramEntry entry))
{
ProgramLinkStatus result = entry.CachedProgram.HostProgram.CheckProgramLink(false);
if (result != ProgramLinkStatus.Incomplete)
{
ProcessCompiledProgram(ref entry, result);
_validationQueue.Dequeue();
}
else
{
break;
}
}
}
/// <summary>
/// Waits until all programs finishes compiling, then adds the ones
/// with successful compilation to the cache.
/// </summary>
private void CheckCompilationBlocking()
{
ProcessCompilationQueue();
while (_validationQueue.TryDequeue(out ProgramEntry entry) && Active)
{
ProcessCompiledProgram(ref entry, entry.CachedProgram.HostProgram.CheckProgramLink(true), asyncCompile: false);
}
}
/// <summary>
/// Process a compiled program result.
/// </summary>
/// <param name="entry">Compiled program entry</param>
/// <param name="result">Compilation result</param>
/// <param name="asyncCompile">For failed host compilations, indicates if a guest compilation should be done asynchronously</param>
private void ProcessCompiledProgram(ref ProgramEntry entry, ProgramLinkStatus result, bool asyncCompile = true)
{
if (result == ProgramLinkStatus.Success)
{
// Compilation successful, add to memory cache.
if (entry.IsCompute)
{
_computeCache.Add(entry.CachedProgram);
}
else
{
_graphicsCache.Add(entry.CachedProgram);
}
if (!entry.IsBinary)
{
_needsHostRegen = true;
}
// Fetch the binary code from the backend if it isn't already present.
byte[] binaryCode = entry.BinaryCode ?? entry.CachedProgram.HostProgram.GetBinary();
_programList.Add(entry.ProgramIndex, (entry.CachedProgram, binaryCode));
SignalCompiled();
}
else if (entry.IsBinary)
{
// If this is a host binary and compilation failed,
// we still have a chance to recompile from the guest binary.
CachedShaderProgram program = entry.CachedProgram;
GuestCodeAndCbData?[] guestShaders = new GuestCodeAndCbData?[program.Shaders.Length];
for (int index = 0; index < program.Shaders.Length; index++)
{
CachedShaderStage shader = program.Shaders[index];
if (shader != null)
{
guestShaders[index] = new GuestCodeAndCbData(shader.Code, shader.Cb1Data);
}
}
if (asyncCompile)
{
QueueGuestProgram(guestShaders, program.SpecializationState, entry.ProgramIndex, entry.IsCompute);
}
else
{
RecompileFromGuestCode(guestShaders, program.SpecializationState, entry.ProgramIndex, entry.IsCompute);
ProcessCompilationQueue();
}
}
else
{
// Failed to compile from both host and guest binary.
ErrorCount++;
SignalCompiled();
}
}
/// <summary>
/// Processes the queue of translated guest programs that should be compiled on the host.
/// </summary>
private void ProcessCompilationQueue()
{
while (_compilationQueue.TryDequeue(out ProgramCompilation compilation) && Active)
{
ShaderSource[] shaderSources = new ShaderSource[compilation.TranslatedStages.Length];
int fragmentOutputMap = -1;
for (int index = 0; index < compilation.TranslatedStages.Length; index++)
{
ShaderProgram shader = compilation.TranslatedStages[index];
shaderSources[index] = CreateShaderSource(shader);
if (shader.Info.Stage == ShaderStage.Fragment)
{
fragmentOutputMap = shader.Info.FragmentOutputMap;
}
}
ShaderInfo shaderInfo = compilation.SpecializationState.PipelineState.HasValue
? new ShaderInfo(fragmentOutputMap, compilation.SpecializationState.PipelineState.Value, fromCache: true)
: new ShaderInfo(fragmentOutputMap, fromCache: true);
IProgram hostProgram = _context.Renderer.CreateProgram(shaderSources, shaderInfo);
CachedShaderProgram program = new CachedShaderProgram(hostProgram, compilation.SpecializationState, compilation.Shaders);
// Vulkan's binary code is the SPIR-V used for compilation, so it is ready immediately. Other APIs get this after compilation.
byte[] binaryCode = _context.Capabilities.Api == TargetApi.Vulkan ? ShaderBinarySerializer.Pack(shaderSources) : null;
EnqueueForValidation(new ProgramEntry(program, binaryCode, compilation.ProgramIndex, compilation.IsCompute, isBinary: false));
}
}
/// <summary>
/// Enqueues a program for validation, which will check if the program was compiled successfully.
/// </summary>
/// <param name="newEntry">Program entry to be validated</param>
private void EnqueueForValidation(ProgramEntry newEntry)
{
_validationQueue.Enqueue(newEntry);
// Do not allow more than N shader compilation in-flight, where N is the maximum number of threads
// the driver will be using for parallel compilation.
// Submitting more seems to cause NVIDIA OpenGL driver to crash.
if (_validationQueue.Count >= _backendParallelCompileThreads && _validationQueue.TryDequeue(out ProgramEntry entry))
{
ProcessCompiledProgram(ref entry, entry.CachedProgram.HostProgram.CheckProgramLink(true), asyncCompile: false);
}
}
/// <summary>
/// Processses the queue of programs that should be translated from guest code.
/// </summary>
/// <param name="state">Cancellation token</param>
private void ProcessAsyncQueue(object state)
{
CancellationToken ct = (CancellationToken)state;
try
{
foreach (AsyncProgramTranslation asyncCompilation in _asyncTranslationQueue.GetConsumingEnumerable(ct))
{
RecompileFromGuestCode(
asyncCompilation.GuestShaders,
asyncCompilation.SpecializationState,
asyncCompilation.ProgramIndex,
asyncCompilation.IsCompute);
}
}
catch (OperationCanceledException)
{
}
}
/// <summary>
/// Recompiles a program from guest code.
/// </summary>
/// <param name="guestShaders">Guest code for each active stage</param>
/// <param name="specState">Specialization state</param>
/// <param name="programIndex">Program index</param>
/// <param name="isCompute">Indicates if the program is a compute shader</param>
private void RecompileFromGuestCode(GuestCodeAndCbData?[] guestShaders, ShaderSpecializationState specState, int programIndex, bool isCompute)
{
try
{
if (isCompute)
{
RecompileComputeFromGuestCode(guestShaders, specState, programIndex);
}
else
{
RecompileGraphicsFromGuestCode(guestShaders, specState, programIndex);
}
}
catch (DiskCacheLoadException diskCacheLoadException)
{
Logger.Error?.Print(LogClass.Gpu, $"Error translating guest shader. {diskCacheLoadException.Message}");
ErrorCount++;
SignalCompiled();
}
}
/// <summary>
/// Recompiles a graphics program from guest code.
/// </summary>
/// <param name="guestShaders">Guest code for each active stage</param>
/// <param name="specState">Specialization state</param>
/// <param name="programIndex">Program index</param>
private void RecompileGraphicsFromGuestCode(GuestCodeAndCbData?[] guestShaders, ShaderSpecializationState specState, int programIndex)
{
ShaderSpecializationState newSpecState = new ShaderSpecializationState(
ref specState.GraphicsState,
specState.PipelineState,
specState.TransformFeedbackDescriptors);
ResourceCounts counts = new ResourceCounts();
TranslatorContext[] translatorContexts = new TranslatorContext[Constants.ShaderStages + 1];
TranslatorContext nextStage = null;
TargetApi api = _context.Capabilities.Api;
for (int stageIndex = Constants.ShaderStages - 1; stageIndex >= 0; stageIndex--)
{
if (guestShaders[stageIndex + 1].HasValue)
{
GuestCodeAndCbData shader = guestShaders[stageIndex + 1].Value;
byte[] guestCode = shader.Code;
byte[] cb1Data = shader.Cb1Data;
DiskCacheGpuAccessor gpuAccessor = new DiskCacheGpuAccessor(_context, guestCode, cb1Data, specState, newSpecState, counts, stageIndex);
TranslatorContext currentStage = DecodeGraphicsShader(gpuAccessor, api, DefaultFlags, 0);
if (nextStage != null)
{
currentStage.SetNextStage(nextStage);
}
if (stageIndex == 0 && guestShaders[0].HasValue)
{
byte[] guestCodeA = guestShaders[0].Value.Code;
byte[] cb1DataA = guestShaders[0].Value.Cb1Data;
DiskCacheGpuAccessor gpuAccessorA = new DiskCacheGpuAccessor(_context, guestCodeA, cb1DataA, specState, newSpecState, counts, 0);
translatorContexts[0] = DecodeGraphicsShader(gpuAccessorA, api, DefaultFlags | TranslationFlags.VertexA, 0);
}
translatorContexts[stageIndex + 1] = currentStage;
nextStage = currentStage;
}
}
CachedShaderStage[] shaders = new CachedShaderStage[guestShaders.Length];
List<ShaderProgram> translatedStages = new List<ShaderProgram>();
for (int stageIndex = 0; stageIndex < Constants.ShaderStages; stageIndex++)
{
TranslatorContext currentStage = translatorContexts[stageIndex + 1];
if (currentStage != null)
{
ShaderProgram program;
byte[] guestCode = guestShaders[stageIndex + 1].Value.Code;
byte[] cb1Data = guestShaders[stageIndex + 1].Value.Cb1Data;
if (stageIndex == 0 && guestShaders[0].HasValue)
{
program = currentStage.Translate(translatorContexts[0]);
byte[] guestCodeA = guestShaders[0].Value.Code;
byte[] cb1DataA = guestShaders[0].Value.Cb1Data;
shaders[0] = new CachedShaderStage(null, guestCodeA, cb1DataA);
shaders[1] = new CachedShaderStage(program.Info, guestCode, cb1Data);
}
else
{
program = currentStage.Translate();
shaders[stageIndex + 1] = new CachedShaderStage(program.Info, guestCode, cb1Data);
}
if (program != null)
{
translatedStages.Add(program);
}
}
}
_compilationQueue.Enqueue(new ProgramCompilation(translatedStages.ToArray(), shaders, newSpecState, programIndex, isCompute: false));
}
/// <summary>
/// Recompiles a compute program from guest code.
/// </summary>
/// <param name="guestShaders">Guest code for each active stage</param>
/// <param name="specState">Specialization state</param>
/// <param name="programIndex">Program index</param>
private void RecompileComputeFromGuestCode(GuestCodeAndCbData?[] guestShaders, ShaderSpecializationState specState, int programIndex)
{
GuestCodeAndCbData shader = guestShaders[0].Value;
ResourceCounts counts = new ResourceCounts();
ShaderSpecializationState newSpecState = new ShaderSpecializationState(ref specState.ComputeState);
DiskCacheGpuAccessor gpuAccessor = new DiskCacheGpuAccessor(_context, shader.Code, shader.Cb1Data, specState, newSpecState, counts, 0);
TranslatorContext translatorContext = DecodeComputeShader(gpuAccessor, _context.Capabilities.Api, 0);
ShaderProgram program = translatorContext.Translate();
CachedShaderStage[] shaders = new[] { new CachedShaderStage(program.Info, shader.Code, shader.Cb1Data) };
_compilationQueue.Enqueue(new ProgramCompilation(new[] { program }, shaders, newSpecState, programIndex, isCompute: true));
}
/// <summary>
/// Signals that compilation of a program has been finished successfully,
/// or that it failed and guest recompilation has also been attempted.
/// </summary>
private void SignalCompiled()
{
_stateChangeCallback(ShaderCacheState.Loading, ++_compiledCount, _totalCount);
}
}
}