add: base for pipeline caching

2025-04-20 15:51:15 +02:00 · 2024-10-14 20:00:37 +02:00 · 2024-10-14 20:00:37 +02:00 · e9e510d2cd
commit e9e510d2cd
parent 8b783e63dc
2 changed files with 430 additions and 1 deletions
--- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp
+++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp
@ -6,6 +6,14 @@
 #include "Cafe/HW/Latte/Core/FetchShader.h"
 #include "Cafe/HW/Latte/ISA/RegDefines.h"
 #include "Cafe/HW/Latte/Core/LatteConst.h"
+#include "Cafe/HW/Latte/Core/LatteCachedFBO.h"
+#include "Cafe/HW/Latte/Common/RegisterSerializer.h"
+#include "Cafe/HW/Latte/Core/LatteShaderCache.h"
+#include "Cemu/FileCache/FileCache.h"
+#include "HW/Latte/Core/LatteShader.h"
+#include "util/helpers/helpers.h"
+#include "config/ActiveSettings.h"
+#include <openssl/sha.h>

 uint64 MetalPipelineCache::CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr)
 {
@ -129,3 +137,373 @@ MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const Latte

    return pipeline;
 }
+
+struct
+{
+	uint32 pipelineLoadIndex;
+	uint32 pipelineMaxFileIndex;
+
+	std::atomic_uint32_t pipelinesQueued;
+	std::atomic_uint32_t pipelinesLoaded;
+} g_mtlCacheState;
+
+uint32 MetalPipelineCache::BeginLoading(uint64 cacheTitleId)
+{
+	std::error_code ec;
+	fs::create_directories(ActiveSettings::GetCachePath("shaderCache/transferable"), ec);
+	const auto pathCacheFile = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}_mtlpipeline.bin", cacheTitleId);
+
+	// init cache loader state
+	g_mtlCacheState.pipelineLoadIndex = 0;
+	g_mtlCacheState.pipelineMaxFileIndex = 0;
+	g_mtlCacheState.pipelinesLoaded = 0;
+	g_mtlCacheState.pipelinesQueued = 0;
+
+	// start async compilation threads
+	m_compilationCount.store(0);
+	m_compilationQueue.clear();
+
+	// get core count
+	uint32 cpuCoreCount = GetPhysicalCoreCount();
+	m_numCompilationThreads = std::clamp(cpuCoreCount, 1u, 8u);
+	// TODO: uncomment?
+	//if (VulkanRenderer::GetInstance()->GetDisableMultithreadedCompilation())
+	//	m_numCompilationThreads = 1;
+
+	for (uint32 i = 0; i < m_numCompilationThreads; i++)
+	{
+		std::thread compileThread(&MetalPipelineCache::CompilerThread, this);
+		compileThread.detach();
+	}
+
+	// open cache file or create it
+	cemu_assert_debug(s_cache == nullptr);
+	s_cache = FileCache::Open(pathCacheFile, true, LatteShaderCache_getPipelineCacheExtraVersion(cacheTitleId));
+	if (!s_cache)
+	{
+		cemuLog_log(LogType::Force, "Failed to open or create Vulkan pipeline cache file: {}", _pathToUtf8(pathCacheFile));
+		return 0;
+	}
+	else
+	{
+		s_cache->UseCompression(false);
+		g_mtlCacheState.pipelineMaxFileIndex = s_cache->GetMaximumFileIndex();
+	}
+	return s_cache->GetFileCount();
+}
+
+bool MetalPipelineCache::UpdateLoading(uint32& pipelinesLoadedTotal, uint32& pipelinesMissingShaders)
+{
+	pipelinesLoadedTotal = g_mtlCacheState.pipelinesLoaded;
+	pipelinesMissingShaders = 0;
+	while (g_mtlCacheState.pipelineLoadIndex <= g_mtlCacheState.pipelineMaxFileIndex)
+	{
+		if (m_compilationQueue.size() >= 50)
+		{
+			std::this_thread::sleep_for(std::chrono::milliseconds(10));
+			return true; // queue up to 50 entries at a time
+		}
+
+		uint64 fileNameA, fileNameB;
+		std::vector<uint8> fileData;
+		if (s_cache->GetFileByIndex(g_mtlCacheState.pipelineLoadIndex, &fileNameA, &fileNameB, fileData))
+		{
+			// queue for async compilation
+			g_mtlCacheState.pipelinesQueued++;
+			m_compilationQueue.push(std::move(fileData));
+			g_mtlCacheState.pipelineLoadIndex++;
+			return true;
+		}
+		g_mtlCacheState.pipelineLoadIndex++;
+	}
+	if (g_mtlCacheState.pipelinesLoaded != g_mtlCacheState.pipelinesQueued)
+	{
+		std::this_thread::sleep_for(std::chrono::milliseconds(10));
+		return true; // pipelines still compiling
+	}
+	return false; // done
+}
+
+void MetalPipelineCache::EndLoading()
+{
+	// shut down compilation threads
+	uint32 threadCount = m_numCompilationThreads;
+	m_numCompilationThreads = 0; // signal thread shutdown
+	for (uint32 i = 0; i < threadCount; i++)
+	{
+		m_compilationQueue.push({}); // push empty workload for every thread. Threads then will shutdown after checking for m_numCompilationThreads == 0
+	}
+	// keep cache file open for writing of new pipelines
+}
+
+void MetalPipelineCache::Close()
+{
+    if(s_cache)
+    {
+        delete s_cache;
+        s_cache = nullptr;
+    }
+}
+
+struct CachedPipeline
+{
+	struct ShaderHash
+	{
+		uint64 baseHash;
+		uint64 auxHash;
+		bool isPresent{};
+
+		void set(uint64 baseHash, uint64 auxHash)
+		{
+			this->baseHash = baseHash;
+			this->auxHash = auxHash;
+			this->isPresent = true;
+		}
+	};
+
+	ShaderHash vsHash; // includes fetch shader
+	ShaderHash gsHash;
+	ShaderHash psHash;
+
+	Latte::GPUCompactedRegisterState gpuState;
+};
+
+void MetalPipelineCache::LoadPipelineFromCache(std::span<uint8> fileData)
+{
+	static FSpinlock s_spinlockSharedInternal;
+
+	// deserialize file
+	LatteContextRegister* lcr = new LatteContextRegister();
+	s_spinlockSharedInternal.lock();
+	CachedPipeline* cachedPipeline = new CachedPipeline();
+	s_spinlockSharedInternal.unlock();
+
+	MemStreamReader streamReader(fileData.data(), fileData.size());
+	if (!DeserializePipeline(streamReader, *cachedPipeline))
+	{
+		// failed to deserialize
+		s_spinlockSharedInternal.lock();
+		delete lcr;
+		delete cachedPipeline;
+		s_spinlockSharedInternal.unlock();
+		return;
+	}
+	// restored register view from compacted state
+	Latte::LoadGPURegisterState(*lcr, cachedPipeline->gpuState);
+
+	LatteDecompilerShader* vertexShader = nullptr;
+	LatteDecompilerShader* geometryShader = nullptr;
+	LatteDecompilerShader* pixelShader = nullptr;
+	// find vertex shader
+	if (cachedPipeline->vsHash.isPresent)
+	{
+		vertexShader = LatteSHRC_FindVertexShader(cachedPipeline->vsHash.baseHash, cachedPipeline->vsHash.auxHash);
+		if (!vertexShader)
+		{
+			cemuLog_logDebug(LogType::Force, "Vertex shader not found in cache");
+			return;
+		}
+	}
+	// find geometry shader
+	if (cachedPipeline->gsHash.isPresent)
+	{
+		geometryShader = LatteSHRC_FindGeometryShader(cachedPipeline->gsHash.baseHash, cachedPipeline->gsHash.auxHash);
+		if (!geometryShader)
+		{
+			cemuLog_logDebug(LogType::Force, "Geometry shader not found in cache");
+			return;
+		}
+	}
+	// find pixel shader
+	if (cachedPipeline->psHash.isPresent)
+	{
+		pixelShader = LatteSHRC_FindPixelShader(cachedPipeline->psHash.baseHash, cachedPipeline->psHash.auxHash);
+		if (!pixelShader)
+		{
+			cemuLog_logDebug(LogType::Force, "Pixel shader not found in cache");
+			return;
+		}
+	}
+
+	if (!pixelShader)
+	{
+		cemu_assert_debug(false);
+		return;
+	}
+
+	// create pipeline info
+	m_pipelineIsCachedLock.lock();
+	m_pipelineIsCachedLock.unlock();
+	throw;
+	// TODO: uncomment
+	/*
+	// compile
+	{
+		MetalPipelineCompiler pp(m_mtlr);
+		if (!pp.InitFromState(fetchShader, vertexShader, geometryShader, pixelShader, activeFBO, activeFBO, *lcr))
+		{
+			s_spinlockSharedInternal.lock();
+			delete lcr;
+			delete cachedPipeline;
+			s_spinlockSharedInternal.unlock();
+			return;
+		}
+		pp.Compile(true, true);
+		// destroy pp early
+	}
+	// on success, calculate pipeline hash and flag as present in cache
+	uint64 pipelineBaseHash = vertexShader->baseHash;
+	uint64 pipelineStateHash = CalculatePipelineHash(fetchShader, vertexShader, geometryShader, pixelShader, activeFBO, activeFBO, *lcr);
+	m_pipelineIsCachedLock.lock();
+	m_pipelineIsCached.emplace(pipelineBaseHash, pipelineStateHash);
+	m_pipelineIsCachedLock.unlock();
+	*/
+
+	// clean up
+	s_spinlockSharedInternal.lock();
+	delete lcr;
+	delete cachedPipeline;
+	s_spinlockSharedInternal.unlock();
+}
+
+bool MetalPipelineCache::HasPipelineCached(uint64 baseHash, uint64 pipelineStateHash)
+{
+	PipelineHash ph(baseHash, pipelineStateHash);
+	return m_pipelineIsCached.find(ph) != m_pipelineIsCached.end();
+}
+
+ConcurrentQueue<CachedPipeline*> g_mtlPipelineCachingQueue;
+
+void MetalPipelineCache::AddCurrentStateToCache(uint64 baseHash, uint64 pipelineStateHash)
+{
+	m_pipelineIsCached.emplace(baseHash, pipelineStateHash);
+	if (!m_pipelineCacheStoreThread)
+	{
+		m_pipelineCacheStoreThread = new std::thread(&MetalPipelineCache::WorkerThread, this);
+		m_pipelineCacheStoreThread->detach();
+	}
+	// fill job structure with cached GPU state
+	// for each cached pipeline we store:
+	// - Active shaders (referenced by hash)
+	// - An almost-complete register state of the GPU (minus some ALU uniform constants which aren't relevant)
+	CachedPipeline* job = new CachedPipeline();
+	auto vs = LatteSHRC_GetActiveVertexShader();
+	auto gs = LatteSHRC_GetActiveGeometryShader();
+	auto ps = LatteSHRC_GetActivePixelShader();
+	if (vs)
+		job->vsHash.set(vs->baseHash, vs->auxHash);
+	if (gs)
+		job->gsHash.set(gs->baseHash, gs->auxHash);
+	if (ps)
+		job->psHash.set(ps->baseHash, ps->auxHash);
+	Latte::StoreGPURegisterState(LatteGPUState.contextNew, job->gpuState);
+	// queue job
+	g_mtlPipelineCachingQueue.push(job);
+}
+
+bool MetalPipelineCache::SerializePipeline(MemStreamWriter& memWriter, CachedPipeline& cachedPipeline)
+{
+	memWriter.writeBE<uint8>(0x01); // version
+	uint8 presentMask = 0;
+	if (cachedPipeline.vsHash.isPresent)
+		presentMask |= 1;
+	if (cachedPipeline.gsHash.isPresent)
+		presentMask |= 2;
+	if (cachedPipeline.psHash.isPresent)
+		presentMask |= 4;
+	memWriter.writeBE<uint8>(presentMask);
+	if (cachedPipeline.vsHash.isPresent)
+	{
+		memWriter.writeBE<uint64>(cachedPipeline.vsHash.baseHash);
+		memWriter.writeBE<uint64>(cachedPipeline.vsHash.auxHash);
+	}
+	if (cachedPipeline.gsHash.isPresent)
+	{
+		memWriter.writeBE<uint64>(cachedPipeline.gsHash.baseHash);
+		memWriter.writeBE<uint64>(cachedPipeline.gsHash.auxHash);
+	}
+	if (cachedPipeline.psHash.isPresent)
+	{
+		memWriter.writeBE<uint64>(cachedPipeline.psHash.baseHash);
+		memWriter.writeBE<uint64>(cachedPipeline.psHash.auxHash);
+	}
+	Latte::SerializeRegisterState(cachedPipeline.gpuState, memWriter);
+	return true;
+}
+
+bool MetalPipelineCache::DeserializePipeline(MemStreamReader& memReader, CachedPipeline& cachedPipeline)
+{
+	// version
+	if (memReader.readBE<uint8>() != 1)
+	{
+		cemuLog_log(LogType::Force, "Cached Vulkan pipeline corrupted or has unknown version");
+		return false;
+	}
+	// shader hashes
+	uint8 presentMask = memReader.readBE<uint8>();
+	if (presentMask & 1)
+	{
+		uint64 baseHash = memReader.readBE<uint64>();
+		uint64 auxHash = memReader.readBE<uint64>();
+		cachedPipeline.vsHash.set(baseHash, auxHash);
+	}
+	if (presentMask & 2)
+	{
+		uint64 baseHash = memReader.readBE<uint64>();
+		uint64 auxHash = memReader.readBE<uint64>();
+		cachedPipeline.gsHash.set(baseHash, auxHash);
+	}
+	if (presentMask & 4)
+	{
+		uint64 baseHash = memReader.readBE<uint64>();
+		uint64 auxHash = memReader.readBE<uint64>();
+		cachedPipeline.psHash.set(baseHash, auxHash);
+	}
+	// deserialize GPU state
+	if (!Latte::DeserializeRegisterState(cachedPipeline.gpuState, memReader))
+	{
+		return false;
+	}
+	cemu_assert_debug(!memReader.hasError());
+	return true;
+}
+
+int MetalPipelineCache::CompilerThread()
+{
+	SetThreadName("plCacheCompiler");
+	while (m_numCompilationThreads != 0)
+	{
+		std::vector<uint8> pipelineData = m_compilationQueue.pop();
+		if(pipelineData.empty())
+			continue;
+		LoadPipelineFromCache(pipelineData);
+		++g_mtlCacheState.pipelinesLoaded;
+	}
+	return 0;
+}
+
+void MetalPipelineCache::WorkerThread()
+{
+	SetThreadName("plCacheWriter");
+	while (true)
+	{
+		CachedPipeline* job;
+		g_mtlPipelineCachingQueue.pop(job);
+		if (!s_cache)
+		{
+			delete job;
+			continue;
+		}
+		// serialize
+		MemStreamWriter memWriter(1024 * 4);
+		SerializePipeline(memWriter, *job);
+		auto blob = memWriter.getResult();
+		// file name is derived from data hash
+		uint8 hash[SHA256_DIGEST_LENGTH];
+		SHA256(blob.data(), blob.size(), hash);
+		uint64 nameA = *(uint64be*)(hash + 0);
+		uint64 nameB = *(uint64be*)(hash + 8);
+		s_cache->AddFileAsync({ nameA, nameB }, blob.data(), blob.size());
+		delete job;
+	}
+}
--- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h
+++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h
@ -1,18 +1,54 @@
 #pragma once

 #include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h"
+#include "util/helpers/ConcurrentQueue.h"
+#include "util/helpers/fspinlock.h"

 // TODO: binary archives
 class MetalPipelineCache
 {
 public:
-    static uint64 CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr);
+    struct PipelineHash
+	{
+		PipelineHash(uint64 h0, uint64 h1) : h0(h0), h1(h1) {};
+
+		uint64 h0;
+		uint64 h1;
+
+		bool operator==(const PipelineHash& r) const
+		{
+			return h0 == r.h0 && h1 == r.h1;
+		}
+
+		struct HashFunc
+		{
+			size_t operator()(const PipelineHash& v) const
+			{
+				static_assert(sizeof(uint64) == sizeof(size_t));
+				return v.h0 ^ v.h1;
+			}
+		};
+	};

    MetalPipelineCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {}
    ~MetalPipelineCache();

    MTL::RenderPipelineState* GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr);

+    // Cache loading
+	uint32 BeginLoading(uint64 cacheTitleId); // returns count of pipelines stored in cache
+	bool UpdateLoading(uint32& pipelinesLoadedTotal, uint32& pipelinesMissingShaders);
+	void EndLoading();
+	void LoadPipelineFromCache(std::span<uint8> fileData);
+       void Close(); // called on title exit
+
+	bool HasPipelineCached(uint64 baseHash, uint64 pipelineStateHash);
+	void AddCurrentStateToCache(uint64 baseHash, uint64 pipelineStateHash);
+
+	// pipeline serialization for file
+	bool SerializePipeline(class MemStreamWriter& memWriter, struct CachedPipeline& cachedPipeline);
+	bool DeserializePipeline(class MemStreamReader& memReader, struct CachedPipeline& cachedPipeline);
+
    // Debug
    size_t GetPipelineCacheSize() const { return m_pipelineCache.size(); }

@ -20,4 +56,19 @@ private:
    class MetalRenderer* m_mtlr;

    std::map<uint64, MTL::RenderPipelineState*> m_pipelineCache;
+
+	std::thread* m_pipelineCacheStoreThread;
+
+	std::unordered_set<PipelineHash, PipelineHash::HashFunc> m_pipelineIsCached;
+	FSpinlock m_pipelineIsCachedLock;
+	class FileCache* s_cache;
+
+	std::atomic_uint32_t m_numCompilationThreads{ 0 };
+	ConcurrentQueue<std::vector<uint8>> m_compilationQueue;
+	std::atomic_uint32_t m_compilationCount;
+
+    static uint64 CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr);
+
+    int CompilerThread();
+	void WorkerThread();
 };