diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt
index 9b5f8d3e..b30f8efe 100644
--- a/src/Cafe/CMakeLists.txt
+++ b/src/Cafe/CMakeLists.txt
@@ -534,6 +534,7 @@ if(APPLE)
 endif()
 
 if(ENABLE_METAL)
+    # TODO: sort alphabetically
     target_sources(CemuCafe PRIVATE
         HW/Latte/Renderer/Metal/MetalRenderer.cpp
         HW/Latte/Renderer/Metal/MetalRenderer.h
@@ -555,11 +556,15 @@ if(ENABLE_METAL)
         HW/Latte/Renderer/Metal/RendererShaderMtl.h
         HW/Latte/Renderer/Metal/CachedFBOMtl.cpp
         HW/Latte/Renderer/Metal/CachedFBOMtl.h
+        HW/Latte/Renderer/Metal/MetalAttachmentsInfo.cpp
+        HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h
         HW/Latte/Renderer/Metal/MetalBufferAllocator.h
         HW/Latte/Renderer/Metal/MetalMemoryManager.cpp
         HW/Latte/Renderer/Metal/MetalMemoryManager.h
         HW/Latte/Renderer/Metal/MetalOutputShaderCache.cpp
         HW/Latte/Renderer/Metal/MetalOutputShaderCache.h
+        HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp
+        HW/Latte/Renderer/Metal/MetalPipelineCompiler.h
         HW/Latte/Renderer/Metal/MetalPipelineCache.cpp
         HW/Latte/Renderer/Metal/MetalPipelineCache.h
         HW/Latte/Renderer/Metal/MetalDepthStencilCache.cpp
diff --git a/src/Cafe/HW/Latte/Core/LatteShader.cpp b/src/Cafe/HW/Latte/Core/LatteShader.cpp
index bc1279c3..9e3e6b1f 100644
--- a/src/Cafe/HW/Latte/Core/LatteShader.cpp
+++ b/src/Cafe/HW/Latte/Core/LatteShader.cpp
@@ -209,11 +209,9 @@ void LatteShader_free(LatteDecompilerShader* shader)
 	delete shader;
 }
 
-// both vertex and geometry/pixel shader depend on PS inputs
-// we prepare the PS import info in advance
-void LatteShader_UpdatePSInputs(uint32* contextRegisters)
+void LatteShader_CreatePSInputTable(LatteShaderPSInputTable* psInputTable, uint32* contextRegisters)
 {
-	// PS control
+    // PS control
 	uint32 psControl0 = contextRegisters[mmSPI_PS_IN_CONTROL_0];
 	uint32 spi0_positionEnable = (psControl0 >> 8) & 1;
 	uint32 spi0_positionCentroid = (psControl0 >> 9) & 1;
@@ -242,12 +240,12 @@ void LatteShader_UpdatePSInputs(uint32* contextRegisters)
 	{
 		key += std::rotr<uint64>(spi0_paramGen, 7);
 		key += std::rotr<uint64>(spi0_paramGenAddr, 3);
-		_activePSImportTable.paramGen = spi0_paramGen;
-		_activePSImportTable.paramGenGPR = spi0_paramGenAddr;
+		psInputTable->paramGen = spi0_paramGen;
+		psInputTable->paramGenGPR = spi0_paramGenAddr;
 	}
 	else
 	{
-		_activePSImportTable.paramGen = 0;
+		psInputTable->paramGen = 0;
 	}
 
 	// semantic imports from vertex shader
@@ -281,9 +279,9 @@ void LatteShader_UpdatePSInputs(uint32* contextRegisters)
 		key = std::rotl<uint64>(key, 7);
 		if (spi0_positionEnable && f == spi0_positionAddr)
 		{
-			_activePSImportTable.import[f].semanticId = LATTE_ANALYZER_IMPORT_INDEX_SPIPOSITION;
-			_activePSImportTable.import[f].isFlat = false;
-			_activePSImportTable.import[f].isNoPerspective = false;
+			psInputTable->import[f].semanticId = LATTE_ANALYZER_IMPORT_INDEX_SPIPOSITION;
+			psInputTable->import[f].isFlat = false;
+			psInputTable->import[f].isNoPerspective = false;
 			key += (uint64)0x33;
 		}
 		else
@@ -296,13 +294,20 @@ void LatteShader_UpdatePSInputs(uint32* contextRegisters)
 			semanticMask[psSemanticId >> 3] |= (1 << (psSemanticId & 7));
 #endif
 
-			_activePSImportTable.import[f].semanticId = psSemanticId;
-			_activePSImportTable.import[f].isFlat = (psInputControl&(1 << 10)) != 0;
-			_activePSImportTable.import[f].isNoPerspective = (psInputControl&(1 << 12)) != 0;
+			psInputTable->import[f].semanticId = psSemanticId;
+			psInputTable->import[f].isFlat = (psInputControl&(1 << 10)) != 0;
+			psInputTable->import[f].isNoPerspective = (psInputControl&(1 << 12)) != 0;
 		}
 	}
-	_activePSImportTable.key = key;
-	_activePSImportTable.count = numPSInputs;
+	psInputTable->key = key;
+	psInputTable->count = numPSInputs;
+}
+
+// both vertex and geometry/pixel shader depend on PS inputs
+// we prepare the PS import info in advance
+void LatteShader_UpdatePSInputs(uint32* contextRegisters)
+{
+	LatteShader_CreatePSInputTable(&_activePSImportTable, contextRegisters);
 }
 
 void LatteShader_CreateRendererShader(LatteDecompilerShader* shader, bool compileAsync)
diff --git a/src/Cafe/HW/Latte/Core/LatteShader.h b/src/Cafe/HW/Latte/Core/LatteShader.h
index f8dc6d1a..85d53b01 100644
--- a/src/Cafe/HW/Latte/Core/LatteShader.h
+++ b/src/Cafe/HW/Latte/Core/LatteShader.h
@@ -84,6 +84,7 @@ struct LatteShaderPSInputTable
 	}
 };
 
+void LatteShader_CreatePSInputTable(LatteShaderPSInputTable* psInputTable, uint32* contextRegisters);
 void LatteShader_UpdatePSInputs(uint32* contextRegisters);
 LatteShaderPSInputTable* LatteSHRC_GetPSInputTable();
 
@@ -126,4 +127,4 @@ void LatteShaderCache_writeSeparableGeometryShader(uint64 shaderBaseHash, uint64
 void LatteShaderCache_writeSeparablePixelShader(uint64 shaderBaseHash, uint64 shaderAuxHash, uint8* pixelShader, uint32 pixelShaderSize, uint32* contextRegisters, bool usesGeometryShader);
 
 // todo - refactor this
-sint32 LatteDecompiler_getTextureSamplerBaseIndex(LatteConst::ShaderType shaderType);
\ No newline at end of file
+sint32 LatteDecompiler_getTextureSamplerBaseIndex(LatteConst::ShaderType shaderType);
diff --git a/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp b/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp
index 4659ff10..126dcc50 100644
--- a/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp
+++ b/src/Cafe/HW/Latte/Core/LatteShaderCache.cpp
@@ -64,7 +64,7 @@ FileCache* s_shaderCacheGeneric = nullptr;	// contains hardware and version inde
 #define SHADER_CACHE_TYPE_PIXEL					(2)
 
 bool LatteShaderCache_readSeparableShader(uint8* shaderInfoData, sint32 shaderInfoSize);
-void LatteShaderCache_LoadVulkanPipelineCache(uint64 cacheTitleId);
+void LatteShaderCache_LoadPipelineCache(uint64 cacheTitleId);
 bool LatteShaderCache_updatePipelineLoadingProgress();
 void LatteShaderCache_ShowProgress(const std::function <bool(void)>& loadUpdateFunc, bool isPipelines);
 
@@ -160,18 +160,11 @@ bool LoadTGAFile(const std::vector<uint8>& buffer, TGAFILE *tgaFile)
 void LatteShaderCache_finish()
 {
     if (g_renderer->GetType() == RendererAPI::Vulkan)
-	{
 		RendererShaderVk::ShaderCacheLoading_end();
-	}
 	else if (g_renderer->GetType() == RendererAPI::OpenGL)
-	{
 		RendererShaderGL::ShaderCacheLoading_end();
-	}
 	else if (g_renderer->GetType() == RendererAPI::Metal)
-	{
 	    RendererShaderMtl::ShaderCacheLoading_end();
-		MetalPipelineCache::ShaderCacheLoading_end();
-	}
 }
 
 uint32 LatteShaderCache_getShaderCacheExtraVersion(uint64 titleId)
@@ -251,18 +244,11 @@ void LatteShaderCache_Load()
 	fs::create_directories(ActiveSettings::GetCachePath("shaderCache/precompiled"), ec);
 	// initialize renderer specific caches
 	if (g_renderer->GetType() == RendererAPI::Vulkan)
-	{
 		RendererShaderVk::ShaderCacheLoading_begin(cacheTitleId);
-	}
 	else if (g_renderer->GetType() == RendererAPI::OpenGL)
-	{
 		RendererShaderGL::ShaderCacheLoading_begin(cacheTitleId);
-	}
 	else if (g_renderer->GetType() == RendererAPI::Metal)
-	{
 	    RendererShaderMtl::ShaderCacheLoading_begin(cacheTitleId);
-		MetalPipelineCache::ShaderCacheLoading_begin(cacheTitleId);
-	}
 	// get cache file name
 	const auto pathGeneric = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}_shaders.bin", cacheTitleId);
 	const auto pathGenericPre1_25_0 = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}.bin", cacheTitleId); // before 1.25.0
@@ -361,9 +347,9 @@ void LatteShaderCache_Load()
 	cemuLog_log(LogType::Force, "Shader cache loaded with {} shaders. Commited mem {}MB. Took {}ms", numLoadedShaders, (sint32)(memCommited/1024/1024), timeLoad);
 #endif
 	LatteShaderCache_finish();
-	// if Vulkan then also load pipeline cache
-	if (g_renderer->GetType() == RendererAPI::Vulkan)
-        LatteShaderCache_LoadVulkanPipelineCache(cacheTitleId);
+	// if Vulkan or Metal then also load pipeline cache
+	if (g_renderer->GetType() == RendererAPI::Vulkan || g_renderer->GetType() == RendererAPI::Metal)
+        LatteShaderCache_LoadPipelineCache(cacheTitleId);
 
 
 	g_renderer->BeginFrame(true);
@@ -518,13 +504,18 @@ void LatteShaderCache_ShowProgress(const std::function <bool(void)>& loadUpdateF
 	}
 }
 
-void LatteShaderCache_LoadVulkanPipelineCache(uint64 cacheTitleId)
+void LatteShaderCache_LoadPipelineCache(uint64 cacheTitleId)
 {
-	auto& pipelineCache = VulkanPipelineStableCache::GetInstance();
-	g_shaderCacheLoaderState.pipelineFileCount = pipelineCache.BeginLoading(cacheTitleId);
+	if (g_renderer->GetType() == RendererAPI::Vulkan)
+	    g_shaderCacheLoaderState.pipelineFileCount = VulkanPipelineStableCache::GetInstance().BeginLoading(cacheTitleId);
+	else if (g_renderer->GetType() == RendererAPI::Metal)
+		g_shaderCacheLoaderState.pipelineFileCount = MetalPipelineCache::GetInstance().BeginLoading(cacheTitleId);
 	g_shaderCacheLoaderState.loadedPipelines = 0;
 	LatteShaderCache_ShowProgress(LatteShaderCache_updatePipelineLoadingProgress, true);
-	pipelineCache.EndLoading();
+	if (g_renderer->GetType() == RendererAPI::Vulkan)
+	    VulkanPipelineStableCache::GetInstance().EndLoading();
+	else if (g_renderer->GetType() == RendererAPI::Metal)
+		MetalPipelineCache::GetInstance().EndLoading();
     if(Latte_GetStopSignal())
         LatteThread_Exit();
 }
@@ -532,7 +523,12 @@ void LatteShaderCache_LoadVulkanPipelineCache(uint64 cacheTitleId)
 bool LatteShaderCache_updatePipelineLoadingProgress()
 {
 	uint32 pipelinesMissingShaders = 0;
-	return VulkanPipelineStableCache::GetInstance().UpdateLoading(g_shaderCacheLoaderState.loadedPipelines, pipelinesMissingShaders);
+	if (g_renderer->GetType() == RendererAPI::Vulkan)
+	    return VulkanPipelineStableCache::GetInstance().UpdateLoading(g_shaderCacheLoaderState.loadedPipelines, pipelinesMissingShaders);
+	else if (g_renderer->GetType() == RendererAPI::Metal)
+		return MetalPipelineCache::GetInstance().UpdateLoading(g_shaderCacheLoaderState.loadedPipelines, pipelinesMissingShaders);
+
+	return false;
 }
 
 uint64 LatteShaderCache_getShaderNameInTransferableCache(uint64 baseHash, uint32 shaderType)
@@ -791,22 +787,17 @@ void LatteShaderCache_Close()
         s_shaderCacheGeneric = nullptr;
     }
     if (g_renderer->GetType() == RendererAPI::Vulkan)
-	{
 		RendererShaderVk::ShaderCacheLoading_Close();
-	}
 	else if (g_renderer->GetType() == RendererAPI::OpenGL)
-	{
 		RendererShaderGL::ShaderCacheLoading_Close();
-	}
 	else if (g_renderer->GetType() == RendererAPI::Metal)
-	{
 	    RendererShaderMtl::ShaderCacheLoading_Close();
-		MetalPipelineCache::ShaderCacheLoading_Close();
-	}
 
-    // if Vulkan then also close pipeline cache
+    // if Vulkan or Metal then also close pipeline cache
     if (g_renderer->GetType() == RendererAPI::Vulkan)
         VulkanPipelineStableCache::GetInstance().Close();
+    else if (g_renderer->GetType() == RendererAPI::Metal)
+        MetalPipelineCache::GetInstance().Close();
 }
 
 #include <wx/msgdlg.h>
diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp
index 14287050..c6a5012b 100644
--- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp
+++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.cpp
@@ -65,7 +65,7 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM
 	}
 	else if (textureType == MTL::TextureTypeCube)
 	{
-	    // Do notjing
+	    // Do nothing
 	}
 	else if (textureType == MTL::TextureTypeCubeArray)
 	{
@@ -81,13 +81,10 @@ LatteTextureMtl::LatteTextureMtl(class MetalRenderer* mtlRenderer, Latte::E_DIM
 
 	MTL::TextureUsage usage = MTL::TextureUsageShaderRead | MTL::TextureUsagePixelFormatView;
 	if (!Latte::IsCompressedFormat(format))
-	{
 		usage |= MTL::TextureUsageRenderTarget;
-	}
 	desc->setUsage(usage);
 
 	m_texture = mtlRenderer->GetDevice()->newTexture(desc);
-
 	desc->release();
 }
 
diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.cpp
new file mode 100644
index 00000000..88a2dfac
--- /dev/null
+++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.cpp
@@ -0,0 +1,48 @@
+#include "Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h"
+#include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h"
+#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h"
+#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h"
+
+MetalAttachmentsInfo::MetalAttachmentsInfo(class CachedFBOMtl* fbo)
+{
+    for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++)
+	{
+	    const auto& colorBuffer = fbo->colorBuffer[i];
+		auto texture = static_cast<LatteTextureViewMtl*>(colorBuffer.texture);
+		if (!texture)
+		    continue;
+
+		colorFormats[i] = texture->format;
+	}
+
+	// Depth stencil attachment
+	if (fbo->depthBuffer.texture)
+	{
+	    auto texture = static_cast<LatteTextureViewMtl*>(fbo->depthBuffer.texture);
+        depthFormat = texture->format;
+        hasStencil = fbo->depthBuffer.hasStencil;
+	}
+}
+
+MetalAttachmentsInfo::MetalAttachmentsInfo(const LatteContextRegister& lcr, const LatteDecompilerShader* pixelShader)
+{
+    uint8 cbMask = LatteMRT::GetActiveColorBufferMask(pixelShader, lcr);
+	bool dbMask = LatteMRT::GetActiveDepthBufferMask(lcr);
+
+	// Color attachments
+	for (int i = 0; i < 8; ++i)
+	{
+		if ((cbMask & (1 << i)) == 0)
+			continue;
+
+		colorFormats[i] = LatteMRT::GetColorBufferFormat(i, lcr);
+	}
+
+	// Depth stencil attachment
+	if (dbMask)
+	{
+		Latte::E_GX2SURFFMT format = LatteMRT::GetDepthBufferFormat(lcr);
+		depthFormat = format;
+		hasStencil = GetMtlPixelFormatInfo(format, true).hasStencil;
+	}
+}
diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h
new file mode 100644
index 00000000..c8ebe7c1
--- /dev/null
+++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h"
+
+class MetalAttachmentsInfo
+{
+public:
+    MetalAttachmentsInfo() = default;
+    MetalAttachmentsInfo(class CachedFBOMtl* fbo);
+    MetalAttachmentsInfo(const LatteContextRegister& lcr, const class LatteDecompilerShader* pixelShader);
+
+    Latte::E_GX2SURFFMT colorFormats[LATTE_NUM_COLOR_TARGET] = {Latte::E_GX2SURFFMT::INVALID_FORMAT};
+    Latte::E_GX2SURFFMT depthFormat = Latte::E_GX2SURFFMT::INVALID_FORMAT;
+    bool hasStencil = false;
+};
diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp
index 29459539..d49060fb 100644
--- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp
+++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.cpp
@@ -1,535 +1,186 @@
-#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h"
 #include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h"
 #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h"
-#include "Cafe/HW/Latte/Core/LatteShader.h"
-#include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h"
 #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h"
-#include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h"
-#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h"
 
 #include "Cafe/HW/Latte/Core/FetchShader.h"
 #include "Cafe/HW/Latte/ISA/RegDefines.h"
-#include "Cemu/Logging/CemuLogging.h"
-#include "HW/Latte/Core/LatteConst.h"
+#include "Cafe/HW/Latte/Core/LatteConst.h"
+#include "Cafe/HW/Latte/Common/RegisterSerializer.h"
+#include "Cafe/HW/Latte/Core/LatteShaderCache.h"
+#include "Cemu/FileCache/FileCache.h"
+#include "Common/precompiled.h"
+#include "Cafe/HW/Latte/Core/LatteShader.h"
+#include "Cafe/HW/Latte/ISA/LatteReg.h"
+#include "HW/Latte/Renderer/Metal/MetalPipelineCompiler.h"
+#include "util/helpers/helpers.h"
 #include "config/ActiveSettings.h"
 
-static void rectsEmulationGS_outputSingleVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable* psInputTable, sint32 vIdx, const LatteContextRegister& latteRegister)
+#include <openssl/sha.h>
+
+static bool g_compilePipelineThreadInit{false};
+static std::mutex g_compilePipelineMutex;
+static std::condition_variable g_compilePipelineCondVar;
+static std::queue<MetalPipelineCompiler*> g_compilePipelineRequests;
+
+static void compileThreadFunc(sint32 threadIndex)
 {
-	auto parameterMask = vertexShader->outputParameterMask;
-	for (uint32 i = 0; i < 32; i++)
+	SetThreadName("compilePl");
+
+	// one thread runs at normal priority while the others run at lower priority
+	if(threadIndex != 0)
+		; // TODO: set thread priority
+
+	while (true)
 	{
-		if ((parameterMask & (1 << i)) == 0)
-			continue;
-		sint32 vsSemanticId = psInputTable->getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i);
-		if (vsSemanticId < 0)
-			continue;
-		// make sure PS has matching input
-		if (!psInputTable->hasPSImportForSemanticId(vsSemanticId))
-			continue;
-		gsSrc.append(fmt::format("out.passParameterSem{} = objectPayload.vertexOut[{}].passParameterSem{};\r\n", vsSemanticId, vIdx, vsSemanticId));
-	}
-	gsSrc.append(fmt::format("out.position = objectPayload.vertexOut[{}].position;\r\n", vIdx));
-	gsSrc.append(fmt::format("mesh.set_vertex({}, out);\r\n", vIdx));
-}
+		std::unique_lock lock(g_compilePipelineMutex);
+		while (g_compilePipelineRequests.empty())
+			g_compilePipelineCondVar.wait(lock);
 
-static void rectsEmulationGS_outputGeneratedVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable* psInputTable, const char* variant, const LatteContextRegister& latteRegister)
-{
-	auto parameterMask = vertexShader->outputParameterMask;
-	for (uint32 i = 0; i < 32; i++)
-	{
-		if ((parameterMask & (1 << i)) == 0)
-			continue;
-		sint32 vsSemanticId = psInputTable->getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i);
-		if (vsSemanticId < 0)
-			continue;
-		// make sure PS has matching input
-		if (!psInputTable->hasPSImportForSemanticId(vsSemanticId))
-			continue;
-		gsSrc.append(fmt::format("out.passParameterSem{} = gen4thVertex{}(objectPayload.vertexOut[0].passParameterSem{}, objectPayload.vertexOut[1].passParameterSem{}, objectPayload.vertexOut[2].passParameterSem{});\r\n", vsSemanticId, variant, vsSemanticId, vsSemanticId, vsSemanticId));
-	}
-	gsSrc.append(fmt::format("out.position = gen4thVertex{}(objectPayload.vertexOut[0].position, objectPayload.vertexOut[1].position, objectPayload.vertexOut[2].position);\r\n", variant));
-	gsSrc.append(fmt::format("mesh.set_vertex(3, out);\r\n"));
-}
+		MetalPipelineCompiler* request = g_compilePipelineRequests.front();
 
-static void rectsEmulationGS_outputVerticesCode(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable* psInputTable, sint32 p0, sint32 p1, sint32 p2, sint32 p3, const char* variant, const LatteContextRegister& latteRegister)
-{
-	sint32 pList[4] = { p0, p1, p2, p3 };
-	for (sint32 i = 0; i < 4; i++)
-	{
-		if (pList[i] == 3)
-			rectsEmulationGS_outputGeneratedVertex(gsSrc, vertexShader, psInputTable, variant, latteRegister);
-		else
-			rectsEmulationGS_outputSingleVertex(gsSrc, vertexShader, psInputTable, pList[i], latteRegister);
-	}
-	gsSrc.append(fmt::format("mesh.set_index(0, {});\r\n", pList[0]));
-	gsSrc.append(fmt::format("mesh.set_index(1, {});\r\n", pList[1]));
-	gsSrc.append(fmt::format("mesh.set_index(2, {});\r\n", pList[2]));
-	gsSrc.append(fmt::format("mesh.set_index(3, {});\r\n", pList[1]));
-	gsSrc.append(fmt::format("mesh.set_index(4, {});\r\n", pList[2]));
-	gsSrc.append(fmt::format("mesh.set_index(5, {});\r\n", pList[3]));
-}
+		g_compilePipelineRequests.pop();
 
-static RendererShaderMtl* rectsEmulationGS_generate(MetalRenderer* metalRenderer, const LatteDecompilerShader* vertexShader, const LatteContextRegister& latteRegister)
-{
-	std::string gsSrc;
-	gsSrc.append("#include <metal_stdlib>\r\n");
-	gsSrc.append("using namespace metal;\r\n");
+		lock.unlock();
 
-	LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable();
-
-	// inputs & outputs
-	std::string vertexOutDefinition = "struct VertexOut {\r\n";
-	vertexOutDefinition += "float4 position;\r\n";
-	std::string geometryOutDefinition = "struct GeometryOut {\r\n";
-	geometryOutDefinition += "float4 position [[position]];\r\n";
-	auto parameterMask = vertexShader->outputParameterMask;
-	for (sint32 f = 0; f < 2; f++)
-	{
-		for (uint32 i = 0; i < 32; i++)
-		{
-			if ((parameterMask & (1 << i)) == 0)
-				continue;
-			sint32 vsSemanticId = psInputTable->getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i);
-			if (vsSemanticId < 0)
-				continue;
-			auto psImport = psInputTable->getPSImportBySemanticId(vsSemanticId);
-			if (psImport == nullptr)
-				continue;
-
-			if (f == 0)
-			{
-				vertexOutDefinition += fmt::format("float4 passParameterSem{};\r\n", vsSemanticId);
-			}
-			else
-			{
-				geometryOutDefinition += fmt::format("float4 passParameterSem{}", vsSemanticId);
-
-    			geometryOutDefinition += fmt::format(" [[user(locn{})]]", psInputTable->getPSImportLocationBySemanticId(vsSemanticId));
-    			if (psImport->isFlat)
-    				geometryOutDefinition += " [[flat]]";
-    			if (psImport->isNoPerspective)
-    				geometryOutDefinition += " [[center_no_perspective]]";
-                geometryOutDefinition += ";\r\n";
-			}
-		}
-	}
-	vertexOutDefinition += "};\r\n";
-	geometryOutDefinition += "};\r\n";
-
-	gsSrc.append(vertexOutDefinition);
-	gsSrc.append(geometryOutDefinition);
-
-	gsSrc.append("struct ObjectPayload {\r\n");
-	gsSrc.append("VertexOut vertexOut[3];\r\n");
-	gsSrc.append("};\r\n");
-
-	// gen function
-	gsSrc.append("float4 gen4thVertexA(float4 a, float4 b, float4 c)\r\n");
-	gsSrc.append("{\r\n");
-	gsSrc.append("return b - (c - a);\r\n");
-	gsSrc.append("}\r\n");
-
-	gsSrc.append("float4 gen4thVertexB(float4 a, float4 b, float4 c)\r\n");
-	gsSrc.append("{\r\n");
-	gsSrc.append("return c - (b - a);\r\n");
-	gsSrc.append("}\r\n");
-
-	gsSrc.append("float4 gen4thVertexC(float4 a, float4 b, float4 c)\r\n");
-	gsSrc.append("{\r\n");
-	gsSrc.append("return c + (b - a);\r\n");
-	gsSrc.append("}\r\n");
-
-	// main
-	gsSrc.append("using MeshType = mesh<GeometryOut, void, 4, 2, topology::triangle>;\r\n");
-	gsSrc.append("[[mesh, max_total_threads_per_threadgroup(1)]]\r\n");
-	gsSrc.append("void main0(MeshType mesh, const object_data ObjectPayload& objectPayload [[payload]])\r\n");
-	gsSrc.append("{\r\n");
-	gsSrc.append("GeometryOut out;\r\n");
-
-	// there are two possible winding orders that need different triangle generation:
-	// 0 1
-	// 2 3
-	// and
-	// 0 1
-	// 3 2
-	// all others are just symmetries of these cases
-
-	// we can determine the case by comparing the distance 0<->1 and 0<->2
-
-	gsSrc.append("float dist0_1 = length(objectPayload.vertexOut[1].position.xy - objectPayload.vertexOut[0].position.xy);\r\n");
-	gsSrc.append("float dist0_2 = length(objectPayload.vertexOut[2].position.xy - objectPayload.vertexOut[0].position.xy);\r\n");
-	gsSrc.append("float dist1_2 = length(objectPayload.vertexOut[2].position.xy - objectPayload.vertexOut[1].position.xy);\r\n");
-
-	// emit vertices
-	gsSrc.append("if(dist0_1 > dist0_2 && dist0_1 > dist1_2)\r\n");
-	gsSrc.append("{\r\n");
-	// p0 to p1 is diagonal
-	rectsEmulationGS_outputVerticesCode(gsSrc, vertexShader, psInputTable, 2, 1, 0, 3, "A", latteRegister);
-	gsSrc.append("} else if ( dist0_2 > dist0_1 && dist0_2 > dist1_2 ) {\r\n");
-	// p0 to p2 is diagonal
-	rectsEmulationGS_outputVerticesCode(gsSrc, vertexShader, psInputTable, 1, 2, 0, 3, "B", latteRegister);
-	gsSrc.append("} else {\r\n");
-	// p1 to p2 is diagonal
-	rectsEmulationGS_outputVerticesCode(gsSrc, vertexShader, psInputTable, 0, 1, 2, 3, "C", latteRegister);
-	gsSrc.append("}\r\n");
-
-	gsSrc.append("mesh.set_primitive_count(2);\r\n");
-
-	gsSrc.append("}\r\n");
-
-	auto mtlShader = new RendererShaderMtl(metalRenderer, RendererShader::ShaderType::kGeometry, 0, 0, false, false, gsSrc);
-	mtlShader->PreponeCompilation(true);
-
-	return mtlShader;
-}
-
-#define INVALID_TITLE_ID 0xFFFFFFFFFFFFFFFF
-
-uint64 s_cacheTitleId = INVALID_TITLE_ID;
-
-extern std::atomic_int g_compiled_shaders_total;
-extern std::atomic_int g_compiled_shaders_async;
-
-template<typename T>
-void SetFragmentState(T* desc, CachedFBOMtl* lastUsedFBO, CachedFBOMtl* activeFBO, const LatteDecompilerShader* pixelShader, const LatteContextRegister& lcr)
-{
-	// Rasterization
-	bool rasterizationEnabled = !lcr.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL();
-
-	// HACK
-	// TODO: include this in the hash?
-	if (!lcr.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA())
-		rasterizationEnabled = true;
-
-	// Culling both front and back faces effectively disables rasterization
-	const auto& polygonControlReg = lcr.PA_SU_SC_MODE_CNTL;
-	uint32 cullFront = polygonControlReg.get_CULL_FRONT();
-	uint32 cullBack = polygonControlReg.get_CULL_BACK();
-	if (cullFront && cullBack)
-	    rasterizationEnabled = false;
-
-	auto pixelShaderMtl = static_cast<RendererShaderMtl*>(pixelShader->shader);
-
-	if (!rasterizationEnabled || !pixelShaderMtl)
-	{
-	    desc->setRasterizationEnabled(false);
-		return;
-	}
-
-    desc->setFragmentFunction(pixelShaderMtl->GetFunction());
-
-    // Color attachments
-	const Latte::LATTE_CB_COLOR_CONTROL& colorControlReg = lcr.CB_COLOR_CONTROL;
-	uint32 blendEnableMask = colorControlReg.get_BLEND_MASK();
-	uint32 renderTargetMask = lcr.CB_TARGET_MASK.get_MASK();
-	for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++)
-	{
-	    const auto& colorBuffer = lastUsedFBO->colorBuffer[i];
-		auto texture = static_cast<LatteTextureViewMtl*>(colorBuffer.texture);
-		if (!texture)
-		{
-		    continue;
-		}
-		auto colorAttachment = desc->colorAttachments()->object(i);
-		colorAttachment->setPixelFormat(texture->GetRGBAView()->pixelFormat());
-
-		// Disable writes if not in the active FBO
-		if (!activeFBO->colorBuffer[i].texture)
-        {
-            colorAttachment->setWriteMask(MTL::ColorWriteMaskNone);
-            continue;
-        }
-
-		colorAttachment->setWriteMask(GetMtlColorWriteMask((renderTargetMask >> (i * 4)) & 0xF));
-
-		// Blending
-		bool blendEnabled = ((blendEnableMask & (1 << i))) != 0;
-		// Only float data type is blendable
-		if (blendEnabled && GetMtlPixelFormatInfo(texture->format, false).dataType == MetalDataType::FLOAT)
-		{
-       		colorAttachment->setBlendingEnabled(true);
-
-       		const auto& blendControlReg = lcr.CB_BLENDN_CONTROL[i];
-
-       		auto rgbBlendOp = GetMtlBlendOp(blendControlReg.get_COLOR_COMB_FCN());
-       		auto srcRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_SRCBLEND());
-       		auto dstRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_DSTBLEND());
-
-       		colorAttachment->setRgbBlendOperation(rgbBlendOp);
-       		colorAttachment->setSourceRGBBlendFactor(srcRgbBlendFactor);
-       		colorAttachment->setDestinationRGBBlendFactor(dstRgbBlendFactor);
-       		if (blendControlReg.get_SEPARATE_ALPHA_BLEND())
-       		{
-       			colorAttachment->setAlphaBlendOperation(GetMtlBlendOp(blendControlReg.get_ALPHA_COMB_FCN()));
-         		    colorAttachment->setSourceAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_SRCBLEND()));
-         		    colorAttachment->setDestinationAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_DSTBLEND()));
-       		}
-       		else
-       		{
-           		colorAttachment->setAlphaBlendOperation(rgbBlendOp);
-           		colorAttachment->setSourceAlphaBlendFactor(srcRgbBlendFactor);
-           		colorAttachment->setDestinationAlphaBlendFactor(dstRgbBlendFactor);
-       		}
-		}
-	}
-
-	// Depth stencil attachment
-	if (lastUsedFBO->depthBuffer.texture)
-	{
-	    auto texture = static_cast<LatteTextureViewMtl*>(lastUsedFBO->depthBuffer.texture);
-        desc->setDepthAttachmentPixelFormat(texture->GetRGBAView()->pixelFormat());
-        if (lastUsedFBO->depthBuffer.hasStencil)
-        {
-            desc->setStencilAttachmentPixelFormat(texture->GetRGBAView()->pixelFormat());
-        }
+		request->Compile(true, false, true);
+		delete request;
 	}
 }
 
-void MetalPipelineCache::ShaderCacheLoading_begin(uint64 cacheTitleId)
+static void initCompileThread()
 {
-    s_cacheTitleId = cacheTitleId;
+	uint32 numCompileThreads;
+
+	uint32 cpuCoreCount = GetPhysicalCoreCount();
+	if (cpuCoreCount <= 2)
+		numCompileThreads = 1;
+	else
+		numCompileThreads = 2 + (cpuCoreCount - 3); // 2 plus one additionally for every extra core above 3
+
+	numCompileThreads = std::min(numCompileThreads, 8u); // cap at 8
+
+	for (uint32 i = 0; i < numCompileThreads; i++)
+	{
+		std::thread compileThread(compileThreadFunc, i);
+		compileThread.detach();
+	}
 }
 
-void MetalPipelineCache::ShaderCacheLoading_end()
+static void queuePipeline(MetalPipelineCompiler* v)
 {
+	std::unique_lock lock(g_compilePipelineMutex);
+	g_compilePipelineRequests.push(std::move(v));
+	lock.unlock();
+	g_compilePipelineCondVar.notify_one();
 }
 
-void MetalPipelineCache::ShaderCacheLoading_Close()
+// make a guess if a pipeline is not essential
+// non-essential means that skipping these drawcalls shouldn't lead to permanently corrupted graphics
+bool IsAsyncPipelineAllowed(const MetalAttachmentsInfo& attachmentsInfo, Vector2i extend, uint32 indexCount)
 {
-    g_compiled_shaders_total = 0;
-    g_compiled_shaders_async = 0;
+	if (extend.x == 1600 && extend.y == 1600)
+		return false; // Splatoon ink mechanics use 1600x1600 R8 and R8G8 framebuffers, this resolution is rare enough that we can just blacklist it globally
+
+	if (attachmentsInfo.depthFormat != Latte::E_GX2SURFFMT::INVALID_FORMAT)
+		return true; // aggressive filter but seems to work well so far
+
+	// small index count (3,4,5,6) is often associated with full-viewport quads (which are considered essential due to often being used to generate persistent textures)
+	if (indexCount <= 6)
+		return false;
+
+	return true;
+}
+
+MetalPipelineCache* g_mtlPipelineCache = nullptr;
+
+MetalPipelineCache& MetalPipelineCache::GetInstance()
+{
+    return *g_mtlPipelineCache;
+}
+
+MetalPipelineCache::MetalPipelineCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}
+{
+    g_mtlPipelineCache = this;
 }
 
 MetalPipelineCache::~MetalPipelineCache()
 {
-    for (auto& pair : m_pipelineCache)
+    for (auto& [key, pipelineObj] : m_pipelineCache)
     {
-        pair.second->release();
+        pipelineObj->m_pipeline->release();
+        delete pipelineObj;
     }
-    m_pipelineCache.clear();
-
-    NS::Error* error = nullptr;
-    m_binaryArchive->serializeToURL(m_binaryArchiveURL, &error);
-    if (error)
-    {
-        cemuLog_log(LogType::Force, "error serializing binary archive: {}", error->localizedDescription()->utf8String());
-        error->release();
-    }
-    m_binaryArchive->release();
-
-    m_binaryArchiveURL->release();
 }
 
-MTL::RenderPipelineState* MetalPipelineCache::GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, CachedFBOMtl* lastUsedFBO, CachedFBOMtl* activeFBO, const LatteContextRegister& lcr)
+PipelineObject* MetalPipelineCache::GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, Vector2i extend, uint32 indexCount, const LatteContextRegister& lcr)
 {
-    uint64 stateHash = CalculateRenderPipelineHash(fetchShader, vertexShader, pixelShader, lastUsedFBO, lcr);
-    auto& pipeline = m_pipelineCache[stateHash];
-    if (pipeline)
-        return pipeline;
+    uint64 hash = CalculatePipelineHash(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr);
+    PipelineObject*& pipelineObj = m_pipelineCache[hash];
+    if (pipelineObj)
+        return pipelineObj;
 
-	auto vertexShaderMtl = static_cast<RendererShaderMtl*>(vertexShader->shader);
+    pipelineObj = new PipelineObject();
 
-	// Render pipeline state
-	MTL::RenderPipelineDescriptor* desc = MTL::RenderPipelineDescriptor::alloc()->init();
-	desc->setVertexFunction(vertexShaderMtl->GetFunction());
+    MetalPipelineCompiler* compiler = new MetalPipelineCompiler(m_mtlr, *pipelineObj);
+    compiler->InitFromState(fetchShader, vertexShader, geometryShader, pixelShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr);
 
-    // Vertex descriptor
-    if (!fetchShader->mtlFetchVertexManually)
-    {
-    	MTL::VertexDescriptor* vertexDescriptor = MTL::VertexDescriptor::alloc()->init();
-    	for (auto& bufferGroup : fetchShader->bufferGroups)
-    	{
-    		std::optional<LatteConst::VertexFetchType2> fetchType;
+    bool allowAsyncCompile = false;
+    if (GetConfig().async_compile)
+		allowAsyncCompile = IsAsyncPipelineAllowed(activeAttachmentsInfo, extend, indexCount);
 
-    		uint32 minBufferStride = 0;
-    		for (sint32 j = 0; j < bufferGroup.attribCount; ++j)
-    		{
-    			auto& attr = bufferGroup.attrib[j];
-
-    			uint32 semanticId = vertexShader->resourceMapping.attributeMapping[attr.semanticId];
-    			if (semanticId == (uint32)-1)
-    				continue; // attribute not used?
-
-    			auto attribute = vertexDescriptor->attributes()->object(semanticId);
-    			attribute->setOffset(attr.offset);
-    			attribute->setBufferIndex(GET_MTL_VERTEX_BUFFER_INDEX(attr.attributeBufferIndex));
-    			attribute->setFormat(GetMtlVertexFormat(attr.format));
-
-    			minBufferStride = std::max(minBufferStride, attr.offset + GetMtlVertexFormatSize(attr.format));
-
-    			if (fetchType.has_value())
-    				cemu_assert_debug(fetchType == attr.fetchType);
-    			else
-    				fetchType = attr.fetchType;
-
-    			if (attr.fetchType == LatteConst::INSTANCE_DATA)
-    			{
-    				cemu_assert_debug(attr.aluDivisor == 1); // other divisor not yet supported
-    			}
-    		}
-
-    		uint32 bufferIndex = bufferGroup.attributeBufferIndex;
-    		uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7;
-    		uint32 bufferStride = (lcr.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF;
-
-    		auto layout = vertexDescriptor->layouts()->object(GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex));
-    		if (bufferStride == 0)
-    		{
-    		    // Buffer stride cannot be zero, let's use the minimum stride
-    			bufferStride = minBufferStride;
-
-    			// Additionally, constant vertex function must be used
-    			layout->setStepFunction(MTL::VertexStepFunctionConstant);
-    			layout->setStepRate(0);
-    		}
-    		else
-    		{
-      		if (!fetchType.has_value() || fetchType == LatteConst::VertexFetchType2::VERTEX_DATA)
-     			layout->setStepFunction(MTL::VertexStepFunctionPerVertex);
-      		else if (fetchType == LatteConst::VertexFetchType2::INSTANCE_DATA)
-     			layout->setStepFunction(MTL::VertexStepFunctionPerInstance);
-      		else
-      		{
-      		    debug_printf("unimplemented vertex fetch type %u\n", (uint32)fetchType.value());
-     			cemu_assert(false);
-      		}
-    		}
-    		bufferStride = Align(bufferStride, 4);
-    		layout->setStride(bufferStride);
-    	}
-
-        // TODO: don't always set the vertex descriptor?
-    	desc->setVertexDescriptor(vertexDescriptor);
-        vertexDescriptor->release();
-    }
-
-	SetFragmentState(desc, lastUsedFBO, activeFBO, pixelShader, lcr);
-
-	TryLoadBinaryArchive();
-
-	// Load binary
-    if (m_binaryArchive)
-    {
-        NS::Object* binArchives[] = {m_binaryArchive};
-        auto binaryArchives = NS::Array::alloc()->init(binArchives, 1);
-        desc->setBinaryArchives(binaryArchives);
-        binaryArchives->release();
-    }
-
-    NS::Error* error = nullptr;
-#ifdef CEMU_DEBUG_ASSERT
-    desc->setLabel(GetLabel("Cached render pipeline state", desc));
-#endif
-	pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionFailOnBinaryArchiveMiss, nullptr, &error);
-
-	// Pipeline wasn't found in the binary archive, we need to compile it
-	if (error)
+	if (allowAsyncCompile)
 	{
-		desc->setBinaryArchives(nullptr);
+	    if (!g_compilePipelineThreadInit)
+		{
+			initCompileThread();
+			g_compilePipelineThreadInit = true;
+		}
 
-        error->release();
-        error = nullptr;
-#ifdef CEMU_DEBUG_ASSERT
-        desc->setLabel(GetLabel("New render pipeline state", desc));
-#endif
-	    pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, &error);
-		if (error)
-		{
-		    cemuLog_log(LogType::Force, "error creating render pipeline state: {}", error->localizedDescription()->utf8String());
-			error->release();
-		}
-		else
-		{
-		    // Save binary
-			if (m_binaryArchive)
-			{
-                NS::Error* error = nullptr;
-                m_binaryArchive->addRenderPipelineFunctions(desc, &error);
-                if (error)
-                {
-                    cemuLog_log(LogType::Force, "error saving render pipeline functions: {}", error->localizedDescription()->utf8String());
-                    error->release();
-                }
-			}
-		}
+		queuePipeline(compiler);
+	}
+	else
+	{
+	    // Also force compile to ensure that the pipeline is ready
+        cemu_assert_debug(compiler->Compile(true, true, true));
+        delete compiler;
 	}
-	desc->release();
 
-	return pipeline;
+	// Save to cache
+    AddCurrentStateToCache(hash, lastUsedAttachmentsInfo);
+
+    return pipelineObj;
 }
 
-MTL::RenderPipelineState* MetalPipelineCache::GetMeshPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, CachedFBOMtl* lastUsedFBO, CachedFBOMtl* activeFBO, const LatteContextRegister& lcr, Renderer::INDEX_TYPE hostIndexType)
-{
-    uint64 stateHash = CalculateRenderPipelineHash(fetchShader, vertexShader, pixelShader, lastUsedFBO, lcr);
-
-	stateHash += lcr.GetRawView()[mmVGT_PRIMITIVE_TYPE];
-	stateHash = std::rotl<uint64>(stateHash, 7);
-
-	stateHash += (uint8)hostIndexType;
-	stateHash = std::rotl<uint64>(stateHash, 7);
-
-    auto& pipeline = m_pipelineCache[stateHash];
-    if (pipeline)
-        return pipeline;
-
-	auto objectShaderMtl = static_cast<RendererShaderMtl*>(vertexShader->shader);
-	RendererShaderMtl* meshShaderMtl;
-	if (geometryShader)
-	{
-        meshShaderMtl = static_cast<RendererShaderMtl*>(geometryShader->shader);
-	}
-    else
-    {
-        // If there is no geometry shader, it means that we are emulating rects
-        meshShaderMtl = rectsEmulationGS_generate(m_mtlr, vertexShader, lcr);
-    }
-
-	// Render pipeline state
-	MTL::MeshRenderPipelineDescriptor* desc = MTL::MeshRenderPipelineDescriptor::alloc()->init();
-	desc->setObjectFunction(objectShaderMtl->GetFunction());
-	desc->setMeshFunction(meshShaderMtl->GetFunction());
-
-	SetFragmentState(desc, lastUsedFBO, activeFBO, pixelShader, lcr);
-
-	TryLoadBinaryArchive();
-
-	// Load binary
-    // TODO: no binary archives? :(
-
-    NS::Error* error = nullptr;
-#ifdef CEMU_DEBUG_ASSERT
-    desc->setLabel(GetLabel("Mesh pipeline state", desc));
-#endif
-	pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionNone, nullptr, &error);
-	desc->release();
-	if (error)
-	{
-    	cemuLog_log(LogType::Force, "error creating mesh render pipeline state: {}", error->localizedDescription()->utf8String());
-        error->release();
-	}
-
-	return pipeline;
-}
-
-uint64 MetalPipelineCache::CalculateRenderPipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, const LatteContextRegister& lcr)
+uint64 MetalPipelineCache::CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr)
 {
     // Hash
     uint64 stateHash = 0;
     for (int i = 0; i < Latte::GPU_LIMITS::NUM_COLOR_ATTACHMENTS; ++i)
 	{
-		auto textureView = static_cast<LatteTextureViewMtl*>(lastUsedFBO->colorBuffer[i].texture);
-		if (!textureView)
-		    continue;
+	    Latte::E_GX2SURFFMT format = lastUsedAttachmentsInfo.colorFormats[i];
+		if (format == Latte::E_GX2SURFFMT::INVALID_FORMAT)
+            continue;
 
-		stateHash += textureView->GetRGBAView()->pixelFormat() + i * 31;
+		stateHash += GetMtlPixelFormat(format, false) + i * 31;
 		stateHash = std::rotl<uint64>(stateHash, 7);
+
+		if (activeAttachmentsInfo.colorFormats[i] == Latte::E_GX2SURFFMT::INVALID_FORMAT)
+		{
+            stateHash += 1;
+		    stateHash = std::rotl<uint64>(stateHash, 1);
+		}
 	}
 
-	if (lastUsedFBO->depthBuffer.texture)
+	if (lastUsedAttachmentsInfo.depthFormat != Latte::E_GX2SURFFMT::INVALID_FORMAT)
 	{
-	    auto textureView = static_cast<LatteTextureViewMtl*>(lastUsedFBO->depthBuffer.texture);
-		stateHash += textureView->GetRGBAView()->pixelFormat();
+		stateHash += GetMtlPixelFormat(lastUsedAttachmentsInfo.depthFormat, true);
 		stateHash = std::rotl<uint64>(stateHash, 7);
+
+		if (activeAttachmentsInfo.depthFormat == Latte::E_GX2SURFFMT::INVALID_FORMAT)
+		{
+            stateHash += 1;
+		    stateHash = std::rotl<uint64>(stateHash, 1);
+		}
 	}
 
 	for (auto& group : fetchShader->bufferGroups)
@@ -586,55 +237,388 @@ uint64 MetalPipelineCache::CalculateRenderPipelineHash(const LatteFetchShader* f
 		}
 	}
 
+	// Mesh pipeline
+	const LattePrimitiveMode primitiveMode = static_cast<LattePrimitiveMode>(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]);
+    bool isPrimitiveRect = (primitiveMode == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS);
+
+    bool usesGeometryShader = (geometryShader != nullptr || isPrimitiveRect);
+
+    if (usesGeometryShader)
+    {
+        stateHash += lcr.GetRawView()[mmVGT_PRIMITIVE_TYPE];
+        stateHash = std::rotl<uint64>(stateHash, 7);
+    }
+
 	return stateHash;
 }
 
-void MetalPipelineCache::TryLoadBinaryArchive()
+struct
 {
-    if (m_binaryArchive || s_cacheTitleId == INVALID_TITLE_ID)
-        return;
+	uint32 pipelineLoadIndex;
+	uint32 pipelineMaxFileIndex;
 
-    // GPU name
-    const char* deviceName1 = m_mtlr->GetDevice()->name()->utf8String();
-    std::string deviceName;
-    deviceName.assign(deviceName1);
+	std::atomic_uint32_t pipelinesQueued;
+	std::atomic_uint32_t pipelinesLoaded;
+} g_mtlCacheState;
 
-    // Replace spaces with underscores
-    for (auto& c : deviceName)
-    {
-        if (c == ' ')
-            c = '_';
-    }
+uint32 MetalPipelineCache::BeginLoading(uint64 cacheTitleId)
+{
+	std::error_code ec;
+	fs::create_directories(ActiveSettings::GetCachePath("shaderCache/transferable"), ec);
+	const auto pathCacheFile = ActiveSettings::GetCachePath("shaderCache/transferable/{:016x}_mtlpipeline.bin", cacheTitleId);
 
-    // OS version
-    auto osVersion = NS::ProcessInfo::processInfo()->operatingSystemVersion();
+	// init cache loader state
+	g_mtlCacheState.pipelineLoadIndex = 0;
+	g_mtlCacheState.pipelineMaxFileIndex = 0;
+	g_mtlCacheState.pipelinesLoaded = 0;
+	g_mtlCacheState.pipelinesQueued = 0;
 
-    // Precompiled binaries cannot be shared between different devices or OS versions
-    const std::string cacheFilename = fmt::format("{:016x}_mtl_pipelines.bin", s_cacheTitleId);
-	const fs::path cachePath = ActiveSettings::GetCachePath("shaderCache/precompiled/{}/{}-{}-{}/{}", deviceName, osVersion.majorVersion, osVersion.minorVersion, osVersion.patchVersion, cacheFilename);
+	// start async compilation threads
+	m_compilationCount.store(0);
+	m_compilationQueue.clear();
 
-	// Create the directory if it doesn't exist
-	std::filesystem::create_directories(cachePath.parent_path());
+	// get core count
+	uint32 cpuCoreCount = GetPhysicalCoreCount();
+	m_numCompilationThreads = std::clamp(cpuCoreCount, 1u, 8u);
+	// TODO: uncomment?
+	//if (VulkanRenderer::GetInstance()->GetDisableMultithreadedCompilation())
+	//	m_numCompilationThreads = 1;
 
-    m_binaryArchiveURL = NS::URL::fileURLWithPath(ToNSString((const char*)cachePath.generic_u8string().c_str()));
+	for (uint32 i = 0; i < m_numCompilationThreads; i++)
+	{
+		std::thread compileThread(&MetalPipelineCache::CompilerThread, this);
+		compileThread.detach();
+	}
 
-    MTL::BinaryArchiveDescriptor* desc = MTL::BinaryArchiveDescriptor::alloc()->init();
-    desc->setUrl(m_binaryArchiveURL);
-
-    NS::Error* error = nullptr;
-    m_binaryArchive = m_mtlr->GetDevice()->newBinaryArchive(desc, &error);
-    if (error)
-    {
-        desc->setUrl(nullptr);
-
-        error->release();
-        error = nullptr;
-        m_binaryArchive = m_mtlr->GetDevice()->newBinaryArchive(desc, &error);
-        if (error)
-        {
-            cemuLog_log(LogType::Force, "failed to create binary archive: {}", error->localizedDescription()->utf8String());
-            error->release();
-        }
-    }
-    desc->release();
+	// open cache file or create it
+	cemu_assert_debug(s_cache == nullptr);
+	s_cache = FileCache::Open(pathCacheFile, true, LatteShaderCache_getPipelineCacheExtraVersion(cacheTitleId));
+	if (!s_cache)
+	{
+		cemuLog_log(LogType::Force, "Failed to open or create Metal pipeline cache file: {}", _pathToUtf8(pathCacheFile));
+		return 0;
+	}
+	else
+	{
+		s_cache->UseCompression(false);
+		g_mtlCacheState.pipelineMaxFileIndex = s_cache->GetMaximumFileIndex();
+	}
+	return s_cache->GetFileCount();
+}
+
+bool MetalPipelineCache::UpdateLoading(uint32& pipelinesLoadedTotal, uint32& pipelinesMissingShaders)
+{
+	pipelinesLoadedTotal = g_mtlCacheState.pipelinesLoaded;
+	pipelinesMissingShaders = 0;
+	while (g_mtlCacheState.pipelineLoadIndex <= g_mtlCacheState.pipelineMaxFileIndex)
+	{
+		if (m_compilationQueue.size() >= 50)
+		{
+			std::this_thread::sleep_for(std::chrono::milliseconds(10));
+			return true; // queue up to 50 entries at a time
+		}
+
+		uint64 fileNameA, fileNameB;
+		std::vector<uint8> fileData;
+		if (s_cache->GetFileByIndex(g_mtlCacheState.pipelineLoadIndex, &fileNameA, &fileNameB, fileData))
+		{
+			// queue for async compilation
+			g_mtlCacheState.pipelinesQueued++;
+			m_compilationQueue.push(std::move(fileData));
+			g_mtlCacheState.pipelineLoadIndex++;
+			return true;
+		}
+		g_mtlCacheState.pipelineLoadIndex++;
+	}
+	if (g_mtlCacheState.pipelinesLoaded != g_mtlCacheState.pipelinesQueued)
+	{
+		std::this_thread::sleep_for(std::chrono::milliseconds(10));
+		return true; // pipelines still compiling
+	}
+	return false; // done
+}
+
+void MetalPipelineCache::EndLoading()
+{
+	// shut down compilation threads
+	uint32 threadCount = m_numCompilationThreads;
+	m_numCompilationThreads = 0; // signal thread shutdown
+	for (uint32 i = 0; i < threadCount; i++)
+	{
+		m_compilationQueue.push({}); // push empty workload for every thread. Threads then will shutdown after checking for m_numCompilationThreads == 0
+	}
+	// keep cache file open for writing of new pipelines
+}
+
+void MetalPipelineCache::Close()
+{
+    if(s_cache)
+    {
+        delete s_cache;
+        s_cache = nullptr;
+    }
+}
+
+struct CachedPipeline
+{
+	struct ShaderHash
+	{
+		uint64 baseHash;
+		uint64 auxHash;
+		bool isPresent{};
+
+		void set(uint64 baseHash, uint64 auxHash)
+		{
+			this->baseHash = baseHash;
+			this->auxHash = auxHash;
+			this->isPresent = true;
+		}
+	};
+
+	ShaderHash vsHash; // includes fetch shader
+	ShaderHash gsHash;
+	ShaderHash psHash;
+
+	MetalAttachmentsInfo lastUsedAttachmentsInfo;
+
+	Latte::GPUCompactedRegisterState gpuState;
+};
+
+void MetalPipelineCache::LoadPipelineFromCache(std::span<uint8> fileData)
+{
+	static FSpinlock s_spinlockSharedInternal;
+
+	// deserialize file
+	LatteContextRegister* lcr = new LatteContextRegister();
+	s_spinlockSharedInternal.lock();
+	CachedPipeline* cachedPipeline = new CachedPipeline();
+	s_spinlockSharedInternal.unlock();
+
+	MemStreamReader streamReader(fileData.data(), fileData.size());
+	if (!DeserializePipeline(streamReader, *cachedPipeline))
+	{
+		// failed to deserialize
+		s_spinlockSharedInternal.lock();
+		delete lcr;
+		delete cachedPipeline;
+		s_spinlockSharedInternal.unlock();
+		return;
+	}
+	// restored register view from compacted state
+	Latte::LoadGPURegisterState(*lcr, cachedPipeline->gpuState);
+
+	LatteDecompilerShader* vertexShader = nullptr;
+	LatteDecompilerShader* geometryShader = nullptr;
+	LatteDecompilerShader* pixelShader = nullptr;
+	// find vertex shader
+	if (cachedPipeline->vsHash.isPresent)
+	{
+		vertexShader = LatteSHRC_FindVertexShader(cachedPipeline->vsHash.baseHash, cachedPipeline->vsHash.auxHash);
+		if (!vertexShader)
+		{
+			cemuLog_logDebug(LogType::Force, "Vertex shader not found in cache");
+			return;
+		}
+	}
+	// find geometry shader
+	if (cachedPipeline->gsHash.isPresent)
+	{
+		geometryShader = LatteSHRC_FindGeometryShader(cachedPipeline->gsHash.baseHash, cachedPipeline->gsHash.auxHash);
+		if (!geometryShader)
+		{
+			cemuLog_logDebug(LogType::Force, "Geometry shader not found in cache");
+			return;
+		}
+	}
+	// find pixel shader
+	if (cachedPipeline->psHash.isPresent)
+	{
+		pixelShader = LatteSHRC_FindPixelShader(cachedPipeline->psHash.baseHash, cachedPipeline->psHash.auxHash);
+		if (!pixelShader)
+		{
+			cemuLog_logDebug(LogType::Force, "Pixel shader not found in cache");
+			return;
+		}
+	}
+
+	if (!pixelShader)
+	{
+		cemu_assert_debug(false);
+		return;
+	}
+
+	MetalAttachmentsInfo attachmentsInfo(*lcr, pixelShader);
+
+	PipelineObject* pipelineObject = new PipelineObject();
+
+	// compile
+	{
+		MetalPipelineCompiler pp(m_mtlr, *pipelineObject);
+		pp.InitFromState(vertexShader->compatibleFetchShader, vertexShader, geometryShader, pixelShader, cachedPipeline->lastUsedAttachmentsInfo, attachmentsInfo, *lcr);
+		pp.Compile(true, true, false);
+		// destroy pp early
+	}
+
+	// on success, cache the pipeline
+	if (pipelineObject->m_pipeline)
+	{
+    	uint64 pipelineStateHash = CalculatePipelineHash(vertexShader->compatibleFetchShader, vertexShader, geometryShader, pixelShader, cachedPipeline->lastUsedAttachmentsInfo, attachmentsInfo, *lcr);
+    	m_pipelineCacheLock.lock();
+    	m_pipelineCache[pipelineStateHash] = pipelineObject;
+    	m_pipelineCacheLock.unlock();
+	}
+
+	// clean up
+	s_spinlockSharedInternal.lock();
+	delete lcr;
+	delete cachedPipeline;
+	s_spinlockSharedInternal.unlock();
+}
+
+ConcurrentQueue<CachedPipeline*> g_mtlPipelineCachingQueue;
+
+void MetalPipelineCache::AddCurrentStateToCache(uint64 pipelineStateHash, const MetalAttachmentsInfo& lastUsedAttachmentsInfo)
+{
+	if (!m_pipelineCacheStoreThread)
+	{
+		m_pipelineCacheStoreThread = new std::thread(&MetalPipelineCache::WorkerThread, this);
+		m_pipelineCacheStoreThread->detach();
+	}
+	// fill job structure with cached GPU state
+	// for each cached pipeline we store:
+	// - Active shaders (referenced by hash)
+	// - An almost-complete register state of the GPU (minus some ALU uniform constants which aren't relevant)
+	CachedPipeline* job = new CachedPipeline();
+	auto vs = LatteSHRC_GetActiveVertexShader();
+	auto gs = LatteSHRC_GetActiveGeometryShader();
+	auto ps = LatteSHRC_GetActivePixelShader();
+	if (vs)
+		job->vsHash.set(vs->baseHash, vs->auxHash);
+	if (gs)
+		job->gsHash.set(gs->baseHash, gs->auxHash);
+	if (ps)
+		job->psHash.set(ps->baseHash, ps->auxHash);
+	job->lastUsedAttachmentsInfo = lastUsedAttachmentsInfo;
+	Latte::StoreGPURegisterState(LatteGPUState.contextNew, job->gpuState);
+	// queue job
+	g_mtlPipelineCachingQueue.push(job);
+}
+
+bool MetalPipelineCache::SerializePipeline(MemStreamWriter& memWriter, CachedPipeline& cachedPipeline)
+{
+	memWriter.writeBE<uint8>(0x01); // version
+	uint8 presentMask = 0;
+	if (cachedPipeline.vsHash.isPresent)
+		presentMask |= 1;
+	if (cachedPipeline.gsHash.isPresent)
+		presentMask |= 2;
+	if (cachedPipeline.psHash.isPresent)
+		presentMask |= 4;
+	memWriter.writeBE<uint8>(presentMask);
+	if (cachedPipeline.vsHash.isPresent)
+	{
+		memWriter.writeBE<uint64>(cachedPipeline.vsHash.baseHash);
+		memWriter.writeBE<uint64>(cachedPipeline.vsHash.auxHash);
+	}
+	if (cachedPipeline.gsHash.isPresent)
+	{
+		memWriter.writeBE<uint64>(cachedPipeline.gsHash.baseHash);
+		memWriter.writeBE<uint64>(cachedPipeline.gsHash.auxHash);
+	}
+	if (cachedPipeline.psHash.isPresent)
+	{
+		memWriter.writeBE<uint64>(cachedPipeline.psHash.baseHash);
+		memWriter.writeBE<uint64>(cachedPipeline.psHash.auxHash);
+	}
+
+	for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++)
+	    memWriter.writeBE<uint16>((uint16)cachedPipeline.lastUsedAttachmentsInfo.colorFormats[i]);
+	memWriter.writeBE<uint16>((uint16)cachedPipeline.lastUsedAttachmentsInfo.depthFormat);
+
+	Latte::SerializeRegisterState(cachedPipeline.gpuState, memWriter);
+
+	return true;
+}
+
+bool MetalPipelineCache::DeserializePipeline(MemStreamReader& memReader, CachedPipeline& cachedPipeline)
+{
+	// version
+	if (memReader.readBE<uint8>() != 1)
+	{
+		cemuLog_log(LogType::Force, "Cached Metal pipeline corrupted or has unknown version");
+		return false;
+	}
+	// shader hashes
+	uint8 presentMask = memReader.readBE<uint8>();
+	if (presentMask & 1)
+	{
+		uint64 baseHash = memReader.readBE<uint64>();
+		uint64 auxHash = memReader.readBE<uint64>();
+		cachedPipeline.vsHash.set(baseHash, auxHash);
+	}
+	if (presentMask & 2)
+	{
+		uint64 baseHash = memReader.readBE<uint64>();
+		uint64 auxHash = memReader.readBE<uint64>();
+		cachedPipeline.gsHash.set(baseHash, auxHash);
+	}
+	if (presentMask & 4)
+	{
+		uint64 baseHash = memReader.readBE<uint64>();
+		uint64 auxHash = memReader.readBE<uint64>();
+		cachedPipeline.psHash.set(baseHash, auxHash);
+	}
+
+	for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++)
+	    cachedPipeline.lastUsedAttachmentsInfo.colorFormats[i] = (Latte::E_GX2SURFFMT)memReader.readBE<uint16>();
+	cachedPipeline.lastUsedAttachmentsInfo.depthFormat = (Latte::E_GX2SURFFMT)memReader.readBE<uint16>();
+
+	// deserialize GPU state
+	if (!Latte::DeserializeRegisterState(cachedPipeline.gpuState, memReader))
+	{
+		return false;
+	}
+	cemu_assert_debug(!memReader.hasError());
+
+	return true;
+}
+
+int MetalPipelineCache::CompilerThread()
+{
+	SetThreadName("plCacheCompiler");
+	while (m_numCompilationThreads != 0)
+	{
+		std::vector<uint8> pipelineData = m_compilationQueue.pop();
+		if(pipelineData.empty())
+			continue;
+		LoadPipelineFromCache(pipelineData);
+		++g_mtlCacheState.pipelinesLoaded;
+	}
+	return 0;
+}
+
+void MetalPipelineCache::WorkerThread()
+{
+	SetThreadName("plCacheWriter");
+	while (true)
+	{
+		CachedPipeline* job;
+		g_mtlPipelineCachingQueue.pop(job);
+		if (!s_cache)
+		{
+			delete job;
+			continue;
+		}
+		// serialize
+		MemStreamWriter memWriter(1024 * 4);
+		SerializePipeline(memWriter, *job);
+		auto blob = memWriter.getResult();
+		// file name is derived from data hash
+		uint8 hash[SHA256_DIGEST_LENGTH];
+		SHA256(blob.data(), blob.size(), hash);
+		uint64 nameA = *(uint64be*)(hash + 0);
+		uint64 nameB = *(uint64be*)(hash + 8);
+		s_cache->AddFileAsync({ nameA, nameB }, blob.data(), blob.size());
+		delete job;
+	}
 }
diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h
index 916a9072..270c2db7 100644
--- a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h
+++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h
@@ -1,24 +1,26 @@
 #pragma once
 
-#include <Metal/Metal.hpp>
-
-#include "HW/Latte/ISA/LatteReg.h"
-#include "HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h"
-#include "Cafe/HW/Latte/Renderer/Renderer.h"
+#include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h"
+#include "util/helpers/ConcurrentQueue.h"
+#include "util/helpers/fspinlock.h"
+#include "util/math/vector2.h"
 
 class MetalPipelineCache
 {
 public:
-    static void ShaderCacheLoading_begin(uint64 cacheTitleId);
-    static void ShaderCacheLoading_end();
-    static void ShaderCacheLoading_Close();
+	static MetalPipelineCache& GetInstance();
 
-    MetalPipelineCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {}
+    MetalPipelineCache(class MetalRenderer* metalRenderer);
     ~MetalPipelineCache();
 
-    MTL::RenderPipelineState* GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr);
+    PipelineObject* GetRenderPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, Vector2i extend, uint32 indexCount, const LatteContextRegister& lcr);
 
-    MTL::RenderPipelineState* GetMeshPipelineState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, class CachedFBOMtl* activeFBO, const LatteContextRegister& lcr, Renderer::INDEX_TYPE hostIndexType);
+    // Cache loading
+	uint32 BeginLoading(uint64 cacheTitleId); // returns count of pipelines stored in cache
+	bool UpdateLoading(uint32& pipelinesLoadedTotal, uint32& pipelinesMissingShaders);
+	void EndLoading();
+	void LoadPipelineFromCache(std::span<uint8> fileData);
+       void Close(); // called on title exit
 
     // Debug
     size_t GetPipelineCacheSize() const { return m_pipelineCache.size(); }
@@ -26,12 +28,25 @@ public:
 private:
     class MetalRenderer* m_mtlr;
 
-    std::map<uint64, MTL::RenderPipelineState*> m_pipelineCache;
+    std::map<uint64, PipelineObject*> m_pipelineCache;
+    FSpinlock m_pipelineCacheLock;
 
-    NS::URL* m_binaryArchiveURL;
-    MTL::BinaryArchive* m_binaryArchive;
+	std::thread* m_pipelineCacheStoreThread;
 
-    uint64 CalculateRenderPipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* pixelShader, class CachedFBOMtl* lastUsedFBO, const LatteContextRegister& lcr);
+	class FileCache* s_cache;
 
-    void TryLoadBinaryArchive();
+	std::atomic_uint32_t m_numCompilationThreads{ 0 };
+	ConcurrentQueue<std::vector<uint8>> m_compilationQueue;
+	std::atomic_uint32_t m_compilationCount;
+
+    static uint64 CalculatePipelineHash(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr);
+
+    void AddCurrentStateToCache(uint64 pipelineStateHash, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo);
+
+	// pipeline serialization for file
+	bool SerializePipeline(class MemStreamWriter& memWriter, struct CachedPipeline& cachedPipeline);
+	bool DeserializePipeline(class MemStreamReader& memReader, struct CachedPipeline& cachedPipeline);
+
+    int CompilerThread();
+	void WorkerThread();
 };
diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp
new file mode 100644
index 00000000..9d74e2d9
--- /dev/null
+++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.cpp
@@ -0,0 +1,496 @@
+#include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h"
+#include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h"
+#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h"
+#include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h"
+#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h"
+#include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h"
+#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h"
+
+#include "Cafe/HW/Latte/Core/FetchShader.h"
+#include "Cafe/HW/Latte/ISA/RegDefines.h"
+#include "Cafe/HW/Latte/Core/LatteConst.h"
+#include "Cafe/HW/Latte/Core/LatteShader.h"
+
+#include <chrono>
+
+extern std::atomic_int g_compiling_pipelines;
+extern std::atomic_int g_compiling_pipelines_async;
+extern std::atomic_uint64_t g_compiling_pipelines_syncTimeSum;
+
+static void rectsEmulationGS_outputSingleVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable& psInputTable, sint32 vIdx, const LatteContextRegister& latteRegister)
+{
+	auto parameterMask = vertexShader->outputParameterMask;
+	for (uint32 i = 0; i < 32; i++)
+	{
+		if ((parameterMask & (1 << i)) == 0)
+			continue;
+		sint32 vsSemanticId = psInputTable.getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i);
+		if (vsSemanticId < 0)
+			continue;
+		// make sure PS has matching input
+		if (!psInputTable.hasPSImportForSemanticId(vsSemanticId))
+			continue;
+		gsSrc.append(fmt::format("out.passParameterSem{} = objectPayload.vertexOut[{}].passParameterSem{};\r\n", vsSemanticId, vIdx, vsSemanticId));
+	}
+	gsSrc.append(fmt::format("out.position = objectPayload.vertexOut[{}].position;\r\n", vIdx));
+	gsSrc.append(fmt::format("mesh.set_vertex({}, out);\r\n", vIdx));
+}
+
+static void rectsEmulationGS_outputGeneratedVertex(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable& psInputTable, const char* variant, const LatteContextRegister& latteRegister)
+{
+	auto parameterMask = vertexShader->outputParameterMask;
+	for (uint32 i = 0; i < 32; i++)
+	{
+		if ((parameterMask & (1 << i)) == 0)
+			continue;
+		sint32 vsSemanticId = psInputTable.getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i);
+		if (vsSemanticId < 0)
+			continue;
+		// make sure PS has matching input
+		if (!psInputTable.hasPSImportForSemanticId(vsSemanticId))
+			continue;
+		gsSrc.append(fmt::format("out.passParameterSem{} = gen4thVertex{}(objectPayload.vertexOut[0].passParameterSem{}, objectPayload.vertexOut[1].passParameterSem{}, objectPayload.vertexOut[2].passParameterSem{});\r\n", vsSemanticId, variant, vsSemanticId, vsSemanticId, vsSemanticId));
+	}
+	gsSrc.append(fmt::format("out.position = gen4thVertex{}(objectPayload.vertexOut[0].position, objectPayload.vertexOut[1].position, objectPayload.vertexOut[2].position);\r\n", variant));
+	gsSrc.append(fmt::format("mesh.set_vertex(3, out);\r\n"));
+}
+
+static void rectsEmulationGS_outputVerticesCode(std::string& gsSrc, const LatteDecompilerShader* vertexShader, LatteShaderPSInputTable& psInputTable, sint32 p0, sint32 p1, sint32 p2, sint32 p3, const char* variant, const LatteContextRegister& latteRegister)
+{
+	sint32 pList[4] = { p0, p1, p2, p3 };
+	for (sint32 i = 0; i < 4; i++)
+	{
+		if (pList[i] == 3)
+			rectsEmulationGS_outputGeneratedVertex(gsSrc, vertexShader, psInputTable, variant, latteRegister);
+		else
+			rectsEmulationGS_outputSingleVertex(gsSrc, vertexShader, psInputTable, pList[i], latteRegister);
+	}
+	gsSrc.append(fmt::format("mesh.set_index(0, {});\r\n", pList[0]));
+	gsSrc.append(fmt::format("mesh.set_index(1, {});\r\n", pList[1]));
+	gsSrc.append(fmt::format("mesh.set_index(2, {});\r\n", pList[2]));
+	gsSrc.append(fmt::format("mesh.set_index(3, {});\r\n", pList[1]));
+	gsSrc.append(fmt::format("mesh.set_index(4, {});\r\n", pList[2]));
+	gsSrc.append(fmt::format("mesh.set_index(5, {});\r\n", pList[3]));
+}
+
+static RendererShaderMtl* rectsEmulationGS_generate(MetalRenderer* metalRenderer, const LatteDecompilerShader* vertexShader, const LatteContextRegister& latteRegister)
+{
+	std::string gsSrc;
+	gsSrc.append("#include <metal_stdlib>\r\n");
+	gsSrc.append("using namespace metal;\r\n");
+
+	LatteShaderPSInputTable psInputTable;
+	LatteShader_CreatePSInputTable(&psInputTable, latteRegister.GetRawView());
+
+	// inputs & outputs
+	std::string vertexOutDefinition = "struct VertexOut {\r\n";
+	vertexOutDefinition += "float4 position;\r\n";
+	std::string geometryOutDefinition = "struct GeometryOut {\r\n";
+	geometryOutDefinition += "float4 position [[position]];\r\n";
+	auto parameterMask = vertexShader->outputParameterMask;
+	for (uint32 i = 0; i < 32; i++)
+	{
+		if ((parameterMask & (1 << i)) == 0)
+			continue;
+		sint32 vsSemanticId = psInputTable.getVertexShaderOutParamSemanticId(latteRegister.GetRawView(), i);
+		if (vsSemanticId < 0)
+			continue;
+		auto psImport = psInputTable.getPSImportBySemanticId(vsSemanticId);
+		if (psImport == nullptr)
+			continue;
+
+		// VertexOut
+		vertexOutDefinition += fmt::format("float4 passParameterSem{};\r\n", vsSemanticId);
+
+		// GeometryOut
+		geometryOutDefinition += fmt::format("float4 passParameterSem{}", vsSemanticId);
+
+        geometryOutDefinition += fmt::format(" [[user(locn{})]]", psInputTable.getPSImportLocationBySemanticId(vsSemanticId));
+        if (psImport->isFlat)
+            geometryOutDefinition += " [[flat]]";
+        if (psImport->isNoPerspective)
+			geometryOutDefinition += " [[center_no_perspective]]";
+        geometryOutDefinition += ";\r\n";
+	}
+	vertexOutDefinition += "};\r\n";
+	geometryOutDefinition += "};\r\n";
+
+	gsSrc.append(vertexOutDefinition);
+	gsSrc.append(geometryOutDefinition);
+
+	gsSrc.append("struct ObjectPayload {\r\n");
+	gsSrc.append("VertexOut vertexOut[3];\r\n");
+	gsSrc.append("};\r\n");
+
+	// gen function
+	gsSrc.append("float4 gen4thVertexA(float4 a, float4 b, float4 c)\r\n");
+	gsSrc.append("{\r\n");
+	gsSrc.append("return b - (c - a);\r\n");
+	gsSrc.append("}\r\n");
+
+	gsSrc.append("float4 gen4thVertexB(float4 a, float4 b, float4 c)\r\n");
+	gsSrc.append("{\r\n");
+	gsSrc.append("return c - (b - a);\r\n");
+	gsSrc.append("}\r\n");
+
+	gsSrc.append("float4 gen4thVertexC(float4 a, float4 b, float4 c)\r\n");
+	gsSrc.append("{\r\n");
+	gsSrc.append("return c + (b - a);\r\n");
+	gsSrc.append("}\r\n");
+
+	// main
+	gsSrc.append("using MeshType = mesh<GeometryOut, void, 4, 2, topology::triangle>;\r\n");
+	gsSrc.append("[[mesh, max_total_threads_per_threadgroup(1)]]\r\n");
+	gsSrc.append("void main0(MeshType mesh, const object_data ObjectPayload& objectPayload [[payload]])\r\n");
+	gsSrc.append("{\r\n");
+	gsSrc.append("GeometryOut out;\r\n");
+
+	// there are two possible winding orders that need different triangle generation:
+	// 0 1
+	// 2 3
+	// and
+	// 0 1
+	// 3 2
+	// all others are just symmetries of these cases
+
+	// we can determine the case by comparing the distance 0<->1 and 0<->2
+
+	gsSrc.append("float dist0_1 = length(objectPayload.vertexOut[1].position.xy - objectPayload.vertexOut[0].position.xy);\r\n");
+	gsSrc.append("float dist0_2 = length(objectPayload.vertexOut[2].position.xy - objectPayload.vertexOut[0].position.xy);\r\n");
+	gsSrc.append("float dist1_2 = length(objectPayload.vertexOut[2].position.xy - objectPayload.vertexOut[1].position.xy);\r\n");
+
+	// emit vertices
+	gsSrc.append("if(dist0_1 > dist0_2 && dist0_1 > dist1_2)\r\n");
+	gsSrc.append("{\r\n");
+	// p0 to p1 is diagonal
+	rectsEmulationGS_outputVerticesCode(gsSrc, vertexShader, psInputTable, 2, 1, 0, 3, "A", latteRegister);
+	gsSrc.append("} else if ( dist0_2 > dist0_1 && dist0_2 > dist1_2 ) {\r\n");
+	// p0 to p2 is diagonal
+	rectsEmulationGS_outputVerticesCode(gsSrc, vertexShader, psInputTable, 1, 2, 0, 3, "B", latteRegister);
+	gsSrc.append("} else {\r\n");
+	// p1 to p2 is diagonal
+	rectsEmulationGS_outputVerticesCode(gsSrc, vertexShader, psInputTable, 0, 1, 2, 3, "C", latteRegister);
+	gsSrc.append("}\r\n");
+
+	gsSrc.append("mesh.set_primitive_count(2);\r\n");
+
+	gsSrc.append("}\r\n");
+
+	auto mtlShader = new RendererShaderMtl(metalRenderer, RendererShader::ShaderType::kGeometry, 0, 0, false, false, gsSrc);
+	mtlShader->PreponeCompilation(true);
+
+	return mtlShader;
+}
+
+#define INVALID_TITLE_ID 0xFFFFFFFFFFFFFFFF
+
+uint64 s_cacheTitleId = INVALID_TITLE_ID;
+
+extern std::atomic_int g_compiled_shaders_total;
+extern std::atomic_int g_compiled_shaders_async;
+
+template<typename T>
+void SetFragmentState(T* desc, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, bool rasterizationEnabled, const LatteContextRegister& lcr)
+{
+	// TODO: check if the pixel shader is valid as well?
+	if (!rasterizationEnabled/* || !pixelShaderMtl*/)
+	{
+	    desc->setRasterizationEnabled(false);
+		return;
+	}
+
+    // Color attachments
+	const Latte::LATTE_CB_COLOR_CONTROL& colorControlReg = lcr.CB_COLOR_CONTROL;
+	uint32 blendEnableMask = colorControlReg.get_BLEND_MASK();
+	uint32 renderTargetMask = lcr.CB_TARGET_MASK.get_MASK();
+	for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++)
+	{
+	    Latte::E_GX2SURFFMT format = lastUsedAttachmentsInfo.colorFormats[i];
+		if (format == Latte::E_GX2SURFFMT::INVALID_FORMAT)
+		    continue;
+
+		MTL::PixelFormat pixelFormat = GetMtlPixelFormat(format, false);
+		auto colorAttachment = desc->colorAttachments()->object(i);
+		colorAttachment->setPixelFormat(pixelFormat);
+
+		// Disable writes if not in the active FBO
+		if (activeAttachmentsInfo.colorFormats[i] == Latte::E_GX2SURFFMT::INVALID_FORMAT)
+        {
+            colorAttachment->setWriteMask(MTL::ColorWriteMaskNone);
+            continue;
+        }
+
+		colorAttachment->setWriteMask(GetMtlColorWriteMask((renderTargetMask >> (i * 4)) & 0xF));
+
+		// Blending
+		bool blendEnabled = ((blendEnableMask & (1 << i))) != 0;
+		// Only float data type is blendable
+		if (blendEnabled && GetMtlPixelFormatInfo(format, false).dataType == MetalDataType::FLOAT)
+		{
+       		colorAttachment->setBlendingEnabled(true);
+
+       		const auto& blendControlReg = lcr.CB_BLENDN_CONTROL[i];
+
+       		auto rgbBlendOp = GetMtlBlendOp(blendControlReg.get_COLOR_COMB_FCN());
+       		auto srcRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_SRCBLEND());
+       		auto dstRgbBlendFactor = GetMtlBlendFactor(blendControlReg.get_COLOR_DSTBLEND());
+
+       		colorAttachment->setRgbBlendOperation(rgbBlendOp);
+       		colorAttachment->setSourceRGBBlendFactor(srcRgbBlendFactor);
+       		colorAttachment->setDestinationRGBBlendFactor(dstRgbBlendFactor);
+       		if (blendControlReg.get_SEPARATE_ALPHA_BLEND())
+       		{
+       			colorAttachment->setAlphaBlendOperation(GetMtlBlendOp(blendControlReg.get_ALPHA_COMB_FCN()));
+      		    colorAttachment->setSourceAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_SRCBLEND()));
+      		    colorAttachment->setDestinationAlphaBlendFactor(GetMtlBlendFactor(blendControlReg.get_ALPHA_DSTBLEND()));
+       		}
+       		else
+       		{
+           		colorAttachment->setAlphaBlendOperation(rgbBlendOp);
+           		colorAttachment->setSourceAlphaBlendFactor(srcRgbBlendFactor);
+           		colorAttachment->setDestinationAlphaBlendFactor(dstRgbBlendFactor);
+       		}
+		}
+	}
+
+	// Depth stencil attachment
+	if (lastUsedAttachmentsInfo.depthFormat != Latte::E_GX2SURFFMT::INVALID_FORMAT)
+	{
+	    MTL::PixelFormat pixelFormat = GetMtlPixelFormat(lastUsedAttachmentsInfo.depthFormat, true);
+        desc->setDepthAttachmentPixelFormat(pixelFormat);
+        if (lastUsedAttachmentsInfo.hasStencil)
+            desc->setStencilAttachmentPixelFormat(pixelFormat);
+	}
+}
+
+MetalPipelineCompiler::~MetalPipelineCompiler()
+{
+    /*
+    for (auto& pair : m_pipelineCache)
+    {
+        pair.second->release();
+    }
+    m_pipelineCache.clear();
+
+    NS::Error* error = nullptr;
+    m_binaryArchive->serializeToURL(m_binaryArchiveURL, &error);
+    if (error)
+    {
+        cemuLog_log(LogType::Force, "error serializing binary archive: {}", error->localizedDescription()->utf8String());
+        error->release();
+    }
+    m_binaryArchive->release();
+
+    m_binaryArchiveURL->release();
+    */
+    m_pipelineDescriptor->release();
+}
+
+void MetalPipelineCompiler::InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr)
+{
+    // Check if the pipeline uses a geometry shader
+    const LattePrimitiveMode primitiveMode = static_cast<LattePrimitiveMode>(lcr.VGT_PRIMITIVE_TYPE.get_PRIMITIVE_MODE());
+    bool isPrimitiveRect = (primitiveMode == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS);
+
+    m_usesGeometryShader = (geometryShader != nullptr || isPrimitiveRect);
+
+    // Rasterization
+	m_rasterizationEnabled = !lcr.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL();
+
+	// HACK
+	// TODO: include this in the hash?
+	if (!lcr.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA())
+		m_rasterizationEnabled = true;
+
+	// Culling both front and back faces effectively disables rasterization
+	const auto& polygonControlReg = lcr.PA_SU_SC_MODE_CNTL;
+	uint32 cullFront = polygonControlReg.get_CULL_FRONT();
+	uint32 cullBack = polygonControlReg.get_CULL_BACK();
+	if (cullFront && cullBack)
+	    m_rasterizationEnabled = false;
+
+    // Shaders
+    m_vertexShaderMtl = static_cast<RendererShaderMtl*>(vertexShader->shader);
+    if (geometryShader)
+        m_geometryShaderMtl = static_cast<RendererShaderMtl*>(geometryShader->shader);
+    else if (isPrimitiveRect)
+        m_geometryShaderMtl = rectsEmulationGS_generate(m_mtlr, vertexShader, lcr);
+    else
+        m_geometryShaderMtl = nullptr;
+    m_pixelShaderMtl = static_cast<RendererShaderMtl*>(pixelShader->shader);
+
+    if (m_usesGeometryShader)
+        InitFromStateMesh(fetchShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr);
+    else
+        InitFromStateRender(fetchShader, vertexShader, lastUsedAttachmentsInfo, activeAttachmentsInfo, lcr);
+}
+
+bool MetalPipelineCompiler::Compile(bool forceCompile, bool isRenderThread, bool showInOverlay)
+{
+    if (forceCompile)
+	{
+		// if some shader stages are not compiled yet, compile them now
+		if (m_vertexShaderMtl && !m_vertexShaderMtl->IsCompiled())
+			m_vertexShaderMtl->PreponeCompilation(isRenderThread);
+		if (m_geometryShaderMtl && !m_geometryShaderMtl->IsCompiled())
+			m_geometryShaderMtl->PreponeCompilation(isRenderThread);
+		if (m_pixelShaderMtl && !m_pixelShaderMtl->IsCompiled())
+			m_pixelShaderMtl->PreponeCompilation(isRenderThread);
+	}
+	else
+	{
+	    // fail early if some shader stages are not compiled
+		if (m_vertexShaderMtl && !m_vertexShaderMtl->IsCompiled())
+			return false;
+		if (m_geometryShaderMtl && !m_geometryShaderMtl->IsCompiled())
+			return false;
+		if (m_pixelShaderMtl && !m_pixelShaderMtl->IsCompiled())
+			return false;
+	}
+
+	// Compile
+    MTL::RenderPipelineState* pipeline = nullptr;
+    NS::Error* error = nullptr;
+
+    auto start = std::chrono::high_resolution_clock::now();
+    if (m_usesGeometryShader)
+    {
+        auto desc = static_cast<MTL::MeshRenderPipelineDescriptor*>(m_pipelineDescriptor);
+
+        // Shaders
+        desc->setObjectFunction(m_vertexShaderMtl->GetFunction());
+        desc->setMeshFunction(m_geometryShaderMtl->GetFunction());
+        if (m_rasterizationEnabled)
+            desc->setFragmentFunction(m_pixelShaderMtl->GetFunction());
+
+#ifdef CEMU_DEBUG_ASSERT
+        desc->setLabel(GetLabel("Mesh render pipeline state", desc));
+#endif
+       	pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionNone, nullptr, &error);
+    }
+    else
+    {
+        auto desc = static_cast<MTL::RenderPipelineDescriptor*>(m_pipelineDescriptor);
+
+        // Shaders
+        desc->setVertexFunction(m_vertexShaderMtl->GetFunction());
+        if (m_rasterizationEnabled)
+            desc->setFragmentFunction(m_pixelShaderMtl->GetFunction());
+
+#ifdef CEMU_DEBUG_ASSERT
+        desc->setLabel(GetLabel("Render pipeline state", desc));
+#endif
+       	pipeline = m_mtlr->GetDevice()->newRenderPipelineState(desc, MTL::PipelineOptionNone, nullptr, &error);
+    }
+    auto end = std::chrono::high_resolution_clock::now();
+
+    auto creationDuration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
+
+   	if (error)
+   	{
+       	cemuLog_log(LogType::Force, "error creating render pipeline state: {}", error->localizedDescription()->utf8String());
+        error->release();
+   	}
+
+    if (showInOverlay)
+	{
+		if (isRenderThread)
+			g_compiling_pipelines_syncTimeSum += creationDuration;
+		else
+			g_compiling_pipelines_async++;
+		g_compiling_pipelines++;
+	}
+
+	m_pipelineObj.m_pipeline = pipeline;
+
+	return true;
+}
+
+void MetalPipelineCompiler::InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr)
+{
+	// Render pipeline state
+	MTL::RenderPipelineDescriptor* desc = MTL::RenderPipelineDescriptor::alloc()->init();
+
+    // Vertex descriptor
+    if (!fetchShader->mtlFetchVertexManually)
+    {
+    	MTL::VertexDescriptor* vertexDescriptor = MTL::VertexDescriptor::alloc()->init();
+    	for (auto& bufferGroup : fetchShader->bufferGroups)
+    	{
+    		std::optional<LatteConst::VertexFetchType2> fetchType;
+
+    		uint32 minBufferStride = 0;
+    		for (sint32 j = 0; j < bufferGroup.attribCount; ++j)
+    		{
+    			auto& attr = bufferGroup.attrib[j];
+
+    			uint32 semanticId = vertexShader->resourceMapping.attributeMapping[attr.semanticId];
+    			if (semanticId == (uint32)-1)
+    				continue; // attribute not used?
+
+    			auto attribute = vertexDescriptor->attributes()->object(semanticId);
+    			attribute->setOffset(attr.offset);
+    			attribute->setBufferIndex(GET_MTL_VERTEX_BUFFER_INDEX(attr.attributeBufferIndex));
+    			attribute->setFormat(GetMtlVertexFormat(attr.format));
+
+    			minBufferStride = std::max(minBufferStride, attr.offset + GetMtlVertexFormatSize(attr.format));
+
+    			if (fetchType.has_value())
+    				cemu_assert_debug(fetchType == attr.fetchType);
+    			else
+    				fetchType = attr.fetchType;
+
+    			if (attr.fetchType == LatteConst::INSTANCE_DATA)
+    			{
+    				cemu_assert_debug(attr.aluDivisor == 1); // other divisor not yet supported
+    			}
+    		}
+
+    		uint32 bufferIndex = bufferGroup.attributeBufferIndex;
+    		uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7;
+    		uint32 bufferStride = (lcr.GetRawView()[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF;
+
+    		auto layout = vertexDescriptor->layouts()->object(GET_MTL_VERTEX_BUFFER_INDEX(bufferIndex));
+    		if (bufferStride == 0)
+    		{
+    		    // Buffer stride cannot be zero, let's use the minimum stride
+    			bufferStride = minBufferStride;
+
+    			// Additionally, constant vertex function must be used
+    			layout->setStepFunction(MTL::VertexStepFunctionConstant);
+    			layout->setStepRate(0);
+    		}
+    		else
+    		{
+      		if (!fetchType.has_value() || fetchType == LatteConst::VertexFetchType2::VERTEX_DATA)
+     			layout->setStepFunction(MTL::VertexStepFunctionPerVertex);
+      		else if (fetchType == LatteConst::VertexFetchType2::INSTANCE_DATA)
+     			layout->setStepFunction(MTL::VertexStepFunctionPerInstance);
+      		else
+      		{
+      		    debug_printf("unimplemented vertex fetch type %u\n", (uint32)fetchType.value());
+     			cemu_assert(false);
+      		}
+    		}
+    		bufferStride = Align(bufferStride, 4);
+    		layout->setStride(bufferStride);
+    	}
+
+    	desc->setVertexDescriptor(vertexDescriptor);
+        vertexDescriptor->release();
+    }
+
+	SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, m_rasterizationEnabled, lcr);
+
+	m_pipelineDescriptor = desc;
+}
+
+void MetalPipelineCompiler::InitFromStateMesh(const LatteFetchShader* fetchShader, const MetalAttachmentsInfo& lastUsedAttachmentsInfo, const MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr)
+{
+	// Render pipeline state
+	MTL::MeshRenderPipelineDescriptor* desc = MTL::MeshRenderPipelineDescriptor::alloc()->init();
+
+	SetFragmentState(desc, lastUsedAttachmentsInfo, activeAttachmentsInfo, m_rasterizationEnabled, lcr);
+
+	m_pipelineDescriptor = desc;
+}
diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h
new file mode 100644
index 00000000..5006ed59
--- /dev/null
+++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalPipelineCompiler.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include "Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h"
+
+#include "Cafe/HW/Latte/ISA/LatteReg.h"
+#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h"
+
+struct PipelineObject
+{
+    MTL::RenderPipelineState* m_pipeline = nullptr;
+};
+
+class MetalPipelineCompiler
+{
+public:
+    MetalPipelineCompiler(class MetalRenderer* metalRenderer, PipelineObject& pipelineObj) : m_mtlr{metalRenderer}, m_pipelineObj{pipelineObj} {}
+    ~MetalPipelineCompiler();
+
+    void InitFromState(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const LatteDecompilerShader* geometryShader, const LatteDecompilerShader* pixelShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr);
+
+    bool Compile(bool forceCompile, bool isRenderThread, bool showInOverlay);
+
+private:
+    class MetalRenderer* m_mtlr;
+    PipelineObject& m_pipelineObj;
+
+    class RendererShaderMtl* m_vertexShaderMtl;
+    class RendererShaderMtl* m_geometryShaderMtl;
+    class RendererShaderMtl* m_pixelShaderMtl;
+    bool m_usesGeometryShader;
+    bool m_rasterizationEnabled;
+
+    NS::Object* m_pipelineDescriptor;
+
+    void InitFromStateRender(const LatteFetchShader* fetchShader, const LatteDecompilerShader* vertexShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr);
+
+    void InitFromStateMesh(const LatteFetchShader* fetchShader, const class MetalAttachmentsInfo& lastUsedAttachmentsInfo, const class MetalAttachmentsInfo& activeAttachmentsInfo, const LatteContextRegister& lcr);
+};
diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp
index 7cd85857..dc4244ec 100644
--- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp
+++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp
@@ -23,6 +23,7 @@
 #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h"
 #include "Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h"
 #include "Cafe/HW/Latte/Renderer/Renderer.h"
+#include "HW/Latte/Renderer/Metal/MetalPipelineCompiler.h"
 #include "config/CemuConfig.h"
 
 #define IMGUI_IMPL_METAL_CPP
@@ -69,6 +70,7 @@ MetalRenderer::MetalRenderer()
     MTL::TextureDescriptor* textureDescriptor = MTL::TextureDescriptor::alloc()->init();
     textureDescriptor->setTextureType(MTL::TextureType1D);
     textureDescriptor->setWidth(1);
+    textureDescriptor->setUsage(MTL::TextureUsageShaderRead);
     m_nullTexture1D = m_device->newTexture(textureDescriptor);
 #ifdef CEMU_DEBUG_ASSERT
     m_nullTexture1D->setLabel(GetLabel("Null texture 1D", m_nullTexture1D));
@@ -76,6 +78,7 @@ MetalRenderer::MetalRenderer()
 
     textureDescriptor->setTextureType(MTL::TextureType2D);
     textureDescriptor->setHeight(1);
+    textureDescriptor->setUsage(MTL::TextureUsageShaderRead | MTL::TextureUsageRenderTarget);
     m_nullTexture2D = m_device->newTexture(textureDescriptor);
 #ifdef CEMU_DEBUG_ASSERT
     m_nullTexture2D->setLabel(GetLabel("Null texture 2D", m_nullTexture2D));
@@ -511,13 +514,13 @@ LatteCachedFBO* MetalRenderer::rendertarget_createCachedFBO(uint64 key)
 
 void MetalRenderer::rendertarget_deleteCachedFBO(LatteCachedFBO* cfbo)
 {
-	if (cfbo == (LatteCachedFBO*)m_state.m_activeFBO)
-	m_state.m_activeFBO = nullptr;
+	if (cfbo == (LatteCachedFBO*)m_state.m_activeFBO.m_fbo)
+	    m_state.m_activeFBO = {nullptr};
 }
 
 void MetalRenderer::rendertarget_bindFramebufferObject(LatteCachedFBO* cfbo)
 {
-	m_state.m_activeFBO = (CachedFBOMtl*)cfbo;
+	m_state.m_activeFBO = {(CachedFBOMtl*)cfbo, MetalAttachmentsInfo((CachedFBOMtl*)cfbo)};
 }
 
 void* MetalRenderer::texture_acquireTextureUploadBuffer(uint32 size)
@@ -943,15 +946,9 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32
 
     // Shaders
     LatteDecompilerShader* vertexShader = LatteSHRC_GetActiveVertexShader();
-    if (vertexShader && !vertexShader->shader->IsCompiled())
-        return;
     LatteDecompilerShader* geometryShader = LatteSHRC_GetActiveGeometryShader();
-    if (geometryShader && !geometryShader->shader->IsCompiled())
-        return;
     LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader();
     const auto fetchShader = LatteSHRC_GetActiveFetchShader();
-    if (vertexShader && !pixelShader->shader->IsCompiled())
-        return;
 
     bool neverSkipAccurateBarrier = false;
 
@@ -1003,12 +1000,23 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32
 	// Render pass
 	auto renderCommandEncoder = GetRenderCommandEncoder();
 
+    // Render pipeline state
+    PipelineObject* pipelineObj = m_pipelineCache->GetRenderPipelineState(fetchShader, vertexShader, geometryShader, pixelShader, m_state.m_lastUsedFBO.m_attachmentsInfo, m_state.m_activeFBO.m_attachmentsInfo, m_state.m_activeFBO.m_fbo->m_size, count, LatteGPUState.contextNew);
+    if (!pipelineObj->m_pipeline)
+        return;
+
+    if (pipelineObj->m_pipeline != encoderState.m_renderPipelineState)
+   	{
+        renderCommandEncoder->setRenderPipelineState(pipelineObj->m_pipeline);
+  		encoderState.m_renderPipelineState = pipelineObj->m_pipeline;
+   	}
+
 	// Depth stencil state
 
 	// Disable depth write when there is no depth attachment
 	auto& depthControl = LatteGPUState.contextNew.DB_DEPTH_CONTROL;
 	bool depthWriteEnable = depthControl.get_Z_WRITE_ENABLE();
-	if (!m_state.m_activeFBO->depthBuffer.texture)
+	if (!m_state.m_activeFBO.m_fbo->depthBuffer.texture)
 	    depthControl.set_Z_WRITE_ENABLE(false);
 
 	MTL::DepthStencilState* depthStencilState = m_depthStencilCache->GetDepthStencilState(LatteGPUState.contextNew);
@@ -1221,22 +1229,6 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32
     //    renderCommandEncoder->memoryBarrier(barrierBuffers.data(), barrierBuffers.size(), MTL::RenderStageVertex, MTL::RenderStageVertex);
     //}
 
-	// Render pipeline state
-	MTL::RenderPipelineState* renderPipelineState;
-	if (usesGeometryShader)
-	    renderPipelineState = m_pipelineCache->GetMeshPipelineState(fetchShader, vertexShader, geometryShader, pixelShader, m_state.m_lastUsedFBO, m_state.m_activeFBO, LatteGPUState.contextNew, hostIndexType);
-	else
-        renderPipelineState = m_pipelineCache->GetRenderPipelineState(fetchShader, vertexShader, pixelShader, m_state.m_lastUsedFBO, m_state.m_activeFBO, LatteGPUState.contextNew);
-
-    if (!renderPipelineState)
-        return;
-
-	if (renderPipelineState != encoderState.m_renderPipelineState)
-   	{
-        renderCommandEncoder->setRenderPipelineState(renderPipelineState);
-  		encoderState.m_renderPipelineState = renderPipelineState;
-   	}
-
 	// Prepare streamout
 	m_state.m_streamoutState.verticesPerInstance = count;
 	LatteStreamout_PrepareDrawcall(count, instanceCount);
@@ -1529,12 +1521,12 @@ MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(bool forceRecr
         {
             if (m_encoderType == MetalEncoderType::Render)
             {
-                bool needsNewRenderPass = (m_state.m_lastUsedFBO == nullptr);
+                bool needsNewRenderPass = (m_state.m_lastUsedFBO.m_fbo == nullptr);
                 if (!needsNewRenderPass)
                 {
                     for (uint8 i = 0; i < 8; i++)
                     {
-                        if (m_state.m_activeFBO->colorBuffer[i].texture && m_state.m_activeFBO->colorBuffer[i].texture != m_state.m_lastUsedFBO->colorBuffer[i].texture)
+                        if (m_state.m_activeFBO.m_fbo->colorBuffer[i].texture && m_state.m_activeFBO.m_fbo->colorBuffer[i].texture != m_state.m_lastUsedFBO.m_fbo->colorBuffer[i].texture)
                         {
                             needsNewRenderPass = true;
                             break;
@@ -1544,7 +1536,7 @@ MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(bool forceRecr
 
                 if (!needsNewRenderPass)
                 {
-                    if (m_state.m_activeFBO->depthBuffer.texture && (m_state.m_activeFBO->depthBuffer.texture != m_state.m_lastUsedFBO->depthBuffer.texture || ( m_state.m_activeFBO->depthBuffer.hasStencil && !m_state.m_lastUsedFBO->depthBuffer.hasStencil)))
+                    if (m_state.m_activeFBO.m_fbo->depthBuffer.texture && (m_state.m_activeFBO.m_fbo->depthBuffer.texture != m_state.m_lastUsedFBO.m_fbo->depthBuffer.texture || ( m_state.m_activeFBO.m_fbo->depthBuffer.hasStencil && !m_state.m_lastUsedFBO.m_fbo->depthBuffer.hasStencil)))
                     {
                         needsNewRenderPass = true;
                     }
@@ -1562,7 +1554,7 @@ MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(bool forceRecr
 
     auto commandBuffer = GetCommandBuffer();
 
-    auto renderCommandEncoder = commandBuffer->renderCommandEncoder(m_state.m_activeFBO->GetRenderPassDescriptor());
+    auto renderCommandEncoder = commandBuffer->renderCommandEncoder(m_state.m_activeFBO.m_fbo->GetRenderPassDescriptor());
 #ifdef CEMU_DEBUG_ASSERT
     renderCommandEncoder->setLabel(GetLabel("Render command encoder", renderCommandEncoder));
 #endif
@@ -1721,7 +1713,7 @@ bool MetalRenderer::CheckIfRenderPassNeedsFlush(LatteDecompilerShader* shader)
 		    // If the texture is also used in the current render pass, we need to end the render pass to "flush" the texture
 			for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++)
 			{
-			    auto colorTarget = m_state.m_activeFBO->colorBuffer[i].texture;
+			    auto colorTarget = m_state.m_activeFBO.m_fbo->colorBuffer[i].texture;
 				if (colorTarget && colorTarget->baseTexture == baseTexture)
 				    return true;
 			}
diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h
index 526f33a5..93c9a56d 100644
--- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h
+++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h
@@ -5,6 +5,7 @@
 #include "Cafe/HW/Latte/Renderer/Metal/MetalLayerHandle.h"
 #include "Cafe/HW/Latte/Renderer/Metal/MetalPerformanceMonitor.h"
 #include "Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h"
+#include "Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h"
 
 struct MetalBufferAllocation
 {
@@ -121,6 +122,12 @@ struct MetalStreamoutState
 	sint32 verticesPerInstance;
 };
 
+struct MetalActiveFBOState
+{
+    class CachedFBOMtl* m_fbo = nullptr;
+    MetalAttachmentsInfo m_attachmentsInfo;
+};
+
 struct MetalState
 {
     MetalEncoderState m_encoderState{};
@@ -130,9 +137,9 @@ struct MetalState
     bool m_skipDrawSequence = false;
     bool m_isFirstDrawInRenderPass = true;
 
-    class CachedFBOMtl* m_activeFBO = nullptr;
-    // If the FBO changes, but it's the same FBO as the last one with some omitted attachments, this FBO doesn't change'
-    class CachedFBOMtl* m_lastUsedFBO = nullptr;
+    MetalActiveFBOState m_activeFBO;
+    // If the FBO changes, but it's the same FBO as the last one with some omitted attachments, this FBO doesn't change
+    MetalActiveFBOState m_lastUsedFBO;
 
     MetalBoundBuffer m_vertexBuffers[MAX_MTL_BUFFERS] = {{}};
     // TODO: find out what is the max number of bound textures on the Wii U