diff --git a/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/ui/settings/SettingsAdapter.java b/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/ui/settings/SettingsAdapter.java index 02d36db513..0887a7e68e 100644 --- a/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/ui/settings/SettingsAdapter.java +++ b/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/ui/settings/SettingsAdapter.java @@ -274,6 +274,10 @@ public final class SettingsAdapter extends RecyclerView.Adapter sl) { + int uberShaderModeValue = getUberShaderModeValue(); + Setting resolution = mSettings.get(SettingsFile.SETTINGS_GFX).get(SettingsFile.SECTION_GFX_SETTINGS).getSetting(SettingsFile.KEY_INTERNAL_RES); Setting fsaa = mSettings.get(SettingsFile.SETTINGS_GFX).get(SettingsFile.SECTION_GFX_SETTINGS).getSetting(SettingsFile.KEY_FSAA); Setting anisotropic = mSettings.get(SettingsFile.SETTINGS_GFX).get(SettingsFile.SECTION_GFX_ENHANCEMENTS).getSetting(SettingsFile.KEY_ANISOTROPY); @@ -283,6 +285,7 @@ public final class SettingsFragmentPresenter Setting perPixel = mSettings.get(SettingsFile.SETTINGS_GFX).get(SettingsFile.SECTION_GFX_SETTINGS).getSetting(SettingsFile.KEY_PER_PIXEL); Setting forceFilter = mSettings.get(SettingsFile.SETTINGS_GFX).get(SettingsFile.SECTION_GFX_ENHANCEMENTS).getSetting(SettingsFile.KEY_FORCE_FILTERING); Setting disableFog = mSettings.get(SettingsFile.SETTINGS_GFX).get(SettingsFile.SECTION_GFX_SETTINGS).getSetting(SettingsFile.KEY_DISABLE_FOG); + IntSetting uberShaderMode = new IntSetting(SettingsFile.KEY_UBERSHADER_MODE, SettingsFile.SECTION_GFX_SETTINGS, SettingsFile.SETTINGS_GFX, uberShaderModeValue); sl.add(new SingleChoiceSetting(SettingsFile.KEY_INTERNAL_RES, SettingsFile.SECTION_GFX_SETTINGS, SettingsFile.SETTINGS_GFX, R.string.internal_resolution, R.string.internal_resolution_descrip, R.array.internalResolutionEntries, R.array.internalResolutionValues, 0, resolution)); sl.add(new SingleChoiceSetting(SettingsFile.KEY_FSAA, SettingsFile.SECTION_GFX_SETTINGS, SettingsFile.SETTINGS_GFX, R.string.FSAA, R.string.FSAA_descrip, R.array.FSAAEntries, R.array.FSAAValues, 0, fsaa)); @@ -296,6 +299,7 @@ public final class SettingsFragmentPresenter sl.add(new CheckBoxSetting(SettingsFile.KEY_PER_PIXEL, SettingsFile.SECTION_GFX_SETTINGS, SettingsFile.SETTINGS_GFX, R.string.per_pixel_lighting, R.string.per_pixel_lighting_descrip, false, perPixel)); sl.add(new CheckBoxSetting(SettingsFile.KEY_FORCE_FILTERING, SettingsFile.SECTION_GFX_ENHANCEMENTS, SettingsFile.SETTINGS_GFX, R.string.force_texture_filtering, R.string.force_texture_filtering_descrip, false, forceFilter)); sl.add(new CheckBoxSetting(SettingsFile.KEY_DISABLE_FOG, SettingsFile.SECTION_GFX_SETTINGS, SettingsFile.SETTINGS_GFX, R.string.disable_fog, R.string.disable_fog_descrip, false, disableFog)); + sl.add(new SingleChoiceSetting(SettingsFile.KEY_UBERSHADER_MODE, SettingsFile.SECTION_GFX_SETTINGS, SettingsFile.SETTINGS_GFX, R.string.ubershader_mode, R.string.ubershader_mode_descrip, R.array.uberShaderModeEntries, R.array.uberShaderModeValues, 0, uberShaderMode)); /* Check if we support stereo @@ -903,6 +907,29 @@ public final class SettingsFragmentPresenter return xfbValue; } + private int getUberShaderModeValue() + { + int uberShaderModeValue = 0; + + try + { + boolean backgroundShaderCompiling = ((BooleanSetting) mSettings.get(SettingsFile.SETTINGS_GFX).get(SettingsFile.SECTION_GFX_SETTINGS).getSetting(SettingsFile.KEY_BACKGROUND_SHADER_COMPILING)).getValue(); + boolean disableSpecializedShaders = ((BooleanSetting) mSettings.get(SettingsFile.SETTINGS_GFX).get(SettingsFile.SECTION_GFX_SETTINGS).getSetting(SettingsFile.KEY_DISABLE_SPECIALIZED_SHADERS)).getValue(); + + if (disableSpecializedShaders) + uberShaderModeValue = 2; // Exclusive + else if (backgroundShaderCompiling) + uberShaderModeValue = 1; // Hybrid + else + uberShaderModeValue = 0; // Disabled + } + catch (NullPointerException ex) + { + } + + return uberShaderModeValue; + } + private int getExtensionValue(int wiimoteNumber) { int extensionValue; diff --git a/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/utils/SettingsFile.java b/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/utils/SettingsFile.java index e8d1cb1b8d..205fffce6c 100644 --- a/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/utils/SettingsFile.java +++ b/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/utils/SettingsFile.java @@ -79,6 +79,9 @@ public final class SettingsFile public static final String KEY_XFB_REAL = "UseRealXFB"; public static final String KEY_FAST_DEPTH = "FastDepthCalc"; public static final String KEY_ASPECT_RATIO = "AspectRatio"; + public static final String KEY_UBERSHADER_MODE = "UberShaderMode"; + public static final String KEY_DISABLE_SPECIALIZED_SHADERS = "DisableSpecializedShaders"; + public static final String KEY_BACKGROUND_SHADER_COMPILING = "BackgroundShaderCompiling"; public static final String KEY_GCPAD_TYPE = "SIDevice"; diff --git a/Source/Android/app/src/main/res/values/arrays.xml b/Source/Android/app/src/main/res/values/arrays.xml index 42b5409884..deeba91df3 100644 --- a/Source/Android/app/src/main/res/values/arrays.xml +++ b/Source/Android/app/src/main/res/values/arrays.xml @@ -89,6 +89,18 @@ 2 + + + Disabled + Hybrid + Exclusive + + + 0 + 1 + 2 + + 1x Native (640x528) diff --git a/Source/Android/app/src/main/res/values/strings.xml b/Source/Android/app/src/main/res/values/strings.xml index 292a5e7172..0996b2515d 100644 --- a/Source/Android/app/src/main/res/values/strings.xml +++ b/Source/Android/app/src/main/res/values/strings.xml @@ -179,6 +179,8 @@ Uses a less accurate algorithm to calculate depth values. Aspect Ratio Select what aspect ratio to use when rendering + Ubershader Mode + Specifies when to use Ubershaders. Disabled - Never, Hybrid - Use ubershaders while compiling specialized shaders. Exclusive - Use only ubershaders, largest performance impact. Yes diff --git a/Source/Core/Core/Config/GraphicsSettings.cpp b/Source/Core/Core/Config/GraphicsSettings.cpp index f467c8e911..e7c195a4cb 100644 --- a/Source/Core/Core/Config/GraphicsSettings.cpp +++ b/Source/Core/Core/Config/GraphicsSettings.cpp @@ -77,6 +77,20 @@ const ConfigInfo GFX_BACKEND_MULTITHREADING{ const ConfigInfo GFX_COMMAND_BUFFER_EXECUTE_INTERVAL{ {System::GFX, "Settings", "CommandBufferExecuteInterval"}, 100}; const ConfigInfo GFX_SHADER_CACHE{{System::GFX, "Settings", "ShaderCache"}, true}; +const ConfigInfo GFX_BACKGROUND_SHADER_COMPILING{ + {System::GFX, "Settings", "BackgroundShaderCompiling"}, false}; +const ConfigInfo GFX_DISABLE_SPECIALIZED_SHADERS{ + {System::GFX, "Settings", "DisableSpecializedShaders"}, false}; +const ConfigInfo GFX_PRECOMPILE_UBER_SHADERS{ + {System::GFX, "Settings", "PrecompileUberShaders"}, true}; +const ConfigInfo GFX_SHADER_COMPILER_THREADS{ + {System::GFX, "Settings", "ShaderCompilerThreads"}, 1}; +const ConfigInfo GFX_SHADER_PRECOMPILER_THREADS{ + {System::GFX, "Settings", "ShaderPrecompilerThreads"}, 1}; +const ConfigInfo GFX_FORCE_VERTEX_UBER_SHADERS{ + {System::GFX, "Settings", "ForceVertexUberShaders"}, false}; +const ConfigInfo GFX_FORCE_PIXEL_UBER_SHADERS{ + {System::GFX, "Settings", "ForcePixelUberShaders"}, false}; const ConfigInfo GFX_SW_ZCOMPLOC{{System::GFX, "Settings", "SWZComploc"}, true}; const ConfigInfo GFX_SW_ZFREEZE{{System::GFX, "Settings", "SWZFreeze"}, true}; diff --git a/Source/Core/Core/Config/GraphicsSettings.h b/Source/Core/Core/Config/GraphicsSettings.h index af2cf8ed1d..3ec39f300f 100644 --- a/Source/Core/Core/Config/GraphicsSettings.h +++ b/Source/Core/Core/Config/GraphicsSettings.h @@ -59,6 +59,13 @@ extern const ConfigInfo GFX_ENABLE_VALIDATION_LAYER; extern const ConfigInfo GFX_BACKEND_MULTITHREADING; extern const ConfigInfo GFX_COMMAND_BUFFER_EXECUTE_INTERVAL; extern const ConfigInfo GFX_SHADER_CACHE; +extern const ConfigInfo GFX_BACKGROUND_SHADER_COMPILING; +extern const ConfigInfo GFX_DISABLE_SPECIALIZED_SHADERS; +extern const ConfigInfo GFX_PRECOMPILE_UBER_SHADERS; +extern const ConfigInfo GFX_SHADER_COMPILER_THREADS; +extern const ConfigInfo GFX_SHADER_PRECOMPILER_THREADS; +extern const ConfigInfo GFX_FORCE_VERTEX_UBER_SHADERS; +extern const ConfigInfo GFX_FORCE_PIXEL_UBER_SHADERS; extern const ConfigInfo GFX_SW_ZCOMPLOC; extern const ConfigInfo GFX_SW_ZFREEZE; diff --git a/Source/Core/Core/ConfigLoaders/IsSettingSaveable.cpp b/Source/Core/Core/ConfigLoaders/IsSettingSaveable.cpp index 38ea90710f..74eb2cd85c 100644 --- a/Source/Core/Core/ConfigLoaders/IsSettingSaveable.cpp +++ b/Source/Core/Core/ConfigLoaders/IsSettingSaveable.cpp @@ -43,6 +43,11 @@ bool IsSettingSaveable(const Config::ConfigLocation& config_location) Config::GFX_DISABLE_FOG.location, Config::GFX_BORDERLESS_FULLSCREEN.location, Config::GFX_ENABLE_VALIDATION_LAYER.location, Config::GFX_BACKEND_MULTITHREADING.location, Config::GFX_COMMAND_BUFFER_EXECUTE_INTERVAL.location, Config::GFX_SHADER_CACHE.location, + Config::GFX_BACKGROUND_SHADER_COMPILING.location, + Config::GFX_DISABLE_SPECIALIZED_SHADERS.location, + Config::GFX_PRECOMPILE_UBER_SHADERS.location, Config::GFX_SHADER_COMPILER_THREADS.location, + Config::GFX_SHADER_PRECOMPILER_THREADS.location, + Config::GFX_FORCE_VERTEX_UBER_SHADERS.location, Config::GFX_FORCE_PIXEL_UBER_SHADERS.location, Config::GFX_SW_ZCOMPLOC.location, Config::GFX_SW_ZFREEZE.location, Config::GFX_SW_DUMP_OBJECTS.location, Config::GFX_SW_DUMP_TEV_STAGES.location, diff --git a/Source/Core/Core/Core.cpp b/Source/Core/Core/Core.cpp index 31cb2d8b4c..db77a1b2e4 100644 --- a/Source/Core/Core/Core.cpp +++ b/Source/Core/Core/Core.cpp @@ -341,6 +341,7 @@ static void CpuThread() { Common::SetCurrentThreadName("CPU-GPU thread"); g_video_backend->Video_Prepare(); + Host_Message(WM_USER_CREATE); } // This needs to be delayed until after the video backend is ready. @@ -409,6 +410,7 @@ static void FifoPlayerThread() else { g_video_backend->Video_Prepare(); + Host_Message(WM_USER_CREATE); Common::SetCurrentThreadName("FIFO-GPU thread"); } @@ -601,6 +603,7 @@ static void EmuThread(std::unique_ptr boot) Common::SetCurrentThreadName("Video thread"); g_video_backend->Video_Prepare(); + Host_Message(WM_USER_CREATE); // Spawn the CPU thread s_cpu_thread = std::thread(cpuThreadFunc); diff --git a/Source/Core/DolphinWX/VideoConfigDiag.cpp b/Source/Core/DolphinWX/VideoConfigDiag.cpp index 19af19461e..4ced1d69ae 100644 --- a/Source/Core/DolphinWX/VideoConfigDiag.cpp +++ b/Source/Core/DolphinWX/VideoConfigDiag.cpp @@ -308,6 +308,14 @@ static wxString gpu_texture_decoding_desc = wxTRANSLATE("Enables texture decoding using the GPU instead of the CPU. This may result in " "performance gains in some scenarios, or on systems where the CPU is the " "bottleneck.\n\nIf unsure, leave this unchecked."); +static wxString ubershader_desc = + wxTRANSLATE("Disabled: Ubershaders are never used. Stuttering will occur during shader " + "compilation, but GPU demands are low. Recommended for low-end hardware.\n\n" + "Hybrid: Ubershaders will be used to prevent stuttering during shader " + "compilation, but traditional shaders will be used when they will not cause " + "stuttering. Balances performance and smoothness.\n\n" + "Exclusive: Ubershaders will always be used. Only recommended for high-end " + "systems."); VideoConfigDiag::VideoConfigDiag(wxWindow* parent, const std::string& title) : wxDialog(parent, wxID_ANY, wxString::Format(_("Dolphin %s Graphics Configuration"), @@ -561,6 +569,29 @@ VideoConfigDiag::VideoConfigDiag(wxWindow* parent, const std::string& title) row += 1; } + // ubershaders + { + const std::array mode_choices = {{_("Disabled"), _("Hybrid"), _("Exclusive")}}; + + wxChoice* const choice_mode = + new wxChoice(page_enh, wxID_ANY, wxDefaultPosition, wxDefaultSize, + static_cast(mode_choices.size()), mode_choices.data()); + RegisterControl(choice_mode, wxGetTranslation(ubershader_desc)); + szr_enh->Add(new wxStaticText(page_enh, wxID_ANY, _("Ubershaders:")), wxGBPosition(row, 0), + wxDefaultSpan, wxALIGN_CENTER_VERTICAL); + szr_enh->Add(choice_mode, wxGBPosition(row, 1), span2, wxALIGN_CENTER_VERTICAL); + row += 1; + + // Determine ubershader mode + choice_mode->Bind(wxEVT_CHOICE, &VideoConfigDiag::OnUberShaderModeChanged, this); + if (Config::GetBase(Config::GFX_DISABLE_SPECIALIZED_SHADERS)) + choice_mode->SetSelection(2); + else if (Config::GetBase(Config::GFX_BACKGROUND_SHADER_COMPILING)) + choice_mode->SetSelection(1); + else + choice_mode->SetSelection(0); + } + // postproc shader if (vconfig.backend_info.bSupportsPostProcessing) { @@ -1326,3 +1357,13 @@ void VideoConfigDiag::OnAAChanged(wxCommandEvent& ev) Config::SetBaseOrCurrent(Config::GFX_MSAA, vconfig.backend_info.AAModes[mode]); } + +void VideoConfigDiag::OnUberShaderModeChanged(wxCommandEvent& ev) +{ + // 0: No ubershaders + // 1: Hybrid ubershaders + // 2: Only ubershaders + int mode = ev.GetInt(); + Config::SetBaseOrCurrent(Config::GFX_BACKGROUND_SHADER_COMPILING, mode == 1); + Config::SetBaseOrCurrent(Config::GFX_DISABLE_SPECIALIZED_SHADERS, mode == 2); +} diff --git a/Source/Core/DolphinWX/VideoConfigDiag.h b/Source/Core/DolphinWX/VideoConfigDiag.h index 431b6cdae9..1730b19090 100644 --- a/Source/Core/DolphinWX/VideoConfigDiag.h +++ b/Source/Core/DolphinWX/VideoConfigDiag.h @@ -140,6 +140,7 @@ protected: void PopulatePostProcessingShaders(); void PopulateAAList(); void OnAAChanged(wxCommandEvent& ev); + void OnUberShaderModeChanged(wxCommandEvent& ev); wxChoice* choice_backend; wxChoice* choice_adapter; diff --git a/Source/Core/VideoBackends/D3D/D3DBase.cpp b/Source/Core/VideoBackends/D3D/D3DBase.cpp index 60475988ce..cab8c628e6 100644 --- a/Source/Core/VideoBackends/D3D/D3DBase.cpp +++ b/Source/Core/VideoBackends/D3D/D3DBase.cpp @@ -185,10 +185,9 @@ std::vector EnumAAModes(IDXGIAdapter* adapter) ID3D11Device* _device; ID3D11DeviceContext* _context; D3D_FEATURE_LEVEL feat_level; - HRESULT hr = PD3D11CreateDevice(adapter, D3D_DRIVER_TYPE_UNKNOWN, nullptr, - D3D11_CREATE_DEVICE_SINGLETHREADED, supported_feature_levels, - NUM_SUPPORTED_FEATURE_LEVELS, D3D11_SDK_VERSION, &_device, - &feat_level, &_context); + HRESULT hr = PD3D11CreateDevice(adapter, D3D_DRIVER_TYPE_UNKNOWN, nullptr, 0, + supported_feature_levels, NUM_SUPPORTED_FEATURE_LEVELS, + D3D11_SDK_VERSION, &_device, &feat_level, &_context); if (FAILED(hr) || feat_level == D3D_FEATURE_LEVEL_10_0) { DXGI_SAMPLE_DESC desc; @@ -221,9 +220,9 @@ std::vector EnumAAModes(IDXGIAdapter* adapter) D3D_FEATURE_LEVEL GetFeatureLevel(IDXGIAdapter* adapter) { D3D_FEATURE_LEVEL feat_level = D3D_FEATURE_LEVEL_9_1; - PD3D11CreateDevice(adapter, D3D_DRIVER_TYPE_UNKNOWN, nullptr, D3D11_CREATE_DEVICE_SINGLETHREADED, - supported_feature_levels, NUM_SUPPORTED_FEATURE_LEVELS, D3D11_SDK_VERSION, - nullptr, &feat_level, nullptr); + PD3D11CreateDevice(adapter, D3D_DRIVER_TYPE_UNKNOWN, nullptr, 0, supported_feature_levels, + NUM_SUPPORTED_FEATURE_LEVELS, D3D11_SDK_VERSION, nullptr, &feat_level, + nullptr); return feat_level; } @@ -311,8 +310,7 @@ HRESULT Create(HWND wnd) // Creating debug devices can sometimes fail if the user doesn't have the correct // version of the DirectX SDK. If it does, simply fallback to a non-debug device. { - hr = PD3D11CreateDevice(adapter, D3D_DRIVER_TYPE_UNKNOWN, nullptr, - D3D11_CREATE_DEVICE_SINGLETHREADED | D3D11_CREATE_DEVICE_DEBUG, + hr = PD3D11CreateDevice(adapter, D3D_DRIVER_TYPE_UNKNOWN, nullptr, D3D11_CREATE_DEVICE_DEBUG, supported_feature_levels, NUM_SUPPORTED_FEATURE_LEVELS, D3D11_SDK_VERSION, &device, &featlevel, &context); @@ -339,8 +337,7 @@ HRESULT Create(HWND wnd) if (FAILED(hr)) #endif { - hr = PD3D11CreateDevice(adapter, D3D_DRIVER_TYPE_UNKNOWN, nullptr, - D3D11_CREATE_DEVICE_SINGLETHREADED, supported_feature_levels, + hr = PD3D11CreateDevice(adapter, D3D_DRIVER_TYPE_UNKNOWN, nullptr, 0, supported_feature_levels, NUM_SUPPORTED_FEATURE_LEVELS, D3D11_SDK_VERSION, &device, &featlevel, &context); } diff --git a/Source/Core/VideoBackends/D3D/D3DShader.cpp b/Source/Core/VideoBackends/D3D/D3DShader.cpp index cd9622b4a3..6afb1eb1cc 100644 --- a/Source/Core/VideoBackends/D3D/D3DShader.cpp +++ b/Source/Core/VideoBackends/D3D/D3DShader.cpp @@ -18,7 +18,7 @@ namespace DX11 namespace D3D { // bytecode->shader -ID3D11VertexShader* CreateVertexShaderFromByteCode(const void* bytecode, unsigned int len) +ID3D11VertexShader* CreateVertexShaderFromByteCode(const void* bytecode, size_t len) { ID3D11VertexShader* v_shader; HRESULT hr = D3D::device->CreateVertexShader(bytecode, len, nullptr, &v_shader); @@ -73,7 +73,7 @@ bool CompileVertexShader(const std::string& code, D3DBlob** blob) } // bytecode->shader -ID3D11GeometryShader* CreateGeometryShaderFromByteCode(const void* bytecode, unsigned int len) +ID3D11GeometryShader* CreateGeometryShaderFromByteCode(const void* bytecode, size_t len) { ID3D11GeometryShader* g_shader; HRESULT hr = D3D::device->CreateGeometryShader(bytecode, len, nullptr, &g_shader); @@ -131,7 +131,7 @@ bool CompileGeometryShader(const std::string& code, D3DBlob** blob, } // bytecode->shader -ID3D11PixelShader* CreatePixelShaderFromByteCode(const void* bytecode, unsigned int len) +ID3D11PixelShader* CreatePixelShaderFromByteCode(const void* bytecode, size_t len) { ID3D11PixelShader* p_shader; HRESULT hr = D3D::device->CreatePixelShader(bytecode, len, nullptr, &p_shader); diff --git a/Source/Core/VideoBackends/D3D/D3DShader.h b/Source/Core/VideoBackends/D3D/D3DShader.h index 0b417d9885..751c854015 100644 --- a/Source/Core/VideoBackends/D3D/D3DShader.h +++ b/Source/Core/VideoBackends/D3D/D3DShader.h @@ -16,9 +16,9 @@ namespace DX11 { namespace D3D { -ID3D11VertexShader* CreateVertexShaderFromByteCode(const void* bytecode, unsigned int len); -ID3D11GeometryShader* CreateGeometryShaderFromByteCode(const void* bytecode, unsigned int len); -ID3D11PixelShader* CreatePixelShaderFromByteCode(const void* bytecode, unsigned int len); +ID3D11VertexShader* CreateVertexShaderFromByteCode(const void* bytecode, size_t len); +ID3D11GeometryShader* CreateGeometryShaderFromByteCode(const void* bytecode, size_t len); +ID3D11PixelShader* CreatePixelShaderFromByteCode(const void* bytecode, size_t len); // The returned bytecode buffers should be Release()d. bool CompileVertexShader(const std::string& code, D3DBlob** blob); diff --git a/Source/Core/VideoBackends/D3D/D3DState.cpp b/Source/Core/VideoBackends/D3D/D3DState.cpp index 09d8dec3e2..f12359e978 100644 --- a/Source/Core/VideoBackends/D3D/D3DState.cpp +++ b/Source/Core/VideoBackends/D3D/D3DState.cpp @@ -136,7 +136,7 @@ void StateManager::Apply() m_current.pixelConstants[1] != m_pending.pixelConstants[1]) { D3D::context->PSSetConstantBuffers(0, m_pending.pixelConstants[1] ? 2 : 1, - m_pending.pixelConstants); + m_pending.pixelConstants.data()); m_current.pixelConstants[0] = m_pending.pixelConstants[0]; m_current.pixelConstants[1] = m_pending.pixelConstants[1]; } diff --git a/Source/Core/VideoBackends/D3D/D3DState.h b/Source/Core/VideoBackends/D3D/D3DState.h index bf86cf025a..f09bb22bb2 100644 --- a/Source/Core/VideoBackends/D3D/D3DState.h +++ b/Source/Core/VideoBackends/D3D/D3DState.h @@ -4,6 +4,7 @@ #pragma once +#include #include #include #include @@ -269,9 +270,9 @@ private: struct Resources { - ID3D11ShaderResourceView* textures[8]; - ID3D11SamplerState* samplers[8]; - ID3D11Buffer* pixelConstants[2]; + std::array textures; + std::array samplers; + std::array pixelConstants; ID3D11Buffer* vertexConstants; ID3D11Buffer* geometryConstants; ID3D11Buffer* vertexBuffer; diff --git a/Source/Core/VideoBackends/D3D/GeometryShaderCache.cpp b/Source/Core/VideoBackends/D3D/GeometryShaderCache.cpp index f6fd869b8c..f6d0599ac3 100644 --- a/Source/Core/VideoBackends/D3D/GeometryShaderCache.cpp +++ b/Source/Core/VideoBackends/D3D/GeometryShaderCache.cpp @@ -13,6 +13,7 @@ #include "VideoBackends/D3D/D3DBase.h" #include "VideoBackends/D3D/D3DShader.h" +#include "VideoBackends/D3D/D3DState.h" #include "VideoBackends/D3D/FramebufferManager.h" #include "VideoBackends/D3D/GeometryShaderCache.h" @@ -159,6 +160,9 @@ void GeometryShaderCache::Init() if (g_ActiveConfig.bShaderCache) LoadShaderCache(); + + if (g_ActiveConfig.CanPrecompileUberShaders()) + PrecompileShaders(); } void GeometryShaderCache::LoadShaderCache() @@ -175,6 +179,9 @@ void GeometryShaderCache::Reload() if (g_ActiveConfig.bShaderCache) LoadShaderCache(); + + if (g_ActiveConfig.CanPrecompileUberShaders()) + PrecompileShaders(); } // ONLY to be used during shutdown. @@ -203,78 +210,74 @@ void GeometryShaderCache::Shutdown() bool GeometryShaderCache::SetShader(u32 primitive_type) { GeometryShaderUid uid = GetGeometryShaderUid(primitive_type); - - // Check if the shader is already set - if (last_entry) + if (last_entry && uid == last_uid) { - if (uid == last_uid) - { - GFX_DEBUGGER_PAUSE_AT(NEXT_PIXEL_SHADER_CHANGE, true); - return true; - } + GFX_DEBUGGER_PAUSE_AT(NEXT_PIXEL_SHADER_CHANGE, true); + D3D::stateman->SetGeometryShader(last_entry->shader); + return true; } - last_uid = uid; - // Check if the shader is a pass-through shader if (uid.GetUidData()->IsPassthrough()) { // Return the default pass-through shader + last_uid = uid; last_entry = &pass_entry; + D3D::stateman->SetGeometryShader(last_entry->shader); return true; } // Check if the shader is already in the cache - GSCache::iterator iter; - iter = GeometryShaders.find(uid); + auto iter = GeometryShaders.find(uid); if (iter != GeometryShaders.end()) { const GSCacheEntry& entry = iter->second; + last_uid = uid; last_entry = &entry; - + D3D::stateman->SetGeometryShader(last_entry->shader); return (entry.shader != nullptr); } // Need to compile a new shader + if (CompileShader(uid)) + return SetShader(primitive_type); + else + return false; +} + +bool GeometryShaderCache::CompileShader(const GeometryShaderUid& uid) +{ + D3DBlob* bytecode; ShaderCode code = GenerateGeometryShaderCode(APIType::D3D, ShaderHostConfig::GetCurrent(), uid.GetUidData()); - - D3DBlob* pbytecode; - if (!D3D::CompileGeometryShader(code.GetBuffer(), &pbytecode)) + if (!D3D::CompileGeometryShader(code.GetBuffer(), &bytecode) || + !InsertByteCode(uid, bytecode->Data(), bytecode->Size())) { - GFX_DEBUGGER_PAUSE_AT(NEXT_ERROR, true); + SAFE_RELEASE(bytecode); return false; } // Insert the bytecode into the caches - g_gs_disk_cache.Append(uid, pbytecode->Data(), pbytecode->Size()); - - bool success = InsertByteCode(uid, pbytecode->Data(), pbytecode->Size()); - pbytecode->Release(); - - return success; -} - -bool GeometryShaderCache::InsertByteCode(const GeometryShaderUid& uid, const void* bytecode, - unsigned int bytecodelen) -{ - ID3D11GeometryShader* shader = D3D::CreateGeometryShaderFromByteCode(bytecode, bytecodelen); - if (shader == nullptr) - return false; - - // TODO: Somehow make the debug name a bit more specific - D3D::SetDebugObjectName((ID3D11DeviceChild*)shader, "a pixel shader of GeometryShaderCache"); - - // Make an entry in the table - GSCacheEntry newentry; - newentry.shader = shader; - GeometryShaders[uid] = newentry; - last_entry = &GeometryShaders[uid]; - - if (!shader) - return false; - + g_gs_disk_cache.Append(uid, bytecode->Data(), bytecode->Size()); return true; } +bool GeometryShaderCache::InsertByteCode(const GeometryShaderUid& uid, const u8* bytecode, + size_t len) +{ + GSCacheEntry& newentry = GeometryShaders[uid]; + newentry.shader = bytecode ? D3D::CreateGeometryShaderFromByteCode(bytecode, len) : nullptr; + return newentry.shader != nullptr; +} + +void GeometryShaderCache::PrecompileShaders() +{ + EnumerateGeometryShaderUids([](const GeometryShaderUid& uid) { + if (GeometryShaders.find(uid) != GeometryShaders.end()) + return; + + CompileShader(uid); + }); +} + } // DX11 diff --git a/Source/Core/VideoBackends/D3D/GeometryShaderCache.h b/Source/Core/VideoBackends/D3D/GeometryShaderCache.h index e6eca3b2e0..6bebf0a2a4 100644 --- a/Source/Core/VideoBackends/D3D/GeometryShaderCache.h +++ b/Source/Core/VideoBackends/D3D/GeometryShaderCache.h @@ -18,14 +18,14 @@ public: static void Reload(); static void Clear(); static void Shutdown(); - static bool SetShader(u32 primitive_type); // TODO: Should be renamed to LoadShader - static bool InsertByteCode(const GeometryShaderUid& uid, const void* bytecode, - unsigned int bytecodelen); + static bool SetShader(u32 primitive_type); + static bool CompileShader(const GeometryShaderUid& uid); + static bool InsertByteCode(const GeometryShaderUid& uid, const u8* bytecode, size_t len); + static void PrecompileShaders(); static ID3D11GeometryShader* GetClearGeometryShader(); static ID3D11GeometryShader* GetCopyGeometryShader(); - static ID3D11GeometryShader* GetActiveShader() { return last_entry->shader; } static ID3D11Buffer*& GetConstantBuffer(); private: diff --git a/Source/Core/VideoBackends/D3D/NativeVertexFormat.cpp b/Source/Core/VideoBackends/D3D/NativeVertexFormat.cpp index 01658b66e6..fe17144bc0 100644 --- a/Source/Core/VideoBackends/D3D/NativeVertexFormat.cpp +++ b/Source/Core/VideoBackends/D3D/NativeVertexFormat.cpp @@ -13,20 +13,6 @@ namespace DX11 { -class D3DVertexFormat : public NativeVertexFormat -{ -public: - D3DVertexFormat(const PortableVertexDeclaration& vtx_decl); - ~D3DVertexFormat() { SAFE_RELEASE(m_layout); } - void SetupVertexPointers() override; - -private: - std::array m_elems{}; - UINT m_num_elems = 0; - - ID3D11InputLayout* m_layout = nullptr; -}; - std::unique_ptr VertexManager::CreateNativeVertexFormat(const PortableVertexDeclaration& vtx_decl) { @@ -66,7 +52,6 @@ D3DVertexFormat::D3DVertexFormat(const PortableVertexDeclaration& _vtx_decl) this->vtx_decl = _vtx_decl; const AttributeFormat* format = &_vtx_decl.position; - if (format->enable) { m_elems[m_num_elems].SemanticName = "POSITION"; @@ -129,15 +114,18 @@ D3DVertexFormat::D3DVertexFormat(const PortableVertexDeclaration& _vtx_decl) } } -void D3DVertexFormat::SetupVertexPointers() +D3DVertexFormat::~D3DVertexFormat() +{ + SAFE_RELEASE(m_layout); +} + +void D3DVertexFormat::SetInputLayout(D3DBlob* vs_bytecode) { if (!m_layout) { // CreateInputLayout requires a shader input, but it only looks at the // signature of the shader, so we don't need to recompute it if the shader // changes. - D3DBlob* vs_bytecode = DX11::VertexShaderCache::GetActiveShaderBytecode(); - HRESULT hr = DX11::D3D::device->CreateInputLayout( m_elems.data(), m_num_elems, vs_bytecode->Data(), vs_bytecode->Size(), &m_layout); if (FAILED(hr)) diff --git a/Source/Core/VideoBackends/D3D/PixelShaderCache.cpp b/Source/Core/VideoBackends/D3D/PixelShaderCache.cpp index 6bb02b890a..bb1aa04911 100644 --- a/Source/Core/VideoBackends/D3D/PixelShaderCache.cpp +++ b/Source/Core/VideoBackends/D3D/PixelShaderCache.cpp @@ -8,12 +8,15 @@ #include "Common/CommonTypes.h" #include "Common/FileUtil.h" #include "Common/LinearDiskCache.h" +#include "Common/MsgHandler.h" #include "Common/StringUtil.h" #include "Core/ConfigManager.h" +#include "Core/Host.h" #include "VideoBackends/D3D/D3DBase.h" #include "VideoBackends/D3D/D3DShader.h" +#include "VideoBackends/D3D/D3DState.h" #include "VideoBackends/D3D/PixelShaderCache.h" #include "VideoCommon/Debugger.h" @@ -25,10 +28,15 @@ namespace DX11 { PixelShaderCache::PSCache PixelShaderCache::PixelShaders; +PixelShaderCache::UberPSCache PixelShaderCache::UberPixelShaders; const PixelShaderCache::PSCacheEntry* PixelShaderCache::last_entry; +const PixelShaderCache::PSCacheEntry* PixelShaderCache::last_uber_entry; PixelShaderUid PixelShaderCache::last_uid; +UberShader::PixelShaderUid PixelShaderCache::last_uber_uid; LinearDiskCache g_ps_disk_cache; +LinearDiskCache g_uber_ps_disk_cache; +extern std::unique_ptr g_async_compiler; ID3D11PixelShader* s_ColorMatrixProgram[2] = {nullptr}; ID3D11PixelShader* s_ColorCopyProgram[2] = {nullptr}; @@ -429,10 +437,8 @@ ID3D11PixelShader* PixelShaderCache::GetDepthResolveProgram() return s_DepthResolveProgram; } -ID3D11Buffer*& PixelShaderCache::GetConstantBuffer() +static void UpdateConstantBuffers() { - // TODO: divide the global variables of the generated shaders into about 5 constant buffers to - // speed this up if (PixelShaderManager::dirty) { D3D11_MAPPED_SUBRESOURCE map; @@ -443,14 +449,20 @@ ID3D11Buffer*& PixelShaderCache::GetConstantBuffer() ADDSTAT(stats.thisFrame.bytesUniformStreamed, sizeof(PixelShaderConstants)); } +} + +ID3D11Buffer* PixelShaderCache::GetConstantBuffer() +{ + UpdateConstantBuffers(); return pscbuf; } // this class will load the precompiled shaders into our cache -class PixelShaderCacheInserter : public LinearDiskCacheReader +template +class PixelShaderCacheInserter : public LinearDiskCacheReader { public: - void Read(const PixelShaderUid& key, const u8* value, u32 value_size) + void Read(const UidType& key, const u8* value, u32 value_size) { PixelShaderCache::InsertByteCode(key, value, value_size); } @@ -499,22 +511,34 @@ void PixelShaderCache::Init() if (g_ActiveConfig.bShaderCache) LoadShaderCache(); + + if (g_ActiveConfig.CanPrecompileUberShaders()) + QueueUberShaderCompiles(); } void PixelShaderCache::LoadShaderCache() { - PixelShaderCacheInserter inserter; + PixelShaderCacheInserter inserter; g_ps_disk_cache.OpenAndRead(GetDiskShaderCacheFileName(APIType::D3D, "PS", true, true), inserter); + + PixelShaderCacheInserter uber_inserter; + g_uber_ps_disk_cache.OpenAndRead(GetDiskShaderCacheFileName(APIType::D3D, "UberPS", false, true), + uber_inserter); } void PixelShaderCache::Reload() { g_ps_disk_cache.Sync(); g_ps_disk_cache.Close(); + g_uber_ps_disk_cache.Sync(); + g_uber_ps_disk_cache.Close(); Clear(); if (g_ActiveConfig.bShaderCache) LoadShaderCache(); + + if (g_ActiveConfig.CanPrecompileUberShaders()) + QueueUberShaderCompiles(); } // ONLY to be used during shutdown. @@ -522,10 +546,15 @@ void PixelShaderCache::Clear() { for (auto& iter : PixelShaders) iter.second.Destroy(); + for (auto& iter : UberPixelShaders) + iter.second.Destroy(); PixelShaders.clear(); + UberPixelShaders.clear(); last_entry = nullptr; + last_uber_entry = nullptr; last_uid = {}; + last_uber_uid = {}; } // Used in Swap() when AA mode has changed @@ -558,82 +587,249 @@ void PixelShaderCache::Shutdown() Clear(); g_ps_disk_cache.Sync(); g_ps_disk_cache.Close(); + g_uber_ps_disk_cache.Sync(); + g_uber_ps_disk_cache.Close(); } bool PixelShaderCache::SetShader() { - PixelShaderUid uid = GetPixelShaderUid(); + if (g_ActiveConfig.bDisableSpecializedShaders || g_ActiveConfig.bForcePixelUberShaders) + return SetUberShader(); - // Check if the shader is already set - if (last_entry) + PixelShaderUid uid = GetPixelShaderUid(); + if (last_entry && uid == last_uid) { - if (uid == last_uid) - { - GFX_DEBUGGER_PAUSE_AT(NEXT_PIXEL_SHADER_CHANGE, true); - return (last_entry->shader != nullptr); - } + if (last_entry->pending) + return SetUberShader(); + + if (!last_entry->shader) + return false; + + D3D::stateman->SetPixelShader(last_entry->shader); + return true; } - last_uid = uid; - // Check if the shader is already in the cache - PSCache::iterator iter; - iter = PixelShaders.find(uid); + auto iter = PixelShaders.find(uid); if (iter != PixelShaders.end()) { const PSCacheEntry& entry = iter->second; + if (entry.pending) + return SetUberShader(); + + last_uid = uid; last_entry = &entry; GFX_DEBUGGER_PAUSE_AT(NEXT_PIXEL_SHADER_CHANGE, true); - return (entry.shader != nullptr); + if (!last_entry->shader) + return false; + + D3D::stateman->SetPixelShader(last_entry->shader); + return true; + } + + // Background compiling? + if (g_ActiveConfig.CanBackgroundCompileShaders()) + { + // Create a pending entry + PSCacheEntry entry; + entry.pending = true; + PixelShaders[uid] = entry; + + // Queue normal shader compiling and use ubershader + g_async_compiler->QueueWorkItem( + g_async_compiler->CreateWorkItem(uid)); + return SetUberShader(); } // Need to compile a new shader + D3DBlob* bytecode = nullptr; ShaderCode code = GeneratePixelShaderCode(APIType::D3D, ShaderHostConfig::GetCurrent(), uid.GetUidData()); - - D3DBlob* pbytecode; - if (!D3D::CompilePixelShader(code.GetBuffer(), &pbytecode)) + D3D::CompilePixelShader(code.GetBuffer(), &bytecode); + if (!InsertByteCode(uid, bytecode->Data(), bytecode->Size())) { - GFX_DEBUGGER_PAUSE_AT(NEXT_ERROR, true); + SAFE_RELEASE(bytecode); return false; } - // Insert the bytecode into the caches - g_ps_disk_cache.Append(uid, pbytecode->Data(), pbytecode->Size()); - - bool success = InsertByteCode(uid, pbytecode->Data(), pbytecode->Size()); - pbytecode->Release(); - - GFX_DEBUGGER_PAUSE_AT(NEXT_PIXEL_SHADER_CHANGE, true); - return success; + g_ps_disk_cache.Append(uid, bytecode->Data(), bytecode->Size()); + return SetShader(); } -bool PixelShaderCache::InsertByteCode(const PixelShaderUid& uid, const void* bytecode, - unsigned int bytecodelen) +bool PixelShaderCache::SetUberShader() { - ID3D11PixelShader* shader = D3D::CreatePixelShaderFromByteCode(bytecode, bytecodelen); - if (shader == nullptr) - return false; + UberShader::PixelShaderUid uid = UberShader::GetPixelShaderUid(); - // TODO: Somehow make the debug name a bit more specific - D3D::SetDebugObjectName((ID3D11DeviceChild*)shader, "a pixel shader of PixelShaderCache"); - - // Make an entry in the table - PSCacheEntry newentry; - newentry.shader = shader; - PixelShaders[uid] = newentry; - last_entry = &PixelShaders[uid]; - - if (!shader) + if (last_uber_entry && last_uber_uid == uid) { - // INCSTAT(stats.numPixelShadersFailed); + if (!last_uber_entry->shader) + return false; + + D3D::stateman->SetPixelShader(last_uber_entry->shader); + return true; + } + + auto iter = UberPixelShaders.find(uid); + if (iter != UberPixelShaders.end()) + { + const PSCacheEntry& entry = iter->second; + last_uber_uid = uid; + last_uber_entry = &entry; + + GFX_DEBUGGER_PAUSE_AT(NEXT_PIXEL_SHADER_CHANGE, true); + if (!last_uber_entry->shader) + return false; + + D3D::stateman->SetPixelShader(last_uber_entry->shader); + return true; + } + + D3DBlob* bytecode = nullptr; + ShaderCode code = + UberShader::GenPixelShader(APIType::D3D, ShaderHostConfig::GetCurrent(), uid.GetUidData()); + D3D::CompilePixelShader(code.GetBuffer(), &bytecode); + if (!InsertByteCode(uid, bytecode->Data(), bytecode->Size())) + { + SAFE_RELEASE(bytecode); + return false; + } + + // Lookup map again. + g_uber_ps_disk_cache.Append(uid, bytecode->Data(), bytecode->Size()); + bytecode->Release(); + return SetUberShader(); +} + +bool PixelShaderCache::InsertByteCode(const PixelShaderUid& uid, const u8* data, size_t len) +{ + ID3D11PixelShader* shader = data ? D3D::CreatePixelShaderFromByteCode(data, len) : nullptr; + if (!InsertShader(uid, shader)) + { + SAFE_RELEASE(shader); return false; } - INCSTAT(stats.numPixelShadersCreated); - SETSTAT(stats.numPixelShadersAlive, PixelShaders.size()); return true; } +bool PixelShaderCache::InsertByteCode(const UberShader::PixelShaderUid& uid, const u8* data, + size_t len) +{ + ID3D11PixelShader* shader = data ? D3D::CreatePixelShaderFromByteCode(data, len) : nullptr; + if (!InsertShader(uid, shader)) + { + SAFE_RELEASE(shader); + return false; + } + + return true; +} + +bool PixelShaderCache::InsertShader(const PixelShaderUid& uid, ID3D11PixelShader* shader) +{ + auto iter = PixelShaders.find(uid); + if (iter != PixelShaders.end() && !iter->second.pending) + return false; + + PSCacheEntry& newentry = PixelShaders[uid]; + newentry.pending = false; + newentry.shader = shader; + + INCSTAT(stats.numPixelShadersCreated); + SETSTAT(stats.numPixelShadersAlive, PixelShaders.size()); + return (shader != nullptr); +} + +bool PixelShaderCache::InsertShader(const UberShader::PixelShaderUid& uid, + ID3D11PixelShader* shader) +{ + auto iter = UberPixelShaders.find(uid); + if (iter != UberPixelShaders.end() && !iter->second.pending) + return false; + + PSCacheEntry& newentry = UberPixelShaders[uid]; + newentry.pending = false; + newentry.shader = shader; + return (shader != nullptr); +} + +void PixelShaderCache::QueueUberShaderCompiles() +{ + UberShader::EnumeratePixelShaderUids([&](const UberShader::PixelShaderUid& uid) { + if (UberPixelShaders.find(uid) != UberPixelShaders.end()) + return; + + g_async_compiler->QueueWorkItem( + g_async_compiler->CreateWorkItem(uid)); + }); + + g_async_compiler->WaitUntilCompletion([](size_t completed, size_t total) { + Host_UpdateProgressDialog(GetStringT("Compiling shaders...").c_str(), + static_cast(completed), static_cast(total)); + }); + g_async_compiler->RetrieveWorkItems(); + Host_UpdateProgressDialog("", -1, -1); +} + +PixelShaderCache::PixelShaderCompilerWorkItem::PixelShaderCompilerWorkItem( + const PixelShaderUid& uid) +{ + std::memcpy(&m_uid, &uid, sizeof(uid)); +} + +PixelShaderCache::PixelShaderCompilerWorkItem::~PixelShaderCompilerWorkItem() +{ + SAFE_RELEASE(m_bytecode); +} + +bool PixelShaderCache::PixelShaderCompilerWorkItem::Compile() +{ + ShaderCode code = + GeneratePixelShaderCode(APIType::D3D, ShaderHostConfig::GetCurrent(), m_uid.GetUidData()); + + if (D3D::CompilePixelShader(code.GetBuffer(), &m_bytecode)) + m_shader = D3D::CreatePixelShaderFromByteCode(m_bytecode); + + return true; +} + +void PixelShaderCache::PixelShaderCompilerWorkItem::Retrieve() +{ + if (InsertShader(m_uid, m_shader)) + g_ps_disk_cache.Append(m_uid, m_bytecode->Data(), m_bytecode->Size()); + else + SAFE_RELEASE(m_shader); +} + +PixelShaderCache::UberPixelShaderCompilerWorkItem::UberPixelShaderCompilerWorkItem( + const UberShader::PixelShaderUid& uid) +{ + std::memcpy(&m_uid, &uid, sizeof(uid)); +} + +PixelShaderCache::UberPixelShaderCompilerWorkItem::~UberPixelShaderCompilerWorkItem() +{ + SAFE_RELEASE(m_bytecode); +} + +bool PixelShaderCache::UberPixelShaderCompilerWorkItem::Compile() +{ + ShaderCode code = + UberShader::GenPixelShader(APIType::D3D, ShaderHostConfig::GetCurrent(), m_uid.GetUidData()); + + if (D3D::CompilePixelShader(code.GetBuffer(), &m_bytecode)) + m_shader = D3D::CreatePixelShaderFromByteCode(m_bytecode); + + return true; +} + +void PixelShaderCache::UberPixelShaderCompilerWorkItem::Retrieve() +{ + if (InsertShader(m_uid, m_shader)) + g_uber_ps_disk_cache.Append(m_uid, m_bytecode->Data(), m_bytecode->Size()); + else + SAFE_RELEASE(m_shader); +} + } // DX11 diff --git a/Source/Core/VideoBackends/D3D/PixelShaderCache.h b/Source/Core/VideoBackends/D3D/PixelShaderCache.h index fb1b79ad6c..fcdd55e67b 100644 --- a/Source/Core/VideoBackends/D3D/PixelShaderCache.h +++ b/Source/Core/VideoBackends/D3D/PixelShaderCache.h @@ -7,10 +7,14 @@ #include #include +#include "VideoCommon/AsyncShaderCompiler.h" #include "VideoCommon/PixelShaderGen.h" +#include "VideoCommon/UberShaderPixel.h" namespace DX11 { +class D3DBlob; + class PixelShaderCache { public: @@ -18,12 +22,15 @@ public: static void Reload(); static void Clear(); static void Shutdown(); - static bool SetShader(); // TODO: Should be renamed to LoadShader - static bool InsertByteCode(const PixelShaderUid& uid, const void* bytecode, - unsigned int bytecodelen); + static bool SetShader(); + static bool SetUberShader(); + static bool InsertByteCode(const PixelShaderUid& uid, const u8* data, size_t len); + static bool InsertByteCode(const UberShader::PixelShaderUid& uid, const u8* data, size_t len); + static bool InsertShader(const PixelShaderUid& uid, ID3D11PixelShader* shader); + static bool InsertShader(const UberShader::PixelShaderUid& uid, ID3D11PixelShader* shader); + static void QueueUberShaderCompiles(); - static ID3D11PixelShader* GetActiveShader() { return last_entry->shader; } - static ID3D11Buffer*& GetConstantBuffer(); + static ID3D11Buffer* GetConstantBuffer(); static ID3D11PixelShader* GetColorMatrixProgram(bool multisampled); static ID3D11PixelShader* GetColorCopyProgram(bool multisampled); @@ -40,18 +47,53 @@ private: struct PSCacheEntry { ID3D11PixelShader* shader; + bool pending; - PSCacheEntry() : shader(nullptr) {} + PSCacheEntry() : shader(nullptr), pending(false) {} void Destroy() { SAFE_RELEASE(shader); } }; + class PixelShaderCompilerWorkItem : public VideoCommon::AsyncShaderCompiler::WorkItem + { + public: + PixelShaderCompilerWorkItem(const PixelShaderUid& uid); + ~PixelShaderCompilerWorkItem() override; + + bool Compile() override; + void Retrieve() override; + + private: + PixelShaderUid m_uid; + ID3D11PixelShader* m_shader = nullptr; + D3DBlob* m_bytecode = nullptr; + }; + + class UberPixelShaderCompilerWorkItem : public VideoCommon::AsyncShaderCompiler::WorkItem + { + public: + UberPixelShaderCompilerWorkItem(const UberShader::PixelShaderUid& uid); + ~UberPixelShaderCompilerWorkItem() override; + + bool Compile() override; + void Retrieve() override; + + private: + UberShader::PixelShaderUid m_uid; + ID3D11PixelShader* m_shader = nullptr; + D3DBlob* m_bytecode = nullptr; + }; + typedef std::map PSCache; + typedef std::map UberPSCache; static void LoadShaderCache(); static PSCache PixelShaders; + static UberPSCache UberPixelShaders; static const PSCacheEntry* last_entry; + static const PSCacheEntry* last_uber_entry; static PixelShaderUid last_uid; + static UberShader::PixelShaderUid last_uber_uid; }; } // namespace DX11 diff --git a/Source/Core/VideoBackends/D3D/Render.cpp b/Source/Core/VideoBackends/D3D/Render.cpp index d3a5b78cc3..157c2218d7 100644 --- a/Source/Core/VideoBackends/D3D/Render.cpp +++ b/Source/Core/VideoBackends/D3D/Render.cpp @@ -837,6 +837,7 @@ void Renderer::SwapImpl(u32 xfbAddr, u32 fbWidth, u32 fbStride, u32 fbHeight, // Enable configuration changes UpdateActiveConfig(); g_texture_cache->OnConfigChanged(g_ActiveConfig); + VertexShaderCache::RetreiveAsyncShaders(); SetWindowSize(fbStride, fbHeight); @@ -958,10 +959,6 @@ void Renderer::ApplyState() g_ActiveConfig.bEnablePixelLighting ? vertexConstants : nullptr); D3D::stateman->SetVertexConstants(vertexConstants); D3D::stateman->SetGeometryConstants(GeometryShaderCache::GetConstantBuffer()); - - D3D::stateman->SetPixelShader(PixelShaderCache::GetActiveShader()); - D3D::stateman->SetVertexShader(VertexShaderCache::GetActiveShader()); - D3D::stateman->SetGeometryShader(GeometryShaderCache::GetActiveShader()); } void Renderer::RestoreState() diff --git a/Source/Core/VideoBackends/D3D/VertexManager.cpp b/Source/Core/VideoBackends/D3D/VertexManager.cpp index 7bb41a0f42..db27b937df 100644 --- a/Source/Core/VideoBackends/D3D/VertexManager.cpp +++ b/Source/Core/VideoBackends/D3D/VertexManager.cpp @@ -159,7 +159,9 @@ void VertexManager::vFlush() return; } - if (!VertexShaderCache::SetShader()) + D3DVertexFormat* vertex_format = + static_cast(VertexLoaderManager::GetCurrentVertexFormat()); + if (!VertexShaderCache::SetShader(vertex_format)) { GFX_DEBUGGER_PAUSE_LOG_AT(NEXT_ERROR, true, { printf("Fail to set pixel shader\n"); }); return; @@ -182,7 +184,6 @@ void VertexManager::vFlush() PrepareDrawBuffers(stride); - VertexLoaderManager::GetCurrentVertexFormat()->SetupVertexPointers(); g_renderer->ApplyState(); Draw(stride); diff --git a/Source/Core/VideoBackends/D3D/VertexManager.h b/Source/Core/VideoBackends/D3D/VertexManager.h index 3bb444a3b4..7d4d8b66c2 100644 --- a/Source/Core/VideoBackends/D3D/VertexManager.h +++ b/Source/Core/VideoBackends/D3D/VertexManager.h @@ -4,13 +4,30 @@ #pragma once +#include #include +#include "VideoCommon/NativeVertexFormat.h" #include "VideoCommon/VertexManagerBase.h" struct ID3D11Buffer; namespace DX11 { +class D3DBlob; +class D3DVertexFormat : public NativeVertexFormat +{ +public: + D3DVertexFormat(const PortableVertexDeclaration& vtx_decl); + ~D3DVertexFormat(); + void SetInputLayout(D3DBlob* vs_bytecode); + +private: + std::array m_elems{}; + UINT m_num_elems = 0; + + ID3D11InputLayout* m_layout = nullptr; +}; + class VertexManager : public VertexManagerBase { public: diff --git a/Source/Core/VideoBackends/D3D/VertexShaderCache.cpp b/Source/Core/VideoBackends/D3D/VertexShaderCache.cpp index 023caedba0..0c56deeb4a 100644 --- a/Source/Core/VideoBackends/D3D/VertexShaderCache.cpp +++ b/Source/Core/VideoBackends/D3D/VertexShaderCache.cpp @@ -8,23 +8,32 @@ #include "Common/CommonTypes.h" #include "Common/FileUtil.h" #include "Common/LinearDiskCache.h" +#include "Common/MsgHandler.h" #include "Common/StringUtil.h" #include "Core/ConfigManager.h" +#include "Core/Host.h" #include "VideoBackends/D3D/D3DShader.h" +#include "VideoBackends/D3D/D3DState.h" +#include "VideoBackends/D3D/VertexManager.h" #include "VideoBackends/D3D/VertexShaderCache.h" #include "VideoCommon/Debugger.h" #include "VideoCommon/Statistics.h" +#include "VideoCommon/UberShaderVertex.h" +#include "VideoCommon/VertexLoaderManager.h" #include "VideoCommon/VertexShaderGen.h" #include "VideoCommon/VertexShaderManager.h" namespace DX11 { VertexShaderCache::VSCache VertexShaderCache::vshaders; +VertexShaderCache::UberVSCache VertexShaderCache::ubervshaders; const VertexShaderCache::VSCacheEntry* VertexShaderCache::last_entry; +const VertexShaderCache::VSCacheEntry* VertexShaderCache::last_uber_entry; VertexShaderUid VertexShaderCache::last_uid; +UberShader::VertexShaderUid VertexShaderCache::last_uber_uid; static ID3D11VertexShader* SimpleVertexShader = nullptr; static ID3D11VertexShader* ClearVertexShader = nullptr; @@ -32,6 +41,8 @@ static ID3D11InputLayout* SimpleLayout = nullptr; static ID3D11InputLayout* ClearLayout = nullptr; LinearDiskCache g_vs_disk_cache; +LinearDiskCache g_uber_vs_disk_cache; +std::unique_ptr g_async_compiler; ID3D11VertexShader* VertexShaderCache::GetSimpleVertexShader() { @@ -70,10 +81,11 @@ ID3D11Buffer*& VertexShaderCache::GetConstantBuffer() } // this class will load the precompiled shaders into our cache -class VertexShaderCacheInserter : public LinearDiskCacheReader +template +class VertexShaderCacheInserter : public LinearDiskCacheReader { public: - void Read(const VertexShaderUid& key, const u8* value, u32 value_size) + void Read(const UidType& key, const u8* value, u32 value_size) { D3DBlob* blob = new D3DBlob(value_size, value); VertexShaderCache::InsertByteCode(key, blob); @@ -160,36 +172,66 @@ void VertexShaderCache::Init() if (g_ActiveConfig.bShaderCache) LoadShaderCache(); + + g_async_compiler = std::make_unique(); + g_async_compiler->ResizeWorkerThreads(g_ActiveConfig.CanPrecompileUberShaders() ? + g_ActiveConfig.GetShaderPrecompilerThreads() : + g_ActiveConfig.GetShaderCompilerThreads()); + + if (g_ActiveConfig.CanPrecompileUberShaders()) + QueueUberShaderCompiles(); } void VertexShaderCache::LoadShaderCache() { - VertexShaderCacheInserter inserter; + VertexShaderCacheInserter inserter; g_vs_disk_cache.OpenAndRead(GetDiskShaderCacheFileName(APIType::D3D, "VS", true, true), inserter); + + VertexShaderCacheInserter uber_inserter; + g_uber_vs_disk_cache.OpenAndRead(GetDiskShaderCacheFileName(APIType::D3D, "UberVS", false, true), + uber_inserter); } void VertexShaderCache::Reload() { + g_async_compiler->WaitUntilCompletion(); + g_async_compiler->RetrieveWorkItems(); + g_vs_disk_cache.Sync(); g_vs_disk_cache.Close(); + g_uber_vs_disk_cache.Sync(); + g_uber_vs_disk_cache.Close(); Clear(); if (g_ActiveConfig.bShaderCache) LoadShaderCache(); + + if (g_ActiveConfig.CanPrecompileUberShaders()) + QueueUberShaderCompiles(); } void VertexShaderCache::Clear() { for (auto& iter : vshaders) iter.second.Destroy(); + for (auto& iter : ubervshaders) + iter.second.Destroy(); vshaders.clear(); + ubervshaders.clear(); - last_entry = nullptr; last_uid = {}; + last_uber_uid = {}; + last_entry = nullptr; + last_uber_entry = nullptr; + last_uid = {}; + last_uber_uid = {}; } void VertexShaderCache::Shutdown() { + g_async_compiler->StopWorkerThreads(); + g_async_compiler->RetrieveWorkItems(); + SAFE_RELEASE(vscbuf); SAFE_RELEASE(SimpleVertexShader); @@ -201,74 +243,267 @@ void VertexShaderCache::Shutdown() Clear(); g_vs_disk_cache.Sync(); g_vs_disk_cache.Close(); + g_uber_vs_disk_cache.Sync(); + g_uber_vs_disk_cache.Close(); } -bool VertexShaderCache::SetShader() +bool VertexShaderCache::SetShader(D3DVertexFormat* vertex_format) { - VertexShaderUid uid = GetVertexShaderUid(); + if (g_ActiveConfig.bDisableSpecializedShaders || g_ActiveConfig.bForceVertexUberShaders) + return SetUberShader(vertex_format); - if (last_entry) + VertexShaderUid uid = GetVertexShaderUid(); + if (last_entry && uid == last_uid) { - if (uid == last_uid) - { - GFX_DEBUGGER_PAUSE_AT(NEXT_VERTEX_SHADER_CHANGE, true); - return (last_entry->shader != nullptr); - } + if (last_entry->pending) + return SetUberShader(vertex_format); + + if (!last_entry->shader) + return false; + + vertex_format->SetInputLayout(last_entry->bytecode); + D3D::stateman->SetVertexShader(last_entry->shader); + return true; } - last_uid = uid; - - VSCache::iterator iter = vshaders.find(uid); + auto iter = vshaders.find(uid); if (iter != vshaders.end()) { const VSCacheEntry& entry = iter->second; + if (entry.pending) + return SetUberShader(vertex_format); + + last_uid = uid; last_entry = &entry; GFX_DEBUGGER_PAUSE_AT(NEXT_VERTEX_SHADER_CHANGE, true); - return (entry.shader != nullptr); + if (!last_entry->shader) + return false; + + vertex_format->SetInputLayout(last_entry->bytecode); + D3D::stateman->SetVertexShader(last_entry->shader); + return true; } + // Background compiling? + if (g_ActiveConfig.CanBackgroundCompileShaders()) + { + // Create a pending entry + VSCacheEntry entry; + entry.pending = true; + vshaders[uid] = entry; + + // Queue normal shader compiling and use ubershader + g_async_compiler->QueueWorkItem( + g_async_compiler->CreateWorkItem(uid)); + return SetUberShader(vertex_format); + } + + // Need to compile a new shader + D3DBlob* bytecode = nullptr; ShaderCode code = GenerateVertexShaderCode(APIType::D3D, ShaderHostConfig::GetCurrent(), uid.GetUidData()); - - D3DBlob* pbytecode = nullptr; - D3D::CompileVertexShader(code.GetBuffer(), &pbytecode); - - if (pbytecode == nullptr) + D3D::CompileVertexShader(code.GetBuffer(), &bytecode); + if (!InsertByteCode(uid, bytecode)) { - GFX_DEBUGGER_PAUSE_AT(NEXT_ERROR, true); + SAFE_RELEASE(bytecode); return false; } - g_vs_disk_cache.Append(uid, pbytecode->Data(), pbytecode->Size()); - bool success = InsertByteCode(uid, pbytecode); - pbytecode->Release(); - - GFX_DEBUGGER_PAUSE_AT(NEXT_VERTEX_SHADER_CHANGE, true); - return success; + g_vs_disk_cache.Append(uid, bytecode->Data(), bytecode->Size()); + bytecode->Release(); + return SetShader(vertex_format); } -bool VertexShaderCache::InsertByteCode(const VertexShaderUid& uid, D3DBlob* bcodeblob) +bool VertexShaderCache::SetUberShader(D3DVertexFormat* vertex_format) { - ID3D11VertexShader* shader = D3D::CreateVertexShaderFromByteCode(bcodeblob); - if (shader == nullptr) + D3DVertexFormat* uber_vertex_format = static_cast( + VertexLoaderManager::GetUberVertexFormat(vertex_format->GetVertexDeclaration())); + UberShader::VertexShaderUid uid = UberShader::GetVertexShaderUid(); + if (last_uber_entry && last_uber_uid == uid) + { + if (!last_uber_entry->shader) + return false; + + uber_vertex_format->SetInputLayout(last_uber_entry->bytecode); + D3D::stateman->SetVertexShader(last_uber_entry->shader); + return true; + } + + auto iter = ubervshaders.find(uid); + if (iter != ubervshaders.end()) + { + const VSCacheEntry& entry = iter->second; + last_uber_uid = uid; + last_uber_entry = &entry; + + GFX_DEBUGGER_PAUSE_AT(NEXT_VERTEX_SHADER_CHANGE, true); + if (!last_uber_entry->shader) + return false; + + uber_vertex_format->SetInputLayout(last_uber_entry->bytecode); + D3D::stateman->SetVertexShader(last_uber_entry->shader); + return true; + } + + // Need to compile a new shader + D3DBlob* bytecode = nullptr; + ShaderCode code = + UberShader::GenVertexShader(APIType::D3D, ShaderHostConfig::GetCurrent(), uid.GetUidData()); + D3D::CompileVertexShader(code.GetBuffer(), &bytecode); + if (!InsertByteCode(uid, bytecode)) + { + SAFE_RELEASE(bytecode); + return false; + } + + g_uber_vs_disk_cache.Append(uid, bytecode->Data(), bytecode->Size()); + bytecode->Release(); + return SetUberShader(vertex_format); +} + +bool VertexShaderCache::InsertByteCode(const VertexShaderUid& uid, D3DBlob* blob) +{ + ID3D11VertexShader* shader = + blob ? D3D::CreateVertexShaderFromByteCode(blob->Data(), blob->Size()) : nullptr; + bool result = InsertShader(uid, shader, blob); + SAFE_RELEASE(shader); + return result; +} + +bool VertexShaderCache::InsertByteCode(const UberShader::VertexShaderUid& uid, D3DBlob* blob) +{ + ID3D11VertexShader* shader = + blob ? D3D::CreateVertexShaderFromByteCode(blob->Data(), blob->Size()) : nullptr; + bool result = InsertShader(uid, shader, blob); + SAFE_RELEASE(shader); + return result; +} + +bool VertexShaderCache::InsertShader(const VertexShaderUid& uid, ID3D11VertexShader* shader, + D3DBlob* blob) +{ + auto iter = vshaders.find(uid); + if (iter != vshaders.end() && !iter->second.pending) return false; - // TODO: Somehow make the debug name a bit more specific - D3D::SetDebugObjectName((ID3D11DeviceChild*)shader, "a vertex shader of VertexShaderCache"); + VSCacheEntry& newentry = vshaders[uid]; + newentry.pending = false; + if (!shader || !blob) + return false; - // Make an entry in the table - VSCacheEntry entry; - entry.shader = shader; - entry.SetByteCode(bcodeblob); + shader->AddRef(); + newentry.SetByteCode(blob); + newentry.shader = shader; - vshaders[uid] = entry; - last_entry = &vshaders[uid]; + INCSTAT(stats.numPixelShadersCreated); + SETSTAT(stats.numPixelShadersAlive, static_cast(vshaders.size())); + return true; +} - INCSTAT(stats.numVertexShadersCreated); - SETSTAT(stats.numVertexShadersAlive, (int)vshaders.size()); +bool VertexShaderCache::InsertShader(const UberShader::VertexShaderUid& uid, + ID3D11VertexShader* shader, D3DBlob* blob) +{ + auto iter = ubervshaders.find(uid); + if (iter != ubervshaders.end() && !iter->second.pending) + return false; + + VSCacheEntry& newentry = ubervshaders[uid]; + newentry.pending = false; + if (!shader || !blob) + return false; + + shader->AddRef(); + newentry.SetByteCode(blob); + newentry.shader = shader; + return true; +} + +void VertexShaderCache::RetreiveAsyncShaders() +{ + g_async_compiler->RetrieveWorkItems(); +} + +void VertexShaderCache::QueueUberShaderCompiles() +{ + UberShader::EnumerateVertexShaderUids([&](const UberShader::VertexShaderUid& uid) { + if (ubervshaders.find(uid) != ubervshaders.end()) + return; + + g_async_compiler->QueueWorkItem( + g_async_compiler->CreateWorkItem(uid)); + }); +} + +void VertexShaderCache::WaitForBackgroundCompilesToComplete() +{ + g_async_compiler->WaitUntilCompletion([](size_t completed, size_t total) { + Host_UpdateProgressDialog(GetStringT("Compiling shaders...").c_str(), + static_cast(completed), static_cast(total)); + }); + g_async_compiler->RetrieveWorkItems(); + Host_UpdateProgressDialog("", -1, -1); + + // Switch from precompile -> runtime compiler threads. + g_async_compiler->ResizeWorkerThreads(g_ActiveConfig.GetShaderCompilerThreads()); +} + +VertexShaderCache::VertexShaderCompilerWorkItem::VertexShaderCompilerWorkItem( + const VertexShaderUid& uid) +{ + std::memcpy(&m_uid, &uid, sizeof(uid)); +} + +VertexShaderCache::VertexShaderCompilerWorkItem::~VertexShaderCompilerWorkItem() +{ + SAFE_RELEASE(m_bytecode); + SAFE_RELEASE(m_vs); +} + +bool VertexShaderCache::VertexShaderCompilerWorkItem::Compile() +{ + ShaderCode code = + GenerateVertexShaderCode(APIType::D3D, ShaderHostConfig::GetCurrent(), m_uid.GetUidData()); + + if (D3D::CompileVertexShader(code.GetBuffer(), &m_bytecode)) + m_vs = D3D::CreateVertexShaderFromByteCode(m_bytecode); return true; } +void VertexShaderCache::VertexShaderCompilerWorkItem::Retrieve() +{ + if (InsertShader(m_uid, m_vs, m_bytecode)) + g_vs_disk_cache.Append(m_uid, m_bytecode->Data(), m_bytecode->Size()); +} + +VertexShaderCache::UberVertexShaderCompilerWorkItem::UberVertexShaderCompilerWorkItem( + const UberShader::VertexShaderUid& uid) +{ + std::memcpy(&m_uid, &uid, sizeof(uid)); +} + +VertexShaderCache::UberVertexShaderCompilerWorkItem::~UberVertexShaderCompilerWorkItem() +{ + SAFE_RELEASE(m_bytecode); + SAFE_RELEASE(m_vs); +} + +bool VertexShaderCache::UberVertexShaderCompilerWorkItem::Compile() +{ + ShaderCode code = + UberShader::GenVertexShader(APIType::D3D, ShaderHostConfig::GetCurrent(), m_uid.GetUidData()); + + if (D3D::CompileVertexShader(code.GetBuffer(), &m_bytecode)) + m_vs = D3D::CreateVertexShaderFromByteCode(m_bytecode); + + return true; +} + +void VertexShaderCache::UberVertexShaderCompilerWorkItem::Retrieve() +{ + if (InsertShader(m_uid, m_vs, m_bytecode)) + g_uber_vs_disk_cache.Append(m_uid, m_bytecode->Data(), m_bytecode->Size()); +} + } // namespace DX11 diff --git a/Source/Core/VideoBackends/D3D/VertexShaderCache.h b/Source/Core/VideoBackends/D3D/VertexShaderCache.h index 514e807192..dd3b07afa0 100644 --- a/Source/Core/VideoBackends/D3D/VertexShaderCache.h +++ b/Source/Core/VideoBackends/D3D/VertexShaderCache.h @@ -9,10 +9,14 @@ #include "VideoBackends/D3D/D3DBase.h" #include "VideoBackends/D3D/D3DBlob.h" +#include "VideoCommon/AsyncShaderCompiler.h" +#include "VideoCommon/UberShaderVertex.h" #include "VideoCommon/VertexShaderGen.h" namespace DX11 { +class D3DVertexFormat; + class VertexShaderCache { public: @@ -20,10 +24,12 @@ public: static void Reload(); static void Clear(); static void Shutdown(); - static bool SetShader(); // TODO: Should be renamed to LoadShader + static bool SetShader(D3DVertexFormat* vertex_format); + static bool SetUberShader(D3DVertexFormat* vertex_format); + static void RetreiveAsyncShaders(); + static void QueueUberShaderCompiles(); + static void WaitForBackgroundCompilesToComplete(); - static ID3D11VertexShader* GetActiveShader() { return last_entry->shader; } - static D3DBlob* GetActiveShaderBytecode() { return last_entry->bytecode; } static ID3D11Buffer*& GetConstantBuffer(); static ID3D11VertexShader* GetSimpleVertexShader(); @@ -31,15 +37,20 @@ public: static ID3D11InputLayout* GetSimpleInputLayout(); static ID3D11InputLayout* GetClearInputLayout(); - static bool InsertByteCode(const VertexShaderUid& uid, D3DBlob* bcodeblob); + static bool InsertByteCode(const VertexShaderUid& uid, D3DBlob* blob); + static bool InsertByteCode(const UberShader::VertexShaderUid& uid, D3DBlob* blob); + static bool InsertShader(const VertexShaderUid& uid, ID3D11VertexShader* shader, D3DBlob* blob); + static bool InsertShader(const UberShader::VertexShaderUid& uid, ID3D11VertexShader* shader, + D3DBlob* blob); private: struct VSCacheEntry { ID3D11VertexShader* shader; D3DBlob* bytecode; // needed to initialize the input layout + bool pending; - VSCacheEntry() : shader(nullptr), bytecode(nullptr) {} + VSCacheEntry() : shader(nullptr), bytecode(nullptr), pending(false) {} void SetByteCode(D3DBlob* blob) { SAFE_RELEASE(bytecode); @@ -52,13 +63,49 @@ private: SAFE_RELEASE(bytecode); } }; + + class VertexShaderCompilerWorkItem : public VideoCommon::AsyncShaderCompiler::WorkItem + { + public: + VertexShaderCompilerWorkItem(const VertexShaderUid& uid); + ~VertexShaderCompilerWorkItem() override; + + bool Compile() override; + void Retrieve() override; + + private: + VertexShaderUid m_uid; + D3DBlob* m_bytecode = nullptr; + ID3D11VertexShader* m_vs = nullptr; + }; + + class UberVertexShaderCompilerWorkItem : public VideoCommon::AsyncShaderCompiler::WorkItem + { + public: + UberVertexShaderCompilerWorkItem(const UberShader::VertexShaderUid& uid); + ~UberVertexShaderCompilerWorkItem() override; + + bool Compile() override; + void Retrieve() override; + + private: + UberShader::VertexShaderUid m_uid; + D3DBlob* m_bytecode = nullptr; + ID3D11VertexShader* m_vs = nullptr; + }; + typedef std::map VSCache; + typedef std::map UberVSCache; static void LoadShaderCache(); + static void SetInputLayout(); static VSCache vshaders; + static UberVSCache ubervshaders; static const VSCacheEntry* last_entry; + static const VSCacheEntry* last_uber_entry; static VertexShaderUid last_uid; + static UberShader::VertexShaderUid last_uber_uid; }; } // namespace DX11 diff --git a/Source/Core/VideoBackends/D3D/main.cpp b/Source/Core/VideoBackends/D3D/main.cpp index 5cc05f2af9..ffa15a601c 100644 --- a/Source/Core/VideoBackends/D3D/main.cpp +++ b/Source/Core/VideoBackends/D3D/main.cpp @@ -78,6 +78,8 @@ void VideoBackend::InitBackendInfo() g_Config.backend_info.bSupportsInternalResolutionFrameDumps = false; g_Config.backend_info.bSupportsGPUTextureDecoding = false; g_Config.backend_info.bSupportsST3CTextures = false; + g_Config.backend_info.bSupportsBitfield = false; + g_Config.backend_info.bSupportsDynamicSamplerIndexing = false; IDXGIFactory* factory; IDXGIAdapter* ad; @@ -159,6 +161,7 @@ void VideoBackend::Video_Prepare() VertexShaderCache::Init(); PixelShaderCache::Init(); GeometryShaderCache::Init(); + VertexShaderCache::WaitForBackgroundCompilesToComplete(); D3D::InitUtils(); BBox::Init(); } diff --git a/Source/Core/VideoBackends/Null/VertexManager.cpp b/Source/Core/VideoBackends/Null/VertexManager.cpp index be2a8f0af5..60eaf83aa3 100644 --- a/Source/Core/VideoBackends/Null/VertexManager.cpp +++ b/Source/Core/VideoBackends/Null/VertexManager.cpp @@ -16,7 +16,6 @@ class NullNativeVertexFormat : public NativeVertexFormat { public: NullNativeVertexFormat() {} - void SetupVertexPointers() override {} }; std::unique_ptr diff --git a/Source/Core/VideoBackends/OGL/NativeVertexFormat.cpp b/Source/Core/VideoBackends/OGL/NativeVertexFormat.cpp index 69a6229741..816cdf3754 100644 --- a/Source/Core/VideoBackends/OGL/NativeVertexFormat.cpp +++ b/Source/Core/VideoBackends/OGL/NativeVertexFormat.cpp @@ -57,6 +57,7 @@ GLVertexFormat::GLVertexFormat(const PortableVertexDeclaration& _vtx_decl) glGenVertexArrays(1, &VAO); glBindVertexArray(VAO); + ProgramShaderCache::BindVertexFormat(this); // the element buffer is bound directly to the vao, so we must it set for every vao glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, vm->m_index_buffers); @@ -74,16 +75,10 @@ GLVertexFormat::GLVertexFormat(const PortableVertexDeclaration& _vtx_decl) SetPointer(SHADER_TEXTURE0_ATTRIB + i, vertex_stride, _vtx_decl.texcoords[i]); SetPointer(SHADER_POSMTX_ATTRIB, vertex_stride, _vtx_decl.posmtx); - - vm->m_last_vao = VAO; } GLVertexFormat::~GLVertexFormat() { glDeleteVertexArrays(1, &VAO); } - -void GLVertexFormat::SetupVertexPointers() -{ -} } diff --git a/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp b/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp index 67e335f515..57d9705a7c 100644 --- a/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp +++ b/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp @@ -4,45 +4,62 @@ #include "VideoBackends/OGL/ProgramShaderCache.h" +#include #include #include #include "Common/Align.h" #include "Common/CommonTypes.h" #include "Common/FileUtil.h" +#include "Common/GL/GLInterfaceBase.h" #include "Common/Logging/Log.h" #include "Common/MsgHandler.h" #include "Common/StringUtil.h" +#include "Common/Timer.h" #include "Core/ConfigManager.h" +#include "Core/Host.h" #include "VideoBackends/OGL/Render.h" #include "VideoBackends/OGL/StreamBuffer.h" +#include "VideoBackends/OGL/VertexManager.h" +#include "VideoCommon/AsyncShaderCompiler.h" #include "VideoCommon/Debugger.h" +#include "VideoCommon/DriverDetails.h" #include "VideoCommon/GeometryShaderManager.h" #include "VideoCommon/ImageWrite.h" #include "VideoCommon/PixelShaderManager.h" #include "VideoCommon/Statistics.h" +#include "VideoCommon/UberShaderPixel.h" +#include "VideoCommon/UberShaderVertex.h" +#include "VideoCommon/VertexLoaderManager.h" #include "VideoCommon/VertexShaderManager.h" #include "VideoCommon/VideoCommon.h" namespace OGL { -static const u32 UBO_LENGTH = 32 * 1024 * 1024; +static constexpr u32 UBO_LENGTH = 32 * 1024 * 1024; +static constexpr u32 INVALID_VAO = std::numeric_limits::max(); +std::unique_ptr + ProgramShaderCache::s_async_compiler; u32 ProgramShaderCache::s_ubo_buffer_size; s32 ProgramShaderCache::s_ubo_align; +u32 ProgramShaderCache::s_last_VAO = INVALID_VAO; static std::unique_ptr s_buffer; static int num_failures = 0; -static LinearDiskCache g_program_disk_cache; +static LinearDiskCache s_program_disk_cache; +static LinearDiskCache s_uber_program_disk_cache; static GLuint CurrentProgram = 0; ProgramShaderCache::PCache ProgramShaderCache::pshaders; +ProgramShaderCache::UberPCache ProgramShaderCache::ubershaders; ProgramShaderCache::PCacheEntry* ProgramShaderCache::last_entry; +ProgramShaderCache::PCacheEntry* ProgramShaderCache::last_uber_entry; SHADERUID ProgramShaderCache::last_uid; - +UBERSHADERUID ProgramShaderCache::last_uber_uid; static std::string s_glsl_header = ""; static std::string GetGLSLVersionString() @@ -85,6 +102,7 @@ void SHADER::SetProgramVariables() GLint PSBlock_id = glGetUniformBlockIndex(glprogid, "PSBlock"); GLint VSBlock_id = glGetUniformBlockIndex(glprogid, "VSBlock"); GLint GSBlock_id = glGetUniformBlockIndex(glprogid, "GSBlock"); + GLint UBERBlock_id = glGetUniformBlockIndex(glprogid, "UBERBlock"); if (PSBlock_id != -1) glUniformBlockBinding(glprogid, PSBlock_id, 1); @@ -92,6 +110,8 @@ void SHADER::SetProgramVariables() glUniformBlockBinding(glprogid, VSBlock_id, 2); if (GSBlock_id != -1) glUniformBlockBinding(glprogid, GSBlock_id, 3); + if (UBERBlock_id != -1) + glUniformBlockBinding(glprogid, UBERBlock_id, 4); // Bind Texture Samplers for (int a = 0; a <= 9; ++a) @@ -123,8 +143,8 @@ void SHADER::SetProgramBindings(bool is_compute) glBindAttribLocation(glprogid, SHADER_POSMTX_ATTRIB, "posmtx"); - glBindAttribLocation(glprogid, SHADER_COLOR0_ATTRIB, "color0"); - glBindAttribLocation(glprogid, SHADER_COLOR1_ATTRIB, "color1"); + glBindAttribLocation(glprogid, SHADER_COLOR0_ATTRIB, "rawcolor0"); + glBindAttribLocation(glprogid, SHADER_COLOR1_ATTRIB, "rawcolor1"); glBindAttribLocation(glprogid, SHADER_NORM0_ATTRIB, "rawnorm0"); glBindAttribLocation(glprogid, SHADER_NORM1_ATTRIB, "rawnorm1"); @@ -133,7 +153,7 @@ void SHADER::SetProgramBindings(bool is_compute) for (int i = 0; i < 8; i++) { - std::string attrib_name = StringFromFormat("tex%d", i); + std::string attrib_name = StringFromFormat("rawtex%d", i); glBindAttribLocation(glprogid, SHADER_TEXTURE0_ATTRIB + i, attrib_name.c_str()); } } @@ -148,6 +168,25 @@ void SHADER::Bind() const } } +void SHADER::DestroyShaders() +{ + if (vsid) + { + glDeleteShader(vsid); + vsid = 0; + } + if (gsid) + { + glDeleteShader(gsid); + gsid = 0; + } + if (psid) + { + glDeleteShader(psid); + psid = 0; + } +} + void ProgramShaderCache::UploadConstants() { if (PixelShaderManager::dirty || VertexShaderManager::dirty || GeometryShaderManager::dirty) @@ -182,68 +221,136 @@ void ProgramShaderCache::UploadConstants() } } -SHADER* ProgramShaderCache::SetShader(u32 primitive_type) +SHADER* ProgramShaderCache::SetShader(u32 primitive_type, const GLVertexFormat* vertex_format) { + if (g_ActiveConfig.bDisableSpecializedShaders) + return SetUberShader(primitive_type, vertex_format); + SHADERUID uid; - GetShaderId(&uid, primitive_type); + std::memset(&uid, 0, sizeof(uid)); + uid.puid = GetPixelShaderUid(); + uid.vuid = GetVertexShaderUid(); + uid.guid = GetGeometryShaderUid(primitive_type); // Check if the shader is already set - if (last_entry) + if (last_entry && uid == last_uid) { - if (uid == last_uid) - { - GFX_DEBUGGER_PAUSE_AT(NEXT_PIXEL_SHADER_CHANGE, true); - last_entry->shader.Bind(); - return &last_entry->shader; - } + last_entry->shader.Bind(); + BindVertexFormat(vertex_format); + return &last_entry->shader; } - last_uid = uid; - // Check if shader is already in cache - PCache::iterator iter = pshaders.find(uid); + auto iter = pshaders.find(uid); if (iter != pshaders.end()) { PCacheEntry* entry = &iter->second; - last_entry = entry; + if (entry->pending) + return SetUberShader(primitive_type, vertex_format); - GFX_DEBUGGER_PAUSE_AT(NEXT_PIXEL_SHADER_CHANGE, true); + last_uid = uid; + last_entry = entry; + BindVertexFormat(vertex_format); last_entry->shader.Bind(); return &last_entry->shader; } - // Make an entry in the table + // Compile the new shader program. PCacheEntry& newentry = pshaders[uid]; - last_entry = &newentry; - newentry.in_cache = 0; + newentry.in_cache = false; + newentry.pending = false; + // Can we background compile this shader? Requires background shader compiling to be enabled, + // and all ubershaders to have been successfully compiled. + if (g_ActiveConfig.CanBackgroundCompileShaders() && !ubershaders.empty() && s_async_compiler) + { + newentry.pending = true; + s_async_compiler->QueueWorkItem(s_async_compiler->CreateWorkItem(uid)); + return SetUberShader(primitive_type, vertex_format); + } + + // Synchronous shader compiling. ShaderHostConfig host_config = ShaderHostConfig::GetCurrent(); - ShaderCode vcode = GenerateVertexShaderCode(APIType::OpenGL, host_config, uid.vuid.GetUidData()); - ShaderCode pcode = GeneratePixelShaderCode(APIType::OpenGL, host_config, uid.puid.GetUidData()); + ShaderCode vcode; + if (!g_ActiveConfig.bForceVertexUberShaders) + vcode = GenerateVertexShaderCode(APIType::OpenGL, host_config, uid.vuid.GetUidData()); + else + vcode = UberShader::GenVertexShader(APIType::OpenGL, host_config, + UberShader::GetVertexShaderUid().GetUidData()); + ShaderCode pcode; + if (!g_ActiveConfig.bForcePixelUberShaders) + pcode = GeneratePixelShaderCode(APIType::OpenGL, host_config, uid.puid.GetUidData()); + else + pcode = UberShader::GenPixelShader(APIType::OpenGL, host_config, + UberShader::GetPixelShaderUid().GetUidData()); + ShaderCode gcode; if (g_ActiveConfig.backend_info.bSupportsGeometryShaders && !uid.guid.GetUidData()->IsPassthrough()) gcode = GenerateGeometryShaderCode(APIType::OpenGL, host_config, uid.guid.GetUidData()); -#if defined(_DEBUG) || defined(DEBUGFAST) - if (g_ActiveConfig.iLog & CONF_SAVESHADERS) + if (!CompileShader(newentry.shader, vcode.GetBuffer(), pcode.GetBuffer(), gcode.GetBuffer())) + return nullptr; + + INCSTAT(stats.numPixelShadersCreated); + SETSTAT(stats.numPixelShadersAlive, pshaders.size()); + + last_uid = uid; + last_entry = &newentry; + BindVertexFormat(vertex_format); + last_entry->shader.Bind(); + return &last_entry->shader; +} + +SHADER* ProgramShaderCache::SetUberShader(u32 primitive_type, const GLVertexFormat* vertex_format) +{ + UBERSHADERUID uid; + std::memset(&uid, 0, sizeof(uid)); + uid.puid = UberShader::GetPixelShaderUid(); + uid.vuid = UberShader::GetVertexShaderUid(); + uid.guid = GetGeometryShaderUid(primitive_type); + + // We need to use the ubershader vertex format with all attributes enabled. + // Otherwise, the NV driver can generate variants for the vertex shaders. + const GLVertexFormat* uber_vertex_format = static_cast( + VertexLoaderManager::GetUberVertexFormat(vertex_format->GetVertexDeclaration())); + + // Check if the shader is already set + if (last_uber_entry && last_uber_uid == uid) { - static int counter = 0; - std::string filename = - StringFromFormat("%svs_%04i.txt", File::GetUserPath(D_DUMP_IDX).c_str(), counter++); - SaveData(filename, vcode.GetBuffer()); - - filename = StringFromFormat("%sps_%04i.txt", File::GetUserPath(D_DUMP_IDX).c_str(), counter++); - SaveData(filename, pcode.GetBuffer()); - - if (!gcode.GetBuffer().empty()) - { - filename = - StringFromFormat("%sgs_%04i.txt", File::GetUserPath(D_DUMP_IDX).c_str(), counter++); - SaveData(filename, gcode.GetBuffer()); - } + BindVertexFormat(uber_vertex_format); + last_uber_entry->shader.Bind(); + return &last_uber_entry->shader; + } + + // Check if shader is already in cache + auto iter = ubershaders.find(uid); + if (iter != ubershaders.end()) + { + PCacheEntry* entry = &iter->second; + last_uber_uid = uid; + last_uber_entry = entry; + BindVertexFormat(uber_vertex_format); + last_uber_entry->shader.Bind(); + return &last_uber_entry->shader; + } + + // Make an entry in the table + PCacheEntry& newentry = ubershaders[uid]; + newentry.in_cache = false; + newentry.pending = false; + + ShaderHostConfig host_config = ShaderHostConfig::GetCurrent(); + ShaderCode vcode = + UberShader::GenVertexShader(APIType::OpenGL, host_config, uid.vuid.GetUidData()); + ShaderCode pcode = + UberShader::GenPixelShader(APIType::OpenGL, host_config, uid.puid.GetUidData()); + ShaderCode gcode; + if (g_ActiveConfig.backend_info.bSupportsGeometryShaders && + !uid.guid.GetUidData()->IsPassthrough()) + { + gcode = GenerateGeometryShaderCode(APIType::OpenGL, host_config, uid.guid.GetUidData()); } -#endif if (!CompileShader(newentry.shader, vcode.GetBuffer(), pcode.GetBuffer(), gcode.GetBuffer())) { @@ -251,93 +358,77 @@ SHADER* ProgramShaderCache::SetShader(u32 primitive_type) return nullptr; } - INCSTAT(stats.numPixelShadersCreated); - SETSTAT(stats.numPixelShadersAlive, pshaders.size()); - GFX_DEBUGGER_PAUSE_AT(NEXT_PIXEL_SHADER_CHANGE, true); - - last_entry->shader.Bind(); - return &last_entry->shader; + last_uber_uid = uid; + last_uber_entry = &newentry; + BindVertexFormat(uber_vertex_format); + last_uber_entry->shader.Bind(); + return &last_uber_entry->shader; } bool ProgramShaderCache::CompileShader(SHADER& shader, const std::string& vcode, const std::string& pcode, const std::string& gcode) { - GLuint vsid = CompileSingleShader(GL_VERTEX_SHADER, vcode); - GLuint psid = CompileSingleShader(GL_FRAGMENT_SHADER, pcode); +#if defined(_DEBUG) || defined(DEBUGFAST) + if (g_ActiveConfig.iLog & CONF_SAVESHADERS) + { + static int counter = 0; + std::string filename = + StringFromFormat("%svs_%04i.txt", File::GetUserPath(D_DUMP_IDX).c_str(), counter++); + SaveData(filename, vcode.c_str()); + + filename = StringFromFormat("%sps_%04i.txt", File::GetUserPath(D_DUMP_IDX).c_str(), counter++); + SaveData(filename, pcode.c_str()); + + if (!gcode.empty()) + { + filename = + StringFromFormat("%sgs_%04i.txt", File::GetUserPath(D_DUMP_IDX).c_str(), counter++); + SaveData(filename, gcode.c_str()); + } + } +#endif + + shader.vsid = CompileSingleShader(GL_VERTEX_SHADER, vcode); + shader.psid = CompileSingleShader(GL_FRAGMENT_SHADER, pcode); // Optional geometry shader - GLuint gsid = 0; + shader.gsid = 0; if (!gcode.empty()) - gsid = CompileSingleShader(GL_GEOMETRY_SHADER, gcode); + shader.gsid = CompileSingleShader(GL_GEOMETRY_SHADER, gcode); - if (!vsid || !psid || (!gcode.empty() && !gsid)) + if (!shader.vsid || !shader.psid || (!gcode.empty() && !shader.gsid)) { - glDeleteShader(vsid); - glDeleteShader(psid); - glDeleteShader(gsid); + shader.Destroy(); return false; } - GLuint pid = shader.glprogid = glCreateProgram(); + // Create and link the program. + shader.glprogid = glCreateProgram(); - glAttachShader(pid, vsid); - glAttachShader(pid, psid); - if (gsid) - glAttachShader(pid, gsid); + glAttachShader(shader.glprogid, shader.vsid); + glAttachShader(shader.glprogid, shader.psid); + if (shader.gsid) + glAttachShader(shader.glprogid, shader.gsid); if (g_ogl_config.bSupportsGLSLCache) - glProgramParameteri(pid, GL_PROGRAM_BINARY_RETRIEVABLE_HINT, GL_TRUE); + glProgramParameteri(shader.glprogid, GL_PROGRAM_BINARY_RETRIEVABLE_HINT, GL_TRUE); shader.SetProgramBindings(false); - glLinkProgram(pid); + glLinkProgram(shader.glprogid); - // original shaders aren't needed any more - glDeleteShader(vsid); - glDeleteShader(psid); - glDeleteShader(gsid); - - GLint linkStatus; - glGetProgramiv(pid, GL_LINK_STATUS, &linkStatus); - GLsizei length = 0; - glGetProgramiv(pid, GL_INFO_LOG_LENGTH, &length); - if (linkStatus != GL_TRUE || (length > 1 && DEBUG_GLSL)) + if (!CheckProgramLinkResult(shader.glprogid, vcode, pcode, gcode)) { - std::string info_log; - info_log.resize(length); - glGetProgramInfoLog(pid, length, &length, &info_log[0]); - ERROR_LOG(VIDEO, "Program info log:\n%s", info_log.c_str()); - - std::string filename = - StringFromFormat("%sbad_p_%d.txt", File::GetUserPath(D_DUMP_IDX).c_str(), num_failures++); - std::ofstream file; - File::OpenFStream(file, filename, std::ios_base::out); - file << s_glsl_header << vcode << s_glsl_header << pcode; - if (!gcode.empty()) - file << s_glsl_header << gcode; - file << info_log; - file.close(); - - if (linkStatus != GL_TRUE) - { - PanicAlert("Failed to link shaders: %s\n" - "Debug info (%s, %s, %s):\n%s", - filename.c_str(), g_ogl_config.gl_vendor, g_ogl_config.gl_renderer, - g_ogl_config.gl_version, info_log.c_str()); - } - } - if (linkStatus != GL_TRUE) - { - // Compile failed - ERROR_LOG(VIDEO, "Program linking failed; see info log"); - // Don't try to use this shader - glDeleteProgram(pid); + shader.Destroy(); return false; } + // For drivers that don't support binding layout, we need to bind it here. shader.SetProgramVariables(); + // Original shaders aren't needed any more. + shader.DestroyShaders(); return true; } @@ -352,63 +443,30 @@ bool ProgramShaderCache::CompileComputeShader(SHADER& shader, const std::string& header = "#extension GL_ARB_compute_shader : enable\n"; } - GLuint shader_id = CompileSingleShader(GL_COMPUTE_SHADER, header + code); + std::string full_code = header + code; + GLuint shader_id = CompileSingleShader(GL_COMPUTE_SHADER, full_code); if (!shader_id) return false; - GLuint pid = shader.glprogid = glCreateProgram(); - glAttachShader(pid, shader_id); - if (g_ogl_config.bSupportsGLSLCache) - glProgramParameteri(pid, GL_PROGRAM_BINARY_RETRIEVABLE_HINT, GL_TRUE); - + shader.glprogid = glCreateProgram(); + glAttachShader(shader.glprogid, shader_id); shader.SetProgramBindings(true); - - glLinkProgram(pid); + glLinkProgram(shader.glprogid); // original shaders aren't needed any more glDeleteShader(shader_id); - GLint linkStatus; - glGetProgramiv(pid, GL_LINK_STATUS, &linkStatus); - GLsizei length = 0; - glGetProgramiv(pid, GL_INFO_LOG_LENGTH, &length); - if (linkStatus != GL_TRUE || (length > 1 && DEBUG_GLSL)) + if (!CheckProgramLinkResult(shader.glprogid, full_code, "", "")) { - std::string info_log; - info_log.resize(length); - glGetProgramInfoLog(pid, length, &length, &info_log[0]); - ERROR_LOG(VIDEO, "Program info log:\n%s", info_log.c_str()); - - std::string filename = - StringFromFormat("%sbad_p_%d.txt", File::GetUserPath(D_DUMP_IDX).c_str(), num_failures++); - std::ofstream file; - File::OpenFStream(file, filename, std::ios_base::out); - file << s_glsl_header << code; - file << info_log; - file.close(); - - if (linkStatus != GL_TRUE) - { - PanicAlert("Failed to link shaders: %s\n" - "Debug info (%s, %s, %s):\n%s", - filename.c_str(), g_ogl_config.gl_vendor, g_ogl_config.gl_renderer, - g_ogl_config.gl_version, info_log.c_str()); - } - } - if (linkStatus != GL_TRUE) - { - // Compile failed - ERROR_LOG(VIDEO, "Program linking failed; see info log"); - - // Don't try to use this shader - glDeleteProgram(pid); + shader.Destroy(); return false; } + shader.SetProgramVariables(); return true; } -GLuint ProgramShaderCache::CompileSingleShader(GLuint type, const std::string& code) +GLuint ProgramShaderCache::CompileSingleShader(GLenum type, const std::string& code) { GLuint result = glCreateShader(type); @@ -416,16 +474,29 @@ GLuint ProgramShaderCache::CompileSingleShader(GLuint type, const std::string& c glShaderSource(result, 2, src, nullptr); glCompileShader(result); + + if (!CheckShaderCompileResult(result, type, code)) + { + // Don't try to use this shader + glDeleteShader(result); + return 0; + } + + return result; +} + +bool ProgramShaderCache::CheckShaderCompileResult(GLuint id, GLenum type, const std::string& code) +{ GLint compileStatus; - glGetShaderiv(result, GL_COMPILE_STATUS, &compileStatus); + glGetShaderiv(id, GL_COMPILE_STATUS, &compileStatus); GLsizei length = 0; - glGetShaderiv(result, GL_INFO_LOG_LENGTH, &length); + glGetShaderiv(id, GL_INFO_LOG_LENGTH, &length); if (compileStatus != GL_TRUE || (length > 1 && DEBUG_GLSL)) { std::string info_log; info_log.resize(length); - glGetShaderInfoLog(result, length, &length, &info_log[0]); + glGetShaderInfoLog(id, length, &length, &info_log[0]); const char* prefix = ""; switch (type) @@ -465,20 +536,48 @@ GLuint ProgramShaderCache::CompileSingleShader(GLuint type, const std::string& c { // Compile failed ERROR_LOG(VIDEO, "Shader compilation failed; see info log"); - - // Don't try to use this shader - glDeleteShader(result); - return 0; + return false; } - return result; + return true; } -void ProgramShaderCache::GetShaderId(SHADERUID* uid, u32 primitive_type) +bool ProgramShaderCache::CheckProgramLinkResult(GLuint id, const std::string& vcode, + const std::string& pcode, const std::string& gcode) { - uid->puid = GetPixelShaderUid(); - uid->vuid = GetVertexShaderUid(); - uid->guid = GetGeometryShaderUid(primitive_type); + GLint linkStatus; + glGetProgramiv(id, GL_LINK_STATUS, &linkStatus); + GLsizei length = 0; + glGetProgramiv(id, GL_INFO_LOG_LENGTH, &length); + if (linkStatus != GL_TRUE || (length > 1 && DEBUG_GLSL)) + { + std::string info_log; + info_log.resize(length); + glGetProgramInfoLog(id, length, &length, &info_log[0]); + ERROR_LOG(VIDEO, "Program info log:\n%s", info_log.c_str()); + + std::string filename = + StringFromFormat("%sbad_p_%d.txt", File::GetUserPath(D_DUMP_IDX).c_str(), num_failures++); + std::ofstream file; + File::OpenFStream(file, filename, std::ios_base::out); + file << s_glsl_header << vcode << s_glsl_header << pcode; + if (!gcode.empty()) + file << s_glsl_header << gcode; + file << info_log; + file.close(); + + if (linkStatus != GL_TRUE) + { + PanicAlert("Failed to link shaders: %s\n" + "Debug info (%s, %s, %s):\n%s", + filename.c_str(), g_ogl_config.gl_vendor, g_ogl_config.gl_renderer, + g_ogl_config.gl_version, info_log.c_str()); + + return false; + } + } + + return true; } ProgramShaderCache::PCacheEntry ProgramShaderCache::GetShaderProgram() @@ -503,6 +602,14 @@ void ProgramShaderCache::Init() // Then once more to get bytes s_buffer = StreamBuffer::Create(GL_UNIFORM_BUFFER, UBO_LENGTH); + // The GPU shader code appears to be context-specific on Mesa/i965. + // This means that if we compiled the ubershaders asynchronously, they will be recompiled + // on the main thread the first time they are used, causing stutter. Nouveau has been + // reported to crash if draw calls are invoked on the shared context threads. For now, + // disable asynchronous compilation on Mesa. + if (!DriverDetails::HasBug(DriverDetails::BUG_SHARED_CONTEXT_SHADER_COMPILATION)) + s_async_compiler = std::make_unique(); + // Read our shader cache, only if supported and enabled if (g_ogl_config.bSupportsGLSLCache && g_ActiveConfig.bShaderCache) LoadProgramBinaries(); @@ -511,36 +618,138 @@ void ProgramShaderCache::Init() CurrentProgram = 0; last_entry = nullptr; + last_uber_entry = nullptr; + + if (g_ActiveConfig.CanPrecompileUberShaders()) + { + if (s_async_compiler) + s_async_compiler->ResizeWorkerThreads(g_ActiveConfig.GetShaderPrecompilerThreads()); + PrecompileUberShaders(); + } + + if (s_async_compiler) + { + // No point using the async compiler without workers. + s_async_compiler->ResizeWorkerThreads(g_ActiveConfig.GetShaderCompilerThreads()); + if (!s_async_compiler->HasWorkerThreads()) + s_async_compiler.reset(); + } +} + +void ProgramShaderCache::RetrieveAsyncShaders() +{ + if (s_async_compiler) + s_async_compiler->RetrieveWorkItems(); } void ProgramShaderCache::Reload() { + if (s_async_compiler) + { + s_async_compiler->WaitUntilCompletion(); + s_async_compiler->RetrieveWorkItems(); + } + const bool use_cache = g_ogl_config.bSupportsGLSLCache && g_ActiveConfig.bShaderCache; if (use_cache) SaveProgramBinaries(); - g_program_disk_cache.Close(); + s_program_disk_cache.Close(); + s_uber_program_disk_cache.Close(); DestroyShaders(); if (use_cache) LoadProgramBinaries(); + if (g_ActiveConfig.CanPrecompileUberShaders()) + PrecompileUberShaders(); + + InvalidateVertexFormat(); CurrentProgram = 0; last_entry = nullptr; + last_uber_entry = nullptr; last_uid = {}; + last_uber_uid = {}; } void ProgramShaderCache::Shutdown() { + if (s_async_compiler) + { + s_async_compiler->WaitUntilCompletion(); + s_async_compiler->StopWorkerThreads(); + s_async_compiler->RetrieveWorkItems(); + s_async_compiler.reset(); + } + // store all shaders in cache on disk if (g_ogl_config.bSupportsGLSLCache && g_ActiveConfig.bShaderCache) SaveProgramBinaries(); - g_program_disk_cache.Close(); + s_program_disk_cache.Close(); + s_uber_program_disk_cache.Close(); + InvalidateVertexFormat(); DestroyShaders(); s_buffer.reset(); } +void ProgramShaderCache::BindVertexFormat(const GLVertexFormat* vertex_format) +{ + u32 new_VAO = vertex_format ? vertex_format->VAO : 0; + if (s_last_VAO == new_VAO) + return; + + glBindVertexArray(new_VAO); + s_last_VAO = new_VAO; +} + +void ProgramShaderCache::InvalidateVertexFormat() +{ + s_last_VAO = INVALID_VAO; +} + +void ProgramShaderCache::BindLastVertexFormat() +{ + if (s_last_VAO != INVALID_VAO) + glBindVertexArray(s_last_VAO); + else + glBindVertexArray(0); +} + +GLuint ProgramShaderCache::CreateProgramFromBinary(const u8* value, u32 value_size) +{ + const u8* binary = value + sizeof(GLenum); + GLint binary_size = value_size - sizeof(GLenum); + GLenum prog_format; + std::memcpy(&prog_format, value, sizeof(GLenum)); + + GLuint progid = glCreateProgram(); + glProgramBinary(progid, prog_format, binary, binary_size); + + GLint success; + glGetProgramiv(progid, GL_LINK_STATUS, &success); + if (!success) + { + glDeleteProgram(progid); + return 0; + } + + return progid; +} + +bool ProgramShaderCache::CreateCacheEntryFromBinary(PCacheEntry* entry, const u8* value, + u32 value_size) +{ + entry->in_cache = true; + entry->pending = false; + entry->shader.glprogid = CreateProgramFromBinary(value, value_size); + if (entry->shader.glprogid == 0) + return false; + + entry->shader.SetProgramVariables(); + return true; +} + void ProgramShaderCache::LoadProgramBinaries() { GLint Supported; @@ -553,49 +762,73 @@ void ProgramShaderCache::LoadProgramBinaries() } else { + // Load game-specific shaders. std::string cache_filename = GetDiskShaderCacheFileName(APIType::OpenGL, "ProgramBinaries", true, true); - ProgramShaderCacheInserter inserter; - g_program_disk_cache.OpenAndRead(cache_filename, inserter); + ProgramShaderCacheInserter inserter(pshaders); + s_program_disk_cache.OpenAndRead(cache_filename, inserter); + + // Load global ubershaders. + cache_filename = + GetDiskShaderCacheFileName(APIType::OpenGL, "UberProgramBinaries", false, true); + ProgramShaderCacheInserter uber_inserter(ubershaders); + s_uber_program_disk_cache.OpenAndRead(cache_filename, uber_inserter); } SETSTAT(stats.numPixelShadersAlive, pshaders.size()); } -void ProgramShaderCache::SaveProgramBinaries() +static bool GetProgramBinary(const ProgramShaderCache::PCacheEntry& entry, std::vector& data) { - for (auto& entry : pshaders) + // Clear any prior error code + glGetError(); + + GLint link_status = GL_FALSE, delete_status = GL_TRUE, binary_size = 0; + glGetProgramiv(entry.shader.glprogid, GL_LINK_STATUS, &link_status); + glGetProgramiv(entry.shader.glprogid, GL_DELETE_STATUS, &delete_status); + glGetProgramiv(entry.shader.glprogid, GL_PROGRAM_BINARY_LENGTH, &binary_size); + if (glGetError() != GL_NO_ERROR || link_status == GL_FALSE || delete_status == GL_TRUE || + binary_size == 0) { - // Clear any prior error code - glGetError(); - - if (entry.second.in_cache) - { - continue; - } - - GLint link_status = GL_FALSE, delete_status = GL_TRUE, binary_size = 0; - glGetProgramiv(entry.second.shader.glprogid, GL_LINK_STATUS, &link_status); - glGetProgramiv(entry.second.shader.glprogid, GL_DELETE_STATUS, &delete_status); - glGetProgramiv(entry.second.shader.glprogid, GL_PROGRAM_BINARY_LENGTH, &binary_size); - if (glGetError() != GL_NO_ERROR || link_status == GL_FALSE || delete_status == GL_TRUE || - !binary_size) - { - continue; - } - - std::vector data(binary_size + sizeof(GLenum)); - u8* binary = &data[sizeof(GLenum)]; - GLenum* prog_format = (GLenum*)&data[0]; - glGetProgramBinary(entry.second.shader.glprogid, binary_size, nullptr, prog_format, binary); - if (glGetError() != GL_NO_ERROR) - { - continue; - } - - g_program_disk_cache.Append(entry.first, &data[0], binary_size + sizeof(GLenum)); + return false; } - g_program_disk_cache.Sync(); + data.resize(binary_size + sizeof(GLenum)); + + GLsizei length = binary_size; + GLenum prog_format; + glGetProgramBinary(entry.shader.glprogid, binary_size, &length, &prog_format, + &data[sizeof(GLenum)]); + if (glGetError() != GL_NO_ERROR) + return false; + + std::memcpy(&data[0], &prog_format, sizeof(prog_format)); + return true; +} + +template +static void SaveProgramBinaryMap(CacheMapType& program_map, DiskCacheType& disk_cache) +{ + std::vector binary_data; + for (auto& entry : program_map) + { + if (entry.second.in_cache || entry.second.pending) + continue; + + // Entry is now in cache (even if it fails, we don't want to try to save it again). + entry.second.in_cache = true; + if (!GetProgramBinary(entry.second, binary_data)) + continue; + + disk_cache.Append(entry.first, &binary_data[0], static_cast(binary_data.size())); + } + + disk_cache.Sync(); +} + +void ProgramShaderCache::SaveProgramBinaries() +{ + SaveProgramBinaryMap(pshaders, s_program_disk_cache); + SaveProgramBinaryMap(ubershaders, s_uber_program_disk_cache); } void ProgramShaderCache::DestroyShaders() @@ -603,10 +836,12 @@ void ProgramShaderCache::DestroyShaders() glUseProgram(0); for (auto& entry : pshaders) - { entry.second.Destroy(); - } pshaders.clear(); + + for (auto& entry : ubershaders) + entry.second.Destroy(); + ubershaders.clear(); } void ProgramShaderCache::CreateHeader() @@ -757,30 +992,307 @@ void ProgramShaderCache::CreateHeader() v >= GLSLES_310 ? "precision highp image2DArray;" : ""); } -void ProgramShaderCache::ProgramShaderCacheInserter::Read(const SHADERUID& key, const u8* value, - u32 value_size) +void ProgramShaderCache::PrecompileUberShaders() { - const u8* binary = value + sizeof(GLenum); - GLenum* prog_format = (GLenum*)value; - GLint binary_size = value_size - sizeof(GLenum); + bool success = true; - PCacheEntry entry; - entry.in_cache = 1; - entry.shader.glprogid = glCreateProgram(); - glProgramBinary(entry.shader.glprogid, *prog_format, binary, binary_size); + UberShader::EnumerateVertexShaderUids([&](const UberShader::VertexShaderUid& vuid) { + UberShader::EnumeratePixelShaderUids([&](const UberShader::PixelShaderUid& puid) { + // UIDs must have compatible texgens, a mismatching combination will never be queried. + if (vuid.GetUidData()->num_texgens != puid.GetUidData()->num_texgens) + return; - GLint success; - glGetProgramiv(entry.shader.glprogid, GL_LINK_STATUS, &success); + EnumerateGeometryShaderUids([&](const GeometryShaderUid& guid) { + if (guid.GetUidData()->numTexGens != vuid.GetUidData()->num_texgens) + return; - if (success) + UBERSHADERUID uid; + std::memcpy(&uid.vuid, &vuid, sizeof(uid.vuid)); + std::memcpy(&uid.puid, &puid, sizeof(uid.puid)); + std::memcpy(&uid.guid, &guid, sizeof(uid.guid)); + + // The ubershader may already exist if shader caching is enabled. + if (!success || ubershaders.find(uid) != ubershaders.end()) + return; + + PCacheEntry& entry = ubershaders[uid]; + entry.in_cache = false; + entry.pending = false; + + // Multi-context path? + if (s_async_compiler) + { + entry.pending = true; + s_async_compiler->QueueWorkItem( + s_async_compiler->CreateWorkItem(uid)); + return; + } + + ShaderHostConfig host_config = ShaderHostConfig::GetCurrent(); + ShaderCode vcode = + UberShader::GenVertexShader(APIType::OpenGL, host_config, uid.vuid.GetUidData()); + ShaderCode pcode = + UberShader::GenPixelShader(APIType::OpenGL, host_config, uid.puid.GetUidData()); + ShaderCode gcode; + if (g_ActiveConfig.backend_info.bSupportsGeometryShaders && + !uid.guid.GetUidData()->IsPassthrough()) + { + GenerateGeometryShaderCode(APIType::OpenGL, host_config, uid.guid.GetUidData()); + } + + // Always background compile, even when it's not supported. + // This way hopefully the driver can still compile the shaders in parallel. + if (!CompileShader(entry.shader, vcode.GetBuffer(), pcode.GetBuffer(), gcode.GetBuffer())) + { + // Stop compiling shaders if any of them fail, no point continuing. + success = false; + return; + } + }); + }); + }); + + if (s_async_compiler) { - pshaders[key] = entry; - entry.shader.SetProgramVariables(); + s_async_compiler->WaitUntilCompletion([](size_t completed, size_t total) { + Host_UpdateProgressDialog(GetStringT("Compiling shaders...").c_str(), + static_cast(completed), static_cast(total)); + }); + s_async_compiler->RetrieveWorkItems(); + Host_UpdateProgressDialog("", -1, -1); } - else + + if (!success) { - glDeleteProgram(entry.shader.glprogid); + PanicAlert("One or more ubershaders failed to compile. Disabling ubershaders."); + for (auto& it : ubershaders) + it.second.Destroy(); + ubershaders.clear(); } } +bool ProgramShaderCache::SharedContextAsyncShaderCompiler::WorkerThreadInitMainThread(void** param) +{ + SharedContextData* ctx_data = new SharedContextData(); + ctx_data->context = GLInterface->CreateSharedContext(); + if (!ctx_data->context) + { + PanicAlert("Failed to create shared context for shader compiling."); + delete ctx_data; + return false; + } + + *param = ctx_data; + return true; +} + +bool ProgramShaderCache::SharedContextAsyncShaderCompiler::WorkerThreadInitWorkerThread(void* param) +{ + SharedContextData* ctx_data = reinterpret_cast(param); + if (!ctx_data->context->MakeCurrent()) + { + PanicAlert("Failed to make shared context current."); + ctx_data->context->Shutdown(); + delete ctx_data; + return false; + } + + CreatePrerenderArrays(ctx_data); + return true; +} + +void ProgramShaderCache::SharedContextAsyncShaderCompiler::WorkerThreadExit(void* param) +{ + SharedContextData* ctx_data = reinterpret_cast(param); + DestroyPrerenderArrays(ctx_data); + ctx_data->context->Shutdown(); + delete ctx_data; +} + +ProgramShaderCache::ShaderCompileWorkItem::ShaderCompileWorkItem(const SHADERUID& uid) +{ + std::memcpy(&m_uid, &uid, sizeof(m_uid)); +} + +bool ProgramShaderCache::ShaderCompileWorkItem::Compile() +{ + ShaderHostConfig host_config = ShaderHostConfig::GetCurrent(); + ShaderCode vcode = + GenerateVertexShaderCode(APIType::OpenGL, host_config, m_uid.vuid.GetUidData()); + ShaderCode pcode = GeneratePixelShaderCode(APIType::OpenGL, host_config, m_uid.puid.GetUidData()); + ShaderCode gcode; + if (g_ActiveConfig.backend_info.bSupportsGeometryShaders && + !m_uid.guid.GetUidData()->IsPassthrough()) + gcode = GenerateGeometryShaderCode(APIType::OpenGL, host_config, m_uid.guid.GetUidData()); + + CompileShader(m_program, vcode.GetBuffer(), pcode.GetBuffer(), gcode.GetBuffer()); + DrawPrerenderArray(m_program, m_uid.guid.GetUidData()->primitive_type); + return true; +} + +void ProgramShaderCache::ShaderCompileWorkItem::Retrieve() +{ + auto iter = pshaders.find(m_uid); + if (iter != pshaders.end() && !iter->second.pending) + { + // Main thread already compiled this shader. + m_program.Destroy(); + return; + } + + PCacheEntry& entry = pshaders[m_uid]; + entry.shader = m_program; + entry.in_cache = false; + entry.pending = false; +} + +ProgramShaderCache::UberShaderCompileWorkItem::UberShaderCompileWorkItem(const UBERSHADERUID& uid) +{ + std::memcpy(&m_uid, &uid, sizeof(m_uid)); +} + +bool ProgramShaderCache::UberShaderCompileWorkItem::Compile() +{ + ShaderHostConfig host_config = ShaderHostConfig::GetCurrent(); + ShaderCode vcode = + UberShader::GenVertexShader(APIType::OpenGL, host_config, m_uid.vuid.GetUidData()); + ShaderCode pcode = + UberShader::GenPixelShader(APIType::OpenGL, host_config, m_uid.puid.GetUidData()); + ShaderCode gcode; + if (g_ActiveConfig.backend_info.bSupportsGeometryShaders && + !m_uid.guid.GetUidData()->IsPassthrough()) + gcode = GenerateGeometryShaderCode(APIType::OpenGL, host_config, m_uid.guid.GetUidData()); + + CompileShader(m_program, vcode.GetBuffer(), pcode.GetBuffer(), gcode.GetBuffer()); + DrawPrerenderArray(m_program, m_uid.guid.GetUidData()->primitive_type); + return true; +} + +void ProgramShaderCache::UberShaderCompileWorkItem::Retrieve() +{ + auto iter = ubershaders.find(m_uid); + if (iter != ubershaders.end() && !iter->second.pending) + { + // Main thread already compiled this shader. + m_program.Destroy(); + return; + } + + PCacheEntry& entry = ubershaders[m_uid]; + entry.shader = m_program; + entry.in_cache = false; + entry.pending = false; +} + +void ProgramShaderCache::CreatePrerenderArrays(SharedContextData* data) +{ + // Create VAO for the prerender vertices. + // We don't use the normal VAO map, since we need to change the VBO pointer. + glGenVertexArrays(1, &data->prerender_VAO); + glBindVertexArray(data->prerender_VAO); + + // Create and populate the prerender VBO. We need enough space to draw 3 triangles. + static constexpr float vbo_data[] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + constexpr u32 vbo_stride = sizeof(float) * 3; + glGenBuffers(1, &data->prerender_VBO); + glBindBuffer(GL_ARRAY_BUFFER, data->prerender_VBO); + glBufferData(GL_ARRAY_BUFFER, sizeof(vbo_data), vbo_data, GL_STATIC_DRAW); + + // We only need a position in our prerender vertex. + glEnableVertexAttribArray(SHADER_POSITION_ATTRIB); + glVertexAttribPointer(SHADER_POSITION_ATTRIB, 3, GL_FLOAT, GL_FALSE, vbo_stride, nullptr); + + // The other attributes have to be active to avoid variant generation. + glEnableVertexAttribArray(SHADER_POSMTX_ATTRIB); + glVertexAttribIPointer(SHADER_POSMTX_ATTRIB, 1, GL_UNSIGNED_BYTE, vbo_stride, nullptr); + for (u32 i = 0; i < 3; i++) + { + glEnableVertexAttribArray(SHADER_NORM0_ATTRIB + i); + glVertexAttribPointer(SHADER_NORM0_ATTRIB + i, 3, GL_FLOAT, GL_FALSE, vbo_stride, nullptr); + } + for (u32 i = 0; i < 2; i++) + { + glEnableVertexAttribArray(SHADER_COLOR0_ATTRIB + i); + glVertexAttribPointer(SHADER_COLOR0_ATTRIB + i, 4, GL_UNSIGNED_BYTE, GL_TRUE, vbo_stride, + nullptr); + } + for (u32 i = 0; i < 8; i++) + { + glEnableVertexAttribArray(SHADER_TEXTURE0_ATTRIB + i); + glVertexAttribPointer(SHADER_TEXTURE0_ATTRIB + i, 3, GL_FLOAT, GL_FALSE, vbo_stride, nullptr); + } + + // We need an index buffer to set up the same drawing state on Mesa. + static constexpr u16 ibo_data[] = {0, 1, 2}; + glGenBuffers(1, &data->prerender_IBO); + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, data->prerender_IBO); + glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(ibo_data), ibo_data, GL_STATIC_DRAW); + + // Mesa also requires the primitive restart state matches? + if (g_ActiveConfig.backend_info.bSupportsPrimitiveRestart) + { + if (GLInterface->GetMode() == GLInterfaceMode::MODE_OPENGLES3) + { + glEnable(GL_PRIMITIVE_RESTART_FIXED_INDEX); + } + else + { + if (GLExtensions::Version() >= 310) + { + glEnable(GL_PRIMITIVE_RESTART); + glPrimitiveRestartIndex(65535); + } + else + { + glEnableClientState(GL_PRIMITIVE_RESTART_NV); + glPrimitiveRestartIndexNV(65535); + } + } + } +} + +void ProgramShaderCache::DestroyPrerenderArrays(SharedContextData* data) +{ + if (data->prerender_VAO) + { + glDeleteVertexArrays(1, &data->prerender_VAO); + data->prerender_VAO = 0; + } + if (data->prerender_VBO) + { + glDeleteBuffers(1, &data->prerender_VBO); + data->prerender_VBO = 0; + } + if (data->prerender_IBO) + { + glDeleteBuffers(1, &data->prerender_IBO); + data->prerender_IBO = 0; + } +} + +void ProgramShaderCache::DrawPrerenderArray(const SHADER& shader, u32 primitive_type) +{ + // This is called on a worker thread, so we don't want to use the normal binding process. + glUseProgram(shader.glprogid); + + // The number of primitives drawn depends on the type. + switch (primitive_type) + { + case PRIMITIVE_POINTS: + glDrawElements(GL_POINTS, 1, GL_UNSIGNED_SHORT, nullptr); + break; + case PRIMITIVE_LINES: + glDrawElements(GL_LINES, 2, GL_UNSIGNED_SHORT, nullptr); + break; + case PRIMITIVE_TRIANGLES: + glDrawElements(GL_TRIANGLES, 3, GL_UNSIGNED_SHORT, nullptr); + break; + } + + // Has to be finished by the time the main thread picks it up. + GLsync sync = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); + glClientWaitSync(sync, GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED); + glDeleteSync(sync); +} + } // namespace OGL diff --git a/Source/Core/VideoBackends/OGL/ProgramShaderCache.h b/Source/Core/VideoBackends/OGL/ProgramShaderCache.h index 7b29c24314..61f2f68fdc 100644 --- a/Source/Core/VideoBackends/OGL/ProgramShaderCache.h +++ b/Source/Core/VideoBackends/OGL/ProgramShaderCache.h @@ -4,17 +4,25 @@ #pragma once +#include #include #include "Common/GL/GLUtil.h" #include "Common/LinearDiskCache.h" +#include "VideoCommon/AsyncShaderCompiler.h" #include "VideoCommon/GeometryShaderGen.h" #include "VideoCommon/PixelShaderGen.h" +#include "VideoCommon/UberShaderPixel.h" +#include "VideoCommon/UberShaderVertex.h" #include "VideoCommon/VertexShaderGen.h" +class cInterfaceBase; + namespace OGL { +class GLVertexFormat; + class SHADERUID { public: @@ -24,30 +32,53 @@ public: bool operator<(const SHADERUID& r) const { - return std::tie(puid, vuid, guid) < std::tie(r.puid, r.vuid, r.guid); + return std::tie(vuid, puid, guid) < std::tie(r.vuid, r.puid, r.guid); } bool operator==(const SHADERUID& r) const { - return std::tie(puid, vuid, guid) == std::tie(r.puid, r.vuid, r.guid); + return std::tie(vuid, puid, guid) == std::tie(r.vuid, r.puid, r.guid); + } +}; +class UBERSHADERUID +{ +public: + UberShader::VertexShaderUid vuid; + UberShader::PixelShaderUid puid; + GeometryShaderUid guid; + + bool operator<(const UBERSHADERUID& r) const + { + return std::tie(vuid, puid, guid) < std::tie(r.vuid, r.puid, r.guid); + } + + bool operator==(const UBERSHADERUID& r) const + { + return std::tie(vuid, puid, guid) == std::tie(r.vuid, r.puid, r.guid); } }; struct SHADER { - SHADER() : glprogid(0) {} void Destroy() { - glDeleteProgram(glprogid); - glprogid = 0; + DestroyShaders(); + if (glprogid) + { + glDeleteProgram(glprogid); + glprogid = 0; + } } - GLuint glprogid; // OpenGL program id - std::string strvprog, strpprog, strgprog; + GLuint vsid = 0; + GLuint gsid = 0; + GLuint psid = 0; + GLuint glprogid = 0; void SetProgramVariables(); void SetProgramBindings(bool is_compute); void Bind() const; + void DestroyShaders(); }; class ProgramShaderCache @@ -57,43 +88,126 @@ public: { SHADER shader; bool in_cache; + bool pending; void Destroy() { shader.Destroy(); } }; static PCacheEntry GetShaderProgram(); - static SHADER* SetShader(u32 primitive_type); - static void GetShaderId(SHADERUID* uid, u32 primitive_type); + static SHADER* SetShader(u32 primitive_type, const GLVertexFormat* vertex_format); + static SHADER* SetUberShader(u32 primitive_type, const GLVertexFormat* vertex_format); + static void BindVertexFormat(const GLVertexFormat* vertex_format); + static void InvalidateVertexFormat(); + static void BindLastVertexFormat(); static bool CompileShader(SHADER& shader, const std::string& vcode, const std::string& pcode, const std::string& gcode = ""); static bool CompileComputeShader(SHADER& shader, const std::string& code); - static GLuint CompileSingleShader(GLuint type, const std::string& code); + static GLuint CompileSingleShader(GLenum type, const std::string& code); + static bool CheckShaderCompileResult(GLuint id, GLenum type, const std::string& code); + static bool CheckProgramLinkResult(GLuint id, const std::string& vcode, const std::string& pcode, + const std::string& gcode); static void UploadConstants(); static void Init(); static void Reload(); static void Shutdown(); static void CreateHeader(); + static void RetrieveAsyncShaders(); + static void PrecompileUberShaders(); private: - class ProgramShaderCacheInserter : public LinearDiskCacheReader + template + class ProgramShaderCacheInserter : public LinearDiskCacheReader { public: - void Read(const SHADERUID& key, const u8* value, u32 value_size) override; + ProgramShaderCacheInserter(std::map& shader_map) + : m_shader_map(shader_map) + { + } + + void Read(const UIDType& key, const u8* value, u32 value_size) override + { + if (m_shader_map.find(key) != m_shader_map.end()) + return; + + PCacheEntry& entry = m_shader_map[key]; + if (!CreateCacheEntryFromBinary(&entry, value, value_size)) + { + m_shader_map.erase(key); + return; + } + } + + private: + std::map& m_shader_map; }; + class SharedContextAsyncShaderCompiler : public VideoCommon::AsyncShaderCompiler + { + protected: + virtual bool WorkerThreadInitMainThread(void** param) override; + virtual bool WorkerThreadInitWorkerThread(void* param) override; + virtual void WorkerThreadExit(void* param) override; + }; + + struct SharedContextData + { + std::unique_ptr context; + GLuint prerender_VBO; + GLuint prerender_VAO; + GLuint prerender_IBO; + }; + + class ShaderCompileWorkItem : public VideoCommon::AsyncShaderCompiler::WorkItem + { + public: + ShaderCompileWorkItem(const SHADERUID& uid); + + bool Compile() override; + void Retrieve() override; + + private: + SHADERUID m_uid; + SHADER m_program; + }; + + class UberShaderCompileWorkItem : public VideoCommon::AsyncShaderCompiler::WorkItem + { + public: + UberShaderCompileWorkItem(const UBERSHADERUID& uid); + + bool Compile() override; + void Retrieve() override; + + private: + UBERSHADERUID m_uid; + SHADER m_program; + }; + + typedef std::map PCache; + typedef std::map UberPCache; + + static GLuint CreateProgramFromBinary(const u8* value, u32 value_size); + static bool CreateCacheEntryFromBinary(PCacheEntry* entry, const u8* value, u32 value_size); static void LoadProgramBinaries(); static void SaveProgramBinaries(); static void DestroyShaders(); + static void CreatePrerenderArrays(SharedContextData* data); + static void DestroyPrerenderArrays(SharedContextData* data); + static void DrawPrerenderArray(const SHADER& shader, u32 primitive_type); - typedef std::map PCache; static PCache pshaders; + static UberPCache ubershaders; static PCacheEntry* last_entry; + static PCacheEntry* last_uber_entry; static SHADERUID last_uid; + static UBERSHADERUID last_uber_uid; + static std::unique_ptr s_async_compiler; static u32 s_ubo_buffer_size; static s32 s_ubo_align; + static u32 s_last_VAO; }; } // namespace OGL diff --git a/Source/Core/VideoBackends/OGL/RasterFont.cpp b/Source/Core/VideoBackends/OGL/RasterFont.cpp index 61cae75e11..9ab8647ce9 100644 --- a/Source/Core/VideoBackends/OGL/RasterFont.cpp +++ b/Source/Core/VideoBackends/OGL/RasterFont.cpp @@ -119,11 +119,11 @@ static const u8 rasters[CHARACTER_COUNT][CHARACTER_HEIGHT] = { static const char* s_vertexShaderSrc = "uniform vec2 charSize;\n" "uniform vec2 offset;" "in vec2 rawpos;\n" - "in vec2 tex0;\n" + "in vec2 rawtex0;\n" "out vec2 uv0;\n" "void main(void) {\n" " gl_Position = vec4(rawpos + offset,0,1);\n" - " uv0 = tex0 * charSize;\n" + " uv0 = rawtex0 * charSize;\n" "}\n"; static const char* s_fragmentShaderSrc = "SAMPLER_BINDING(8) uniform sampler2D samp8;\n" diff --git a/Source/Core/VideoBackends/OGL/Render.cpp b/Source/Core/VideoBackends/OGL/Render.cpp index 4392da994c..b8a14869d0 100644 --- a/Source/Core/VideoBackends/OGL/Render.cpp +++ b/Source/Core/VideoBackends/OGL/Render.cpp @@ -447,6 +447,12 @@ Renderer::Renderer() // Clip distance support is useless without a method to clamp the depth range g_Config.backend_info.bSupportsDepthClamp = GLExtensions::Supports("GL_ARB_depth_clamp"); + // Desktop OpenGL supports bitfield manulipation and dynamic sampler indexing if it supports + // shader5. OpenGL ES 3.1 supports it implicitly without an extension + g_Config.backend_info.bSupportsBitfield = GLExtensions::Supports("GL_ARB_gpu_shader5"); + g_Config.backend_info.bSupportsDynamicSamplerIndexing = + GLExtensions::Supports("GL_ARB_gpu_shader5"); + g_ogl_config.bSupportsGLSLCache = GLExtensions::Supports("GL_ARB_get_program_binary"); g_ogl_config.bSupportsGLPinnedMemory = GLExtensions::Supports("GL_AMD_pinned_memory"); g_ogl_config.bSupportsGLSync = GLExtensions::Supports("GL_ARB_sync"); @@ -515,6 +521,8 @@ Renderer::Renderer() g_ogl_config.bSupportsMSAA = true; g_ogl_config.bSupportsTextureStorage = true; g_ogl_config.bSupports2DTextureStorageMultisample = true; + g_Config.backend_info.bSupportsBitfield = true; + g_Config.backend_info.bSupportsDynamicSamplerIndexing = g_ogl_config.bSupportsAEP; if (g_ActiveConfig.iStereoMode > 0 && g_ActiveConfig.iMultisamples > 1 && !g_ogl_config.bSupports3DTextureStorageMultisample) { @@ -542,6 +550,8 @@ Renderer::Renderer() g_ogl_config.bSupportsTextureStorage = true; g_ogl_config.bSupports2DTextureStorageMultisample = true; g_ogl_config.bSupports3DTextureStorageMultisample = true; + g_Config.backend_info.bSupportsBitfield = true; + g_Config.backend_info.bSupportsDynamicSamplerIndexing = true; } } else @@ -1462,6 +1472,7 @@ void Renderer::SwapImpl(u32 xfbAddr, u32 fbWidth, u32 fbStride, u32 fbHeight, // Clean out old stuff from caches. It's not worth it to clean out the shader caches. g_texture_cache->Cleanup(frameCount); + ProgramShaderCache::RetrieveAsyncShaders(); // Render to the framebuffer. FramebufferManager::SetFramebuffer(0); @@ -1758,10 +1769,9 @@ void Renderer::RestoreAPIState() SetBlendMode(true); SetViewport(); + ProgramShaderCache::BindLastVertexFormat(); const VertexManager* const vm = static_cast(g_vertex_manager.get()); glBindBuffer(GL_ARRAY_BUFFER, vm->m_vertex_buffers); - if (vm->m_last_vao) - glBindVertexArray(vm->m_last_vao); OGLTexture::SetStage(); } diff --git a/Source/Core/VideoBackends/OGL/Render.h b/Source/Core/VideoBackends/OGL/Render.h index 8ec6a21e0d..e8df3d4301 100644 --- a/Source/Core/VideoBackends/OGL/Render.h +++ b/Source/Core/VideoBackends/OGL/Render.h @@ -58,6 +58,7 @@ struct VideoConfig bool bSupportsConservativeDepth; bool bSupportsImageLoadStore; bool bSupportsAniso; + bool bSupportsBitfield; const char* gl_vendor; const char* gl_renderer; diff --git a/Source/Core/VideoBackends/OGL/VertexManager.cpp b/Source/Core/VideoBackends/OGL/VertexManager.cpp index eb7ce04c92..9f8d87eceb 100644 --- a/Source/Core/VideoBackends/OGL/VertexManager.cpp +++ b/Source/Core/VideoBackends/OGL/VertexManager.cpp @@ -53,8 +53,6 @@ void VertexManager::CreateDeviceObjects() s_indexBuffer = StreamBuffer::Create(GL_ELEMENT_ARRAY_BUFFER, MAX_IBUFFER_SIZE); m_index_buffers = s_indexBuffer->m_buffer; - - m_last_vao = 0; } void VertexManager::DestroyDeviceObjects() @@ -142,22 +140,13 @@ void VertexManager::vFlush() GLVertexFormat* nativeVertexFmt = (GLVertexFormat*)VertexLoaderManager::GetCurrentVertexFormat(); u32 stride = nativeVertexFmt->GetVertexStride(); - if (m_last_vao != nativeVertexFmt->VAO) - { - glBindVertexArray(nativeVertexFmt->VAO); - m_last_vao = nativeVertexFmt->VAO; - } + ProgramShaderCache::SetShader(m_current_primitive_type, nativeVertexFmt); PrepareDrawBuffers(stride); - ProgramShaderCache::SetShader(m_current_primitive_type); - // upload global constants ProgramShaderCache::UploadConstants(); - // setup the pointers - nativeVertexFmt->SetupVertexPointers(); - if (::BoundingBox::active && !g_Config.BBoxUseFragmentShaderImplementation()) { glEnable(GL_STENCIL_TEST); @@ -171,24 +160,6 @@ void VertexManager::vFlush() glDisable(GL_STENCIL_TEST); } -#if defined(_DEBUG) || defined(DEBUGFAST) - if (g_ActiveConfig.iLog & CONF_SAVESHADERS) - { - // save the shaders - ProgramShaderCache::PCacheEntry prog = ProgramShaderCache::GetShaderProgram(); - std::string filename = StringFromFormat( - "%sps%.3d.txt", File::GetUserPath(D_DUMPFRAMES_IDX).c_str(), g_ActiveConfig.iSaveTargetId); - std::ofstream fps; - File::OpenFStream(fps, filename, std::ios_base::out); - fps << prog.shader.strpprog; - - filename = StringFromFormat("%svs%.3d.txt", File::GetUserPath(D_DUMPFRAMES_IDX).c_str(), - g_ActiveConfig.iSaveTargetId); - std::ofstream fvs; - File::OpenFStream(fvs, filename, std::ios_base::out); - fvs << prog.shader.strvprog; - } -#endif g_Config.iSaveTargetId++; ClearEFBCache(); } diff --git a/Source/Core/VideoBackends/OGL/VertexManager.h b/Source/Core/VideoBackends/OGL/VertexManager.h index 7ba18b6ecc..6a4599d48e 100644 --- a/Source/Core/VideoBackends/OGL/VertexManager.h +++ b/Source/Core/VideoBackends/OGL/VertexManager.h @@ -20,8 +20,6 @@ public: GLVertexFormat(const PortableVertexDeclaration& vtx_decl); ~GLVertexFormat(); - void SetupVertexPointers() override; - GLuint VAO; }; @@ -42,7 +40,6 @@ public: // NativeVertexFormat use this GLuint m_vertex_buffers; GLuint m_index_buffers; - GLuint m_last_vao; protected: void ResetBuffer(u32 stride) override; diff --git a/Source/Core/VideoBackends/Software/SWVertexLoader.cpp b/Source/Core/VideoBackends/Software/SWVertexLoader.cpp index d81cb4da4f..3976eb7a2c 100644 --- a/Source/Core/VideoBackends/Software/SWVertexLoader.cpp +++ b/Source/Core/VideoBackends/Software/SWVertexLoader.cpp @@ -30,7 +30,6 @@ class NullNativeVertexFormat : public NativeVertexFormat { public: NullNativeVertexFormat(const PortableVertexDeclaration& _vtx_decl) { vtx_decl = _vtx_decl; } - void SetupVertexPointers() override {} }; std::unique_ptr diff --git a/Source/Core/VideoBackends/Software/Tev.cpp b/Source/Core/VideoBackends/Software/Tev.cpp index 5cd4491cc9..261b2535d1 100644 --- a/Source/Core/VideoBackends/Software/Tev.cpp +++ b/Source/Core/VideoBackends/Software/Tev.cpp @@ -769,7 +769,7 @@ void Tev::Draw() // - scaling of the "k" coefficient isn't clear either. // First, calculate the offset from the viewport center (normalized to 0..1) - float offset = (Position[0] - (static_cast(bpmem.fogRange.Base.Center) - 342)) / + float offset = (Position[0] - (static_cast(bpmem.fogRange.Base.Center.Value()) - 342)) / static_cast(xfmem.viewport.wd); // Based on that, choose the index such that points which are far away from the z-axis use the diff --git a/Source/Core/VideoBackends/Software/TransformUnit.cpp b/Source/Core/VideoBackends/Software/TransformUnit.cpp index 3babc89fc4..8fff7ded27 100644 --- a/Source/Core/VideoBackends/Software/TransformUnit.cpp +++ b/Source/Core/VideoBackends/Software/TransformUnit.cpp @@ -443,7 +443,7 @@ void TransformTexCoord(const InputVertexData* src, OutputVertexData* dst, bool s dst->texCoords[coordNum].z = 1.0f; break; default: - ERROR_LOG(VIDEO, "Bad tex gen type %i", texinfo.texgentype); + ERROR_LOG(VIDEO, "Bad tex gen type %i", texinfo.texgentype.Value()); } } diff --git a/Source/Core/VideoBackends/Vulkan/ObjectCache.cpp b/Source/Core/VideoBackends/Vulkan/ObjectCache.cpp index 4e856cb0d5..9eb7c0efd9 100644 --- a/Source/Core/VideoBackends/Vulkan/ObjectCache.cpp +++ b/Source/Core/VideoBackends/Vulkan/ObjectCache.cpp @@ -16,6 +16,7 @@ #include "Core/ConfigManager.h" +#include "VideoBackends/Vulkan/CommandBufferManager.h" #include "VideoBackends/Vulkan/ShaderCompiler.h" #include "VideoBackends/Vulkan/StreamBuffer.h" #include "VideoBackends/Vulkan/Util.h" @@ -59,6 +60,19 @@ bool ObjectCache::Initialize() if (!m_utility_shader_vertex_buffer || !m_utility_shader_uniform_buffer) return false; + m_dummy_texture = Texture2D::Create(1, 1, 1, 1, VK_FORMAT_R8G8B8A8_UNORM, VK_SAMPLE_COUNT_1_BIT, + VK_IMAGE_VIEW_TYPE_2D_ARRAY, VK_IMAGE_TILING_LINEAR, + VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT); + m_dummy_texture->TransitionToLayout(g_command_buffer_mgr->GetCurrentInitCommandBuffer(), + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + VkClearColorValue clear_color = {}; + VkImageSubresourceRange clear_range = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}; + vkCmdClearColorImage(g_command_buffer_mgr->GetCurrentInitCommandBuffer(), + m_dummy_texture->GetImage(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + &clear_color, 1, &clear_range); + m_dummy_texture->TransitionToLayout(g_command_buffer_mgr->GetCurrentInitCommandBuffer(), + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); + return true; } @@ -99,17 +113,9 @@ bool ObjectCache::CreateDescriptorSetLayouts() {UBO_DESCRIPTOR_SET_BINDING_GS, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC, 1, VK_SHADER_STAGE_GEOMETRY_BIT}}; - // Annoying these have to be split, apparently we can't partially update an array without the - // validation layers throwing a warning. static const VkDescriptorSetLayoutBinding sampler_set_bindings[] = { - {0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1, VK_SHADER_STAGE_FRAGMENT_BIT}, - {1, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1, VK_SHADER_STAGE_FRAGMENT_BIT}, - {2, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1, VK_SHADER_STAGE_FRAGMENT_BIT}, - {3, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1, VK_SHADER_STAGE_FRAGMENT_BIT}, - {4, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1, VK_SHADER_STAGE_FRAGMENT_BIT}, - {5, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1, VK_SHADER_STAGE_FRAGMENT_BIT}, - {6, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1, VK_SHADER_STAGE_FRAGMENT_BIT}, - {7, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1, VK_SHADER_STAGE_FRAGMENT_BIT}}; + {0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, static_cast(NUM_PIXEL_SHADER_SAMPLERS), + VK_SHADER_STAGE_FRAGMENT_BIT}}; static const VkDescriptorSetLayoutBinding ssbo_set_bindings[] = { {0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_FRAGMENT_BIT}}; diff --git a/Source/Core/VideoBackends/Vulkan/ObjectCache.h b/Source/Core/VideoBackends/Vulkan/ObjectCache.h index dd7b1ed739..bfcec09166 100644 --- a/Source/Core/VideoBackends/Vulkan/ObjectCache.h +++ b/Source/Core/VideoBackends/Vulkan/ObjectCache.h @@ -15,6 +15,7 @@ #include "Common/LinearDiskCache.h" #include "VideoBackends/Vulkan/Constants.h" +#include "VideoBackends/Vulkan/Texture2D.h" #include "VideoCommon/GeometryShaderGen.h" #include "VideoCommon/PixelShaderGen.h" @@ -62,6 +63,9 @@ public: VkSampler GetLinearSampler() const { return m_linear_sampler; } VkSampler GetSampler(const SamplerState& info); + // Dummy image for samplers that are unbound + Texture2D* GetDummyImage() const { return m_dummy_texture.get(); } + VkImageView GetDummyImageView() const { return m_dummy_texture->GetView(); } // Perform at startup, create descriptor layouts, compiles all static shaders. bool Initialize(); @@ -89,6 +93,9 @@ private: VkSampler m_linear_sampler = VK_NULL_HANDLE; std::map m_sampler_cache; + + // Dummy image for samplers that are unbound + std::unique_ptr m_dummy_texture; }; extern std::unique_ptr g_object_cache; diff --git a/Source/Core/VideoBackends/Vulkan/PostProcessing.cpp b/Source/Core/VideoBackends/Vulkan/PostProcessing.cpp index 991117567c..f2a8b3e0d0 100644 --- a/Source/Core/VideoBackends/Vulkan/PostProcessing.cpp +++ b/Source/Core/VideoBackends/Vulkan/PostProcessing.cpp @@ -149,7 +149,7 @@ static const std::string DEFAULT_FRAGMENT_SHADER_SOURCE = R"( static const std::string POSTPROCESSING_SHADER_HEADER = R"( SAMPLER_BINDING(0) uniform sampler2DArray samp0; - SAMPLER_BINDING(1) uniform sampler2D samp1; + SAMPLER_BINDING(1) uniform sampler2DArray samp1; layout(location = 0) in float3 uv0; layout(location = 1) in float4 col0; @@ -176,7 +176,7 @@ static const std::string POSTPROCESSING_SHADER_HEADER = R"( float4 SampleFontLocation(float2 location) { - return texture(samp1, location); + return texture(samp1, float3(location, 0.0)); } float2 GetResolution() diff --git a/Source/Core/VideoBackends/Vulkan/RasterFont.cpp b/Source/Core/VideoBackends/Vulkan/RasterFont.cpp index def32e5050..97ab4dd6d8 100644 --- a/Source/Core/VideoBackends/Vulkan/RasterFont.cpp +++ b/Source/Core/VideoBackends/Vulkan/RasterFont.cpp @@ -150,7 +150,7 @@ layout(std140, push_constant) uniform PCBlock { vec4 color; } PC; -layout(set = 1, binding = 0) uniform sampler2D samp0; +layout(set = 1, binding = 0) uniform sampler2DArray samp0; layout(location = 0) in vec2 uv0; @@ -158,7 +158,7 @@ layout(location = 0) out vec4 ocol0; void main() { - ocol0 = texture(samp0, uv0) * PC.color; + ocol0 = texture(samp0, float3(uv0, 0.0)) * PC.color; } )"; @@ -209,7 +209,7 @@ bool RasterFont::CreateTexture() // create the actual texture object m_texture = Texture2D::Create(CHARACTER_WIDTH * CHARACTER_COUNT, CHARACTER_HEIGHT, 1, 1, VK_FORMAT_R8G8B8A8_UNORM, VK_SAMPLE_COUNT_1_BIT, - VK_IMAGE_VIEW_TYPE_2D, VK_IMAGE_TILING_OPTIMAL, + VK_IMAGE_VIEW_TYPE_2D_ARRAY, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT); if (!m_texture) return false; diff --git a/Source/Core/VideoBackends/Vulkan/Renderer.cpp b/Source/Core/VideoBackends/Vulkan/Renderer.cpp index 4e72d94a61..682fb8885c 100644 --- a/Source/Core/VideoBackends/Vulkan/Renderer.cpp +++ b/Source/Core/VideoBackends/Vulkan/Renderer.cpp @@ -113,9 +113,6 @@ bool Renderer::Initialize() m_bounding_box->GetGPUBufferSize()); } - // Ensure all pipelines previously used by the game have been created. - StateTracker::GetInstance()->ReloadPipelineUIDCache(); - // Initialize post processing. m_post_processor = std::make_unique(); if (!static_cast(m_post_processor.get()) @@ -589,6 +586,9 @@ void Renderer::SwapImpl(u32 xfb_addr, u32 fb_width, u32 fb_stride, u32 fb_height // Clean up stale textures. TextureCache::GetInstance()->Cleanup(frameCount); + + // Pull in now-ready async shaders. + g_shader_cache->RetrieveAsyncShaders(); } void Renderer::TransitionBuffersForSwap(const TargetRectangle& scaled_rect, @@ -1132,6 +1132,8 @@ void Renderer::CheckForConfigChanges() bool old_force_filtering = g_ActiveConfig.bForceFiltering; bool old_use_xfb = g_ActiveConfig.bUseXFB; bool old_use_realxfb = g_ActiveConfig.bUseRealXFB; + bool old_vertex_ubershaders = g_ActiveConfig.bForceVertexUberShaders; + bool old_pixel_ubershaders = g_ActiveConfig.bForcePixelUberShaders; // Copy g_Config to g_ActiveConfig. // NOTE: This can potentially race with the UI thread, however if it does, the changes will be @@ -1145,6 +1147,8 @@ void Renderer::CheckForConfigChanges() bool aspect_changed = old_aspect_ratio != g_ActiveConfig.iAspectRatio; bool use_xfb_changed = old_use_xfb != g_ActiveConfig.bUseXFB; bool use_realxfb_changed = old_use_realxfb != g_ActiveConfig.bUseRealXFB; + bool ubershaders_changed = old_vertex_ubershaders != g_ActiveConfig.bForceVertexUberShaders || + old_pixel_ubershaders != g_ActiveConfig.bForcePixelUberShaders; // Update texture cache settings with any changed options. TextureCache::GetInstance()->OnConfigChanged(g_ActiveConfig); @@ -1190,6 +1194,10 @@ void Renderer::CheckForConfigChanges() if (anisotropy_changed || force_texture_filtering_changed) ResetSamplerStates(); + // Clear UID state if ubershaders are toggled. + if (ubershaders_changed) + StateTracker::GetInstance()->ClearShaders(); + // Check for a changed post-processing shader and recompile if needed. static_cast(m_post_processor.get())->UpdateConfig(); } diff --git a/Source/Core/VideoBackends/Vulkan/ShaderCache.cpp b/Source/Core/VideoBackends/Vulkan/ShaderCache.cpp index 79a77458b6..7d5c3e2379 100644 --- a/Source/Core/VideoBackends/Vulkan/ShaderCache.cpp +++ b/Source/Core/VideoBackends/Vulkan/ShaderCache.cpp @@ -15,13 +15,20 @@ #include "Common/MsgHandler.h" #include "Core/ConfigManager.h" +#include "Core/Host.h" +#include "VideoBackends/Vulkan/FramebufferManager.h" #include "VideoBackends/Vulkan/ShaderCompiler.h" #include "VideoBackends/Vulkan/StreamBuffer.h" #include "VideoBackends/Vulkan/Util.h" #include "VideoBackends/Vulkan/VertexFormat.h" #include "VideoBackends/Vulkan/VulkanContext.h" +#include "VideoCommon/AsyncShaderCompiler.h" +#include "VideoCommon/GeometryShaderGen.h" #include "VideoCommon/Statistics.h" +#include "VideoCommon/UberShaderPixel.h" +#include "VideoCommon/UberShaderVertex.h" +#include "VideoCommon/VertexLoaderManager.h" namespace Vulkan { @@ -55,9 +62,22 @@ bool ShaderCache::Initialize() if (!CompileSharedShaders()) return false; + m_async_shader_compiler = std::make_unique(); + m_async_shader_compiler->ResizeWorkerThreads(g_ActiveConfig.CanPrecompileUberShaders() ? + g_ActiveConfig.GetShaderPrecompilerThreads() : + g_ActiveConfig.GetShaderCompilerThreads()); return true; } +void ShaderCache::Shutdown() +{ + if (m_async_shader_compiler) + { + m_async_shader_compiler->StopWorkerThreads(); + m_async_shader_compiler->RetrieveWorkItems(); + } +} + static bool IsStripPrimitiveTopology(VkPrimitiveTopology topology) { return topology == VK_PRIMITIVE_TOPOLOGY_LINE_STRIP || @@ -365,13 +385,34 @@ std::pair ShaderCache::GetPipelineWithCacheResult(const Pipeli { auto iter = m_pipeline_objects.find(info); if (iter != m_pipeline_objects.end()) - return {iter->second, true}; + { + // If it's background compiling, ignore it, and recompile it synchronously. + if (!iter->second.second) + return std::make_pair(iter->second.first, true); + else + m_pipeline_objects.erase(iter); + } VkPipeline pipeline = CreatePipeline(info); - m_pipeline_objects.emplace(info, pipeline); + m_pipeline_objects.emplace(info, std::make_pair(pipeline, false)); + _assert_(pipeline != VK_NULL_HANDLE); return {pipeline, false}; } +std::pair, bool> +ShaderCache::GetPipelineWithCacheResultAsync(const PipelineInfo& info) +{ + auto iter = m_pipeline_objects.find(info); + if (iter != m_pipeline_objects.end()) + return std::make_pair(iter->second, true); + + // Kick a job off. + m_async_shader_compiler->QueueWorkItem( + m_async_shader_compiler->CreateWorkItem(info)); + m_pipeline_objects.emplace(info, std::make_pair(static_cast(VK_NULL_HANDLE), true)); + return std::make_pair(std::make_pair(static_cast(VK_NULL_HANDLE), true), false); +} + VkPipeline ShaderCache::CreateComputePipeline(const ComputePipelineInfo& info) { VkComputePipelineCreateInfo pipeline_info = {VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, @@ -409,10 +450,11 @@ VkPipeline ShaderCache::GetComputePipeline(const ComputePipelineInfo& info) void ShaderCache::ClearPipelineCache() { + // TODO: Stop any async compiling happening. for (const auto& it : m_pipeline_objects) { - if (it.second != VK_NULL_HANDLE) - vkDestroyPipeline(g_vulkan_context->GetDevice(), it.second, nullptr); + if (it.second.first != VK_NULL_HANDLE) + vkDestroyPipeline(g_vulkan_context->GetDevice(), it.second.first, nullptr); } m_pipeline_objects.clear(); @@ -620,7 +662,10 @@ void ShaderCache::SavePipelineCache() template struct ShaderCacheReader : public LinearDiskCacheReader { - ShaderCacheReader(std::map& shader_map) : m_shader_map(shader_map) {} + ShaderCacheReader(std::map>& shader_map) + : m_shader_map(shader_map) + { + } void Read(const Uid& key, const u32* value, u32 value_size) override { // We don't insert null modules into the shader map since creation could succeed later on. @@ -630,10 +675,10 @@ struct ShaderCacheReader : public LinearDiskCacheReader if (module == VK_NULL_HANDLE) return; - m_shader_map.emplace(key, module); + m_shader_map.emplace(key, std::make_pair(module, false)); } - std::map& m_shader_map; + std::map>& m_shader_map; }; void ShaderCache::LoadShaderCaches() @@ -653,6 +698,13 @@ void ShaderCache::LoadShaderCaches() gs_reader); } + ShaderCacheReader uber_vs_reader(m_uber_vs_cache.shader_map); + m_uber_vs_cache.disk_cache.OpenAndRead( + GetDiskShaderCacheFileName(APIType::Vulkan, "UberVS", false, true), uber_vs_reader); + ShaderCacheReader uber_ps_reader(m_uber_ps_cache.shader_map); + m_uber_ps_cache.disk_cache.OpenAndRead( + GetDiskShaderCacheFileName(APIType::Vulkan, "UberPS", false, true), uber_ps_reader); + SETSTAT(stats.numPixelShadersCreated, static_cast(m_ps_cache.shader_map.size())); SETSTAT(stats.numPixelShadersAlive, static_cast(m_ps_cache.shader_map.size())); SETSTAT(stats.numVertexShadersCreated, static_cast(m_vs_cache.shader_map.size())); @@ -666,8 +718,8 @@ static void DestroyShaderCache(T& cache) cache.disk_cache.Close(); for (const auto& it : cache.shader_map) { - if (it.second != VK_NULL_HANDLE) - vkDestroyShaderModule(g_vulkan_context->GetDevice(), it.second, nullptr); + if (it.second.first != VK_NULL_HANDLE) + vkDestroyShaderModule(g_vulkan_context->GetDevice(), it.second.first, nullptr); } cache.shader_map.clear(); } @@ -680,6 +732,9 @@ void ShaderCache::DestroyShaderCaches() if (g_vulkan_context->SupportsGeometryShaders()) DestroyShaderCache(m_gs_cache); + DestroyShaderCache(m_uber_vs_cache); + DestroyShaderCache(m_uber_ps_cache); + SETSTAT(stats.numPixelShadersCreated, 0); SETSTAT(stats.numPixelShadersAlive, 0); SETSTAT(stats.numVertexShadersCreated, 0); @@ -690,7 +745,13 @@ VkShaderModule ShaderCache::GetVertexShaderForUid(const VertexShaderUid& uid) { auto it = m_vs_cache.shader_map.find(uid); if (it != m_vs_cache.shader_map.end()) - return it->second; + { + // If it's pending, compile it synchronously. + if (!it->second.second) + return it->second.first; + else + m_vs_cache.shader_map.erase(it); + } // Not in the cache, so compile the shader. ShaderCompiler::SPIRVCodeVector spv; @@ -712,7 +773,7 @@ VkShaderModule ShaderCache::GetVertexShaderForUid(const VertexShaderUid& uid) } // We still insert null entries to prevent further compilation attempts. - m_vs_cache.shader_map.emplace(uid, module); + m_vs_cache.shader_map.emplace(uid, std::make_pair(module, false)); return module; } @@ -721,7 +782,13 @@ VkShaderModule ShaderCache::GetGeometryShaderForUid(const GeometryShaderUid& uid _assert_(g_vulkan_context->SupportsGeometryShaders()); auto it = m_gs_cache.shader_map.find(uid); if (it != m_gs_cache.shader_map.end()) - return it->second; + { + // If it's pending, compile it synchronously. + if (!it->second.second) + return it->second.first; + else + m_gs_cache.shader_map.erase(it); + } // Not in the cache, so compile the shader. ShaderCompiler::SPIRVCodeVector spv; @@ -739,7 +806,7 @@ VkShaderModule ShaderCache::GetGeometryShaderForUid(const GeometryShaderUid& uid } // We still insert null entries to prevent further compilation attempts. - m_gs_cache.shader_map.emplace(uid, module); + m_gs_cache.shader_map.emplace(uid, std::make_pair(module, false)); return module; } @@ -747,7 +814,13 @@ VkShaderModule ShaderCache::GetPixelShaderForUid(const PixelShaderUid& uid) { auto it = m_ps_cache.shader_map.find(uid); if (it != m_ps_cache.shader_map.end()) - return it->second; + { + // If it's pending, compile it synchronously. + if (!it->second.second) + return it->second.first; + else + m_ps_cache.shader_map.erase(it); + } // Not in the cache, so compile the shader. ShaderCompiler::SPIRVCodeVector spv; @@ -769,7 +842,79 @@ VkShaderModule ShaderCache::GetPixelShaderForUid(const PixelShaderUid& uid) } // We still insert null entries to prevent further compilation attempts. - m_ps_cache.shader_map.emplace(uid, module); + m_ps_cache.shader_map.emplace(uid, std::make_pair(module, false)); + return module; +} + +VkShaderModule ShaderCache::GetVertexUberShaderForUid(const UberShader::VertexShaderUid& uid) +{ + auto it = m_uber_vs_cache.shader_map.find(uid); + if (it != m_uber_vs_cache.shader_map.end()) + { + // If it's pending, compile it synchronously. + if (!it->second.second) + return it->second.first; + else + m_uber_vs_cache.shader_map.erase(it); + } + + // Not in the cache, so compile the shader. + ShaderCompiler::SPIRVCodeVector spv; + VkShaderModule module = VK_NULL_HANDLE; + ShaderCode source_code = UberShader::GenVertexShader( + APIType::Vulkan, ShaderHostConfig::GetCurrent(), uid.GetUidData()); + if (ShaderCompiler::CompileVertexShader(&spv, source_code.GetBuffer().c_str(), + source_code.GetBuffer().length())) + { + module = Util::CreateShaderModule(spv.data(), spv.size()); + + // Append to shader cache if it created successfully. + if (module != VK_NULL_HANDLE) + { + m_uber_vs_cache.disk_cache.Append(uid, spv.data(), static_cast(spv.size())); + INCSTAT(stats.numVertexShadersCreated); + INCSTAT(stats.numVertexShadersAlive); + } + } + + // We still insert null entries to prevent further compilation attempts. + m_uber_vs_cache.shader_map.emplace(uid, std::make_pair(module, false)); + return module; +} + +VkShaderModule ShaderCache::GetPixelUberShaderForUid(const UberShader::PixelShaderUid& uid) +{ + auto it = m_uber_ps_cache.shader_map.find(uid); + if (it != m_uber_ps_cache.shader_map.end()) + { + // If it's pending, compile it synchronously. + if (!it->second.second) + return it->second.first; + else + m_uber_ps_cache.shader_map.erase(it); + } + + // Not in the cache, so compile the shader. + ShaderCompiler::SPIRVCodeVector spv; + VkShaderModule module = VK_NULL_HANDLE; + ShaderCode source_code = + UberShader::GenPixelShader(APIType::Vulkan, ShaderHostConfig::GetCurrent(), uid.GetUidData()); + if (ShaderCompiler::CompileFragmentShader(&spv, source_code.GetBuffer().c_str(), + source_code.GetBuffer().length())) + { + module = Util::CreateShaderModule(spv.data(), spv.size()); + + // Append to shader cache if it created successfully. + if (module != VK_NULL_HANDLE) + { + m_uber_ps_cache.disk_cache.Append(uid, spv.data(), static_cast(spv.size())); + INCSTAT(stats.numPixelShadersCreated); + INCSTAT(stats.numPixelShadersAlive); + } + } + + // We still insert null entries to prevent further compilation attempts. + m_uber_ps_cache.shader_map.emplace(uid, std::make_pair(module, false)); return module; } @@ -782,6 +927,9 @@ void ShaderCache::RecompileSharedShaders() void ShaderCache::ReloadShaderAndPipelineCaches() { + m_async_shader_compiler->WaitUntilCompletion(); + m_async_shader_compiler->RetrieveWorkItems(); + SavePipelineCache(); DestroyShaderCaches(); DestroyPipelineCache(); @@ -795,6 +943,9 @@ void ShaderCache::ReloadShaderAndPipelineCaches() { CreatePipelineCache(); } + + if (g_ActiveConfig.CanPrecompileUberShaders()) + PrecompileUberShaders(); } std::string ShaderCache::GetUtilityShaderHeader() const @@ -1026,4 +1177,214 @@ void ShaderCache::DestroySharedShaders() DestroyShader(m_screen_quad_geometry_shader); DestroyShader(m_passthrough_geometry_shader); } + +void ShaderCache::CreateDummyPipeline(const UberShader::VertexShaderUid& vuid, + const GeometryShaderUid& guid, + const UberShader::PixelShaderUid& puid) +{ + PortableVertexDeclaration vertex_decl; + std::memset(&vertex_decl, 0, sizeof(vertex_decl)); + + PipelineInfo pinfo; + pinfo.vertex_format = + static_cast(VertexLoaderManager::GetUberVertexFormat(vertex_decl)); + pinfo.pipeline_layout = g_object_cache->GetPipelineLayout( + g_ActiveConfig.bBBoxEnable && g_ActiveConfig.BBoxUseFragmentShaderImplementation() ? + PIPELINE_LAYOUT_BBOX : + PIPELINE_LAYOUT_STANDARD); + pinfo.vs = GetVertexUberShaderForUid(vuid); + pinfo.gs = (!guid.GetUidData()->IsPassthrough() && g_vulkan_context->SupportsGeometryShaders()) ? + GetGeometryShaderForUid(guid) : + VK_NULL_HANDLE; + pinfo.ps = GetPixelUberShaderForUid(puid); + pinfo.render_pass = FramebufferManager::GetInstance()->GetEFBLoadRenderPass(); + pinfo.rasterization_state.bits = Util::GetNoCullRasterizationState().bits; + pinfo.depth_stencil_state.bits = Util::GetNoDepthTestingDepthStencilState().bits; + pinfo.blend_state.hex = Util::GetNoBlendingBlendState().hex; + switch (guid.GetUidData()->primitive_type) + { + case PRIMITIVE_POINTS: + pinfo.primitive_topology = VK_PRIMITIVE_TOPOLOGY_POINT_LIST; + break; + case PRIMITIVE_LINES: + pinfo.primitive_topology = VK_PRIMITIVE_TOPOLOGY_LINE_LIST; + break; + case PRIMITIVE_TRIANGLES: + pinfo.primitive_topology = g_ActiveConfig.backend_info.bSupportsPrimitiveRestart ? + VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP : + VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; + break; + } + GetPipelineWithCacheResultAsync(pinfo); +} + +void ShaderCache::PrecompileUberShaders() +{ + UberShader::EnumerateVertexShaderUids([&](const UberShader::VertexShaderUid& vuid) { + UberShader::EnumeratePixelShaderUids([&](const UberShader::PixelShaderUid& puid) { + // UIDs must have compatible texgens, a mismatching combination will never be queried. + if (vuid.GetUidData()->num_texgens != puid.GetUidData()->num_texgens) + return; + + EnumerateGeometryShaderUids([&](const GeometryShaderUid& guid) { + if (guid.GetUidData()->numTexGens != vuid.GetUidData()->num_texgens) + return; + + CreateDummyPipeline(vuid, guid, puid); + }); + }); + }); + + WaitForBackgroundCompilesToComplete(); + + // Switch to the runtime/background thread config. + m_async_shader_compiler->ResizeWorkerThreads(g_ActiveConfig.GetShaderCompilerThreads()); +} + +void ShaderCache::WaitForBackgroundCompilesToComplete() +{ + m_async_shader_compiler->WaitUntilCompletion([](size_t completed, size_t total) { + Host_UpdateProgressDialog(GetStringT("Compiling shaders...").c_str(), + static_cast(completed), static_cast(total)); + }); + m_async_shader_compiler->RetrieveWorkItems(); + Host_UpdateProgressDialog("", -1, -1); +} + +void ShaderCache::RetrieveAsyncShaders() +{ + m_async_shader_compiler->RetrieveWorkItems(); +} + +std::pair ShaderCache::GetVertexShaderForUidAsync(const VertexShaderUid& uid) +{ + auto it = m_vs_cache.shader_map.find(uid); + if (it != m_vs_cache.shader_map.end()) + return it->second; + + // Kick a compile job off. + m_async_shader_compiler->QueueWorkItem( + m_async_shader_compiler->CreateWorkItem(uid)); + m_vs_cache.shader_map.emplace(uid, + std::make_pair(static_cast(VK_NULL_HANDLE), true)); + return std::make_pair(VK_NULL_HANDLE, true); +} + +std::pair ShaderCache::GetPixelShaderForUidAsync(const PixelShaderUid& uid) +{ + auto it = m_ps_cache.shader_map.find(uid); + if (it != m_ps_cache.shader_map.end()) + return it->second; + + // Kick a compile job off. + m_async_shader_compiler->QueueWorkItem( + m_async_shader_compiler->CreateWorkItem(uid)); + m_ps_cache.shader_map.emplace(uid, + std::make_pair(static_cast(VK_NULL_HANDLE), true)); + return std::make_pair(VK_NULL_HANDLE, true); +} + +bool ShaderCache::VertexShaderCompilerWorkItem::Compile() +{ + ShaderCode code = + GenerateVertexShaderCode(APIType::Vulkan, ShaderHostConfig::GetCurrent(), m_uid.GetUidData()); + if (!ShaderCompiler::CompileVertexShader(&m_spirv, code.GetBuffer().c_str(), + code.GetBuffer().length())) + return true; + + m_module = Util::CreateShaderModule(m_spirv.data(), m_spirv.size()); + return true; +} + +void ShaderCache::VertexShaderCompilerWorkItem::Retrieve() +{ + auto it = g_shader_cache->m_vs_cache.shader_map.find(m_uid); + if (it == g_shader_cache->m_vs_cache.shader_map.end()) + { + g_shader_cache->m_vs_cache.shader_map.emplace(m_uid, std::make_pair(m_module, false)); + g_shader_cache->m_vs_cache.disk_cache.Append(m_uid, m_spirv.data(), + static_cast(m_spirv.size())); + return; + } + + // The main thread may have also compiled this shader. + if (!it->second.second) + { + if (m_module != VK_NULL_HANDLE) + vkDestroyShaderModule(g_vulkan_context->GetDevice(), m_module, nullptr); + return; + } + + // No longer pending. + it->second.first = m_module; + it->second.second = false; + g_shader_cache->m_vs_cache.disk_cache.Append(m_uid, m_spirv.data(), + static_cast(m_spirv.size())); +} + +bool ShaderCache::PixelShaderCompilerWorkItem::Compile() +{ + ShaderCode code = + GeneratePixelShaderCode(APIType::Vulkan, ShaderHostConfig::GetCurrent(), m_uid.GetUidData()); + if (!ShaderCompiler::CompileFragmentShader(&m_spirv, code.GetBuffer().c_str(), + code.GetBuffer().length())) + return true; + + m_module = Util::CreateShaderModule(m_spirv.data(), m_spirv.size()); + return true; +} + +void ShaderCache::PixelShaderCompilerWorkItem::Retrieve() +{ + auto it = g_shader_cache->m_ps_cache.shader_map.find(m_uid); + if (it == g_shader_cache->m_ps_cache.shader_map.end()) + { + g_shader_cache->m_ps_cache.shader_map.emplace(m_uid, std::make_pair(m_module, false)); + g_shader_cache->m_ps_cache.disk_cache.Append(m_uid, m_spirv.data(), + static_cast(m_spirv.size())); + return; + } + + // The main thread may have also compiled this shader. + if (!it->second.second) + { + if (m_module != VK_NULL_HANDLE) + vkDestroyShaderModule(g_vulkan_context->GetDevice(), m_module, nullptr); + return; + } + + // No longer pending. + it->second.first = m_module; + it->second.second = false; + g_shader_cache->m_ps_cache.disk_cache.Append(m_uid, m_spirv.data(), + static_cast(m_spirv.size())); +} + +bool ShaderCache::PipelineCompilerWorkItem::Compile() +{ + m_pipeline = g_shader_cache->CreatePipeline(m_info); + return true; +} + +void ShaderCache::PipelineCompilerWorkItem::Retrieve() +{ + auto it = g_shader_cache->m_pipeline_objects.find(m_info); + if (it == g_shader_cache->m_pipeline_objects.end()) + { + g_shader_cache->m_pipeline_objects.emplace(m_info, std::make_pair(m_pipeline, false)); + return; + } + + // The main thread may have also compiled this shader. + if (!it->second.second) + { + if (m_pipeline != VK_NULL_HANDLE) + vkDestroyPipeline(g_vulkan_context->GetDevice(), m_pipeline, nullptr); + return; + } + + // No longer pending. + it->second.first = m_pipeline; + it->second.second = false; +} } diff --git a/Source/Core/VideoBackends/Vulkan/ShaderCache.h b/Source/Core/VideoBackends/Vulkan/ShaderCache.h index 7a83472e38..c31ad08616 100644 --- a/Source/Core/VideoBackends/Vulkan/ShaderCache.h +++ b/Source/Core/VideoBackends/Vulkan/ShaderCache.h @@ -10,16 +10,21 @@ #include #include #include +#include #include "Common/CommonTypes.h" #include "Common/LinearDiskCache.h" #include "VideoBackends/Vulkan/Constants.h" #include "VideoBackends/Vulkan/ObjectCache.h" +#include "VideoBackends/Vulkan/ShaderCompiler.h" +#include "VideoCommon/AsyncShaderCompiler.h" #include "VideoCommon/GeometryShaderGen.h" #include "VideoCommon/PixelShaderGen.h" #include "VideoCommon/RenderState.h" +#include "VideoCommon/UberShaderPixel.h" +#include "VideoCommon/UberShaderVertex.h" #include "VideoCommon/VertexShaderGen.h" namespace Vulkan @@ -92,8 +97,17 @@ public: VkShaderModule GetGeometryShaderForUid(const GeometryShaderUid& uid); VkShaderModule GetPixelShaderForUid(const PixelShaderUid& uid); + // Ubershader caches + VkShaderModule GetVertexUberShaderForUid(const UberShader::VertexShaderUid& uid); + VkShaderModule GetPixelUberShaderForUid(const UberShader::PixelShaderUid& uid); + + // Accesses ShaderGen shader caches asynchronously + std::pair GetVertexShaderForUidAsync(const VertexShaderUid& uid); + std::pair GetPixelShaderForUidAsync(const PixelShaderUid& uid); + // Perform at startup, create descriptor layouts, compiles all static shaders. bool Initialize(); + void Shutdown(); // Creates a pipeline for the specified description. The resulting pipeline, if successful // is not stored anywhere, this is left up to the caller. @@ -106,6 +120,8 @@ public: // resulted in a pipeline being created, the second field of the return value will be false, // otherwise for a cache hit it will be true. std::pair GetPipelineWithCacheResult(const PipelineInfo& info); + std::pair, bool> + GetPipelineWithCacheResultAsync(const PipelineInfo& info); // Creates a compute pipeline, and does not track the handle. VkPipeline CreateComputePipeline(const ComputePipelineInfo& info); @@ -134,6 +150,10 @@ public: VkShaderModule GetPassthroughVertexShader() const { return m_passthrough_vertex_shader; } VkShaderModule GetScreenQuadGeometryShader() const { return m_screen_quad_geometry_shader; } VkShaderModule GetPassthroughGeometryShader() const { return m_passthrough_geometry_shader; } + void PrecompileUberShaders(); + void WaitForBackgroundCompilesToComplete(); + void RetrieveAsyncShaders(); + private: bool CreatePipelineCache(); bool LoadPipelineCache(); @@ -144,17 +164,26 @@ private: bool CompileSharedShaders(); void DestroySharedShaders(); + // We generate a dummy pipeline with some defaults in the blend/depth states, + // that way the driver is forced to compile something (looking at you, NVIDIA). + // It can then hopefully re-use part of this pipeline for others in the future. + void CreateDummyPipeline(const UberShader::VertexShaderUid& vuid, const GeometryShaderUid& guid, + const UberShader::PixelShaderUid& puid); + template struct ShaderModuleCache { - std::map shader_map; + std::map> shader_map; LinearDiskCache disk_cache; }; ShaderModuleCache m_vs_cache; ShaderModuleCache m_gs_cache; ShaderModuleCache m_ps_cache; + ShaderModuleCache m_uber_vs_cache; + ShaderModuleCache m_uber_ps_cache; - std::unordered_map m_pipeline_objects; + std::unordered_map, PipelineInfoHash> + m_pipeline_objects; std::unordered_map m_compute_pipeline_objects; VkPipelineCache m_pipeline_cache = VK_NULL_HANDLE; @@ -165,6 +194,45 @@ private: VkShaderModule m_passthrough_vertex_shader = VK_NULL_HANDLE; VkShaderModule m_screen_quad_geometry_shader = VK_NULL_HANDLE; VkShaderModule m_passthrough_geometry_shader = VK_NULL_HANDLE; + + std::unique_ptr m_async_shader_compiler; + + // TODO: Use templates to reduce the number of these classes. + class VertexShaderCompilerWorkItem : public VideoCommon::AsyncShaderCompiler::WorkItem + { + public: + VertexShaderCompilerWorkItem(const VertexShaderUid& uid) : m_uid(uid) {} + bool Compile() override; + void Retrieve() override; + + private: + VertexShaderUid m_uid; + ShaderCompiler::SPIRVCodeVector m_spirv; + VkShaderModule m_module = VK_NULL_HANDLE; + }; + class PixelShaderCompilerWorkItem : public VideoCommon::AsyncShaderCompiler::WorkItem + { + public: + PixelShaderCompilerWorkItem(const PixelShaderUid& uid) : m_uid(uid) {} + bool Compile() override; + void Retrieve() override; + + private: + PixelShaderUid m_uid; + ShaderCompiler::SPIRVCodeVector m_spirv; + VkShaderModule m_module = VK_NULL_HANDLE; + }; + class PipelineCompilerWorkItem : public VideoCommon::AsyncShaderCompiler::WorkItem + { + public: + PipelineCompilerWorkItem(const PipelineInfo& info) : m_info(info) {} + bool Compile() override; + void Retrieve() override; + + private: + PipelineInfo m_info; + VkPipeline m_pipeline; + }; }; extern std::unique_ptr g_shader_cache; diff --git a/Source/Core/VideoBackends/Vulkan/StateTracker.cpp b/Source/Core/VideoBackends/Vulkan/StateTracker.cpp index a03437a697..2ab6af50ff 100644 --- a/Source/Core/VideoBackends/Vulkan/StateTracker.cpp +++ b/Source/Core/VideoBackends/Vulkan/StateTracker.cpp @@ -22,6 +22,7 @@ #include "VideoCommon/GeometryShaderManager.h" #include "VideoCommon/PixelShaderManager.h" #include "VideoCommon/Statistics.h" +#include "VideoCommon/VertexLoaderManager.h" #include "VideoCommon/VertexShaderManager.h" #include "VideoCommon/VideoConfig.h" @@ -77,12 +78,13 @@ bool StateTracker::Initialize() m_pipeline_state.pipeline_layout = g_object_cache->GetPipelineLayout(PIPELINE_LAYOUT_STANDARD); m_num_active_descriptor_sets = NUM_GX_DRAW_DESCRIPTOR_SETS; m_bbox_enabled = false; + ClearShaders(); // Initialize all samplers to point by default for (size_t i = 0; i < NUM_PIXEL_SHADER_SAMPLERS; i++) { m_bindings.ps_samplers[i].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; - m_bindings.ps_samplers[i].imageView = VK_NULL_HANDLE; + m_bindings.ps_samplers[i].imageView = g_object_cache->GetDummyImageView(); m_bindings.ps_samplers[i].sampler = g_object_cache->GetPointSampler(); } @@ -154,6 +156,10 @@ void StateTracker::ReloadPipelineUIDCache() PipelineInserter inserter(this); m_uid_cache.OpenAndRead(filename, inserter); } + + // If we were using background compilation, ensure everything is ready before continuing. + if (g_ActiveConfig.bBackgroundShaderCompiling) + g_shader_cache->WaitForBackgroundCompilesToComplete(); } void StateTracker::AppendToPipelineUIDCache(const PipelineInfo& info) @@ -178,7 +184,8 @@ bool StateTracker::PrecachePipelineUID(const SerializedPipelineUID& uid) // Need to create the vertex declaration first, rather than deferring to when a game creates a // vertex loader that uses this format, since we need it to create a pipeline. - pinfo.vertex_format = VertexFormat::GetOrCreateMatchingFormat(uid.vertex_decl); + pinfo.vertex_format = + static_cast(VertexLoaderManager::GetOrCreateMatchingFormat(uid.vertex_decl)); pinfo.pipeline_layout = uid.ps_uid.GetUidData()->bounding_box ? g_object_cache->GetPipelineLayout(PIPELINE_LAYOUT_BBOX) : g_object_cache->GetPipelineLayout(PIPELINE_LAYOUT_STANDARD); @@ -209,11 +216,19 @@ bool StateTracker::PrecachePipelineUID(const SerializedPipelineUID& uid) pinfo.blend_state.hex = uid.blend_state_bits; pinfo.primitive_topology = uid.primitive_topology; - VkPipeline pipeline = g_shader_cache->GetPipeline(pinfo); - if (pipeline == VK_NULL_HANDLE) + if (g_ActiveConfig.bBackgroundShaderCompiling) { - WARN_LOG(VIDEO, "Failed to get pipeline from cached UID."); - return false; + // Use async for multithreaded compilation. + g_shader_cache->GetPipelineWithCacheResultAsync(pinfo); + } + else + { + VkPipeline pipeline = g_shader_cache->GetPipeline(pinfo); + if (pipeline == VK_NULL_HANDLE) + { + WARN_LOG(VIDEO, "Failed to get pipeline from cached UID."); + return false; + } } // We don't need to do anything with this pipeline, just make sure it exists. @@ -267,11 +282,11 @@ void StateTracker::SetFramebuffer(VkFramebuffer framebuffer, const VkRect2D& ren void StateTracker::SetVertexFormat(const VertexFormat* vertex_format) { - if (m_pipeline_state.vertex_format == vertex_format) + if (m_vertex_format == vertex_format) return; - m_pipeline_state.vertex_format = vertex_format; - m_dirty_flags |= DIRTY_FLAG_PIPELINE; + m_vertex_format = vertex_format; + UpdatePipelineVertexFormat(); } void StateTracker::SetPrimitiveTopology(VkPrimitiveTopology primitive_topology) @@ -323,14 +338,87 @@ bool StateTracker::CheckForShaderChanges(u32 gx_primitive_type) { VertexShaderUid vs_uid = GetVertexShaderUid(); PixelShaderUid ps_uid = GetPixelShaderUid(); - bool changed = false; - if (vs_uid != m_vs_uid) + bool use_ubershaders = g_ActiveConfig.bDisableSpecializedShaders; + if (g_ActiveConfig.CanBackgroundCompileShaders() && !g_ActiveConfig.bDisableSpecializedShaders) { - m_pipeline_state.vs = g_shader_cache->GetVertexShaderForUid(vs_uid); - m_vs_uid = vs_uid; - changed = true; + // Look up both VS and PS, and check if we can compile it asynchronously. + auto vs = g_shader_cache->GetVertexShaderForUidAsync(vs_uid); + auto ps = g_shader_cache->GetPixelShaderForUidAsync(ps_uid); + if (vs.second || ps.second) + { + // One of the shaders is still pending. Use the ubershader for both. + use_ubershaders = true; + } + else + { + // Use the standard shaders for both. + if (m_pipeline_state.vs != vs.first) + { + m_pipeline_state.vs = vs.first; + m_vs_uid = vs_uid; + changed = true; + } + if (m_pipeline_state.ps != ps.first) + { + m_pipeline_state.ps = ps.first; + m_ps_uid = ps_uid; + changed = true; + } + } + } + else + { + // Normal shader path. No ubershaders. + if (vs_uid != m_vs_uid) + { + m_vs_uid = vs_uid; + m_pipeline_state.vs = g_shader_cache->GetVertexShaderForUid(vs_uid); + changed = true; + } + if (ps_uid != m_ps_uid) + { + m_ps_uid = ps_uid; + m_pipeline_state.ps = g_shader_cache->GetPixelShaderForUid(ps_uid); + changed = true; + } + } + + // Ubershader fallback? + bool uber_vertex_shader = use_ubershaders || g_ActiveConfig.bForceVertexUberShaders; + bool uber_pixel_shader = use_ubershaders || g_ActiveConfig.bForcePixelUberShaders; + bool using_ubershaders = uber_vertex_shader || uber_pixel_shader; + + // Switching to/from ubershaders? Have to adjust the vertex format and pipeline layout. + if (using_ubershaders != m_using_ubershaders) + { + m_using_ubershaders = using_ubershaders; + UpdatePipelineLayout(); + UpdatePipelineVertexFormat(); + } + + if (uber_vertex_shader) + { + UberShader::VertexShaderUid uber_vs_uid = UberShader::GetVertexShaderUid(); + VkShaderModule vs = g_shader_cache->GetVertexUberShaderForUid(uber_vs_uid); + if (vs != m_pipeline_state.vs) + { + m_uber_vs_uid = uber_vs_uid; + m_pipeline_state.vs = vs; + changed = true; + } + } + if (uber_pixel_shader) + { + UberShader::PixelShaderUid uber_ps_uid = UberShader::GetPixelShaderUid(); + VkShaderModule ps = g_shader_cache->GetPixelUberShaderForUid(uber_ps_uid); + if (ps != m_pipeline_state.ps) + { + m_uber_ps_uid = uber_ps_uid; + m_pipeline_state.ps = ps; + changed = true; + } } if (g_vulkan_context->SupportsGeometryShaders()) @@ -338,29 +426,39 @@ bool StateTracker::CheckForShaderChanges(u32 gx_primitive_type) GeometryShaderUid gs_uid = GetGeometryShaderUid(gx_primitive_type); if (gs_uid != m_gs_uid) { + m_gs_uid = gs_uid; if (gs_uid.GetUidData()->IsPassthrough()) m_pipeline_state.gs = VK_NULL_HANDLE; else m_pipeline_state.gs = g_shader_cache->GetGeometryShaderForUid(gs_uid); - m_gs_uid = gs_uid; changed = true; } } - if (ps_uid != m_ps_uid) - { - m_pipeline_state.ps = g_shader_cache->GetPixelShaderForUid(ps_uid); - m_ps_uid = ps_uid; - changed = true; - } - if (changed) m_dirty_flags |= DIRTY_FLAG_PIPELINE; return changed; } +void StateTracker::ClearShaders() +{ + // Set the UIDs to something that will never match, so on the first access they are checked. + std::memset(&m_vs_uid, 0xFF, sizeof(m_vs_uid)); + std::memset(&m_gs_uid, 0xFF, sizeof(m_gs_uid)); + std::memset(&m_ps_uid, 0xFF, sizeof(m_ps_uid)); + std::memset(&m_uber_vs_uid, 0xFF, sizeof(m_uber_vs_uid)); + std::memset(&m_uber_ps_uid, 0xFF, sizeof(m_uber_ps_uid)); + + m_pipeline_state.vs = VK_NULL_HANDLE; + m_pipeline_state.gs = VK_NULL_HANDLE; + m_pipeline_state.ps = VK_NULL_HANDLE; + m_pipeline_state.vertex_format = nullptr; + + m_dirty_flags |= DIRTY_FLAG_PIPELINE; +} + void StateTracker::UpdateVertexShaderConstants() { if (!VertexShaderManager::dirty || !ReserveConstantStorage()) @@ -557,24 +655,8 @@ void StateTracker::SetBBoxEnable(bool enable) if (m_bbox_enabled == enable) return; - // Change the number of active descriptor sets, as well as the pipeline layout - if (enable) - { - m_pipeline_state.pipeline_layout = g_object_cache->GetPipelineLayout(PIPELINE_LAYOUT_BBOX); - m_num_active_descriptor_sets = NUM_GX_DRAW_WITH_BBOX_DESCRIPTOR_SETS; - - // The bbox buffer never changes, so we defer descriptor updates until it is enabled. - if (m_descriptor_sets[DESCRIPTOR_SET_BIND_POINT_STORAGE_OR_TEXEL_BUFFER] == VK_NULL_HANDLE) - m_dirty_flags |= DIRTY_FLAG_PS_SSBO; - } - else - { - m_pipeline_state.pipeline_layout = g_object_cache->GetPipelineLayout(PIPELINE_LAYOUT_STANDARD); - m_num_active_descriptor_sets = NUM_GX_DRAW_DESCRIPTOR_SETS; - } - - m_dirty_flags |= DIRTY_FLAG_PIPELINE | DIRTY_FLAG_DESCRIPTOR_SET_BINDING; m_bbox_enabled = enable; + UpdatePipelineLayout(); } void StateTracker::SetBBoxBuffer(VkBuffer buffer, VkDeviceSize offset, VkDeviceSize range) @@ -590,7 +672,7 @@ void StateTracker::SetBBoxBuffer(VkBuffer buffer, VkDeviceSize offset, VkDeviceS m_bindings.ps_ssbo.range = range; // Defer descriptor update until bbox is actually enabled. - if (m_bbox_enabled) + if (IsSSBODescriptorRequired()) m_dirty_flags |= DIRTY_FLAG_PS_SSBO; } @@ -599,7 +681,7 @@ void StateTracker::UnbindTexture(VkImageView view) for (VkDescriptorImageInfo& it : m_bindings.ps_samplers) { if (it.imageView == view) - it.imageView = VK_NULL_HANDLE; + it.imageView = g_object_cache->GetDummyImageView(); } } @@ -609,7 +691,7 @@ void StateTracker::InvalidateDescriptorSets() m_dirty_flags |= DIRTY_FLAG_ALL_DESCRIPTOR_SETS; // Defer SSBO descriptor update until bbox is actually enabled. - if (!m_bbox_enabled) + if (!IsSSBODescriptorRequired()) m_dirty_flags &= ~DIRTY_FLAG_PS_SSBO; } @@ -886,15 +968,49 @@ void StateTracker::EndClearRenderPass() EndRenderPass(); } -VkPipeline StateTracker::GetPipelineAndCacheUID(const PipelineInfo& info) +VkPipeline StateTracker::GetPipelineAndCacheUID() { - auto result = g_shader_cache->GetPipelineWithCacheResult(info); + // We can't cache ubershader uids, only normal shader uids. + if (g_ActiveConfig.CanBackgroundCompileShaders() && !m_using_ubershaders) + { + // Append to UID cache if it is a new pipeline. + auto result = g_shader_cache->GetPipelineWithCacheResultAsync(m_pipeline_state); + if (!result.second && g_ActiveConfig.bShaderCache) + AppendToPipelineUIDCache(m_pipeline_state); - // Add to the UID cache if it is a new pipeline. - if (!result.second && g_ActiveConfig.bShaderCache) - AppendToPipelineUIDCache(info); + // Still waiting for the pipeline to compile? + if (!result.first.second) + return result.first.first; - return result.first; + // Use ubershader instead. + m_using_ubershaders = true; + UpdatePipelineLayout(); + UpdatePipelineVertexFormat(); + + PipelineInfo uber_info = m_pipeline_state; + UberShader::VertexShaderUid uber_vuid = UberShader::GetVertexShaderUid(); + UberShader::PixelShaderUid uber_puid = UberShader::GetPixelShaderUid(); + uber_info.vs = g_shader_cache->GetVertexUberShaderForUid(uber_vuid); + uber_info.ps = g_shader_cache->GetPixelUberShaderForUid(uber_puid); + + auto uber_result = g_shader_cache->GetPipelineWithCacheResult(uber_info); + return uber_result.first; + } + else + { + // Add to the UID cache if it is a new pipeline. + auto result = g_shader_cache->GetPipelineWithCacheResult(m_pipeline_state); + if (!result.second && !m_using_ubershaders && g_ActiveConfig.bShaderCache) + AppendToPipelineUIDCache(m_pipeline_state); + + return result.first; + } +} + +bool StateTracker::IsSSBODescriptorRequired() const +{ + return m_bbox_enabled || (m_using_ubershaders && g_ActiveConfig.bBBoxEnable && + g_ActiveConfig.BBoxUseFragmentShaderImplementation()); } bool StateTracker::UpdatePipeline() @@ -904,16 +1020,56 @@ bool StateTracker::UpdatePipeline() return false; // Grab a new pipeline object, this can fail. - m_pipeline_object = GetPipelineAndCacheUID(m_pipeline_state); + m_pipeline_object = GetPipelineAndCacheUID(); m_dirty_flags |= DIRTY_FLAG_PIPELINE_BINDING; return m_pipeline_object != VK_NULL_HANDLE; } +void StateTracker::UpdatePipelineLayout() +{ + const bool use_bbox_pipeline_layout = IsSSBODescriptorRequired(); + VkPipelineLayout pipeline_layout = + use_bbox_pipeline_layout ? g_object_cache->GetPipelineLayout(PIPELINE_LAYOUT_BBOX) : + g_object_cache->GetPipelineLayout(PIPELINE_LAYOUT_STANDARD); + if (m_pipeline_state.pipeline_layout == pipeline_layout) + return; + + // Change the number of active descriptor sets, as well as the pipeline layout + m_pipeline_state.pipeline_layout = pipeline_layout; + if (use_bbox_pipeline_layout) + { + m_num_active_descriptor_sets = NUM_GX_DRAW_WITH_BBOX_DESCRIPTOR_SETS; + + // The bbox buffer never changes, so we defer descriptor updates until it is enabled. + if (m_descriptor_sets[DESCRIPTOR_SET_BIND_POINT_STORAGE_OR_TEXEL_BUFFER] == VK_NULL_HANDLE) + m_dirty_flags |= DIRTY_FLAG_PS_SSBO; + } + else + { + m_num_active_descriptor_sets = NUM_GX_DRAW_DESCRIPTOR_SETS; + } + + m_dirty_flags |= DIRTY_FLAG_PIPELINE | DIRTY_FLAG_DESCRIPTOR_SET_BINDING; +} + +void StateTracker::UpdatePipelineVertexFormat() +{ + const NativeVertexFormat* vertex_format = + m_using_ubershaders ? + VertexLoaderManager::GetUberVertexFormat(m_vertex_format->GetVertexDeclaration()) : + m_vertex_format; + if (m_pipeline_state.vertex_format == vertex_format) + return; + + m_pipeline_state.vertex_format = static_cast(vertex_format); + m_dirty_flags |= DIRTY_FLAG_PIPELINE; +} + bool StateTracker::UpdateDescriptorSet() { const size_t MAX_DESCRIPTOR_WRITES = NUM_UBO_DESCRIPTOR_SET_BINDINGS + // UBO - NUM_PIXEL_SHADER_SAMPLERS + // Samplers + 1 + // Samplers 1; // SSBO std::array writes; u32 num_writes = 0; @@ -954,30 +1110,22 @@ bool StateTracker::UpdateDescriptorSet() if (set == VK_NULL_HANDLE) return false; - for (size_t i = 0; i < NUM_PIXEL_SHADER_SAMPLERS; i++) - { - const VkDescriptorImageInfo& info = m_bindings.ps_samplers[i]; - if (info.imageView != VK_NULL_HANDLE && info.sampler != VK_NULL_HANDLE) - { - writes[num_writes++] = {VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, - nullptr, - set, - static_cast(i), - 0, - 1, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - &info, - nullptr, - nullptr}; - } - } + writes[num_writes++] = {VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + nullptr, + set, + 0, + 0, + static_cast(NUM_PIXEL_SHADER_SAMPLERS), + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + m_bindings.ps_samplers.data(), + nullptr, + nullptr}; m_descriptor_sets[DESCRIPTOR_SET_BIND_POINT_PIXEL_SHADER_SAMPLERS] = set; m_dirty_flags |= DIRTY_FLAG_DESCRIPTOR_SET_BINDING; } - if (m_bbox_enabled && - (m_dirty_flags & DIRTY_FLAG_PS_SSBO || + if ((m_dirty_flags & DIRTY_FLAG_PS_SSBO || m_descriptor_sets[DESCRIPTOR_SET_BIND_POINT_STORAGE_OR_TEXEL_BUFFER] == VK_NULL_HANDLE)) { VkDescriptorSetLayout layout = diff --git a/Source/Core/VideoBackends/Vulkan/StateTracker.h b/Source/Core/VideoBackends/Vulkan/StateTracker.h index 03d7464cee..73da3ad646 100644 --- a/Source/Core/VideoBackends/Vulkan/StateTracker.h +++ b/Source/Core/VideoBackends/Vulkan/StateTracker.h @@ -16,6 +16,8 @@ #include "VideoCommon/NativeVertexFormat.h" #include "VideoCommon/PixelShaderGen.h" #include "VideoCommon/RenderBase.h" +#include "VideoCommon/UberShaderPixel.h" +#include "VideoCommon/UberShaderVertex.h" #include "VideoCommon/VertexShaderGen.h" namespace Vulkan @@ -60,6 +62,7 @@ public: void SetBlendState(const BlendingState& state); bool CheckForShaderChanges(u32 gx_primitive_type); + void ClearShaders(); void UpdateVertexShaderConstants(); void UpdateGeometryShaderConstants(); @@ -159,8 +162,8 @@ private: DIRTY_FLAG_DESCRIPTOR_SET_BINDING = (1 << 11), DIRTY_FLAG_PIPELINE_BINDING = (1 << 12), - DIRTY_FLAG_ALL_DESCRIPTOR_SETS = - DIRTY_FLAG_VS_UBO | DIRTY_FLAG_GS_UBO | DIRTY_FLAG_PS_SAMPLERS | DIRTY_FLAG_PS_SSBO + DIRTY_FLAG_ALL_DESCRIPTOR_SETS = DIRTY_FLAG_VS_UBO | DIRTY_FLAG_GS_UBO | DIRTY_FLAG_PS_UBO | + DIRTY_FLAG_PS_SAMPLERS | DIRTY_FLAG_PS_SSBO }; bool Initialize(); @@ -178,9 +181,15 @@ private: // Obtains a Vulkan pipeline object for the specified pipeline configuration. // Also adds this pipeline configuration to the UID cache if it is not present already. - VkPipeline GetPipelineAndCacheUID(const PipelineInfo& info); + VkPipeline GetPipelineAndCacheUID(); + + // Are bounding box ubershaders enabled? If so, we need to ensure the SSBO is set up, + // since the bbox writes are determined by a uniform. + bool IsSSBODescriptorRequired() const; bool UpdatePipeline(); + void UpdatePipelineLayout(); + void UpdatePipelineVertexFormat(); bool UpdateDescriptorSet(); // Allocates storage in the uniform buffer of the specified size. If this storage cannot be @@ -203,10 +212,14 @@ private: VertexShaderUid m_vs_uid = {}; GeometryShaderUid m_gs_uid = {}; PixelShaderUid m_ps_uid = {}; + UberShader::VertexShaderUid m_uber_vs_uid = {}; + UberShader::PixelShaderUid m_uber_ps_uid = {}; + bool m_using_ubershaders = false; // pipeline state PipelineInfo m_pipeline_state = {}; VkPipeline m_pipeline_object = VK_NULL_HANDLE; + const VertexFormat* m_vertex_format = nullptr; // shader bindings std::array m_descriptor_sets = {}; diff --git a/Source/Core/VideoBackends/Vulkan/Util.cpp b/Source/Core/VideoBackends/Vulkan/Util.cpp index f1f4f42b1b..fc5b2ec8dc 100644 --- a/Source/Core/VideoBackends/Vulkan/Util.cpp +++ b/Source/Core/VideoBackends/Vulkan/Util.cpp @@ -575,8 +575,7 @@ void UtilityShaderDraw::BindDescriptors() { // TODO: This method is a mess, clean it up std::array bind_descriptor_sets = {}; - std::array - set_writes = {}; + std::array set_writes = {}; uint32_t num_set_writes = 0; VkDescriptorBufferInfo dummy_uniform_buffer = { @@ -633,29 +632,32 @@ void UtilityShaderDraw::BindDescriptors() // Check if we have any at all, skip the binding process entirely if we don't if (first_active_sampler != NUM_PIXEL_SHADER_SAMPLERS) { + // We need to fill it with non-empty images. + for (size_t i = 0; i < NUM_PIXEL_SHADER_SAMPLERS; i++) + { + if (m_ps_samplers[i].imageView == VK_NULL_HANDLE) + { + m_ps_samplers[i].imageView = g_object_cache->GetDummyImageView(); + m_ps_samplers[i].sampler = g_object_cache->GetPointSampler(); + } + } + // Allocate a new descriptor set VkDescriptorSet set = g_command_buffer_mgr->AllocateDescriptorSet( g_object_cache->GetDescriptorSetLayout(DESCRIPTOR_SET_LAYOUT_PIXEL_SHADER_SAMPLERS)); if (set == VK_NULL_HANDLE) PanicAlert("Failed to allocate descriptor set for utility draw"); - for (size_t i = 0; i < NUM_PIXEL_SHADER_SAMPLERS; i++) - { - const VkDescriptorImageInfo& info = m_ps_samplers[i]; - if (info.imageView != VK_NULL_HANDLE && info.sampler != VK_NULL_HANDLE) - { - set_writes[num_set_writes++] = {VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, - nullptr, - set, - static_cast(i), - 0, - 1, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - &info, - nullptr, - nullptr}; - } - } + set_writes[num_set_writes++] = {VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + nullptr, + set, + 0, + 0, + static_cast(NUM_PIXEL_SHADER_SAMPLERS), + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + m_ps_samplers.data(), + nullptr, + nullptr}; bind_descriptor_sets[DESCRIPTOR_SET_BIND_POINT_PIXEL_SHADER_SAMPLERS] = set; } diff --git a/Source/Core/VideoBackends/Vulkan/VertexFormat.cpp b/Source/Core/VideoBackends/Vulkan/VertexFormat.cpp index 1d35177252..c28110c844 100644 --- a/Source/Core/VideoBackends/Vulkan/VertexFormat.cpp +++ b/Source/Core/VideoBackends/Vulkan/VertexFormat.cpp @@ -53,17 +53,9 @@ VertexFormat::VertexFormat(const PortableVertexDeclaration& in_vtx_decl) SetupInputState(); } -VertexFormat* VertexFormat::GetOrCreateMatchingFormat(const PortableVertexDeclaration& decl) +const VkPipelineVertexInputStateCreateInfo& VertexFormat::GetVertexInputStateInfo() const { - auto vertex_format_map = VertexLoaderManager::GetNativeVertexFormatMap(); - auto iter = vertex_format_map->find(decl); - if (iter == vertex_format_map->end()) - { - auto ipair = vertex_format_map->emplace(decl, std::make_unique(decl)); - iter = ipair.first; - } - - return static_cast(iter->second.get()); + return m_input_state_info; } void VertexFormat::MapAttributes() @@ -136,9 +128,4 @@ void VertexFormat::AddAttribute(uint32_t location, uint32_t binding, VkFormat fo m_attribute_descriptions[m_num_attributes].offset = offset; m_num_attributes++; } - -void VertexFormat::SetupVertexPointers() -{ -} - } // namespace Vulkan diff --git a/Source/Core/VideoBackends/Vulkan/VertexFormat.h b/Source/Core/VideoBackends/Vulkan/VertexFormat.h index ef2d31d748..9b5810ced7 100644 --- a/Source/Core/VideoBackends/Vulkan/VertexFormat.h +++ b/Source/Core/VideoBackends/Vulkan/VertexFormat.h @@ -16,24 +16,13 @@ class VertexFormat : public ::NativeVertexFormat public: VertexFormat(const PortableVertexDeclaration& in_vtx_decl); - // Creates or obtains a pointer to a VertexFormat representing decl. - // If this results in a VertexFormat being created, if the game later uses a matching vertex - // declaration, the one that was previously created will be used. - static VertexFormat* GetOrCreateMatchingFormat(const PortableVertexDeclaration& decl); - // Passed to pipeline state creation - const VkPipelineVertexInputStateCreateInfo& GetVertexInputStateInfo() const - { - return m_input_state_info; - } + const VkPipelineVertexInputStateCreateInfo& GetVertexInputStateInfo() const; // Converting PortableVertexDeclaration -> Vulkan types void MapAttributes(); void SetupInputState(); - // Not used in the Vulkan backend. - void SetupVertexPointers() override; - private: void AddAttribute(uint32_t location, uint32_t binding, VkFormat format, uint32_t offset); diff --git a/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp b/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp index 8ab3245455..af9e249833 100644 --- a/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp +++ b/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp @@ -236,6 +236,8 @@ void VulkanContext::PopulateBackendInfo(VideoConfig* config) config->backend_info.bSupportsMultithreading = true; // Assumed support. config->backend_info.bSupportsComputeShaders = true; // Assumed support. config->backend_info.bSupportsGPUTextureDecoding = true; // Assumed support. + config->backend_info.bSupportsBitfield = true; // Assumed support. + config->backend_info.bSupportsDynamicSamplerIndexing = true; // Assumed support. config->backend_info.bSupportsInternalResolutionFrameDumps = true; // Assumed support. config->backend_info.bSupportsPostProcessing = true; // Assumed support. config->backend_info.bSupportsDualSourceBlend = false; // Dependent on features. diff --git a/Source/Core/VideoBackends/Vulkan/main.cpp b/Source/Core/VideoBackends/Vulkan/main.cpp index 4354a7a58b..f507d6cf05 100644 --- a/Source/Core/VideoBackends/Vulkan/main.cpp +++ b/Source/Core/VideoBackends/Vulkan/main.cpp @@ -253,6 +253,7 @@ bool VideoBackend::Initialize(void* window_handle) g_renderer.reset(); StateTracker::DestroyInstance(); g_framebuffer_manager.reset(); + g_shader_cache->Shutdown(); g_shader_cache.reset(); g_object_cache.reset(); g_command_buffer_mgr.reset(); @@ -262,6 +263,14 @@ bool VideoBackend::Initialize(void* window_handle) return false; } + // Ensure all pipelines previously used by the game have been created. + StateTracker::GetInstance()->ReloadPipelineUIDCache(); + + // Lastly, precompile ubershaders, if requested. + // This has to be done after the texture cache and shader cache are initialized. + if (g_ActiveConfig.CanPrecompileUberShaders()) + g_shader_cache->PrecompileUberShaders(); + return true; } @@ -293,6 +302,7 @@ void VideoBackend::Shutdown() void VideoBackend::Video_Cleanup() { g_command_buffer_mgr->WaitForGPUIdle(); + g_shader_cache->Shutdown(); // Save all cached pipelines out to disk for next time. if (g_ActiveConfig.bShaderCache) diff --git a/Source/Core/VideoCommon/AsyncShaderCompiler.cpp b/Source/Core/VideoCommon/AsyncShaderCompiler.cpp new file mode 100644 index 0000000000..59ef9762f1 --- /dev/null +++ b/Source/Core/VideoCommon/AsyncShaderCompiler.cpp @@ -0,0 +1,233 @@ +// Copyright 2017 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#include "VideoCommon/AsyncShaderCompiler.h" +#include +#include "Common/Assert.h" +#include "Common/Logging/Log.h" + +namespace VideoCommon +{ +AsyncShaderCompiler::AsyncShaderCompiler() +{ +} + +AsyncShaderCompiler::~AsyncShaderCompiler() +{ + // Pending work can be left at shutdown. + // The work item classes are expected to clean up after themselves. + _assert_(!HasWorkerThreads()); + _assert_(m_completed_work.empty()); +} + +void AsyncShaderCompiler::QueueWorkItem(WorkItemPtr item) +{ + // If no worker threads are available, compile synchronously. + if (!HasWorkerThreads()) + { + item->Compile(); + m_completed_work.push_back(std::move(item)); + } + else + { + std::lock_guard guard(m_pending_work_lock); + m_pending_work.push_back(std::move(item)); + m_worker_thread_wake.notify_one(); + } +} + +void AsyncShaderCompiler::RetrieveWorkItems() +{ + std::deque completed_work; + { + std::lock_guard guard(m_completed_work_lock); + m_completed_work.swap(completed_work); + } + + while (!completed_work.empty()) + { + completed_work.front()->Retrieve(); + completed_work.pop_front(); + } +} + +bool AsyncShaderCompiler::HasPendingWork() +{ + std::lock_guard guard(m_pending_work_lock); + return !m_pending_work.empty() || m_busy_workers.load() != 0; +} + +void AsyncShaderCompiler::WaitUntilCompletion() +{ + while (HasPendingWork()) + std::this_thread::sleep_for(std::chrono::milliseconds(1)); +} + +void AsyncShaderCompiler::WaitUntilCompletion( + const std::function& progress_callback) +{ + if (!HasPendingWork()) + return; + + // Wait a second before opening a progress dialog. + // This way, if the operation completes quickly, we don't annoy the user. + constexpr u32 CHECK_INTERVAL_MS = 50; + constexpr auto CHECK_INTERVAL = std::chrono::milliseconds(CHECK_INTERVAL_MS); + for (u32 i = 0; i < (1000 / CHECK_INTERVAL_MS); i++) + { + std::this_thread::sleep_for(std::chrono::milliseconds(CHECK_INTERVAL)); + if (!HasPendingWork()) + return; + } + + // Grab the number of pending items. We use this to work out how many are left. + size_t total_items = 0; + { + // Safe to hold both locks here, since nowhere else does. + std::lock_guard pending_guard(m_pending_work_lock); + std::lock_guard completed_guard(m_completed_work_lock); + total_items = m_completed_work.size() + m_pending_work.size() + m_busy_workers.load() + 1; + } + + // Update progress while the compiles complete. + for (;;) + { + size_t remaining_items; + { + std::lock_guard pending_guard(m_pending_work_lock); + if (m_pending_work.empty() && !m_busy_workers.load()) + break; + remaining_items = m_pending_work.size(); + } + + progress_callback(total_items - remaining_items, total_items); + std::this_thread::sleep_for(CHECK_INTERVAL); + } +} + +bool AsyncShaderCompiler::StartWorkerThreads(u32 num_worker_threads) +{ + if (num_worker_threads == 0) + return true; + + for (u32 i = 0; i < num_worker_threads; i++) + { + void* thread_param = nullptr; + if (!WorkerThreadInitMainThread(&thread_param)) + { + WARN_LOG(VIDEO, "Failed to initialize shader compiler worker thread."); + break; + } + + m_worker_thread_start_result.store(false); + + std::thread thr(&AsyncShaderCompiler::WorkerThreadEntryPoint, this, thread_param); + m_init_event.Wait(); + + if (!m_worker_thread_start_result.load()) + { + WARN_LOG(VIDEO, "Failed to start shader compiler worker thread."); + thr.join(); + break; + } + + m_worker_threads.push_back(std::move(thr)); + } + + return HasWorkerThreads(); +} + +bool AsyncShaderCompiler::ResizeWorkerThreads(u32 num_worker_threads) +{ + if (m_worker_threads.size() == num_worker_threads) + return true; + + StopWorkerThreads(); + return StartWorkerThreads(num_worker_threads); +} + +bool AsyncShaderCompiler::HasWorkerThreads() const +{ + return !m_worker_threads.empty(); +} + +void AsyncShaderCompiler::StopWorkerThreads() +{ + if (!HasWorkerThreads()) + return; + + // Signal worker threads to stop, and wake all of them. + { + std::lock_guard guard(m_pending_work_lock); + m_exit_flag.Set(); + m_worker_thread_wake.notify_all(); + } + + // Wait for worker threads to exit. + for (std::thread& thr : m_worker_threads) + thr.join(); + m_worker_threads.clear(); + m_exit_flag.Clear(); +} + +bool AsyncShaderCompiler::WorkerThreadInitMainThread(void** param) +{ + return true; +} + +bool AsyncShaderCompiler::WorkerThreadInitWorkerThread(void* param) +{ + return true; +} + +void AsyncShaderCompiler::WorkerThreadExit(void* param) +{ +} + +void AsyncShaderCompiler::WorkerThreadEntryPoint(void* param) +{ + // Initialize worker thread with backend-specific method. + if (!WorkerThreadInitWorkerThread(param)) + { + WARN_LOG(VIDEO, "Failed to initialize shader compiler worker."); + m_worker_thread_start_result.store(false); + m_init_event.Set(); + return; + } + + m_worker_thread_start_result.store(true); + m_init_event.Set(); + + WorkerThreadRun(); + + WorkerThreadExit(param); +} + +void AsyncShaderCompiler::WorkerThreadRun() +{ + std::unique_lock pending_lock(m_pending_work_lock); + while (!m_exit_flag.IsSet()) + { + m_worker_thread_wake.wait(pending_lock); + + while (!m_pending_work.empty() && !m_exit_flag.IsSet()) + { + m_busy_workers++; + WorkItemPtr item(std::move(m_pending_work.front())); + m_pending_work.pop_front(); + pending_lock.unlock(); + + if (item->Compile()) + { + std::lock_guard completed_guard(m_completed_work_lock); + m_completed_work.push_back(std::move(item)); + } + + pending_lock.lock(); + m_busy_workers--; + } + } +} + +} // namespace VideoCommon diff --git a/Source/Core/VideoCommon/AsyncShaderCompiler.h b/Source/Core/VideoCommon/AsyncShaderCompiler.h new file mode 100644 index 0000000000..fb117dab28 --- /dev/null +++ b/Source/Core/VideoCommon/AsyncShaderCompiler.h @@ -0,0 +1,84 @@ +// Copyright 2017 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "Common/CommonTypes.h" +#include "Common/Event.h" +#include "Common/Flag.h" + +namespace VideoCommon +{ +class AsyncShaderCompiler +{ +public: + class WorkItem + { + public: + virtual ~WorkItem() = default; + virtual bool Compile() = 0; + virtual void Retrieve() = 0; + }; + + using WorkItemPtr = std::unique_ptr; + + AsyncShaderCompiler(); + virtual ~AsyncShaderCompiler(); + + template + static WorkItemPtr CreateWorkItem(Params... params) + { + return std::unique_ptr(new T(params...)); + } + + void QueueWorkItem(WorkItemPtr item); + void RetrieveWorkItems(); + bool HasPendingWork(); + + // Simpler version without progress updates. + void WaitUntilCompletion(); + + // Calls progress_callback periodically, with completed_items, and total_items. + void WaitUntilCompletion(const std::function& progress_callback); + + // Needed because of calling virtual methods in shutdown procedure. + bool StartWorkerThreads(u32 num_worker_threads); + bool ResizeWorkerThreads(u32 num_worker_threads); + bool HasWorkerThreads() const; + void StopWorkerThreads(); + +protected: + virtual bool WorkerThreadInitMainThread(void** param); + virtual bool WorkerThreadInitWorkerThread(void* param); + virtual void WorkerThreadExit(void* param); + +private: + void WorkerThreadEntryPoint(void* param); + void WorkerThreadRun(); + + Common::Flag m_exit_flag; + Common::Event m_init_event; + + std::vector m_worker_threads; + std::atomic_bool m_worker_thread_start_result{false}; + + std::deque m_pending_work; + std::mutex m_pending_work_lock; + std::condition_variable m_worker_thread_wake; + std::atomic_size_t m_busy_workers{0}; + + std::deque m_completed_work; + std::mutex m_completed_work_lock; +}; + +} // namespace VideoCommon diff --git a/Source/Core/VideoCommon/BPMemory.cpp b/Source/Core/VideoCommon/BPMemory.cpp index 67eeb994b1..26891ba941 100644 --- a/Source/Core/VideoCommon/BPMemory.cpp +++ b/Source/Core/VideoCommon/BPMemory.cpp @@ -24,8 +24,7 @@ float FogParam0::GetA() const float FogParam3::GetC() const { // scale mantissa from 11 to 23 bits - const u32 integral = (static_cast(c_sign) << 31) | (static_cast(c_exp) << 23) | - (static_cast(c_mant) << 12); + const u32 integral = (c_sign.Value() << 31) | (c_exp.Value() << 23) | (c_mant.Value() << 12); float real; std::memcpy(&real, &integral, sizeof(u32)); diff --git a/Source/Core/VideoCommon/BPMemory.h b/Source/Core/VideoCommon/BPMemory.h index 366dc191a5..ceb1fd3899 100644 --- a/Source/Core/VideoCommon/BPMemory.h +++ b/Source/Core/VideoCommon/BPMemory.h @@ -301,40 +301,37 @@ struct TevStageCombiner { union ColorCombiner { - struct // abc=8bit,d=10bit - { - u32 d : 4; // TEVSELCC_X - u32 c : 4; // TEVSELCC_X - u32 b : 4; // TEVSELCC_X - u32 a : 4; // TEVSELCC_X + // abc=8bit,d=10bit + BitField<0, 4, u32> d; // TEVSELCC_X + BitField<4, 4, u32> c; // TEVSELCC_X + BitField<8, 4, u32> b; // TEVSELCC_X + BitField<12, 4, u32> a; // TEVSELCC_X - u32 bias : 2; - u32 op : 1; - u32 clamp : 1; + BitField<16, 2, u32> bias; + BitField<18, 1, u32> op; + BitField<19, 1, u32> clamp; + + BitField<20, 2, u32> shift; + BitField<22, 2, u32> dest; // 1,2,3 - u32 shift : 2; - u32 dest : 2; // 1,2,3 - }; u32 hex; }; union AlphaCombiner { - struct - { - u32 rswap : 2; - u32 tswap : 2; - u32 d : 3; // TEVSELCA_ - u32 c : 3; // TEVSELCA_ - u32 b : 3; // TEVSELCA_ - u32 a : 3; // TEVSELCA_ + BitField<0, 2, u32> rswap; + BitField<2, 2, u32> tswap; + BitField<4, 3, u32> d; // TEVSELCA_ + BitField<7, 3, u32> c; // TEVSELCA_ + BitField<10, 3, u32> b; // TEVSELCA_ + BitField<13, 3, u32> a; // TEVSELCA_ - u32 bias : 2; // GXTevBias - u32 op : 1; - u32 clamp : 1; + BitField<16, 2, u32> bias; // GXTevBias + BitField<18, 1, u32> op; + BitField<19, 1, u32> clamp; + + BitField<20, 2, u32> shift; + BitField<22, 2, u32> dest; // 1,2,3 - u32 shift : 2; - u32 dest : 2; // 1,2,3 - }; u32 hex; }; @@ -353,21 +350,18 @@ struct TevStageCombiner union TevStageIndirect { - struct - { - u32 bt : 2; // Indirect tex stage ID - u32 fmt : 2; // Format: ITF_X - u32 bias : 3; // ITB_X - u32 bs : 2; // ITBA_X, indicates which coordinate will become the 'bump alpha' - u32 mid : 4; // Matrix ID to multiply offsets with - u32 sw : 3; // ITW_X, wrapping factor for S of regular coord - u32 tw : 3; // ITW_X, wrapping factor for T of regular coord - u32 lb_utclod : 1; // Use modified or unmodified texture coordinates for LOD computation - u32 fb_addprev : 1; // 1 if the texture coordinate results from the previous TEV stage should - // be added - u32 pad0 : 3; - u32 rid : 8; - }; + BitField<0, 2, u32> bt; // Indirect tex stage ID + BitField<2, 2, u32> fmt; // Format: ITF_X + BitField<4, 3, u32> bias; // ITB_X + BitField<7, 2, u32> bs; // ITBA_X, indicates which coordinate will become the 'bump alpha' + BitField<9, 4, u32> mid; // Matrix ID to multiply offsets with + BitField<13, 3, u32> sw; // ITW_X, wrapping factor for S of regular coord + BitField<16, 3, u32> tw; // ITW_X, wrapping factor for T of regular coord + BitField<19, 1, u32> lb_utclod; // Use modified or unmodified texture + // coordinates for LOD computation + BitField<20, 1, u32> fb_addprev; // 1 if the texture coordinate results from the previous TEV + // stage should be added + struct { u32 hex : 21; @@ -381,28 +375,23 @@ union TevStageIndirect union TwoTevStageOrders { - struct - { - u32 texmap0 : 3; // Indirect tex stage texmap - u32 texcoord0 : 3; - u32 enable0 : 1; // 1 if should read from texture - u32 colorchan0 : 3; // RAS1_CC_X + BitField<0, 3, u32> texmap0; // Indirect tex stage texmap + BitField<3, 3, u32> texcoord0; + BitField<6, 1, u32> enable0; // 1 if should read from texture + BitField<7, 3, u32> colorchan0; // RAS1_CC_X - u32 pad0 : 2; + BitField<12, 3, u32> texmap1; + BitField<15, 3, u32> texcoord1; + BitField<18, 1, u32> enable1; // 1 if should read from texture + BitField<19, 3, u32> colorchan1; // RAS1_CC_X - u32 texmap1 : 3; - u32 texcoord1 : 3; - u32 enable1 : 1; // 1 if should read from texture - u32 colorchan1 : 3; // RAS1_CC_X + BitField<24, 8, u32> rid; - u32 pad1 : 2; - u32 rid : 8; - }; u32 hex; - int getTexMap(int i) const { return i ? texmap1 : texmap0; } - int getTexCoord(int i) const { return i ? texcoord1 : texcoord0; } - int getEnable(int i) const { return i ? enable1 : enable0; } - int getColorChan(int i) const { return i ? colorchan1 : colorchan0; } + u32 getTexMap(int i) const { return i ? texmap1.Value() : texmap0.Value(); } + u32 getTexCoord(int i) const { return i ? texcoord1.Value() : texcoord0.Value(); } + u32 getEnable(int i) const { return i ? enable1.Value() : enable0.Value(); } + u32 getColorChan(int i) const { return i ? colorchan1.Value() : colorchan0.Value(); } }; union TEXSCALE @@ -527,20 +516,14 @@ union TexTLUT union ZTex1 { - struct - { - u32 bias : 24; - }; + BitField<0, 24, u32> bias; u32 hex; }; union ZTex2 { - struct - { - u32 type : 2; // TEV_Z_TYPE_X - u32 op : 2; // GXZTexOp - }; + BitField<0, 2, u32> type; // TEV_Z_TYPE_X + BitField<2, 2, u32> op; // GXZTexOp u32 hex; }; @@ -681,14 +664,12 @@ union FogParam0 union FogParam3 { - struct - { - u32 c_mant : 11; - u32 c_exp : 8; - u32 c_sign : 1; - u32 proj : 1; // 0 - perspective, 1 - orthographic - u32 fsel : 3; // 0 - off, 2 - linear, 4 - exp, 5 - exp2, 6 - backward exp, 7 - backward exp2 - }; + BitField<0, 11, u32> c_mant; + BitField<11, 8, u32> c_exp; + BitField<19, 1, u32> c_sign; + BitField<20, 1, u32> proj; // 0 - perspective, 1 - orthographic + BitField<21, 3, u32> fsel; // 0 - off, 2 - linear, 4 - exp, 5 - exp2, 6 - + // backward exp, 7 - backward exp2 // amount to subtract from eyespacez after range adjustment float GetC() const; @@ -698,15 +679,12 @@ union FogParam3 union FogRangeKElement { - struct - { - u32 HI : 12; - u32 LO : 12; - u32 regid : 8; - }; + BitField<0, 12, u32> HI; + BitField<12, 12, u32> LO; + BitField<24, 8, u32> regid; // TODO: Which scaling coefficient should we use here? This is just a guess! - float GetValue(int i) const { return (i ? HI : LO) / 256.f; } + float GetValue(int i) const { return (i ? HI.Value() : LO.Value()) / 256.f; } u32 HEX; }; @@ -714,13 +692,9 @@ struct FogRangeParams { union RangeBase { - struct - { - u32 Center : 10; // viewport center + 342 - u32 Enabled : 1; - u32 unused : 13; - u32 regid : 8; - }; + BitField<0, 10, u32> Center; // viewport center + 342 + BitField<10, 1, u32> Enabled; + BitField<24, 8, u32> regid; u32 hex; }; RangeBase Base; @@ -736,12 +710,9 @@ struct FogParams union FogColor { - struct - { - u32 b : 8; - u32 g : 8; - u32 r : 8; - }; + BitField<0, 8, u32> b; + BitField<8, 8, u32> g; + BitField<16, 8, u32> r; u32 hex; }; @@ -771,11 +742,8 @@ union ZMode union ConstantAlpha { - struct - { - u32 alpha : 8; - u32 enable : 1; - }; + BitField<0, 8, u32> alpha; + BitField<8, 1, u32> enable; u32 hex; }; @@ -881,19 +849,16 @@ union TevReg union TevKSel { - struct - { - u32 swap1 : 2; - u32 swap2 : 2; - u32 kcsel0 : 5; - u32 kasel0 : 5; - u32 kcsel1 : 5; - u32 kasel1 : 5; - }; + BitField<0, 2, u32> swap1; + BitField<2, 2, u32> swap2; + BitField<4, 5, u32> kcsel0; + BitField<9, 5, u32> kasel0; + BitField<14, 5, u32> kcsel1; + BitField<19, 5, u32> kasel1; u32 hex; - int getKC(int i) const { return i ? kcsel1 : kcsel0; } - int getKA(int i) const { return i ? kasel1 : kasel0; } + u32 getKC(int i) const { return i ? kcsel1.Value() : kcsel0.Value(); } + u32 getKA(int i) const { return i ? kasel1.Value() : kasel0.Value(); } }; union AlphaTest diff --git a/Source/Core/VideoCommon/BPStructs.cpp b/Source/Core/VideoCommon/BPStructs.cpp index 204c637b94..c686d4ff01 100644 --- a/Source/Core/VideoCommon/BPStructs.cpp +++ b/Source/Core/VideoCommon/BPStructs.cpp @@ -93,6 +93,9 @@ static void BPWritten(const BPCmd& bp) (u32)bpmem.genMode.cullmode, (u32)bpmem.genMode.numindstages, (u32)bpmem.genMode.zfreeze); + if (bp.changes) + PixelShaderManager::SetGenModeChanged(); + // Only call SetGenerationMode when cull mode changes. if (bp.changes & 0xC000) SetGenerationMode(); @@ -155,12 +158,20 @@ static void BPWritten(const BPCmd& bp) // Set Color Mask if (bp.changes & 0x18) // colorupdate | alphaupdate SetColorMask(); + + // Dither + if (bp.changes & 0x04) + PixelShaderManager::SetBlendModeChanged(); } return; case BPMEM_CONSTANTALPHA: // Set Destination Alpha - PRIM_LOG("constalpha: alp=%d, en=%d", bpmem.dstalpha.alpha, bpmem.dstalpha.enable); - if (bp.changes & 0xFF) - PixelShaderManager::SetDestAlpha(); + PRIM_LOG("constalpha: alp=%d, en=%d", bpmem.dstalpha.alpha.Value(), + bpmem.dstalpha.enable.Value()); + if (bp.changes) + { + PixelShaderManager::SetAlpha(); + PixelShaderManager::SetDestAlphaChanged(); + } if (bp.changes & 0x100) SetBlendMode(); return; @@ -237,6 +248,7 @@ static void BPWritten(const BPCmd& bp) // the number of lines copied is determined by the y scale * source efb height BoundingBox::active = false; + PixelShaderManager::SetBoundingBoxActive(false); float yScale; if (PE_copy.scale_invert) @@ -317,12 +329,13 @@ static void BPWritten(const BPCmd& bp) PixelShaderManager::SetAlpha(); if (bp.changes) { + PixelShaderManager::SetAlphaTestChanged(); g_renderer->SetColorMask(); SetBlendMode(); } return; case BPMEM_BIAS: // BIAS - PRIM_LOG("ztex bias=0x%x", bpmem.ztex1.bias); + PRIM_LOG("ztex bias=0x%x", bpmem.ztex1.bias.Value()); if (bp.changes) PixelShaderManager::SetZTextureBias(); return; @@ -331,7 +344,7 @@ static void BPWritten(const BPCmd& bp) if (bp.changes & 3) PixelShaderManager::SetZTextureTypeChanged(); if (bp.changes & 12) - VertexShaderManager::SetViewportChanged(); + PixelShaderManager::SetZTextureOpChanged(); #if defined(_DEBUG) || defined(DEBUGFAST) const char* pzop[] = {"DISABLE", "ADD", "REPLACE", "?"}; const char* pztype[] = {"Z8", "Z16", "Z24", "?"}; @@ -389,6 +402,7 @@ static void BPWritten(const BPCmd& bp) { u8 offset = bp.address & 2; BoundingBox::active = true; + PixelShaderManager::SetBoundingBoxActive(true); if (g_ActiveConfig.backend_info.bSupportsBBox && g_ActiveConfig.bBBoxEnable) { @@ -425,6 +439,11 @@ static void BPWritten(const BPCmd& bp) * 3 BC0 - Ind. Tex Stage 0 NTexCoord * 0 BI0 - Ind. Tex Stage 0 NTexMap */ case BPMEM_IREF: + { + if (bp.changes) + PixelShaderManager::SetTevIndirectChanged(); + return; + } case BPMEM_TEV_KSEL: // Texture Environment Swap Mode Table 0 case BPMEM_TEV_KSEL + 1: // Texture Environment Swap Mode Table 1 @@ -434,6 +453,8 @@ static void BPWritten(const BPCmd& bp) case BPMEM_TEV_KSEL + 5: // Texture Environment Swap Mode Table 5 case BPMEM_TEV_KSEL + 6: // Texture Environment Swap Mode Table 6 case BPMEM_TEV_KSEL + 7: // Texture Environment Swap Mode Table 7 + PixelShaderManager::SetTevKSel(bp.address - BPMEM_TEV_KSEL, bp.newvalue); + return; /* This Register can be used to limit to which bits of BP registers is * actually written to. The mask is only valid for the next BP write, @@ -566,6 +587,7 @@ static void BPWritten(const BPCmd& bp) // ------------------------- case BPMEM_TREF: case BPMEM_TREF + 4: + PixelShaderManager::SetTevOrder(bp.address - BPMEM_TREF, bp.newvalue); return; // ---------------------- // Set wrap size @@ -629,15 +651,18 @@ static void BPWritten(const BPCmd& bp) // -------------- // Indirect Tev // -------------- - case BPMEM_IND_CMD: // Indirect 0-15 + case BPMEM_IND_CMD: + PixelShaderManager::SetTevIndirectChanged(); return; // -------------------------------------------------- // Set Color/Alpha of a Tev // BPMEM_TEV_COLOR_ENV - Dest, Shift, Clamp, Sub, Bias, Sel A, Sel B, Sel C, Sel D // BPMEM_TEV_ALPHA_ENV - Dest, Shift, Clamp, Sub, Bias, Sel A, Sel B, Sel C, Sel D, T Swap, R Swap // -------------------------------------------------- - case BPMEM_TEV_COLOR_ENV: // Texture Environment Color/Alpha 0-7 - case BPMEM_TEV_COLOR_ENV + 16: // Texture Environment Color/Alpha 8-15 + case BPMEM_TEV_COLOR_ENV: // Texture Environment 1 + case BPMEM_TEV_COLOR_ENV + 16: + PixelShaderManager::SetTevCombiner((bp.address - BPMEM_TEV_COLOR_ENV) >> 1, + (bp.address - BPMEM_TEV_COLOR_ENV) & 1, bp.newvalue); return; default: break; @@ -1281,7 +1306,7 @@ void GetBPRegInfo(const u8* data, std::string* name, std::string* desc) "Tex sel: %d\n", (data[0] - BPMEM_TEV_ALPHA_ENV) / 2, tevin[ac.a], tevin[ac.b], tevin[ac.c], tevin[ac.d], tevbias[ac.bias], tevop[ac.op], no_yes[ac.clamp], - tevscale[ac.shift], tevout[ac.dest], ac.rswap, ac.tswap); + tevscale[ac.shift], tevout[ac.dest], ac.rswap.Value(), ac.tswap.Value()); break; } diff --git a/Source/Core/VideoCommon/CMakeLists.txt b/Source/Core/VideoCommon/CMakeLists.txt index 0724212bdc..4df4a01550 100644 --- a/Source/Core/VideoCommon/CMakeLists.txt +++ b/Source/Core/VideoCommon/CMakeLists.txt @@ -1,6 +1,7 @@ set(SRCS AbstractTexture.cpp AsyncRequests.cpp + AsyncShaderCompiler.cpp BoundingBox.cpp BPFunctions.cpp BPMemory.cpp @@ -31,6 +32,9 @@ set(SRCS RenderState.cpp ShaderGenCommon.cpp Statistics.cpp + UberShaderCommon.cpp + UberShaderPixel.cpp + UberShaderVertex.cpp TextureCacheBase.cpp TextureConfig.cpp TextureConversionShader.cpp diff --git a/Source/Core/VideoCommon/ConstantManager.h b/Source/Core/VideoCommon/ConstantManager.h index c3b7004e69..0630d015db 100644 --- a/Source/Core/VideoCommon/ConstantManager.h +++ b/Source/Core/VideoCommon/ConstantManager.h @@ -24,11 +24,31 @@ struct PixelShaderConstants int4 fogi; float4 fogf[2]; float4 zslope; - float4 efbscale; + float efbscale[2]; + + // Constants from here onwards are only used in ubershaders. + u32 genmode; // .z + u32 alphaTest; // .w + u32 fogParam3; // .x + u32 fogRangeBase; // .y + u32 dstalpha; // .z + u32 ztex_op; // .w + u32 early_ztest; // .x (bool) + u32 rgba6_format; // .y (bool) + u32 dither; // .z (bool) + u32 bounding_box; // .w (bool) + uint4 pack1[16]; // .xy - combiners, .z - tevind, .w - iref + uint4 pack2[8]; // .x - tevorder, .y - tevksel + int4 konst[32]; // .rgba }; struct VertexShaderConstants { + u32 components; // .x + u32 xfmem_dualTexInfo; // .y + u32 xfmem_numColorChans; // .z + u32 pad1; // .w + float4 posnormalmatrix[6]; float4 projection[4]; int4 materials[4]; @@ -45,7 +65,10 @@ struct VertexShaderConstants float4 normalmatrices[32]; float4 posttransformmatrices[64]; float4 pixelcentercorrection; - float4 viewport; + float viewport[2]; // .xy + float pad2[2]; // .zw + + uint4 xfmem_pack1[8]; // .x - texMtxInfo, .y - postMtxInfo, [0..1].z = color, [0..1].w = alpha }; struct GeometryShaderConstants diff --git a/Source/Core/VideoCommon/DriverDetails.cpp b/Source/Core/VideoCommon/DriverDetails.cpp index e81bdfb2af..fb12fc04b8 100644 --- a/Source/Core/VideoCommon/DriverDetails.cpp +++ b/Source/Core/VideoCommon/DriverDetails.cpp @@ -98,7 +98,10 @@ static BugInfo m_known_bugs[] = { BUG_BROKEN_BITWISE_OP_NEGATION, -1.0, -1.0, true}, {API_VULKAN, OS_ALL, VENDOR_ATI, DRIVER_ATI, Family::UNKNOWN, BUG_PRIMITIVE_RESTART, -1.0, -1.0, true}, -}; + {API_OPENGL, OS_LINUX, VENDOR_MESA, DRIVER_I965, Family::UNKNOWN, + BUG_SHARED_CONTEXT_SHADER_COMPILATION, -1.0, -1.0, true}, + {API_OPENGL, OS_LINUX, VENDOR_MESA, DRIVER_NOUVEAU, Family::UNKNOWN, + BUG_SHARED_CONTEXT_SHADER_COMPILATION, -1.0, -1.0, true}}; static std::map m_bugs; diff --git a/Source/Core/VideoCommon/DriverDetails.h b/Source/Core/VideoCommon/DriverDetails.h index 009ef1fdd5..d256b51918 100644 --- a/Source/Core/VideoCommon/DriverDetails.h +++ b/Source/Core/VideoCommon/DriverDetails.h @@ -247,6 +247,12 @@ enum Bug // fail compilation with no useful diagnostic log. This can be worked around by storing // the negated value to a temporary variable then using that in the bitwise op. BUG_BROKEN_BITWISE_OP_NEGATION, + + // Bug: Shaders are recompiled on the main thread after being previously compiled on + // a worker thread on Mesa i965. + // Started version: -1 + // Ended Version: -1 + BUG_SHARED_CONTEXT_SHADER_COMPILATION, }; // Initializes our internal vendor, device family, and driver version diff --git a/Source/Core/VideoCommon/GeometryShaderGen.cpp b/Source/Core/VideoCommon/GeometryShaderGen.cpp index bb201ddb0f..4b06c0f7f3 100644 --- a/Source/Core/VideoCommon/GeometryShaderGen.cpp +++ b/Source/Core/VideoCommon/GeometryShaderGen.cpp @@ -364,3 +364,23 @@ static void EndPrimitive(ShaderCode& out, const ShaderHostConfig& host_config, else out.Write("\toutput.RestartStrip();\n"); } + +void EnumerateGeometryShaderUids(const std::function& callback) +{ + GeometryShaderUid uid; + std::memset(&uid, 0, sizeof(uid)); + + static constexpr std::array primitive_lut = { + {PRIMITIVE_TRIANGLES, PRIMITIVE_LINES, PRIMITIVE_POINTS}}; + for (u32 primitive : primitive_lut) + { + auto* guid = uid.GetUidData(); + guid->primitive_type = primitive; + + for (u32 texgens = 0; texgens <= 8; texgens++) + { + guid->numTexGens = texgens; + callback(uid); + } + } +} diff --git a/Source/Core/VideoCommon/GeometryShaderGen.h b/Source/Core/VideoCommon/GeometryShaderGen.h index 58ab01f757..f138207e14 100644 --- a/Source/Core/VideoCommon/GeometryShaderGen.h +++ b/Source/Core/VideoCommon/GeometryShaderGen.h @@ -4,6 +4,7 @@ #pragma once +#include #include "Common/CommonTypes.h" #include "VideoCommon/ShaderGenCommon.h" #include "VideoCommon/VertexManagerBase.h" @@ -28,3 +29,4 @@ typedef ShaderUid GeometryShaderUid; ShaderCode GenerateGeometryShaderCode(APIType ApiType, const ShaderHostConfig& host_config, const geometry_shader_uid_data* uid_data); GeometryShaderUid GetGeometryShaderUid(u32 primitive_type); +void EnumerateGeometryShaderUids(const std::function& callback); diff --git a/Source/Core/VideoCommon/MainBase.cpp b/Source/Core/VideoCommon/MainBase.cpp index 2817c983a3..5f5ce785ef 100644 --- a/Source/Core/VideoCommon/MainBase.cpp +++ b/Source/Core/VideoCommon/MainBase.cpp @@ -194,9 +194,6 @@ void VideoBackendBase::InitializeShared() g_Config.UpdateProjectionHack(); g_Config.VerifyValidity(); UpdateActiveConfig(); - - // Notify the core that the video backend is ready - Host_Message(WM_USER_CREATE); } void VideoBackendBase::ShutdownShared() diff --git a/Source/Core/VideoCommon/NativeVertexFormat.h b/Source/Core/VideoCommon/NativeVertexFormat.h index 55f8a57483..f03de05cc6 100644 --- a/Source/Core/VideoCommon/NativeVertexFormat.h +++ b/Source/Core/VideoCommon/NativeVertexFormat.h @@ -106,8 +106,6 @@ class NativeVertexFormat : NonCopyable { public: virtual ~NativeVertexFormat() {} - virtual void SetupVertexPointers() = 0; - u32 GetVertexStride() const { return vtx_decl.stride; } const PortableVertexDeclaration& GetVertexDeclaration() const { return vtx_decl; } protected: diff --git a/Source/Core/VideoCommon/PixelEngine.cpp b/Source/Core/VideoCommon/PixelEngine.cpp index 9c4282170b..02cda26917 100644 --- a/Source/Core/VideoCommon/PixelEngine.cpp +++ b/Source/Core/VideoCommon/PixelEngine.cpp @@ -18,6 +18,7 @@ #include "VideoCommon/CommandProcessor.h" #include "VideoCommon/Fifo.h" #include "VideoCommon/PixelEngine.h" +#include "VideoCommon/PixelShaderManager.h" namespace PixelEngine { @@ -231,6 +232,7 @@ void RegisterMMIO(MMIO::Mapping* mmio, u32 base) { mmio->Register(base | (PE_BBOX_LEFT + 2 * i), MMIO::ComplexRead([i](u32) { BoundingBox::active = false; + PixelShaderManager::SetBoundingBoxActive(false); return g_video_backend->Video_GetBoundingBox(i); }), MMIO::InvalidWrite()); diff --git a/Source/Core/VideoCommon/PixelShaderGen.cpp b/Source/Core/VideoCommon/PixelShaderGen.cpp index fe4ef11bb0..308807e7c8 100644 --- a/Source/Core/VideoCommon/PixelShaderGen.cpp +++ b/Source/Core/VideoCommon/PixelShaderGen.cpp @@ -179,7 +179,7 @@ PixelShaderUid GetPixelShaderUid() u32 numStages = uid_data->genMode_numtevstages + 1; const bool forced_early_z = - g_ActiveConfig.backend_info.bSupportsEarlyZ && bpmem.UseEarlyDepthTest() && + bpmem.UseEarlyDepthTest() && (g_ActiveConfig.bFastDepthCalc || bpmem.alpha_test.TestResult() == AlphaTest::UNDETERMINED) // We can't allow early_ztest for zfreeze because depth is overridden per-pixel. // This means it's impossible for zcomploc to be emulated on a zfrozen polygon. @@ -192,18 +192,6 @@ PixelShaderUid GetPixelShaderUid() uid_data->per_pixel_depth = per_pixel_depth; uid_data->forced_early_z = forced_early_z; - if (!uid_data->forced_early_z && bpmem.UseEarlyDepthTest() && - (!g_ActiveConfig.bFastDepthCalc || bpmem.alpha_test.TestResult() == AlphaTest::UNDETERMINED)) - { - static bool warn_once = true; - if (warn_once) - WARN_LOG(VIDEO, "Early z test enabled but not possible to emulate with current " - "configuration. Make sure to enable fast depth calculations. If this message " - "still shows up your hardware isn't able to emulate the feature properly (a " - "GPU with D3D 11.0 / OGL 4.2 support is required)."); - warn_once = false; - } - if (g_ActiveConfig.bEnablePixelLighting) { // The lighting shader only needs the two color bits of the 23bit component bit array. @@ -333,6 +321,110 @@ PixelShaderUid GetPixelShaderUid() return out; } +void WritePixelShaderCommonHeader(ShaderCode& out, APIType ApiType, u32 num_texgens, + bool per_pixel_lighting, bool bounding_box) +{ + // dot product for integer vectors + out.Write("int idot(int3 x, int3 y)\n" + "{\n" + "\tint3 tmp = x * y;\n" + "\treturn tmp.x + tmp.y + tmp.z;\n" + "}\n"); + + out.Write("int idot(int4 x, int4 y)\n" + "{\n" + "\tint4 tmp = x * y;\n" + "\treturn tmp.x + tmp.y + tmp.z + tmp.w;\n" + "}\n\n"); + + // rounding + casting to integer at once in a single function + out.Write("int iround(float x) { return int (round(x)); }\n" + "int2 iround(float2 x) { return int2(round(x)); }\n" + "int3 iround(float3 x) { return int3(round(x)); }\n" + "int4 iround(float4 x) { return int4(round(x)); }\n\n"); + + if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan) + { + out.Write("SAMPLER_BINDING(0) uniform sampler2DArray samp[8];\n"); + } + else // D3D + { + // Declare samplers + out.Write("SamplerState samp[8] : register(s0);\n"); + out.Write("\n"); + out.Write("Texture2DArray Tex[8] : register(t0);\n"); + } + out.Write("\n"); + + if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan) + out.Write("UBO_BINDING(std140, 1) uniform PSBlock {\n"); + else + out.Write("cbuffer PSBlock : register(b0) {\n"); + + out.Write("\tint4 " I_COLORS "[4];\n" + "\tint4 " I_KCOLORS "[4];\n" + "\tint4 " I_ALPHA ";\n" + "\tfloat4 " I_TEXDIMS "[8];\n" + "\tint4 " I_ZBIAS "[2];\n" + "\tint4 " I_INDTEXSCALE "[2];\n" + "\tint4 " I_INDTEXMTX "[6];\n" + "\tint4 " I_FOGCOLOR ";\n" + "\tint4 " I_FOGI ";\n" + "\tfloat4 " I_FOGF "[2];\n" + "\tfloat4 " I_ZSLOPE ";\n" + "\tfloat2 " I_EFBSCALE ";\n" + "\tuint bpmem_genmode;\n" + "\tuint bpmem_alphaTest;\n" + "\tuint bpmem_fogParam3;\n" + "\tuint bpmem_fogRangeBase;\n" + "\tuint bpmem_dstalpha;\n" + "\tuint bpmem_ztex_op;\n" + "\tbool bpmem_early_ztest;\n" + "\tbool bpmem_rgba6_format;\n" + "\tbool bpmem_dither;\n" + "\tbool bpmem_bounding_box;\n" + "\tuint4 bpmem_pack1[16];\n" // .xy - combiners, .z - tevind + "\tuint4 bpmem_pack2[8];\n" // .x - tevorder, .y - tevksel + "\tint4 konstLookup[32];\n" + "};\n\n"); + out.Write("#define bpmem_combiners(i) (bpmem_pack1[(i)].xy)\n" + "#define bpmem_tevind(i) (bpmem_pack1[(i)].z)\n" + "#define bpmem_iref(i) (bpmem_pack1[(i)].w)\n" + "#define bpmem_tevorder(i) (bpmem_pack2[(i)].x)\n" + "#define bpmem_tevksel(i) (bpmem_pack2[(i)].y)\n\n"); + + if (per_pixel_lighting) + { + out.Write("%s", s_lighting_struct); + + if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan) + out.Write("UBO_BINDING(std140, 2) uniform VSBlock {\n"); + else + out.Write("cbuffer VSBlock : register(b1) {\n"); + + out.Write(s_shader_uniforms); + out.Write("};\n"); + } + + if (bounding_box) + { + if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan) + { + out.Write("SSBO_BINDING(0) buffer BBox {\n" + "\tint4 bbox_data;\n" + "};\n"); + } + else + { + out.Write("globallycoherent RWBuffer bbox_data : register(u2);\n"); + } + } + + out.Write("struct VS_OUTPUT {\n"); + GenerateVSOutputMembers(out, ApiType, num_texgens, per_pixel_lighting, ""); + out.Write("};\n"); +} + static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, int n, APIType ApiType, bool stereo); static void WriteTevRegular(ShaderCode& out, const char* components, int bias, int op, int clamp, @@ -360,100 +452,11 @@ ShaderCode GeneratePixelShaderCode(APIType ApiType, const ShaderHostConfig& host out.Write("//%i TEV stages, %i texgens, %i IND stages\n", numStages, uid_data->genMode_numtexgens, uid_data->genMode_numindstages); - // dot product for integer vectors - out.Write("int idot(int3 x, int3 y)\n" - "{\n" - "\tint3 tmp = x * y;\n" - "\treturn tmp.x + tmp.y + tmp.z;\n" - "}\n"); + // Stuff that is shared between ubershaders and pixelgen. + WritePixelShaderCommonHeader(out, ApiType, uid_data->genMode_numtexgens, per_pixel_lighting, + uid_data->bounding_box); - out.Write("int idot(int4 x, int4 y)\n" - "{\n" - "\tint4 tmp = x * y;\n" - "\treturn tmp.x + tmp.y + tmp.z + tmp.w;\n" - "}\n\n"); - - // rounding + casting to integer at once in a single function - out.Write("int iround(float x) { return int (round(x)); }\n" - "int2 iround(float2 x) { return int2(round(x)); }\n" - "int3 iround(float3 x) { return int3(round(x)); }\n" - "int4 iround(float4 x) { return int4(round(x)); }\n\n"); - - if (ApiType == APIType::OpenGL) - { - out.Write("SAMPLER_BINDING(0) uniform sampler2DArray samp[8];\n"); - } - else if (ApiType == APIType::Vulkan) - { - out.Write("SAMPLER_BINDING(0) uniform sampler2DArray samp0;\n"); - out.Write("SAMPLER_BINDING(1) uniform sampler2DArray samp1;\n"); - out.Write("SAMPLER_BINDING(2) uniform sampler2DArray samp2;\n"); - out.Write("SAMPLER_BINDING(3) uniform sampler2DArray samp3;\n"); - out.Write("SAMPLER_BINDING(4) uniform sampler2DArray samp4;\n"); - out.Write("SAMPLER_BINDING(5) uniform sampler2DArray samp5;\n"); - out.Write("SAMPLER_BINDING(6) uniform sampler2DArray samp6;\n"); - out.Write("SAMPLER_BINDING(7) uniform sampler2DArray samp7;\n"); - } - else // D3D - { - // Declare samplers - out.Write("SamplerState samp[8] : register(s0);\n"); - out.Write("\n"); - out.Write("Texture2DArray Tex[8] : register(t0);\n"); - } - out.Write("\n"); - - if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan) - out.Write("UBO_BINDING(std140, 1) uniform PSBlock {\n"); - else - out.Write("cbuffer PSBlock : register(b0) {\n"); - - out.Write("\tint4 " I_COLORS "[4];\n" - "\tint4 " I_KCOLORS "[4];\n" - "\tint4 " I_ALPHA ";\n" - "\tfloat4 " I_TEXDIMS "[8];\n" - "\tint4 " I_ZBIAS "[2];\n" - "\tint4 " I_INDTEXSCALE "[2];\n" - "\tint4 " I_INDTEXMTX "[6];\n" - "\tint4 " I_FOGCOLOR ";\n" - "\tint4 " I_FOGI ";\n" - "\tfloat4 " I_FOGF "[2];\n" - "\tfloat4 " I_ZSLOPE ";\n" - "\tfloat4 " I_EFBSCALE ";\n" - "};\n"); - - if (per_pixel_lighting) - { - out.Write("%s", s_lighting_struct); - - if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan) - out.Write("UBO_BINDING(std140, 2) uniform VSBlock {\n"); - else - out.Write("cbuffer VSBlock : register(b1) {\n"); - - out.Write(s_shader_uniforms); - out.Write("};\n"); - } - - if (uid_data->bounding_box) - { - if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan) - { - out.Write("SSBO_BINDING(0) buffer BBox {\n" - "\tint4 bbox_data;\n" - "};\n"); - } - else - { - out.Write("globallycoherent RWBuffer bbox_data : register(u2);\n"); - } - } - - out.Write("struct VS_OUTPUT {\n"); - GenerateVSOutputMembers(out, ApiType, uid_data->genMode_numtexgens, per_pixel_lighting, ""); - out.Write("};\n"); - - if (uid_data->forced_early_z) + if (uid_data->forced_early_z && g_ActiveConfig.backend_info.bSupportsEarlyZ) { // Zcomploc (aka early_ztest) is a way to control whether depth test is done before // or after texturing and alpha test. PC graphics APIs used to provide no way to emulate @@ -549,7 +552,7 @@ ShaderCode GeneratePixelShaderCode(APIType ApiType, const ShaderHostConfig& host // Let's set up attributes for (unsigned int i = 0; i < uid_data->genMode_numtexgens; ++i) { - out.Write("%s in float3 uv%d;\n", GetInterpolationQualifier(msaa, ssaa), i); + out.Write("%s in float3 tex%d;\n", GetInterpolationQualifier(msaa, ssaa), i); } out.Write("%s in float4 clipPos;\n", GetInterpolationQualifier(msaa, ssaa)); if (per_pixel_lighting) @@ -560,13 +563,6 @@ ShaderCode GeneratePixelShaderCode(APIType ApiType, const ShaderHostConfig& host } out.Write("void main()\n{\n"); - - if (host_config.backend_geometry_shaders || ApiType == APIType::Vulkan) - { - for (unsigned int i = 0; i < uid_data->genMode_numtexgens; ++i) - out.Write("\tfloat3 uv%d = tex%d;\n", i, i); - } - out.Write("\tfloat4 rawpos = gl_FragCoord;\n"); } else // D3D @@ -582,7 +578,8 @@ ShaderCode GeneratePixelShaderCode(APIType ApiType, const ShaderHostConfig& host // compute window position if needed because binding semantic WPOS is not widely supported for (unsigned int i = 0; i < uid_data->genMode_numtexgens; ++i) - out.Write(",\n in %s float3 uv%d : TEXCOORD%d", GetInterpolationQualifier(msaa, ssaa), i, i); + out.Write(",\n in %s float3 tex%d : TEXCOORD%d", GetInterpolationQualifier(msaa, ssaa), i, + i); out.Write(",\n in %s float4 clipPos : TEXCOORD%d", GetInterpolationQualifier(msaa, ssaa), uid_data->genMode_numtexgens); if (per_pixel_lighting) @@ -645,7 +642,7 @@ ShaderCode GeneratePixelShaderCode(APIType ApiType, const ShaderHostConfig& host for (unsigned int i = 0; i < uid_data->genMode_numtexgens; ++i) { out.Write("\tint2 fixpoint_uv%d = int2(", i); - out.Write("(uv%d.z == 0.0 ? uv%d.xy : uv%d.xy / uv%d.z)", i, i, i, i); + out.Write("(tex%d.z == 0.0 ? tex%d.xy : tex%d.xy / tex%d.z)", i, i, i, i); out.Write(" * " I_TEXDIMS "[%d].zw);\n", i); // TODO: S24 overflows here? } @@ -824,7 +821,7 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i const char* tevIndAlphaSel[] = {"", "x", "y", "z"}; const char* tevIndAlphaMask[] = {"248", "224", "240", "248"}; // 0b11111000, 0b11100000, 0b11110000, 0b11111000 - out.Write("alphabump = iindtex%d.%s & %s;\n", tevind.bt, tevIndAlphaSel[tevind.bs], + out.Write("alphabump = iindtex%d.%s & %s;\n", tevind.bt.Value(), tevIndAlphaSel[tevind.bs], tevIndAlphaMask[tevind.fmt]); } else @@ -836,7 +833,8 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i { // format const char* tevIndFmtMask[] = {"255", "31", "15", "7"}; - out.Write("\tint3 iindtevcrd%d = iindtex%d & %s;\n", n, tevind.bt, tevIndFmtMask[tevind.fmt]); + out.Write("\tint3 iindtevcrd%d = iindtex%d & %s;\n", n, tevind.bt.Value(), + tevIndFmtMask[tevind.fmt]); // bias - TODO: Check if this needs to be this complicated.. const char* tevIndBiasField[] = {"", "x", "y", "xy", @@ -1166,11 +1164,6 @@ static void SampleTexture(ShaderCode& out, const char* texcoords, const char* te "[%d].xy, %s))).%s;\n", texmap, texmap, texcoords, texmap, stereo ? "layer" : "0.0", texswap); } - else if (ApiType == APIType::Vulkan) - { - out.Write("iround(255.0 * texture(samp%d, float3(%s.xy * " I_TEXDIMS "[%d].xy, %s))).%s;\n", - texmap, texcoords, texmap, stereo ? "layer" : "0.0", texswap); - } else { out.Write("iround(255.0 * texture(samp[%d], float3(%s.xy * " I_TEXDIMS "[%d].xy, %s))).%s;\n", diff --git a/Source/Core/VideoCommon/PixelShaderGen.h b/Source/Core/VideoCommon/PixelShaderGen.h index 23de4635e1..ee422bee8d 100644 --- a/Source/Core/VideoCommon/PixelShaderGen.h +++ b/Source/Core/VideoCommon/PixelShaderGen.h @@ -159,4 +159,7 @@ typedef ShaderUid PixelShaderUid; ShaderCode GeneratePixelShaderCode(APIType ApiType, const ShaderHostConfig& host_config, const pixel_shader_uid_data* uid_data); +void WritePixelShaderCommonHeader(ShaderCode& out, APIType ApiType, u32 num_texgens, + bool per_pixel_lighting, bool bounding_box); +ShaderCode GeneratePixelShaderCode(APIType ApiType, const pixel_shader_uid_data* uid_data); PixelShaderUid GetPixelShaderUid(); diff --git a/Source/Core/VideoCommon/PixelShaderManager.cpp b/Source/Core/VideoCommon/PixelShaderManager.cpp index c5a23a22c7..98fba2f08f 100644 --- a/Source/Core/VideoCommon/PixelShaderManager.cpp +++ b/Source/Core/VideoCommon/PixelShaderManager.cpp @@ -15,6 +15,8 @@ bool PixelShaderManager::s_bFogRangeAdjustChanged; bool PixelShaderManager::s_bViewPortChanged; +bool PixelShaderManager::s_bIndirectDirty; +bool PixelShaderManager::s_bDestAlphaDirty; PixelShaderConstants PixelShaderManager::constants; bool PixelShaderManager::dirty; @@ -40,6 +42,38 @@ void PixelShaderManager::Init() SetTexCoordChanged(6); SetTexCoordChanged(7); + // fixed Konstants + for (int component = 0; component < 4; component++) + { + constants.konst[0][component] = 255; // 1 + constants.konst[1][component] = 223; // 7/8 + constants.konst[2][component] = 191; // 3/4 + constants.konst[3][component] = 159; // 5/8 + constants.konst[4][component] = 128; // 1/2 + constants.konst[5][component] = 96; // 3/8 + constants.konst[6][component] = 64; // 1/4 + constants.konst[7][component] = 32; // 1/8 + + // Invalid Konstants (reads as zero on hardware) + constants.konst[8][component] = 0; + constants.konst[9][component] = 0; + constants.konst[10][component] = 0; + constants.konst[11][component] = 0; + + // Annoyingly, alpha reads zero values for the .rgb colors (offically + // defined as invalid) + // If it wasn't for this, we could just use one of the first 3 colunms + // instead of + // wasting an entire 4th column just for alpha. + if (component == 3) + { + constants.konst[12][component] = 0; + constants.konst[13][component] = 0; + constants.konst[14][component] = 0; + constants.konst[15][component] = 0; + } + } + dirty = true; } @@ -99,6 +133,59 @@ void PixelShaderManager::SetConstants() dirty = true; s_bViewPortChanged = false; } + + if (s_bIndirectDirty) + { + for (int i = 0; i < 4; i++) + constants.pack1[i][3] = 0; + + for (u32 i = 0; i < (bpmem.genMode.numtevstages + 1); ++i) + { + u32 stage = bpmem.tevind[i].bt; + if (stage < bpmem.genMode.numindstages) + { + // We set some extra bits so the ubershader can quickly check if these + // features are in use. + if (bpmem.tevind[i].IsActive()) + constants.pack1[stage][3] = + bpmem.tevindref.getTexCoord(stage) | bpmem.tevindref.getTexMap(stage) << 8 | 1 << 16; + // Note: a tevind of zero just happens to be a passthrough, so no need + // to set an extra bit. + constants.pack1[i][2] = + bpmem.tevind[i].hex; // TODO: This match shadergen, but videosw will + // always wrap. + + // The ubershader uses tevind != 0 as a condition whether to calculate texcoords, + // even when texture is disabled, instead of the stage < bpmem.genMode.numindstages. + // We set an unused bit here to indicate that the stage is active, even if it + // is just a pass-through. + constants.pack1[i][2] |= 0x80000000; + } + else + { + constants.pack1[i][2] = 0; + } + } + + dirty = true; + s_bIndirectDirty = false; + } + + if (s_bDestAlphaDirty) + { + // Destination alpha is only enabled if alpha writes are enabled. Force entire uniform to zero + // when disabled. + u32 dstalpha = bpmem.blendmode.alphaupdate && bpmem.dstalpha.enable && + bpmem.zcontrol.pixel_format == PEControl::RGBA6_Z24 ? + bpmem.dstalpha.hex : + 0; + + if (constants.dstalpha != dstalpha) + { + constants.dstalpha = dstalpha; + dirty = true; + } + } } void PixelShaderManager::SetTevColor(int index, int component, s32 value) @@ -116,20 +203,78 @@ void PixelShaderManager::SetTevKonstColor(int index, int component, s32 value) c[component] = value; dirty = true; + // Konst for ubershaders. We build the whole array on cpu so the gpu can do a single indirect + // access. + if (component != 3) // Alpha doesn't included in the .rgb konsts + constants.konst[index + 12][component] = value; + + // .rrrr .gggg .bbbb .aaaa konsts + constants.konst[index + 16 + component * 4][0] = value; + constants.konst[index + 16 + component * 4][1] = value; + constants.konst[index + 16 + component * 4][2] = value; + constants.konst[index + 16 + component * 4][3] = value; + PRIM_LOG("tev konst color%d: %d %d %d %d", index, c[0], c[1], c[2], c[3]); } +void PixelShaderManager::SetTevOrder(int index, u32 order) +{ + if (constants.pack2[index][0] != order) + { + constants.pack2[index][0] = order; + dirty = true; + } +} + +void PixelShaderManager::SetTevKSel(int index, u32 ksel) +{ + if (constants.pack2[index][1] != ksel) + { + constants.pack2[index][1] = ksel; + dirty = true; + } +} + +void PixelShaderManager::SetTevCombiner(int index, int alpha, u32 combiner) +{ + if (constants.pack1[index][alpha] != combiner) + { + constants.pack1[index][alpha] = combiner; + dirty = true; + } +} + +void PixelShaderManager::SetTevIndirectChanged() +{ + s_bIndirectDirty = true; +} + void PixelShaderManager::SetAlpha() { constants.alpha[0] = bpmem.alpha_test.ref0; constants.alpha[1] = bpmem.alpha_test.ref1; + constants.alpha[3] = static_cast(bpmem.dstalpha.alpha); dirty = true; } -void PixelShaderManager::SetDestAlpha() +void PixelShaderManager::SetAlphaTestChanged() { - constants.alpha[3] = bpmem.dstalpha.alpha; - dirty = true; + // Force alphaTest Uniform to zero if it will always pass. + // (set an extra bit to distinguish from "never && never") + // TODO: we could optimize this further and check the actual constants, + // i.e. "a <= 0" and "a >= 255" will always pass. + u32 alpha_test = + bpmem.alpha_test.TestResult() != AlphaTest::PASS ? bpmem.alpha_test.hex | 1 << 31 : 0; + if (constants.alphaTest != alpha_test) + { + constants.alphaTest = alpha_test; + dirty = true; + } +} + +void PixelShaderManager::SetDestAlphaChanged() +{ + s_bDestAlphaDirty = true; } void PixelShaderManager::SetTexDims(int texmapid, u32 width, u32 height) @@ -235,6 +380,12 @@ void PixelShaderManager::SetZTextureTypeChanged() dirty = true; } +void PixelShaderManager::SetZTextureOpChanged() +{ + constants.ztex_op = bpmem.ztex2.op; + dirty = true; +} + void PixelShaderManager::SetTexCoordChanged(u8 texmapid) { TCoordInfo& tc = bpmem.texcoords[texmapid]; @@ -262,6 +413,7 @@ void PixelShaderManager::SetFogParamChanged() constants.fogi[1] = bpmem.fog.b_magnitude; constants.fogf[1][2] = bpmem.fog.c_proj_fsel.GetC(); constants.fogi[3] = bpmem.fog.b_shift; + constants.fogParam3 = bpmem.fog.c_proj_fsel.hex; } else { @@ -269,6 +421,7 @@ void PixelShaderManager::SetFogParamChanged() constants.fogi[1] = 1; constants.fogf[1][2] = 0.f; constants.fogi[3] = 1; + constants.fogParam3 = 0; } dirty = true; } @@ -279,12 +432,68 @@ void PixelShaderManager::SetFogRangeAdjustChanged() return; s_bFogRangeAdjustChanged = true; + + if (constants.fogRangeBase != bpmem.fogRange.Base.hex) + { + constants.fogRangeBase = bpmem.fogRange.Base.hex; + dirty = true; + } +} + +void PixelShaderManager::SetGenModeChanged() +{ + constants.genmode = bpmem.genMode.hex; + s_bIndirectDirty = true; + dirty = true; +} + +void PixelShaderManager::SetZControlChanged() +{ + u32 early_ztest = bpmem.zcontrol.early_ztest ? 1 : 0; + u32 rgba6_format = + (bpmem.zcontrol.pixel_format == PEControl::RGBA6_Z24 && !g_ActiveConfig.bForceTrueColor) ? 1 : + 0; + u32 dither = rgba6_format && bpmem.blendmode.dither; + if (constants.early_ztest != early_ztest || constants.rgba6_format != rgba6_format || + constants.dither != dither) + { + constants.early_ztest = early_ztest; + constants.rgba6_format = rgba6_format; + constants.dither = dither; + dirty = true; + } + s_bDestAlphaDirty = true; +} + +void PixelShaderManager::SetBlendModeChanged() +{ + u32 dither = constants.rgba6_format && bpmem.blendmode.dither; + if (constants.dither != dither) + { + constants.dither = dither; + dirty = true; + } + s_bDestAlphaDirty = true; +} + +void PixelShaderManager::SetBoundingBoxActive(bool active) +{ + const bool enable = + active && g_ActiveConfig.bBBoxEnable && g_ActiveConfig.BBoxUseFragmentShaderImplementation(); + + if (enable == (constants.bounding_box != 0)) + return; + + constants.bounding_box = active; + dirty = true; } void PixelShaderManager::DoState(PointerWrap& p) { p.Do(s_bFogRangeAdjustChanged); p.Do(s_bViewPortChanged); + p.Do(s_bIndirectDirty); + p.Do(s_bDestAlphaDirty); p.Do(constants); diff --git a/Source/Core/VideoCommon/PixelShaderManager.h b/Source/Core/VideoCommon/PixelShaderManager.h index c7d6e3b9ee..dcb244bccb 100644 --- a/Source/Core/VideoCommon/PixelShaderManager.h +++ b/Source/Core/VideoCommon/PixelShaderManager.h @@ -24,24 +24,36 @@ public: // so make sure to call them after memory is committed static void SetTevColor(int index, int component, s32 value); static void SetTevKonstColor(int index, int component, s32 value); + static void SetTevOrder(int index, u32 order); + static void SetTevKSel(int index, u32 ksel); + static void SetTevCombiner(int index, int alpha, u32 combiner); static void SetAlpha(); - static void SetDestAlpha(); + static void SetAlphaTestChanged(); + static void SetDestAlphaChanged(); static void SetTexDims(int texmapid, u32 width, u32 height); static void SetZTextureBias(); static void SetViewportChanged(); static void SetEfbScaleChanged(float scalex, float scaley); static void SetZSlope(float dfdx, float dfdy, float f0); static void SetIndMatrixChanged(int matrixidx); + static void SetTevIndirectChanged(); static void SetZTextureTypeChanged(); + static void SetZTextureOpChanged(); static void SetIndTexScaleChanged(bool high); static void SetTexCoordChanged(u8 texmapid); static void SetFogColorChanged(); static void SetFogParamChanged(); static void SetFogRangeAdjustChanged(); + static void SetGenModeChanged(); + static void SetZControlChanged(); + static void SetBlendModeChanged(); + static void SetBoundingBoxActive(bool active); static PixelShaderConstants constants; static bool dirty; static bool s_bFogRangeAdjustChanged; static bool s_bViewPortChanged; + static bool s_bIndirectDirty; + static bool s_bDestAlphaDirty; }; diff --git a/Source/Core/VideoCommon/ShaderGenCommon.cpp b/Source/Core/VideoCommon/ShaderGenCommon.cpp index 58085fdafa..94819f3e34 100644 --- a/Source/Core/VideoCommon/ShaderGenCommon.cpp +++ b/Source/Core/VideoCommon/ShaderGenCommon.cpp @@ -29,6 +29,9 @@ ShaderHostConfig ShaderHostConfig::GetCurrent() bits.backend_atomics = g_ActiveConfig.backend_info.bSupportsFragmentStoresAndAtomics; bits.backend_depth_clamp = g_ActiveConfig.backend_info.bSupportsDepthClamp; bits.backend_reversed_depth_range = g_ActiveConfig.backend_info.bSupportsReversedDepthRange; + bits.backend_bitfield = g_ActiveConfig.backend_info.bSupportsBitfield; + bits.backend_dynamic_sampler_indexing = + g_ActiveConfig.backend_info.bSupportsDynamicSamplerIndexing; return bits; } @@ -65,7 +68,7 @@ std::string GetDiskShaderCacheFileName(APIType api_type, const char* type, bool if (include_host_config) { - // We're using 18 bits, so 5 hex characters. + // We're using 20 bits, so 5 hex characters. ShaderHostConfig host_config = ShaderHostConfig::GetCurrent(); filename += StringFromFormat("-%05X", host_config.bits); } diff --git a/Source/Core/VideoCommon/ShaderGenCommon.h b/Source/Core/VideoCommon/ShaderGenCommon.h index 5750e58c3e..3e3ef2eb13 100644 --- a/Source/Core/VideoCommon/ShaderGenCommon.h +++ b/Source/Core/VideoCommon/ShaderGenCommon.h @@ -176,7 +176,9 @@ union ShaderHostConfig u32 backend_atomics : 1; u32 backend_depth_clamp : 1; u32 backend_reversed_depth_range : 1; - u32 pad : 14; + u32 backend_bitfield : 1; + u32 backend_dynamic_sampler_indexing : 1; + u32 pad : 12; }; static ShaderHostConfig GetCurrent(); @@ -316,7 +318,10 @@ inline const char* GetInterpolationQualifier(bool msaa, bool ssaa, #define I_LINEPTPARAMS "clinept" #define I_TEXOFFSET "ctexoffset" -static const char s_shader_uniforms[] = "\tfloat4 " I_POSNORMALMATRIX "[6];\n" +static const char s_shader_uniforms[] = "\tuint components;\n" + "\tuint xfmem_dualTexInfo;\n" + "\tuint xfmem_numColorChans;\n" + "\tfloat4 " I_POSNORMALMATRIX "[6];\n" "\tfloat4 " I_PROJECTION "[4];\n" "\tint4 " I_MATERIALS "[4];\n" "\tLight " I_LIGHTS "[8];\n" @@ -325,4 +330,9 @@ static const char s_shader_uniforms[] = "\tfloat4 " I_POSNORMALMATRIX "[6];\n" "\tfloat4 " I_NORMALMATRICES "[32];\n" "\tfloat4 " I_POSTTRANSFORMMATRICES "[64];\n" "\tfloat4 " I_PIXELCENTERCORRECTION ";\n" - "\tfloat2 " I_VIEWPORT_SIZE ";\n"; + "\tfloat2 " I_VIEWPORT_SIZE ";\n" + "\tuint4 xfmem_pack1[8];\n" + "\t#define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x)\n" + "\t#define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y)\n" + "\t#define xfmem_color(i) (xfmem_pack1[(i)].z)\n" + "\t#define xfmem_alpha(i) (xfmem_pack1[(i)].w)\n"; diff --git a/Source/Core/VideoCommon/UberShaderCommon.cpp b/Source/Core/VideoCommon/UberShaderCommon.cpp new file mode 100644 index 0000000000..58e33e5942 --- /dev/null +++ b/Source/Core/VideoCommon/UberShaderCommon.cpp @@ -0,0 +1,203 @@ +// Copyright 2017 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#include "VideoCommon/UberShaderCommon.h" +#include "VideoCommon/NativeVertexFormat.h" +#include "VideoCommon/VideoConfig.h" +#include "VideoCommon/XFMemory.h" + +namespace UberShader +{ +void WriteUberShaderCommonHeader(ShaderCode& out, APIType api_type, + const ShaderHostConfig& host_config) +{ + // ============================================== + // BitfieldExtract for APIs which don't have it + // ============================================== + if (!host_config.backend_bitfield) + { + out.Write("uint bitfieldExtract(uint val, int off, int size) {\n" + " // This built-in function is only support in OpenGL 4.0+ and ES 3.1+\n" + " // Microsoft's HLSL compiler automatically optimises this to a bitfield extract " + "instruction.\n" + " uint mask = uint((1 << size) - 1);\n" + " return uint(val >> off) & mask;\n" + "}\n\n"); + } +} + +void WriteLightingFunction(ShaderCode& out) +{ + // ============================================== + // Lighting channel calculation helper + // ============================================== + out.Write("int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float3 pos, " + "float3 normal) {\n" + " float3 ldir, h, cosAttn, distAttn;\n" + " float dist, dist2, attn;\n" + "\n" + " switch (attnfunc) {\n"); + out.Write(" case %uu: // LIGNTATTN_NONE\n", LIGHTATTN_NONE); + out.Write(" case %uu: // LIGHTATTN_DIR\n", LIGHTATTN_DIR); + out.Write(" ldir = normalize(" I_LIGHTS "[index].pos.xyz - pos.xyz);\n" + " attn = 1.0;\n" + " if (length(ldir) == 0.0)\n" + " ldir = normal;\n" + " break;\n\n"); + out.Write(" case %uu: // LIGHTATTN_SPEC\n", LIGHTATTN_SPEC); + out.Write(" ldir = normalize(" I_LIGHTS "[index].pos.xyz - pos.xyz);\n" + " attn = (dot(normal, ldir) >= 0.0) ? max(0.0, dot(normal, " I_LIGHTS + "[index].dir.xyz)) : 0.0;\n" + " cosAttn = " I_LIGHTS "[index].cosatt.xyz;\n"); + out.Write(" if (diffusefunc == %uu) // LIGHTDIF_NONE\n", LIGHTDIF_NONE); + out.Write(" distAttn = " I_LIGHTS "[index].distatt.xyz;\n" + " else\n" + " distAttn = normalize(" I_LIGHTS "[index].distatt.xyz);\n" + " attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, " + "float3(1.0, attn, attn*attn));\n" + " break;\n\n"); + out.Write(" case %uu: // LIGHTATTN_SPOT\n", LIGHTATTN_SPOT); + out.Write(" ldir = " I_LIGHTS "[index].pos.xyz - pos.xyz;\n" + " dist2 = dot(ldir, ldir);\n" + " dist = sqrt(dist2);\n" + " ldir = ldir / dist;\n" + " attn = max(0.0, dot(ldir, " I_LIGHTS "[index].dir.xyz));\n" + " attn = max(0.0, " I_LIGHTS "[index].cosatt.x + " I_LIGHTS + "[index].cosatt.y * attn + " I_LIGHTS "[index].cosatt.z * attn * attn) / dot(" I_LIGHTS + "[index].distatt.xyz, float3(1.0, dist, dist2));\n" + " break;\n\n"); + out.Write(" default:\n" + " attn = 1.0;\n" + " ldir = normal;\n" + " break;\n" + " }\n" + "\n" + " switch (diffusefunc) {\n"); + out.Write(" case %uu: // LIGHTDIF_NONE\n", LIGHTDIF_NONE); + out.Write(" return int4(round(attn * float4(" I_LIGHTS "[index].color)));\n\n"); + out.Write(" case %uu: // LIGHTDIF_SIGN\n", LIGHTDIF_SIGN); + out.Write(" return int4(round(attn * dot(ldir, normal) * float4(" I_LIGHTS + "[index].color)));\n\n"); + out.Write(" case %uu: // LIGHTDIF_CLAMP\n", LIGHTDIF_CLAMP); + out.Write(" return int4(round(attn * max(0.0, dot(ldir, normal)) * float4(" I_LIGHTS + "[index].color)));\n\n"); + out.Write(" default:\n" + " return int4(0, 0, 0, 0);\n" + " }\n" + "}\n\n"); +} + +void WriteVertexLighting(ShaderCode& out, APIType api_type, const char* world_pos_var, + const char* normal_var, const char* in_color_0_var, + const char* in_color_1_var, const char* out_color_0_var, + const char* out_color_1_var) +{ + out.Write("// Lighting\n"); + out.Write("%sfor (uint chan = 0u; chan < xfmem_numColorChans; chan++) {\n", + api_type == APIType::D3D ? "[loop] " : ""); + out.Write(" uint colorreg = xfmem_color(chan);\n" + " uint alphareg = xfmem_alpha(chan);\n" + " int4 mat = " I_MATERIALS "[chan + 2u]; \n" + " int4 lacc = int4(255, 255, 255, 255);\n" + "\n"); + + out.Write(" if (%s != 0u) {\n", BitfieldExtract("colorreg", LitChannel().matsource).c_str()); + out.Write(" if ((components & (%uu << chan)) != 0u) // VB_HAS_COL0\n", VB_HAS_COL0); + out.Write(" mat.xyz = int3(round(((chan == 0u) ? %s.xyz : %s.xyz) * 255.0));\n", + in_color_0_var, in_color_1_var); + out.Write(" else if ((components & %uu) != 0u) // VB_HAS_COLO0\n", VB_HAS_COL0); + out.Write(" mat.xyz = int3(round(%s.xyz * 255.0));\n", in_color_0_var); + out.Write(" else\n" + " mat.xyz = int3(255, 255, 255);\n" + " }\n" + "\n"); + + out.Write(" if (%s != 0u) {\n", BitfieldExtract("alphareg", LitChannel().matsource).c_str()); + out.Write(" if ((components & (%uu << chan)) != 0u) // VB_HAS_COL0\n", VB_HAS_COL0); + out.Write(" mat.w = int(round(((chan == 0u) ? %s.w : %s.w) * 255.0));\n", in_color_0_var, + in_color_1_var); + out.Write(" else if ((components & %uu) != 0u) // VB_HAS_COLO0\n", VB_HAS_COL0); + out.Write(" mat.w = int(round(%s.w * 255.0));\n", in_color_0_var); + out.Write(" else\n" + " mat.w = 255;\n" + " } else {\n" + " mat.w = " I_MATERIALS " [chan + 2u].w;\n" + " }\n" + "\n"); + + out.Write(" if (%s != 0u) {\n", + BitfieldExtract("colorreg", LitChannel().enablelighting).c_str()); + out.Write(" if (%s != 0u) {\n", BitfieldExtract("colorreg", LitChannel().ambsource).c_str()); + out.Write(" if ((components & (%uu << chan)) != 0u) // VB_HAS_COL0\n", VB_HAS_COL0); + out.Write(" lacc.xyz = int3(round(((chan == 0u) ? %s.xyz : %s.xyz) * 255.0));\n", + in_color_0_var, in_color_1_var); + out.Write(" else if ((components & %uu) != 0u) // VB_HAS_COLO0\n", VB_HAS_COL0); + out.Write(" lacc.xyz = int3(round(%s.xyz * 255.0));\n", in_color_0_var); + out.Write(" else\n" + " lacc.xyz = int3(255, 255, 255);\n" + " } else {\n" + " lacc.xyz = " I_MATERIALS " [chan].xyz;\n" + " }\n" + "\n"); + out.Write(" uint light_mask = %s | (%s << 4u);\n", + BitfieldExtract("colorreg", LitChannel().lightMask0_3).c_str(), + BitfieldExtract("colorreg", LitChannel().lightMask4_7).c_str()); + out.Write(" uint attnfunc = %s;\n", + BitfieldExtract("colorreg", LitChannel().attnfunc).c_str()); + out.Write(" uint diffusefunc = %s;\n", + BitfieldExtract("colorreg", LitChannel().diffusefunc).c_str()); + out.Write( + " for (uint light_index = 0u; light_index < 8u; light_index++) {\n" + " if ((light_mask & (1u << light_index)) != 0u)\n" + " lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, %s, %s).xyz;\n", + world_pos_var, normal_var); + out.Write(" }\n" + " }\n" + "\n"); + + out.Write(" if (%s != 0u) {\n", + BitfieldExtract("alphareg", LitChannel().enablelighting).c_str()); + out.Write(" if (%s != 0u) {\n", BitfieldExtract("alphareg", LitChannel().ambsource).c_str()); + out.Write(" if ((components & (%uu << chan)) != 0u) // VB_HAS_COL0\n", VB_HAS_COL0); + out.Write(" lacc.w = int(round(((chan == 0u) ? %s.w : %s.w) * 255.0));\n", in_color_0_var, + in_color_1_var); + out.Write(" else if ((components & %uu) != 0u) // VB_HAS_COLO0\n", VB_HAS_COL0); + out.Write(" lacc.w = int(round(%s.w * 255.0));\n", in_color_0_var); + out.Write(" else\n" + " lacc.w = 255;\n" + " } else {\n" + " lacc.w = " I_MATERIALS " [chan].w;\n" + " }\n" + "\n"); + out.Write(" uint light_mask = %s | (%s << 4u);\n", + BitfieldExtract("alphareg", LitChannel().lightMask0_3).c_str(), + BitfieldExtract("alphareg", LitChannel().lightMask4_7).c_str()); + out.Write(" uint attnfunc = %s;\n", + BitfieldExtract("alphareg", LitChannel().attnfunc).c_str()); + out.Write(" uint diffusefunc = %s;\n", + BitfieldExtract("alphareg", LitChannel().diffusefunc).c_str()); + out.Write(" for (uint light_index = 0u; light_index < 8u; light_index++) {\n\n" + " if ((light_mask & (1u << light_index)) != 0u)\n\n" + " lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, %s, %s).w;\n", + world_pos_var, normal_var); + out.Write(" }\n" + " }\n" + "\n"); + + out.Write(" lacc = clamp(lacc, 0, 255);\n" + "\n" + " // Hopefully GPUs that can support dynamic indexing will optimize this.\n" + " float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0;\n" + " switch (chan) {\n" + " case 0u: %s = lit_color; break;\n", + out_color_0_var); + out.Write(" case 1u: %s = lit_color; break;\n", out_color_1_var); + out.Write(" }\n" + "}\n" + "\n"); + + out.Write("if (xfmem_numColorChans < 2u && (components & %uu) == 0u)\n", VB_HAS_COL1); + out.Write(" %s = %s;\n\n", out_color_1_var, out_color_0_var); +} +} diff --git a/Source/Core/VideoCommon/UberShaderCommon.h b/Source/Core/VideoCommon/UberShaderCommon.h new file mode 100644 index 0000000000..a623e9d58d --- /dev/null +++ b/Source/Core/VideoCommon/UberShaderCommon.h @@ -0,0 +1,30 @@ +// Copyright 2017 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#pragma once + +#include "VideoCommon/ShaderGenCommon.h" +#include "VideoCommon/VideoCommon.h" + +namespace UberShader +{ +// Common functions across all ubershaders +void WriteUberShaderCommonHeader(ShaderCode& out, APIType api_type, + const ShaderHostConfig& host_config); + +// Vertex lighting +void WriteLightingFunction(ShaderCode& out); +void WriteVertexLighting(ShaderCode& out, APIType api_type, const char* world_pos_var, + const char* normal_var, const char* in_color_0_var, + const char* in_color_1_var, const char* out_color_0_var, + const char* out_color_1_var); + +// bitfieldExtract generator for BitField types +template +std::string BitfieldExtract(const std::string& source, T type) +{ + return StringFromFormat("bitfieldExtract(%s, %u, %u)", source.c_str(), + static_cast(type.StartBit()), static_cast(type.NumBits())); +} +} // namespace UberShader diff --git a/Source/Core/VideoCommon/UberShaderPixel.cpp b/Source/Core/VideoCommon/UberShaderPixel.cpp new file mode 100644 index 0000000000..23d6725b3b --- /dev/null +++ b/Source/Core/VideoCommon/UberShaderPixel.cpp @@ -0,0 +1,1101 @@ +// Copyright 2015 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#include "VideoCommon/UberShaderPixel.h" +#include "VideoCommon/BPMemory.h" +#include "VideoCommon/DriverDetails.h" +#include "VideoCommon/NativeVertexFormat.h" +#include "VideoCommon/UberShaderCommon.h" +#include "VideoCommon/XFMemory.h" + +namespace UberShader +{ +PixelShaderUid GetPixelShaderUid() +{ + PixelShaderUid out; + pixel_ubershader_uid_data* uid_data = out.GetUidData(); + memset(uid_data, 0, sizeof(*uid_data)); + uid_data->num_texgens = xfmem.numTexGen.numTexGens; + uid_data->early_depth = + bpmem.UseEarlyDepthTest() && + (g_ActiveConfig.bFastDepthCalc || bpmem.alpha_test.TestResult() == AlphaTest::UNDETERMINED) && + !(bpmem.zmode.testenable && bpmem.genMode.zfreeze); + uid_data->per_pixel_depth = + (bpmem.ztex2.op != ZTEXTURE_DISABLE && bpmem.UseLateDepthTest()) || + (!g_ActiveConfig.bFastDepthCalc && bpmem.zmode.testenable && !uid_data->early_depth) || + (bpmem.zmode.testenable && bpmem.genMode.zfreeze); + return out; +} + +ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config, + const pixel_ubershader_uid_data* uid_data) +{ + const bool per_pixel_lighting = host_config.per_pixel_lighting; + const bool msaa = host_config.msaa; + const bool ssaa = host_config.ssaa; + const bool stereo = host_config.stereo; + const bool use_dual_source = host_config.backend_dual_source_blend; + const bool early_depth = uid_data->early_depth != 0; + const bool per_pixel_depth = uid_data->per_pixel_depth != 0; + const bool bounding_box = + host_config.bounding_box && g_ActiveConfig.BBoxUseFragmentShaderImplementation(); + const u32 numTexgen = uid_data->num_texgens; + ShaderCode out; + + out.Write("// Pixel UberShader for %u texgens%s%s\n", numTexgen, + early_depth ? ", early-depth" : "", per_pixel_depth ? ", per-pixel depth" : ""); + WritePixelShaderCommonHeader(out, ApiType, numTexgen, per_pixel_lighting, bounding_box); + WriteUberShaderCommonHeader(out, ApiType, host_config); + if (per_pixel_lighting) + WriteLightingFunction(out); + + // Shader inputs/outputs in GLSL (HLSL is in main). + if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan) + { + if (use_dual_source) + { + if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_FRAGMENT_SHADER_INDEX_DECORATION)) + { + out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 ocol0;\n"); + out.Write("FRAGMENT_OUTPUT_LOCATION(1) out vec4 ocol1;\n"); + } + else + { + out.Write("FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0;\n"); + out.Write("FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1;\n"); + } + } + else + { + out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 ocol0;\n"); + } + + if (per_pixel_depth) + out.Write("#define depth gl_FragDepth\n"); + + if (host_config.backend_geometry_shaders || ApiType == APIType::Vulkan) + { + out.Write("VARYING_LOCATION(0) in VertexData {\n"); + GenerateVSOutputMembers(out, ApiType, numTexgen, per_pixel_lighting, + GetInterpolationQualifier(msaa, ssaa)); + + if (stereo) + out.Write(" flat int layer;\n"); + + out.Write("};\n\n"); + } + else + { + out.Write("%s in float4 colors_0;\n", GetInterpolationQualifier(msaa, ssaa)); + out.Write("%s in float4 colors_1;\n", GetInterpolationQualifier(msaa, ssaa)); + // compute window position if needed because binding semantic WPOS is not widely supported + // Let's set up attributes + for (u32 i = 0; i < numTexgen; ++i) + out.Write("%s in float3 tex%d;\n", GetInterpolationQualifier(msaa, ssaa), i); + out.Write("%s in float4 clipPos;\n", GetInterpolationQualifier(msaa, ssaa)); + if (per_pixel_lighting) + { + out.Write("%s in float3 Normal;\n", GetInterpolationQualifier(msaa, ssaa)); + out.Write("%s in float3 WorldPos;\n", GetInterpolationQualifier(msaa, ssaa)); + } + } + } + + // Uniform index -> texture coordinates + if (numTexgen > 0) + { + if (ApiType != APIType::D3D) + { + out.Write("float3 selectTexCoord(uint index) {\n"); + } + else + { + out.Write("float3 selectTexCoord(uint index"); + for (u32 i = 0; i < numTexgen; i++) + out.Write(", float3 tex%u", i); + out.Write(") {\n"); + } + + out.Write(" switch (index) {\n"); + for (u32 i = 0; i < numTexgen; i++) + { + out.Write(" case %uu:\n" + " return tex%u;\n", + i, i); + } + out.Write(" default:\n" + " return float3(0.0, 0.0, 0.0);\n" + " }\n" + "}\n\n"); + } + + // ===================== + // Texture Sampling + // ===================== + + if (host_config.backend_dynamic_sampler_indexing) + { + // Doesn't look like directx supports this. Oh well the code path is here just incase it + // supports this in the future. + out.Write("int4 sampleTexture(uint sampler_num, float2 uv) {\n"); + if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan) + out.Write(" return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0);\n"); + else if (ApiType == APIType::D3D) + out.Write(" return iround(Tex[sampler_num].Sample(samp[sampler_num], float3(uv, 0.0)) * " + "255.0);\n"); + out.Write("}\n\n"); + } + else + { + out.Write("int4 sampleTexture(uint sampler_num, float2 uv) {\n" + " // This is messy, but DirectX, OpenGl 3.3 and Opengl ES 3.0 doesn't support " + "dynamic indexing of the sampler array\n" + " // With any luck the shader compiler will optimise this if the hardware supports " + "dynamic indexing.\n" + " switch(sampler_num) {\n"); + for (int i = 0; i < 8; i++) + { + if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan) + out.Write(" case %du: return iround(texture(samp[%d], float3(uv, 0.0)) * 255.0);\n", i, i); + else if (ApiType == APIType::D3D) + out.Write(" case %du: return iround(Tex[%d].Sample(samp[%d], float3(uv, 0.0)) * 255.0);\n", + i, i, i); + } + out.Write(" }\n" + "}\n\n"); + } + + // ====================== + // Arbatary Swizzling + // ====================== + + out.Write("int4 Swizzle(uint s, int4 color) {\n" + " // AKA: Color Channel Swapping\n" + "\n" + " int4 ret;\n"); + out.Write(" ret.r = color[%s];\n", + BitfieldExtract("bpmem_tevksel(s * 2u)", TevKSel().swap1).c_str()); + out.Write(" ret.g = color[%s];\n", + BitfieldExtract("bpmem_tevksel(s * 2u)", TevKSel().swap2).c_str()); + out.Write(" ret.b = color[%s];\n", + BitfieldExtract("bpmem_tevksel(s * 2u + 1u)", TevKSel().swap1).c_str()); + out.Write(" ret.a = color[%s];\n", + BitfieldExtract("bpmem_tevksel(s * 2u + 1u)", TevKSel().swap2).c_str()); + out.Write(" return ret;\n" + "}\n\n"); + + // ====================== + // Indirect Wrappping + // ====================== + out.Write("int Wrap(int coord, uint mode) {\n" + " if (mode == 0u) // ITW_OFF\n" + " return coord;\n" + " else if (mode < 6u) // ITW_256 to ITW_16\n" + " return coord & (0xfffe >> mode);\n" + " else // ITW_0\n" + " return 0;\n" + "}\n\n"); + + // ====================== + // Indirect Lookup + // ====================== + auto LookupIndirectTexture = [&out](const char* out_var_name, const char* in_index_name) { + out.Write( + "{\n" + " uint iref = bpmem_iref(%s);\n" + " if ( iref != 0u)\n" + " {\n" + " uint texcoord = bitfieldExtract(iref, 0, 3);\n" + " uint texmap = bitfieldExtract(iref, 8, 3);\n" + " float3 uv = getTexCoord(texcoord);\n" + " int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * " I_TEXDIMS + "[texcoord].zw);\n" + "\n" + " if ((%s & 1u) == 0u)\n" + " fixedPoint_uv = fixedPoint_uv >> " I_INDTEXSCALE "[%s >> 1].xy;\n" + " else\n" + " fixedPoint_uv = fixedPoint_uv >> " I_INDTEXSCALE "[%s >> 1].zw;\n" + "\n" + " %s = sampleTexture(texmap, float2(fixedPoint_uv) * " I_TEXDIMS "[texmap].xy).abg;\n" + " }\n" + " else\n" + " {\n" + " %s = int3(0, 0, 0);\n" + " }\n" + "}\n", + in_index_name, in_index_name, in_index_name, in_index_name, out_var_name, out_var_name); + }; + + // ====================== + // TEV's Special Lerp + // ====================== + auto WriteTevLerp = [&out](const char* components) { + out.Write("// TEV's Linear Interpolate, plus bias, add/subtract and scale\n" + "int%s tevLerp%s(int%s A, int%s B, int%s C, int%s D, uint bias, bool op, bool alpha, " + "uint shift) {\n" + " // Scale C from 0..255 to 0..256\n" + " C += C >> 7;\n" + "\n" + " // Add bias to D\n" + " if (bias == 1u) D += 128;\n" + " else if (bias == 2u) D -= 128;\n" + "\n" + " int%s lerp = (A << 8) + (B - A)*C;\n" + " if (shift != 3u) {\n" + " lerp = lerp << shift;\n" + " D = D << shift;\n" + " }\n" + "\n" + " if ((shift == 3u) == alpha)\n" + " lerp = lerp + (op ? 127 : 128);\n" + "\n" + " int%s result = lerp >> 8;\n" + "\n" + " // Add/Subtract D\n" + " if(op) // Subtract\n" + " result = D - result;\n" + " else // Add\n" + " result = D + result;\n" + "\n" + " // Most of the Shift was moved inside the lerp for improved percision\n" + " // But we still do the divide by 2 here\n" + " if (shift == 3u)\n" + " result = result >> 1;\n" + " return result;\n" + "}\n\n", + components, components, components, components, components, components, components, + components); + }; + WriteTevLerp(""); // int + WriteTevLerp("3"); // int3 + + // ======================= + // TEV's Color Compare + // ======================= + + out.Write( + "// Implements operations 0-5 of tev's compare mode,\n" + "// which are common to both color and alpha channels\n" + "bool tevCompare(uint op, int3 color_A, int3 color_B) {\n" + " switch (op) {\n" + " case 0u: // TEVCMP_R8_GT\n" + " return (color_A.r > color_B.r);\n" + " case 1u: // TEVCMP_R8_EQ\n" + " return (color_A.r == color_B.r);\n" + " case 2u: // TEVCMP_GR16_GT\n" + " int A_16 = (color_A.r | (color_A.g << 8));\n" + " int B_16 = (color_B.r | (color_B.g << 8));\n" + " return A_16 > B_16;\n" + " case 3u: // TEVCMP_GR16_EQ\n" + " return (color_A.r == color_B.r && color_A.g == color_B.g);\n" + " case 4u: // TEVCMP_BGR24_GT\n" + " int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16));\n" + " int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16));\n" + " return A_24 > B_24;\n" + " case 5u: // TEVCMP_BGR24_EQ\n" + " return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b);\n" + " default:\n" + " return false;\n" + " }\n" + "}\n\n"); + + // ================= + // Alpha Compare + // ================= + + out.Write("// Helper function for Alpha Test\n" + "bool alphaCompare(int a, int b, uint compare) {\n" + " switch (compare) {\n" + " case 0u: // NEVER\n" + " return false;\n" + " case 1u: // LESS\n" + " return a < b;\n" + " case 2u: // EQUAL\n" + " return a == b;\n" + " case 3u: // LEQUAL\n" + " return a <= b;\n" + " case 4u: // GREATER\n" + " return a > b;\n" + " case 5u: // NEQUAL;\n" + " return a != b;\n" + " case 6u: // GEQUAL\n" + " return a >= b;\n" + " case 7u: // ALWAYS\n" + " return true;\n" + " }\n" + "}\n\n"); + + // ================= + // Input Selects + // ================= + + out.Write("struct State {\n" + " int4 Reg[4];\n" + " int4 TexColor;\n" + " int AlphaBump;\n" + "};\n" + "struct StageState {\n" + " uint stage;\n" + " uint order;\n" + " uint cc;\n" + " uint ac;\n"); + + out.Write("};\n" + "\n" + "int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1);\n" + "int4 getKonstColor(State s, StageState ss);\n" + "\n" + "int3 selectColorInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint " + "index) {\n" + " switch (index) {\n" + " case 0u: // prev.rgb\n" + " return s.Reg[0].rgb;\n" + " case 1u: // prev.aaa\n" + " return s.Reg[0].aaa;\n" + " case 2u: // c0.rgb\n" + " return s.Reg[1].rgb;\n" + " case 3u: // c0.aaa\n" + " return s.Reg[1].aaa;\n" + " case 4u: // c1.rgb\n" + " return s.Reg[2].rgb;\n" + " case 5u: // c1.aaa\n" + " return s.Reg[2].aaa;\n" + " case 6u: // c2.rgb\n" + " return s.Reg[3].rgb;\n" + " case 7u: // c2.aaa\n" + " return s.Reg[3].aaa;\n" + " case 8u:\n" + " return s.TexColor.rgb;\n" + " case 9u:\n" + " return s.TexColor.aaa;\n" + " case 10u:\n" + " return getRasColor(s, ss, colors_0, colors_1).rgb;\n" + " case 11u:\n" + " return getRasColor(s, ss, colors_0, colors_1).aaa;\n" + " case 12u: // One\n" + " return int3(255, 255, 255);\n" + " case 13u: // Half\n" + " return int3(128, 128, 128);\n" + " case 14u:\n" + " return getKonstColor(s, ss).rgb;\n" + " case 15u: // Zero\n" + " return int3(0, 0, 0);\n" + " }\n" + "}\n" + "\n" + "int selectAlphaInput(State s, StageState ss, float4 colors_0, float4 colors_1, uint " + "index) {\n" + " switch (index) {\n" + " case 0u: // prev.a\n" + " return s.Reg[0].a;\n" + " case 1u: // c0.a\n" + " return s.Reg[1].a;\n" + " case 2u: // c1.a\n" + " return s.Reg[2].a;\n" + " case 3u: // c2.a\n" + " return s.Reg[3].a;\n" + " case 4u:\n" + " return s.TexColor.a;\n" + " case 5u:\n" + " return getRasColor(s, ss, colors_0, colors_1).a;\n" + " case 6u:\n" + " return getKonstColor(s, ss).a;\n" + " case 7u: // Zero\n" + " return 0;\n" + " }\n" + "}\n" + "\n" + "int4 getTevReg(in State s, uint index) {\n" + " switch (index) {\n" + " case 0u: // prev\n" + " return s.Reg[0];\n" + " case 1u: // c0\n" + " return s.Reg[1];\n" + " case 2u: // c1\n" + " return s.Reg[2];\n" + " case 3u: // c2\n" + " return s.Reg[3];\n" + " default: // prev\n" + " return s.Reg[0];\n" + " }\n" + "}\n" + "\n" + "void setRegColor(inout State s, uint index, int3 color) {\n" + " switch (index) {\n" + " case 0u: // prev\n" + " s.Reg[0].rgb = color;\n" + " break;\n" + " case 1u: // c0\n" + " s.Reg[1].rgb = color;\n" + " break;\n" + " case 2u: // c1\n" + " s.Reg[2].rgb = color;\n" + " break;\n" + " case 3u: // c2\n" + " s.Reg[3].rgb = color;\n" + " break;\n" + " }\n" + "}\n" + "\n" + "void setRegAlpha(inout State s, uint index, int alpha) {\n" + " switch (index) {\n" + " case 0u: // prev\n" + " s.Reg[0].a = alpha;\n" + " break;\n" + " case 1u: // c0\n" + " s.Reg[1].a = alpha;\n" + " break;\n" + " case 2u: // c1\n" + " s.Reg[2].a = alpha;\n" + " break;\n" + " case 3u: // c2\n" + " s.Reg[3].a = alpha;\n" + " break;\n" + " }\n" + "}\n" + "\n"); + + // Since the texture coodinate variables aren't global, we need to pass + // them to the select function in D3D. + if (numTexgen > 0) + { + if (ApiType != APIType::D3D) + { + out.Write("#define getTexCoord(index) selectTexCoord((index))\n\n"); + } + else + { + out.Write("#define getTexCoord(index) selectTexCoord((index)"); + for (u32 i = 0; i < numTexgen; i++) + out.Write(", tex%u", i); + out.Write(")\n\n"); + } + } + + if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan) + { + if (early_depth && host_config.backend_early_z) + out.Write("FORCE_EARLY_Z;\n"); + + out.Write("void main()\n{\n"); + out.Write(" float4 rawpos = gl_FragCoord;\n"); + } + else // D3D + { + if (early_depth && host_config.backend_early_z) + out.Write("[earlydepthstencil]\n"); + + out.Write("void main(\n" + " out float4 ocol0 : SV_Target0,\n" + " out float4 ocol1 : SV_Target1,\n" + " %s\n", + per_pixel_depth ? "\n out float depth : SV_Depth," : ""); + out.Write(" in float4 rawpos : SV_Position,\n"); + + out.Write(" in %s float4 colors_0 : COLOR0,\n", GetInterpolationQualifier(msaa, ssaa)); + out.Write(" in %s float4 colors_1 : COLOR1", GetInterpolationQualifier(msaa, ssaa)); + + // compute window position if needed because binding semantic WPOS is not widely supported + for (u32 i = 0; i < numTexgen; ++i) + out.Write(",\n in %s float3 tex%u : TEXCOORD%u", GetInterpolationQualifier(msaa, ssaa), i, + i); + out.Write("\n,\n in %s float4 clipPos : TEXCOORD%u", GetInterpolationQualifier(msaa, ssaa), + numTexgen); + if (per_pixel_lighting) + { + out.Write(",\n in %s float3 Normal : TEXCOORD%u", GetInterpolationQualifier(msaa, ssaa), + numTexgen + 1); + out.Write(",\n in %s float3 WorldPos : TEXCOORD%u", GetInterpolationQualifier(msaa, ssaa), + numTexgen + 2); + } + out.Write(",\n in float clipDist0 : SV_ClipDistance0\n"); + out.Write(",\n in float clipDist1 : SV_ClipDistance1\n"); + if (stereo) + out.Write(",\n in uint layer : SV_RenderTargetArrayIndex\n"); + out.Write("\n ) {\n"); + } + + out.Write(" int3 tevcoord = int3(0, 0, 0);\n" + " State s;\n" + " s.TexColor = int4(0, 0, 0, 0);\n" + " s.AlphaBump = 0;\n" + "\n"); + for (int i = 0; i < 4; i++) + out.Write(" s.Reg[%d] = " I_COLORS "[%d];\n", i, i); + + const char* color_input_prefix = ""; + if (per_pixel_lighting) + { + out.Write(" float4 lit_colors_0 = colors_0;\n"); + out.Write(" float4 lit_colors_1 = colors_1;\n"); + out.Write(" float3 lit_normal = normalize(Normal.xyz);\n"); + out.Write(" float3 lit_pos = WorldPos.xyz;\n"); + WriteVertexLighting(out, ApiType, "lit_pos", "lit_normal", "colors_0", "colors_1", + "lit_colors_0", "lit_colors_1"); + color_input_prefix = "lit_"; + } + + out.Write(" uint num_stages = %s;\n\n", + BitfieldExtract("bpmem_genmode", bpmem.genMode.numtevstages).c_str()); + + out.Write(" // Main tev loop\n"); + if (ApiType == APIType::D3D) + { + // Tell DirectX we don't want this loop unrolled (it crashes if it tries to) + out.Write(" [loop]\n"); + } + + out.Write(" for(uint stage = 0u; stage <= num_stages; stage++)\n" + " {\n" + " StageState ss;\n" + " ss.stage = stage;\n" + " ss.cc = bpmem_combiners(stage).x;\n" + " ss.ac = bpmem_combiners(stage).y;\n" + " ss.order = bpmem_tevorder(stage>>1);\n" + " if ((stage & 1u) == 1u)\n" + " ss.order = ss.order >> %d;\n\n", + int(TwoTevStageOrders().enable1.StartBit() - TwoTevStageOrders().enable0.StartBit())); + + // Disable texturing when there are no texgens (for now) + if (numTexgen != 0) + { + out.Write(" uint tex_coord = %s;\n", + BitfieldExtract("ss.order", TwoTevStageOrders().texcoord0).c_str()); + out.Write(" float3 uv = getTexCoord(tex_coord);\n" + " int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * " I_TEXDIMS + "[tex_coord].zw);\n" + "\n" + " bool texture_enabled = (ss.order & %du) != 0u;\n", + 1 << TwoTevStageOrders().enable0.StartBit()); + out.Write("\n" + " // Indirect textures\n" + " uint tevind = bpmem_tevind(stage);\n" + " if (tevind != 0u)\n" + " {\n" + " uint bs = %s;\n", + BitfieldExtract("tevind", TevStageIndirect().bs).c_str()); + out.Write(" uint fmt = %s;\n", BitfieldExtract("tevind", TevStageIndirect().fmt).c_str()); + out.Write(" uint bias = %s;\n", + BitfieldExtract("tevind", TevStageIndirect().bias).c_str()); + out.Write(" uint bt = %s;\n", BitfieldExtract("tevind", TevStageIndirect().bt).c_str()); + out.Write(" uint mid = %s;\n", BitfieldExtract("tevind", TevStageIndirect().mid).c_str()); + out.Write("\n"); + out.Write(" int3 indcoord;\n"); + LookupIndirectTexture("indcoord", "bt"); + out.Write(" if (bs != 0u)\n" + " s.AlphaBump = indcoord[bs - 1u];\n" + " switch(fmt)\n" + " {\n" + " case %iu:\n", + ITF_8); + out.Write(" indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0);\n" + " indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0);\n" + " indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0);\n" + " s.AlphaBump = s.AlphaBump & 0xf8;\n" + " break;\n" + " case %iu:\n", + ITF_5); + out.Write(" indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0);\n" + " indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0);\n" + " indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0);\n" + " s.AlphaBump = s.AlphaBump & 0xe0;\n" + " break;\n" + " case %iu:\n", + ITF_4); + out.Write(" indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0);\n" + " indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0);\n" + " indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0);\n" + " s.AlphaBump = s.AlphaBump & 0xf0;\n" + " break;\n" + " case %iu:\n", + ITF_3); + out.Write(" indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0);\n" + " indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0);\n" + " indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0);\n" + " s.AlphaBump = s.AlphaBump & 0xf8;\n" + " break;\n" + " }\n" + "\n" + " // Matrix multiply\n" + " int2 indtevtrans = int2(0, 0);\n" + " if ((mid & 3u) != 0u)\n" + " {\n" + " uint mtxidx = 2u * ((mid & 3u) - 1u);\n" + " int shift = " I_INDTEXMTX "[mtxidx].w;\n" + "\n" + " switch (mid >> 2)\n" + " {\n" + " case 0u: // 3x2 S0.10 matrix\n" + " indtevtrans = int2(idot(" I_INDTEXMTX + "[mtxidx].xyz, indcoord), idot(" I_INDTEXMTX "[mtxidx + 1u].xyz, indcoord)) >> 3;\n" + " break;\n" + " case 1u: // S matrix, S17.7 format\n" + " indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8;\n" + " break;\n" + " case 2u: // T matrix, S17.7 format\n" + " indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8;\n" + " break;\n" + " }\n" + "\n" + " if (shift >= 0)\n" + " indtevtrans = indtevtrans >> shift;\n" + " else\n" + " indtevtrans = indtevtrans << ((-shift) & 31);\n" + " }\n" + "\n" + " // Wrapping\n" + " uint sw = %s;\n", + BitfieldExtract("tevind", TevStageIndirect().sw).c_str()); + out.Write(" uint tw = %s; \n", BitfieldExtract("tevind", TevStageIndirect().tw).c_str()); + out.Write( + " int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw));\n" + "\n" + " if ((tevind & %du) != 0u) // add previous tevcoord\n", + 1 << TevStageIndirect().fb_addprev.StartBit()); + out.Write(" tevcoord.xy += wrapped_coord + indtevtrans;\n" + " else\n" + " tevcoord.xy = wrapped_coord + indtevtrans;\n" + "\n" + " // Emulate s24 overflows\n" + " tevcoord.xy = (tevcoord.xy << 8) >> 8;\n" + " }\n" + " else if (texture_enabled)\n" + " {\n" + " tevcoord.xy = fixedPoint_uv;\n" + " }\n" + "\n" + " // Sample texture for stage\n" + " if(texture_enabled) {\n" + " uint sampler_num = %s;\n", + BitfieldExtract("ss.order", TwoTevStageOrders().texmap0).c_str()); + out.Write("\n" + " float2 uv = (float2(tevcoord.xy)) * " I_TEXDIMS "[sampler_num].xy;\n" + "\n" + " int4 color = sampleTexture(sampler_num, uv);\n" + "\n" + " uint swap = %s;\n", + BitfieldExtract("ss.ac", TevStageCombiner().alphaC.tswap).c_str()); + out.Write(" s.TexColor = Swizzle(swap, color);\n"); + out.Write(" } else {\n" + " // Texture is disabled\n" + " s.TexColor = int4(255, 255, 255, 255);\n" + " }\n" + "\n"); + } + + out.Write(" // This is the Meat of TEV\n" + " {\n" + " // Color Combiner\n"); + out.Write(" uint color_a = %s;\n", + BitfieldExtract("ss.cc", TevStageCombiner().colorC.a).c_str()); + out.Write(" uint color_b = %s;\n", + BitfieldExtract("ss.cc", TevStageCombiner().colorC.b).c_str()); + out.Write(" uint color_c = %s;\n", + BitfieldExtract("ss.cc", TevStageCombiner().colorC.c).c_str()); + out.Write(" uint color_d = %s;\n", + BitfieldExtract("ss.cc", TevStageCombiner().colorC.d).c_str()); + + out.Write(" uint color_bias = %s;\n", + BitfieldExtract("ss.cc", TevStageCombiner().colorC.bias).c_str()); + out.Write(" bool color_op = bool(%s);\n", + BitfieldExtract("ss.cc", TevStageCombiner().colorC.op).c_str()); + out.Write(" bool color_clamp = bool(%s);\n", + BitfieldExtract("ss.cc", TevStageCombiner().colorC.clamp).c_str()); + out.Write(" uint color_shift = %s;\n", + BitfieldExtract("ss.cc", TevStageCombiner().colorC.shift).c_str()); + out.Write(" uint color_dest = %s;\n", + BitfieldExtract("ss.cc", TevStageCombiner().colorC.dest).c_str()); + + out.Write(" uint color_compare_op = color_shift << 1 | uint(color_op);\n" + "\n" + " int3 color_A = selectColorInput(s, ss, %scolors_0, %scolors_1, color_a) & " + "int3(255, 255, 255);\n" + " int3 color_B = selectColorInput(s, ss, %scolors_0, %scolors_1, color_b) & " + "int3(255, 255, 255);\n" + " int3 color_C = selectColorInput(s, ss, %scolors_0, %scolors_1, color_c) & " + "int3(255, 255, 255);\n" + " int3 color_D = selectColorInput(s, ss, %scolors_0, %scolors_1, color_d); // 10 " + "bits + sign\n" + "\n", // TODO: do we need to sign extend? + color_input_prefix, + color_input_prefix, color_input_prefix, color_input_prefix, color_input_prefix, + color_input_prefix, color_input_prefix, color_input_prefix); + out.Write( + " int3 color;\n" + " if(color_bias != 3u) { // Normal mode\n" + " color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, " + "color_shift);\n" + " } else { // Compare mode\n" + " // op 6 and 7 do a select per color channel\n" + " if (color_compare_op == 6u) {\n" + " // TEVCMP_RGB8_GT\n" + " color.r = (color_A.r > color_B.r) ? color_C.r : 0;\n" + " color.g = (color_A.g > color_B.g) ? color_C.g : 0;\n" + " color.b = (color_A.b > color_B.b) ? color_C.b : 0;\n" + " } else if (color_compare_op == 7u) {\n" + " // TEVCMP_RGB8_EQ\n" + " color.r = (color_A.r == color_B.r) ? color_C.r : 0;\n" + " color.g = (color_A.g == color_B.g) ? color_C.g : 0;\n" + " color.b = (color_A.b == color_B.b) ? color_C.b : 0;\n" + " } else {\n" + " // The remaining ops do one compare which selects all 3 channels\n" + " color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, " + "0);\n" + " }\n" + " color = color_D + color;\n" + " }\n" + "\n" + " // Clamp result\n" + " if (color_clamp)\n" + " color = clamp(color, 0, 255);\n" + " else\n" + " color = clamp(color, -1024, 1023);\n" + "\n" + " // Write result to the correct input register of the next stage\n" + " setRegColor(s, color_dest, color);\n" + "\n"); + + // Alpha combiner + out.Write(" // Alpha Combiner\n"); + out.Write(" uint alpha_a = %s;\n", + BitfieldExtract("ss.ac", TevStageCombiner().alphaC.a).c_str()); + out.Write(" uint alpha_b = %s;\n", + BitfieldExtract("ss.ac", TevStageCombiner().alphaC.b).c_str()); + out.Write(" uint alpha_c = %s;\n", + BitfieldExtract("ss.ac", TevStageCombiner().alphaC.c).c_str()); + out.Write(" uint alpha_d = %s;\n", + BitfieldExtract("ss.ac", TevStageCombiner().alphaC.d).c_str()); + + out.Write(" uint alpha_bias = %s;\n", + BitfieldExtract("ss.ac", TevStageCombiner().alphaC.bias).c_str()); + out.Write(" bool alpha_op = bool(%s);\n", + BitfieldExtract("ss.ac", TevStageCombiner().alphaC.op).c_str()); + out.Write(" bool alpha_clamp = bool(%s);\n", + BitfieldExtract("ss.ac", TevStageCombiner().alphaC.clamp).c_str()); + out.Write(" uint alpha_shift = %s;\n", + BitfieldExtract("ss.ac", TevStageCombiner().alphaC.shift).c_str()); + out.Write(" uint alpha_dest = %s;\n", + BitfieldExtract("ss.ac", TevStageCombiner().alphaC.dest).c_str()); + + out.Write( + " uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op);\n" + "\n" + " int alpha_A;\n" + " int alpha_B;\n" + " if (alpha_bias != 3u || alpha_compare_op > 5u) {\n" + " // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5\n" + " alpha_A = selectAlphaInput(s, ss, %scolors_0, %scolors_1, alpha_a) & 255;\n" + " alpha_B = selectAlphaInput(s, ss, %scolors_0, %scolors_1, alpha_b) & 255;\n" + " };\n" + " int alpha_C = selectAlphaInput(s, ss, %scolors_0, %scolors_1, alpha_c) & 255;\n" + " int alpha_D = selectAlphaInput(s, ss, %scolors_0, %scolors_1, alpha_d); // 10 bits + " + "sign\n" + "\n", // TODO: do we need to sign extend? + color_input_prefix, + color_input_prefix, color_input_prefix, color_input_prefix, color_input_prefix, + color_input_prefix, color_input_prefix, color_input_prefix); + out.Write("\n" + " int alpha;\n" + " if(alpha_bias != 3u) { // Normal mode\n" + " alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, " + "true, alpha_shift);\n" + " } else { // Compare mode\n" + " if (alpha_compare_op == 6u) {\n" + " // TEVCMP_A8_GT\n" + " alpha = (alpha_A > alpha_B) ? alpha_C : 0;\n" + " } else if (alpha_compare_op == 7u) {\n" + " // TEVCMP_A8_EQ\n" + " alpha = (alpha_A == alpha_B) ? alpha_C : 0;\n" + " } else {\n" + " // All remaining alpha compare ops actually compare the color channels\n" + " alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0;\n" + " }\n" + " alpha = alpha_D + alpha;\n" + " }\n" + "\n" + " // Clamp result\n" + " if (alpha_clamp)\n" + " alpha = clamp(alpha, 0, 255);\n" + " else\n" + " alpha = clamp(alpha, -1024, 1023);\n" + "\n" + " // Write result to the correct input register of the next stage\n" + " setRegAlpha(s, alpha_dest, alpha);\n" + " }\n"); + + out.Write(" } // Main tev loop\n" + "\n"); + + // Select the output color and alpha registers from the last stage. + out.Write(" int4 TevResult;\n"); + out.Write( + " TevResult.xyz = getTevReg(s, %s).xyz;\n", + BitfieldExtract("bpmem_combiners(num_stages).x", TevStageCombiner().colorC.dest).c_str()); + out.Write( + " TevResult.w = getTevReg(s, %s).w;\n", + BitfieldExtract("bpmem_combiners(num_stages).y", TevStageCombiner().alphaC.dest).c_str()); + + out.Write(" TevResult &= 255;\n\n"); + + if (host_config.fast_depth_calc) + { + if (ApiType == APIType::D3D || ApiType == APIType::Vulkan) + out.Write(" int zCoord = int((1.0 - rawpos.z) * 16777216.0);\n"); + else + out.Write(" int zCoord = int(rawpos.z * 16777216.0);\n"); + out.Write(" zCoord = clamp(zCoord, 0, 0xFFFFFF);\n" + "\n"); + } + else + { + out.Write("\tint zCoord = " I_ZBIAS "[1].x + int((clipPos.z / clipPos.w) * float(" I_ZBIAS + "[1].y));\n"); + } + + // =========== + // ZFreeze + // =========== + + if (per_pixel_depth) + { + // Zfreeze forces early depth off + out.Write(" // ZFreeze\n" + " if ((bpmem_genmode & %du) != 0u) {\n", + 1 << GenMode().zfreeze.StartBit()); + out.Write(" float2 screenpos = rawpos.xy * " I_EFBSCALE ".xy;\n"); + if (ApiType == APIType::OpenGL) + out.Write(" // Opengl has reversed vertical screenspace coordiantes\n" + " screenpos.y = 528.0 - screenpos.y;\n"); + + out.Write(" zCoord = int(" I_ZSLOPE ".z + " I_ZSLOPE ".x * screenpos.x + " I_ZSLOPE + ".y * screenpos.y);\n" + " }\n" + "\n"); + } + + // ================= + // Depth Texture + // ================= + + out.Write(" // Depth Texture\n" + " int early_zCoord = zCoord;\n" + " if (bpmem_ztex_op != 0u) {\n" + " int ztex = int(" I_ZBIAS "[1].w); // fixed bias\n" + "\n" + " // Whatever texture was in our last stage, it's now our depth texture\n" + " ztex += idot(s.TexColor.xyzw, " I_ZBIAS "[0].xyzw);\n" + " ztex += (bpmem_ztex_op == 1u) ? zCoord : 0;\n" + " zCoord = ztex & 0xFFFFFF;\n" + " }\n" + "\n"); + + if (per_pixel_depth) + { + out.Write(" // If early depth is enabled, write to zbuffer before depth textures\n"); + out.Write(" // If early depth isn't enabled, we write to the zbuffer here\n"); + out.Write(" int zbuffer_zCoord = bpmem_early_ztest ? early_zCoord : zCoord;\n"); + if (ApiType == APIType::D3D || ApiType == APIType::Vulkan) + out.Write(" depth = 1.0 - float(zbuffer_zCoord) / 16777216.0;\n"); + else + out.Write(" depth = float(zbuffer_zCoord) / 16777216.0;\n"); + } + + out.Write(" // Alpha Test\n" + " if (bpmem_alphaTest != 0u) {\n" + " bool comp0 = alphaCompare(TevResult.a, " I_ALPHA ".r, %s);\n", + BitfieldExtract("bpmem_alphaTest", AlphaTest().comp0).c_str()); + out.Write(" bool comp1 = alphaCompare(TevResult.a, " I_ALPHA ".g, %s);\n", + BitfieldExtract("bpmem_alphaTest", AlphaTest().comp1).c_str()); + out.Write("\n" + " // These if statements are written weirdly to work around intel and qualcom bugs " + "with handling booleans.\n" + " switch (%s) {\n", + BitfieldExtract("bpmem_alphaTest", AlphaTest().logic).c_str()); + out.Write(" case 0u: // AND\n" + " if (comp0 && comp1) break; else discard; break;\n" + " case 1u: // OR\n" + " if (comp0 || comp1) break; else discard; break;\n" + " case 2u: // XOR\n" + " if (comp0 != comp1) break; else discard; break;\n" + " case 3u: // XNOR\n" + " if (comp0 == comp1) break; else discard; break;\n" + " }\n" + " }\n" + "\n"); + + // ========= + // Dithering + // ========= + out.Write(" if (bpmem_dither) {\n" + " // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering\n" + " // Here the matrix is encoded into the two factor constants\n" + " int2 dither = int2(rawpos.xy) & 1;\n" + " TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - " + "dither.x * 2);\n" + " }\n\n"); + + // ========= + // Fog + // ========= + + // FIXME: Fog is implemented the same as ShaderGen, but ShaderGen's fog is all hacks. + // Should be fixed point, and should not make guesses about Range-Based adjustments. + out.Write(" // Fog\n" + " uint fog_function = %s;\n", + BitfieldExtract("bpmem_fogParam3", FogParam3().fsel).c_str()); + out.Write(" if (fog_function != 0u) {\n" + " // TODO: This all needs to be converted from float to fixed point\n" + " float ze;\n" + " if (%s == 0u) {\n", + BitfieldExtract("bpmem_fogParam3", FogParam3().proj).c_str()); + out.Write(" // perspective\n" + " // ze = A/(B - (Zs >> B_SHF)\n" + " ze = (" I_FOGF "[1].x * 16777216.0) / float(" I_FOGI ".y - (zCoord >> " I_FOGI + ".w));\n" + " } else {\n" + " // orthographic\n" + " // ze = a*Zs (here, no B_SHF)\n" + " ze = " I_FOGF "[1].x * float(zCoord) / 16777216.0;\n" + " }\n" + "\n" + " if (bool(%s)) {\n", + BitfieldExtract("bpmem_fogRangeBase", FogRangeParams::RangeBase().Enabled).c_str()); + out.Write(" // x_adjust = sqrt((x-center)^2 + k^2)/k\n" + " // ze *= x_adjust\n" + " // TODO Instead of this theoretical calculation, we should use the\n" + " // coefficient table given in the fog range BP registers!\n" + " float x_adjust = (2.0 * (rawpos.x / " I_FOGF "[0].y)) - 1.0 - " I_FOGF + "[0].x; \n" + " x_adjust = sqrt(x_adjust * x_adjust + " I_FOGF "[0].z * " I_FOGF + "[0].z) / " I_FOGF "[0].z;\n" + " ze *= x_adjust;\n" + " }\n" + "\n" + " float fog = clamp(ze - " I_FOGF "[1].z, 0.0, 1.0);\n" + "\n" + " if (fog_function > 3u) {\n" + " switch (fog_function) {\n" + " case 4u:\n" + " fog = 1.0 - exp2(-8.0 * fog);\n" + " break;\n" + " case 5u:\n" + " fog = 1.0 - exp2(-8.0 * fog * fog);\n" + " break;\n" + " case 6u:\n" + " fog = exp2(-8.0 * (1.0 - fog));\n" + " break;\n" + " case 7u:\n" + " fog = 1.0 - fog;\n" + " fog = exp2(-8.0 * fog * fog);\n" + " break;\n" + " }\n" + " }\n" + "\n" + " int ifog = iround(fog * 256.0);\n" + " TevResult.rgb = (TevResult.rgb * (256 - ifog) + " I_FOGCOLOR ".rgb * ifog) >> 8;\n" + " }\n" + "\n"); + + // TODO: Do we still want to support two pass alpha blending? + out.Write(" if (bpmem_rgba6_format)\n" + " ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0;\n" + " else\n" + " ocol0.rgb = float3(TevResult.rgb) / 255.0;\n" + "\n" + " if (bpmem_dstalpha != 0u)\n"); + out.Write(" ocol0.a = float(%s >> 2) / 63.0;\n", + BitfieldExtract("bpmem_dstalpha", ConstantAlpha().alpha).c_str()); + out.Write(" else\n" + " ocol0.a = float(TevResult.a >> 2) / 63.0;\n" + " \n"); + + if (use_dual_source) + { + out.Write(" // Dest alpha override (dual source blending)\n" + " // Colors will be blended against the alpha from ocol1 and\n" + " // the alpha from ocol0 will be written to the framebuffer.\n" + " ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0);\n"); + } + + if (bounding_box) + { + const char* atomic_op = + (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan) ? "atomic" : "Interlocked"; + out.Write(" if (bpmem_bounding_box) {\n"); + out.Write(" if(bbox_data[0] > int(rawpos.x)) %sMin(bbox_data[0], int(rawpos.x));\n", + atomic_op); + out.Write(" if(bbox_data[1] < int(rawpos.x)) %sMax(bbox_data[1], int(rawpos.x));\n", + atomic_op); + out.Write(" if(bbox_data[2] > int(rawpos.y)) %sMin(bbox_data[2], int(rawpos.y));\n", + atomic_op); + out.Write(" if(bbox_data[3] < int(rawpos.y)) %sMax(bbox_data[3], int(rawpos.y));\n", + atomic_op); + out.Write(" }\n"); + } + + out.Write("}\n" + "\n" + "int4 getRasColor(State s, StageState ss, float4 colors_0, float4 colors_1) {\n" + " // Select Ras for stage\n" + " uint ras = %s;\n", + BitfieldExtract("ss.order", TwoTevStageOrders().colorchan0).c_str()); + out.Write(" if (ras < 2u) { // Lighting Channel 0 or 1\n" + " int4 color = iround(((ras == 0u) ? colors_0 : colors_1) * 255.0);\n" + " uint swap = %s;\n", + BitfieldExtract("ss.ac", TevStageCombiner().alphaC.rswap).c_str()); + out.Write(" return Swizzle(swap, color);\n"); + out.Write(" } else if (ras == 5u) { // Alpha Bumb\n" + " return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump);\n" + " } else if (ras == 6u) { // Normalzied Alpha Bump\n" + " int normalized = s.AlphaBump | s.AlphaBump >> 5;\n" + " return int4(normalized, normalized, normalized, normalized);\n" + " } else {\n" + " return int4(0, 0, 0, 0);\n" + " }\n" + "}\n" + "\n" + "int4 getKonstColor(State s, StageState ss) {\n" + " // Select Konst for stage\n" + " // TODO: a switch case might be better here than an dynamically" + " // indexed uniform lookup\n" + " uint tevksel = bpmem_tevksel(ss.stage>>1);\n" + " if ((ss.stage & 1u) == 0u)\n" + " return int4(konstLookup[%s].rgb, konstLookup[%s].a);\n", + BitfieldExtract("tevksel", bpmem.tevksel[0].kcsel0).c_str(), + BitfieldExtract("tevksel", bpmem.tevksel[0].kasel0).c_str()); + out.Write(" else\n" + " return int4(konstLookup[%s].rgb, konstLookup[%s].a);\n", + BitfieldExtract("tevksel", bpmem.tevksel[0].kcsel1).c_str(), + BitfieldExtract("tevksel", bpmem.tevksel[0].kasel1).c_str()); + out.Write("}\n"); + + return out; +} + +void EnumeratePixelShaderUids(const std::function& callback) +{ + PixelShaderUid uid; + std::memset(&uid, 0, sizeof(uid)); + + for (u32 texgens = 0; texgens <= 8; texgens++) + { + auto* puid = uid.GetUidData(); + puid->num_texgens = texgens; + + for (u32 early_depth = 0; early_depth < 2; early_depth++) + { + puid->early_depth = early_depth != 0; + for (u32 per_pixel_depth = 0; per_pixel_depth < 2; per_pixel_depth++) + { + // Don't generate shaders where we have early depth tests enabled, and write gl_FragDepth. + if (early_depth && per_pixel_depth) + continue; + + puid->per_pixel_depth = per_pixel_depth != 0; + callback(uid); + } + } + } +} +} diff --git a/Source/Core/VideoCommon/UberShaderPixel.h b/Source/Core/VideoCommon/UberShaderPixel.h new file mode 100644 index 0000000000..d7dc8109e8 --- /dev/null +++ b/Source/Core/VideoCommon/UberShaderPixel.h @@ -0,0 +1,31 @@ +// Copyright 2015 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#pragma once + +#include +#include "VideoCommon/PixelShaderGen.h" + +namespace UberShader +{ +#pragma pack(1) +struct pixel_ubershader_uid_data +{ + u32 num_texgens : 4; + u32 early_depth : 1; + u32 per_pixel_depth : 1; + + u32 NumValues() const { return sizeof(pixel_ubershader_uid_data); } +}; +#pragma pack() + +typedef ShaderUid PixelShaderUid; + +PixelShaderUid GetPixelShaderUid(); + +ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config, + const pixel_ubershader_uid_data* uid_data); + +void EnumeratePixelShaderUids(const std::function& callback); +} diff --git a/Source/Core/VideoCommon/UberShaderVertex.cpp b/Source/Core/VideoCommon/UberShaderVertex.cpp new file mode 100644 index 0000000000..ebc9c80f0e --- /dev/null +++ b/Source/Core/VideoCommon/UberShaderVertex.cpp @@ -0,0 +1,467 @@ +// Copyright 2015 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#include "VideoCommon/UberShaderVertex.h" +#include "VideoCommon/DriverDetails.h" +#include "VideoCommon/NativeVertexFormat.h" +#include "VideoCommon/UberShaderCommon.h" +#include "VideoCommon/VertexShaderGen.h" +#include "VideoCommon/VideoConfig.h" +#include "VideoCommon/XFMemory.h" + +namespace UberShader +{ +VertexShaderUid GetVertexShaderUid() +{ + VertexShaderUid out; + vertex_ubershader_uid_data* uid_data = out.GetUidData(); + memset(uid_data, 0, sizeof(*uid_data)); + uid_data->num_texgens = xfmem.numTexGen.numTexGens; + return out; +} + +static void GenVertexShaderTexGens(APIType ApiType, u32 numTexgen, ShaderCode& out); + +ShaderCode GenVertexShader(APIType ApiType, const ShaderHostConfig& host_config, + const vertex_ubershader_uid_data* uid_data) +{ + const bool msaa = host_config.msaa; + const bool ssaa = host_config.ssaa; + const bool per_pixel_lighting = host_config.per_pixel_lighting; + const bool vertex_rounding = host_config.vertex_rounding; + const u32 numTexgen = uid_data->num_texgens; + ShaderCode out; + + out.Write("// Vertex UberShader\n\n"); + out.Write("%s", s_lighting_struct); + + // uniforms + if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan) + out.Write("UBO_BINDING(std140, 2) uniform VSBlock {\n"); + else + out.Write("cbuffer VSBlock {\n"); + out.Write(s_shader_uniforms); + out.Write("};\n"); + + out.Write("struct VS_OUTPUT {\n"); + GenerateVSOutputMembers(out, ApiType, numTexgen, per_pixel_lighting, ""); + out.Write("};\n\n"); + + WriteUberShaderCommonHeader(out, ApiType, host_config); + WriteLightingFunction(out); + + if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan) + { + out.Write("ATTRIBUTE_LOCATION(%d) in float4 rawpos;\n", SHADER_POSITION_ATTRIB); + out.Write("ATTRIBUTE_LOCATION(%d) in uint4 posmtx;\n", SHADER_POSMTX_ATTRIB); + out.Write("ATTRIBUTE_LOCATION(%d) in float3 rawnorm0;\n", SHADER_NORM0_ATTRIB); + out.Write("ATTRIBUTE_LOCATION(%d) in float3 rawnorm1;\n", SHADER_NORM1_ATTRIB); + out.Write("ATTRIBUTE_LOCATION(%d) in float3 rawnorm2;\n", SHADER_NORM2_ATTRIB); + out.Write("ATTRIBUTE_LOCATION(%d) in float4 rawcolor0;\n", SHADER_COLOR0_ATTRIB); + out.Write("ATTRIBUTE_LOCATION(%d) in float4 rawcolor1;\n", SHADER_COLOR1_ATTRIB); + for (int i = 0; i < 8; ++i) + out.Write("ATTRIBUTE_LOCATION(%d) in float3 rawtex%d;\n", SHADER_TEXTURE0_ATTRIB + i, i); + + // We need to always use output blocks for Vulkan, but geometry shaders are also optional. + if (host_config.backend_geometry_shaders || ApiType == APIType::Vulkan) + { + out.Write("VARYING_LOCATION(0) out VertexData {\n"); + GenerateVSOutputMembers(out, ApiType, numTexgen, per_pixel_lighting, + GetInterpolationQualifier(msaa, ssaa, true, false)); + out.Write("} vs;\n"); + } + else + { + // Let's set up attributes + for (u32 i = 0; i < numTexgen; ++i) + out.Write("%s out float3 tex%u;\n", GetInterpolationQualifier(msaa, ssaa), i); + + out.Write("%s out float4 clipPos;\n", GetInterpolationQualifier(msaa, ssaa)); + if (per_pixel_lighting) + { + out.Write("%s out float3 Normal;\n", GetInterpolationQualifier(msaa, ssaa)); + out.Write("%s out float3 WorldPos;\n", GetInterpolationQualifier(msaa, ssaa)); + } + out.Write("%s out float4 colors_0;\n", GetInterpolationQualifier(msaa, ssaa)); + out.Write("%s out float4 colors_1;\n", GetInterpolationQualifier(msaa, ssaa)); + } + + out.Write("void main()\n{\n"); + } + else // D3D + { + out.Write("VS_OUTPUT main(\n"); + + // inputs + out.Write(" float3 rawnorm0 : NORMAL0,\n"); + out.Write(" float3 rawnorm1 : NORMAL1,\n"); + out.Write(" float3 rawnorm2 : NORMAL2,\n"); + out.Write(" float4 rawcolor0 : COLOR0,\n"); + out.Write(" float4 rawcolor1 : COLOR1,\n"); + for (int i = 0; i < 8; ++i) + out.Write(" float3 rawtex%d : TEXCOORD%d,\n", i, i); + out.Write(" uint posmtx : BLENDINDICES,\n"); + out.Write(" float4 rawpos : POSITION) {\n"); + } + + out.Write("VS_OUTPUT o;\n" + "\n"); + + // Transforms + out.Write("// Position matrix\n" + "float4 P0;\n" + "float4 P1;\n" + "float4 P2;\n" + "\n" + "// Normal matrix\n" + "float3 N0;\n" + "float3 N1;\n" + "float3 N2;\n" + "\n" + "if ((components & %uu) != 0u) {// VB_HAS_POSMTXIDX\n", + VB_HAS_POSMTXIDX); + out.Write(" // Vertex format has a per-vertex matrix\n" + " int posidx = int(posmtx.r);\n" + " P0 = " I_TRANSFORMMATRICES "[posidx];\n" + " P1 = " I_TRANSFORMMATRICES "[posidx+1];\n" + " P2 = " I_TRANSFORMMATRICES "[posidx+2];\n" + "\n" + " int normidx = posidx >= 32 ? (posidx - 32) : posidx;\n" + " N0 = " I_NORMALMATRICES "[normidx].xyz;\n" + " N1 = " I_NORMALMATRICES "[normidx+1].xyz;\n" + " N2 = " I_NORMALMATRICES "[normidx+2].xyz;\n" + "} else {\n" + " // One shared matrix\n" + " P0 = " I_POSNORMALMATRIX "[0];\n" + " P1 = " I_POSNORMALMATRIX "[1];\n" + " P2 = " I_POSNORMALMATRIX "[2];\n" + " N0 = " I_POSNORMALMATRIX "[3].xyz;\n" + " N1 = " I_POSNORMALMATRIX "[4].xyz;\n" + " N2 = " I_POSNORMALMATRIX "[5].xyz;\n" + "}\n" + "\n" + "float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0);\n" + "o.pos = float4(dot(" I_PROJECTION "[0], pos), dot(" I_PROJECTION + "[1], pos), dot(" I_PROJECTION "[2], pos), dot(" I_PROJECTION "[3], pos));\n" + "\n" + "// Only the first normal gets normalized (TODO: why?)\n" + "float3 _norm0 = float3(0.0, 0.0, 0.0);\n" + "if ((components & %uu) != 0u) // VB_HAS_NRM0\n", + VB_HAS_NRM0); + out.Write( + " _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0)));\n" + "\n" + "float3 _norm1 = float3(0.0, 0.0, 0.0);\n" + "if ((components & %uu) != 0u) // VB_HAS_NRM1\n", + VB_HAS_NRM1); + out.Write(" _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1));\n" + "\n" + "float3 _norm2 = float3(0.0, 0.0, 0.0);\n" + "if ((components & %uu) != 0u) // VB_HAS_NRM2\n", + VB_HAS_NRM2); + out.Write(" _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2));\n" + "\n"); + + // Hardware Lighting + WriteVertexLighting(out, ApiType, "pos.xyz", "_norm0", "rawcolor0", "rawcolor1", "o.colors_0", + "o.colors_1"); + + // Texture Coordinates + if (numTexgen > 0) + GenVertexShaderTexGens(ApiType, numTexgen, out); + + // clipPos/w needs to be done in pixel shader, not here + out.Write("o.clipPos = o.pos;\n"); + + if (per_pixel_lighting) + { + out.Write("o.Normal = _norm0;\n"); + out.Write("o.WorldPos = pos.xyz;\n"); + out.Write("if ((components & %uu) != 0u) // VB_HAS_COL0\n", VB_HAS_COL0); + out.Write(" o.colors_0 = rawcolor0;\n"); + out.Write("if ((components & %uu) != 0u) // VB_HAS_COL1\n", VB_HAS_COL1); + out.Write(" o.colors_1 = rawcolor1;\n"); + } + + // If we can disable the incorrect depth clipping planes using depth clamping, then we can do + // our own depth clipping and calculate the depth range before the perspective divide if + // necessary. + if (host_config.backend_depth_clamp) + { + // Since we're adjusting z for the depth range before the perspective divide, we have to do our + // own clipping. We want to clip so that -w <= z <= 0, which matches the console -1..0 range. + // We adjust our depth value for clipping purposes to match the perspective projection in the + // software backend, which is a hack to fix Sonic Adventure and Unleashed games. + out.Write("float clipDepth = o.pos.z * (1.0 - 1e-7);\n"); + out.Write("o.clipDist0 = clipDepth + o.pos.w;\n"); // Near: z < -w + out.Write("o.clipDist1 = -clipDepth;\n"); // Far: z > 0 + } + + // Write the true depth value. If the game uses depth textures, then the pixel shader will + // override it with the correct values if not then early z culling will improve speed. + // There are two different ways to do this, when the depth range is oversized, we process + // the depth range in the vertex shader, if not we let the host driver handle it. + // + // Adjust z for the depth range. We're using an equation which incorperates a depth inversion, + // so we can map the console -1..0 range to the 0..1 range used in the depth buffer. + // We have to handle the depth range in the vertex shader instead of after the perspective + // divide, because some games will use a depth range larger than what is allowed by the + // graphics API. These large depth ranges will still be clipped to the 0..1 range, so these + // games effectively add a depth bias to the values written to the depth buffer. + out.Write("o.pos.z = o.pos.w * " I_PIXELCENTERCORRECTION ".w - " + "o.pos.z * " I_PIXELCENTERCORRECTION ".z;\n"); + + if (!host_config.backend_clip_control) + { + // If the graphics API doesn't support a depth range of 0..1, then we need to map z to + // the -1..1 range. Unfortunately we have to use a substraction, which is a lossy floating-point + // operation that can introduce a round-trip error. + out.Write("o.pos.z = o.pos.z * 2.0 - o.pos.w;\n"); + } + + // Correct for negative viewports by mirroring all vertices. We need to negate the height here, + // since the viewport height is already negated by the render backend. + out.Write("o.pos.xy *= sign(" I_PIXELCENTERCORRECTION ".xy * float2(1.0, -1.0));\n"); + + // The console GPU places the pixel center at 7/12 in screen space unless + // antialiasing is enabled, while D3D and OpenGL place it at 0.5. This results + // in some primitives being placed one pixel too far to the bottom-right, + // which in turn can be critical if it happens for clear quads. + // Hence, we compensate for this pixel center difference so that primitives + // get rasterized correctly. + out.Write("o.pos.xy = o.pos.xy - o.pos.w * " I_PIXELCENTERCORRECTION ".xy;\n"); + + if (vertex_rounding) + { + // By now our position is in clip space. However, higher resolutions than the Wii outputs + // cause an additional pixel offset. Due to a higher pixel density we need to correct this + // by converting our clip-space position into the Wii's screen-space. + // Acquire the right pixel and then convert it back. + out.Write("if (o.pos.w == 1.0f)\n"); + out.Write("{\n"); + + out.Write("\tfloat ss_pixel_x = ((o.pos.x + 1.0f) * (" I_VIEWPORT_SIZE ".x * 0.5f));\n"); + out.Write("\tfloat ss_pixel_y = ((o.pos.y + 1.0f) * (" I_VIEWPORT_SIZE ".y * 0.5f));\n"); + + out.Write("\tss_pixel_x = round(ss_pixel_x);\n"); + out.Write("\tss_pixel_y = round(ss_pixel_y);\n"); + + out.Write("\to.pos.x = ((ss_pixel_x / (" I_VIEWPORT_SIZE ".x * 0.5f)) - 1.0f);\n"); + out.Write("\to.pos.y = ((ss_pixel_y / (" I_VIEWPORT_SIZE ".y * 0.5f)) - 1.0f);\n"); + out.Write("}\n"); + } + + if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan) + { + if (host_config.backend_geometry_shaders || ApiType == APIType::Vulkan) + { + AssignVSOutputMembers(out, "vs", "o", numTexgen, per_pixel_lighting); + } + else + { + // TODO: Pass interface blocks between shader stages even if geometry shaders + // are not supported, however that will require at least OpenGL 3.2 support. + for (u32 i = 0; i < numTexgen; ++i) + out.Write("tex%d.xyz = o.tex%d;\n", i, i); + out.Write("clipPos = o.clipPos;\n"); + if (per_pixel_lighting) + { + out.Write("Normal = o.Normal;\n"); + out.Write("WorldPos = o.WorldPos;\n"); + } + out.Write("colors_0 = o.colors_0;\n"); + out.Write("colors_1 = o.colors_1;\n"); + } + + if (host_config.backend_depth_clamp) + { + out.Write("gl_ClipDistance[0] = o.clipDist0;\n"); + out.Write("gl_ClipDistance[1] = o.clipDist1;\n"); + } + + // Vulkan NDC space has Y pointing down (right-handed NDC space). + if (ApiType == APIType::Vulkan) + out.Write("gl_Position = float4(o.pos.x, -o.pos.y, o.pos.z, o.pos.w);\n"); + else + out.Write("gl_Position = o.pos;\n"); + } + else // D3D + { + out.Write("return o;\n"); + } + out.Write("}\n"); + + return out; +} + +void GenVertexShaderTexGens(APIType ApiType, u32 numTexgen, ShaderCode& out) +{ + // The HLSL compiler complains that the output texture coordinates are uninitialized when trying + // to dynamically index them. + for (u32 i = 0; i < numTexgen; i++) + out.Write("o.tex%u = float3(0.0, 0.0, 0.0);\n", i); + + out.Write("// Texture coordinate generation\n"); + if (numTexgen == 1) + out.Write("{ const uint texgen = 0u;\n"); + else + out.Write("%sfor (uint texgen = 0u; texgen < %uu; texgen++) {\n", + ApiType == APIType::D3D ? "[loop] " : "", numTexgen); + + out.Write(" // Texcoord transforms\n"); + out.Write(" float4 coord = float4(0.0, 0.0, 1.0, 1.0);\n" + " uint texMtxInfo = xfmem_texMtxInfo(texgen);\n"); + out.Write(" switch (%s) {\n", BitfieldExtract("texMtxInfo", TexMtxInfo().sourcerow).c_str()); + out.Write(" case %uu: // XF_SRCGEOM_INROW\n", XF_SRCGEOM_INROW); + out.Write(" coord.xyz = rawpos.xyz;\n"); + out.Write(" break;\n\n"); + out.Write(" case %uu: // XF_SRCNORMAL_INROW\n", XF_SRCNORMAL_INROW); + out.Write( + " coord.xyz = ((components & %uu /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz;", + VB_HAS_NRM0); + out.Write(" break;\n\n"); + out.Write(" case %uu: // XF_SRCBINORMAL_T_INROW\n", XF_SRCBINORMAL_T_INROW); + out.Write( + " coord.xyz = ((components & %uu /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz;", + VB_HAS_NRM1); + out.Write(" break;\n\n"); + out.Write(" case %uu: // XF_SRCBINORMAL_B_INROW\n", XF_SRCBINORMAL_B_INROW); + out.Write( + " coord.xyz = ((components & %uu /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz;", + VB_HAS_NRM2); + out.Write(" break;\n\n"); + for (u32 i = 0; i < 8; i++) + { + out.Write(" case %uu: // XF_SRCTEX%u_INROW\n", XF_SRCTEX0_INROW + i, i); + out.Write( + " coord = ((components & %uu /* VB_HAS_UV%u */) != 0u) ? float4(rawtex%u.x, rawtex%u.y, " + "1.0, 1.0) : coord;\n", + VB_HAS_UV0 << i, i, i, i); + out.Write(" break;\n\n"); + } + out.Write(" }\n"); + out.Write("\n"); + + out.Write(" // Input form of AB11 sets z element to 1.0\n"); + out.Write(" if (%s == %uu) // inputform == XF_TEXINPUT_AB11\n", + BitfieldExtract("texMtxInfo", TexMtxInfo().inputform).c_str(), XF_TEXINPUT_AB11); + out.Write(" coord.z = 1.0f;\n"); + out.Write("\n"); + + out.Write(" // first transformation\n"); + out.Write(" uint texgentype = %s;\n", + BitfieldExtract("texMtxInfo", TexMtxInfo().texgentype).c_str()); + out.Write(" float3 output_tex;\n" + " switch (texgentype)\n" + " {\n"); + out.Write(" case %uu: // XF_TEXGEN_EMBOSS_MAP\n", XF_TEXGEN_EMBOSS_MAP); + out.Write(" {\n"); + out.Write(" uint light = %s;\n", + BitfieldExtract("texMtxInfo", TexMtxInfo().embosslightshift).c_str()); + out.Write(" uint source = %s;\n", + BitfieldExtract("texMtxInfo", TexMtxInfo().embosssourceshift).c_str()); + out.Write(" switch (source) {\n"); + for (u32 i = 0; i < numTexgen; i++) + out.Write(" case %uu: output_tex.xyz = o.tex%u; break;\n", i, i); + out.Write(" default: output_tex.xyz = float3(0.0, 0.0, 0.0); break;\n" + " }\n"); + out.Write(" if ((components & %uu) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2\n", + VB_HAS_NRM1 | VB_HAS_NRM2); // Should this be VB_HAS_NRM1 | VB_HAS_NRM2 + out.Write(" float3 ldir = normalize(" I_LIGHTS "[light].pos.xyz - pos.xyz);\n" + " output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0);\n" + " }\n" + " }\n" + " break;\n\n"); + out.Write(" case %uu: // XF_TEXGEN_COLOR_STRGBC0\n", XF_TEXGEN_COLOR_STRGBC0); + out.Write(" output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0);\n" + " break;\n\n"); + out.Write(" case %uu: // XF_TEXGEN_COLOR_STRGBC1\n", XF_TEXGEN_COLOR_STRGBC1); + out.Write(" output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0);\n" + " break;\n\n"); + out.Write(" default: // Also XF_TEXGEN_REGULAR\n" + " {\n"); + out.Write(" if ((components & (%uu /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) {\n", + VB_HAS_TEXMTXIDX0); + out.Write(" // This is messy, due to dynamic indexing of the input texture coordinates.\n" + " // Hopefully the compiler will unroll this whole loop anyway and the switch.\n" + " int tmp = 0;\n" + " switch (texgen) {\n"); + for (u32 i = 0; i < numTexgen; i++) + out.Write(" case %uu: tmp = int(rawtex%u.z); break;\n", i, i); + out.Write(" }\n" + "\n"); + out.Write(" if (%s == %uu) {\n", + BitfieldExtract("texMtxInfo", TexMtxInfo().projection).c_str(), XF_TEXPROJ_STQ); + out.Write(" output_tex.xyz = float3(dot(coord, " I_TRANSFORMMATRICES "[tmp]),\n" + " dot(coord, " I_TRANSFORMMATRICES "[tmp + 1]),\n" + " dot(coord, " I_TRANSFORMMATRICES "[tmp + 2]));\n" + " } else {\n" + " output_tex.xyz = float3(dot(coord, " I_TRANSFORMMATRICES "[tmp]),\n" + " dot(coord, " I_TRANSFORMMATRICES "[tmp + 1]),\n" + " 1.0);\n" + " }\n" + " } else {\n"); + out.Write(" if (%s == %uu) {\n", + BitfieldExtract("texMtxInfo", TexMtxInfo().projection).c_str(), XF_TEXPROJ_STQ); + out.Write(" output_tex.xyz = float3(dot(coord, " I_TEXMATRICES "[3u * texgen]),\n" + " dot(coord, " I_TEXMATRICES "[3u * texgen + 1u]),\n" + " dot(coord, " I_TEXMATRICES "[3u * texgen + 2u]));\n" + " } else {\n" + " output_tex.xyz = float3(dot(coord, " I_TEXMATRICES "[3u * texgen]),\n" + " dot(coord, " I_TEXMATRICES "[3u * texgen + 1u]),\n" + " 1.0);\n" + " }\n" + " }\n" + " }\n" + " break;\n\n" + " }\n" + "\n"); + + out.Write(" if (xfmem_dualTexInfo != 0u) {\n"); + out.Write(" uint postMtxInfo = xfmem_postMtxInfo(texgen);"); + out.Write(" uint base_index = %s;\n", + BitfieldExtract("postMtxInfo", PostMtxInfo().index).c_str()); + out.Write(" float4 P0 = " I_POSTTRANSFORMMATRICES "[base_index & 0x3fu];\n" + " float4 P1 = " I_POSTTRANSFORMMATRICES "[(base_index + 1u) & 0x3fu];\n" + " float4 P2 = " I_POSTTRANSFORMMATRICES "[(base_index + 2u) & 0x3fu];\n" + "\n"); + out.Write(" if (%s != 0u)\n", BitfieldExtract("postMtxInfo", PostMtxInfo().normalize).c_str()); + out.Write(" output_tex.xyz = normalize(output_tex.xyz);\n" + "\n" + " // multiply by postmatrix\n" + " output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w,\n" + " dot(P1.xyz, output_tex.xyz) + P1.w,\n" + " dot(P2.xyz, output_tex.xyz) + P2.w);\n" + " }\n\n"); + + // When q is 0, the GameCube appears to have a special case + // This can be seen in devkitPro's neheGX Lesson08 example for Wii + // Makes differences in Rogue Squadron 3 (Hoth sky) and The Last Story (shadow culling) + out.Write(" if (texgentype == %uu && output_tex.z == 0.0) // XF_TEXGEN_REGULAR\n", + XF_TEXGEN_REGULAR); + out.Write( + " output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f));\n" + "\n"); + + out.Write(" // Hopefully GPUs that can support dynamic indexing will optimize this.\n"); + out.Write(" switch (texgen) {\n"); + for (u32 i = 0; i < numTexgen; i++) + out.Write(" case %uu: o.tex%u = output_tex; break;\n", i, i); + out.Write(" }\n" + "}\n"); +} + +void EnumerateVertexShaderUids(const std::function& callback) +{ + VertexShaderUid uid; + std::memset(&uid, 0, sizeof(uid)); + + for (u32 texgens = 0; texgens <= 8; texgens++) + { + auto* vuid = uid.GetUidData(); + vuid->num_texgens = texgens; + callback(uid); + } +} +} diff --git a/Source/Core/VideoCommon/UberShaderVertex.h b/Source/Core/VideoCommon/UberShaderVertex.h new file mode 100644 index 0000000000..daebaa3f77 --- /dev/null +++ b/Source/Core/VideoCommon/UberShaderVertex.h @@ -0,0 +1,28 @@ +// Copyright 2015 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#pragma once + +#include +#include "VideoCommon/PixelShaderGen.h" + +namespace UberShader +{ +#pragma pack(1) +struct vertex_ubershader_uid_data +{ + u32 num_texgens : 4; + + u32 NumValues() const { return sizeof(vertex_ubershader_uid_data); } +}; +#pragma pack() + +typedef ShaderUid VertexShaderUid; + +VertexShaderUid GetVertexShaderUid(); + +ShaderCode GenVertexShader(APIType api_type, const ShaderHostConfig& host_config, + const vertex_ubershader_uid_data* uid_data); +void EnumerateVertexShaderUids(const std::function& callback); +} diff --git a/Source/Core/VideoCommon/VertexLoaderManager.cpp b/Source/Core/VideoCommon/VertexLoaderManager.cpp index d3f51ddfd9..bb52197585 100644 --- a/Source/Core/VideoCommon/VertexLoaderManager.cpp +++ b/Source/Core/VideoCommon/VertexLoaderManager.cpp @@ -44,13 +44,6 @@ static VertexLoaderMap s_vertex_loader_map; u8* cached_arraybases[12]; -// Used in the Vulkan backend - -NativeVertexFormatMap* GetNativeVertexFormatMap() -{ - return &s_native_vertex_map; -} - void Init() { MarkAllDirty(); @@ -133,6 +126,75 @@ void MarkAllDirty() g_preprocess_cp_state.attr_dirty = BitSet32::AllTrue(8); } +NativeVertexFormat* GetOrCreateMatchingFormat(const PortableVertexDeclaration& decl) +{ + auto iter = s_native_vertex_map.find(decl); + if (iter == s_native_vertex_map.end()) + { + std::unique_ptr fmt = g_vertex_manager->CreateNativeVertexFormat(decl); + auto ipair = s_native_vertex_map.emplace(decl, std::move(fmt)); + iter = ipair.first; + } + + return iter->second.get(); +} + +NativeVertexFormat* GetUberVertexFormat(const PortableVertexDeclaration& decl) +{ + // The padding in the structs can cause the memcmp() in the map to create duplicates. + // Avoid this by initializing the padding to zero. + PortableVertexDeclaration new_decl; + std::memset(&new_decl, 0, sizeof(new_decl)); + new_decl.stride = decl.stride; + + auto MakeDummyAttribute = [](AttributeFormat& attr, VarType type, int components, bool integer) { + attr.type = type; + attr.components = components; + attr.offset = 0; + attr.enable = true; + attr.integer = integer; + }; + auto CopyAttribute = [](AttributeFormat& attr, const AttributeFormat& src) { + attr.type = src.type; + attr.components = src.components; + attr.offset = src.offset; + attr.enable = src.enable; + attr.integer = src.integer; + }; + + if (decl.position.enable) + CopyAttribute(new_decl.position, decl.position); + else + MakeDummyAttribute(new_decl.position, VAR_FLOAT, 1, false); + for (size_t i = 0; i < ArraySize(new_decl.normals); i++) + { + if (decl.normals[i].enable) + CopyAttribute(new_decl.normals[i], decl.normals[i]); + else + MakeDummyAttribute(new_decl.normals[i], VAR_FLOAT, 1, false); + } + for (size_t i = 0; i < ArraySize(new_decl.colors); i++) + { + if (decl.colors[i].enable) + CopyAttribute(new_decl.colors[i], decl.colors[i]); + else + MakeDummyAttribute(new_decl.colors[i], VAR_UNSIGNED_BYTE, 4, false); + } + for (size_t i = 0; i < ArraySize(new_decl.texcoords); i++) + { + if (decl.texcoords[i].enable) + CopyAttribute(new_decl.texcoords[i], decl.texcoords[i]); + else + MakeDummyAttribute(new_decl.texcoords[i], VAR_FLOAT, 1, false); + } + if (decl.posmtx.enable) + CopyAttribute(new_decl.posmtx, decl.posmtx); + else + MakeDummyAttribute(new_decl.posmtx, VAR_UNSIGNED_BYTE, 1, true); + + return GetOrCreateMatchingFormat(new_decl); +} + static VertexLoaderBase* RefreshLoader(int vtx_attr_group, bool preprocess = false) { CPState* state = preprocess ? &g_preprocess_cp_state : &g_main_cp_state; @@ -208,6 +270,7 @@ int RunVertices(int vtx_attr_group, int primitive, int count, DataReader src, bo } s_current_vtx_fmt = loader->m_native_vertex_format; g_current_components = loader->m_native_components; + VertexShaderManager::SetVertexFormat(loader->m_native_components); // if cull mode is CULL_ALL, tell VertexManager to skip triangles and quads. // They still need to go through vertex loading, because we need to calculate a zfreeze refrence diff --git a/Source/Core/VideoCommon/VertexLoaderManager.h b/Source/Core/VideoCommon/VertexLoaderManager.h index 30ceb41aec..b46d354e7d 100644 --- a/Source/Core/VideoCommon/VertexLoaderManager.h +++ b/Source/Core/VideoCommon/VertexLoaderManager.h @@ -24,7 +24,15 @@ void Clear(); void MarkAllDirty(); -NativeVertexFormatMap* GetNativeVertexFormatMap(); +// Creates or obtains a pointer to a VertexFormat representing decl. +// If this results in a VertexFormat being created, if the game later uses a matching vertex +// declaration, the one that was previously created will be used. +NativeVertexFormat* GetOrCreateMatchingFormat(const PortableVertexDeclaration& decl); + +// For vertex ubershaders, all attributes need to be present, even when the vertex +// format does not contain them. This function returns a vertex format with dummy +// offsets set to the unused attributes. +NativeVertexFormat* GetUberVertexFormat(const PortableVertexDeclaration& decl); // Returns -1 if buf_size is insufficient, else the amount of bytes consumed int RunVertices(int vtx_attr_group, int primitive, int count, DataReader src, bool is_preprocess); diff --git a/Source/Core/VideoCommon/VertexManagerBase.cpp b/Source/Core/VideoCommon/VertexManagerBase.cpp index b6088ab2e4..7807cdb98c 100644 --- a/Source/Core/VideoCommon/VertexManagerBase.cpp +++ b/Source/Core/VideoCommon/VertexManagerBase.cpp @@ -193,22 +193,24 @@ void VertexManagerBase::Flush() g_video_backend->CheckInvalidState(); #if defined(_DEBUG) || defined(DEBUGFAST) - PRIM_LOG("frame%d:\n texgen=%d, numchan=%d, dualtex=%d, ztex=%d, cole=%d, alpe=%d, ze=%d", + PRIM_LOG("frame%d:\n texgen=%u, numchan=%u, dualtex=%u, ztex=%u, cole=%u, alpe=%u, ze=%u", g_ActiveConfig.iSaveTargetId, xfmem.numTexGen.numTexGens, xfmem.numChan.numColorChans, - xfmem.dualTexTrans.enabled, bpmem.ztex2.op, (int)bpmem.blendmode.colorupdate, - (int)bpmem.blendmode.alphaupdate, (int)bpmem.zmode.updateenable); + xfmem.dualTexTrans.enabled, bpmem.ztex2.op.Value(), bpmem.blendmode.colorupdate.Value(), + bpmem.blendmode.alphaupdate.Value(), bpmem.zmode.updateenable.Value()); - for (unsigned int i = 0; i < xfmem.numChan.numColorChans; ++i) + for (u32 i = 0; i < xfmem.numChan.numColorChans; ++i) { LitChannel* ch = &xfmem.color[i]; - PRIM_LOG("colchan%d: matsrc=%d, light=0x%x, ambsrc=%d, diffunc=%d, attfunc=%d", i, - ch->matsource, ch->GetFullLightMask(), ch->ambsource, ch->diffusefunc, ch->attnfunc); + PRIM_LOG("colchan%u: matsrc=%u, light=0x%x, ambsrc=%u, diffunc=%u, attfunc=%u", i, + ch->matsource.Value(), ch->GetFullLightMask(), ch->ambsource.Value(), + ch->diffusefunc.Value(), ch->attnfunc.Value()); ch = &xfmem.alpha[i]; - PRIM_LOG("alpchan%d: matsrc=%d, light=0x%x, ambsrc=%d, diffunc=%d, attfunc=%d", i, - ch->matsource, ch->GetFullLightMask(), ch->ambsource, ch->diffusefunc, ch->attnfunc); + PRIM_LOG("alpchan%u: matsrc=%u, light=0x%x, ambsrc=%u, diffunc=%u, attfunc=%u", i, + ch->matsource.Value(), ch->GetFullLightMask(), ch->ambsource.Value(), + ch->diffusefunc.Value(), ch->attnfunc.Value()); } - for (unsigned int i = 0; i < xfmem.numTexGen.numTexGens; ++i) + for (u32 i = 0; i < xfmem.numTexGen.numTexGens; ++i) { TexMtxInfo tinfo = xfmem.texMtxInfo[i]; if (tinfo.texgentype != XF_TEXGEN_EMBOSS_MAP) @@ -216,16 +218,17 @@ void VertexManagerBase::Flush() if (tinfo.texgentype != XF_TEXGEN_REGULAR) tinfo.projection = 0; - PRIM_LOG("txgen%d: proj=%d, input=%d, gentype=%d, srcrow=%d, embsrc=%d, emblght=%d, " - "postmtx=%d, postnorm=%d", - i, tinfo.projection, tinfo.inputform, tinfo.texgentype, tinfo.sourcerow, - tinfo.embosssourceshift, tinfo.embosslightshift, xfmem.postMtxInfo[i].index, - xfmem.postMtxInfo[i].normalize); + PRIM_LOG("txgen%u: proj=%u, input=%u, gentype=%u, srcrow=%u, embsrc=%u, emblght=%u, " + "postmtx=%u, postnorm=%u", + i, tinfo.projection.Value(), tinfo.inputform.Value(), tinfo.texgentype.Value(), + tinfo.sourcerow.Value(), tinfo.embosssourceshift.Value(), + tinfo.embosslightshift.Value(), xfmem.postMtxInfo[i].index.Value(), + xfmem.postMtxInfo[i].normalize.Value()); } - PRIM_LOG("pixel: tev=%d, ind=%d, texgen=%d, dstalpha=%d, alphatest=0x%x", - (int)bpmem.genMode.numtevstages + 1, (int)bpmem.genMode.numindstages, - (int)bpmem.genMode.numtexgens, (u32)bpmem.dstalpha.enable, + PRIM_LOG("pixel: tev=%u, ind=%u, texgen=%u, dstalpha=%u, alphatest=0x%x", + bpmem.genMode.numtevstages.Value() + 1, bpmem.genMode.numindstages.Value(), + bpmem.genMode.numtexgens.Value(), bpmem.dstalpha.enable.Value(), (bpmem.alpha_test.hex >> 16) & 0xff); #endif diff --git a/Source/Core/VideoCommon/VertexShaderGen.cpp b/Source/Core/VideoCommon/VertexShaderGen.cpp index 2ee9532254..a02f801896 100644 --- a/Source/Core/VideoCommon/VertexShaderGen.cpp +++ b/Source/Core/VideoCommon/VertexShaderGen.cpp @@ -114,16 +114,16 @@ ShaderCode GenerateVertexShaderCode(APIType api_type, const ShaderHostConfig& ho out.Write("ATTRIBUTE_LOCATION(%d) in float3 rawnorm2;\n", SHADER_NORM2_ATTRIB); if (uid_data->components & VB_HAS_COL0) - out.Write("ATTRIBUTE_LOCATION(%d) in float4 color0;\n", SHADER_COLOR0_ATTRIB); + out.Write("ATTRIBUTE_LOCATION(%d) in float4 rawcolor0;\n", SHADER_COLOR0_ATTRIB); if (uid_data->components & VB_HAS_COL1) - out.Write("ATTRIBUTE_LOCATION(%d) in float4 color1;\n", SHADER_COLOR1_ATTRIB); + out.Write("ATTRIBUTE_LOCATION(%d) in float4 rawcolor1;\n", SHADER_COLOR1_ATTRIB); for (int i = 0; i < 8; ++i) { u32 hastexmtx = (uid_data->components & (VB_HAS_TEXMTXIDX0 << i)); if ((uid_data->components & (VB_HAS_UV0 << i)) || hastexmtx) { - out.Write("ATTRIBUTE_LOCATION(%d) in float%d tex%d;\n", SHADER_TEXTURE0_ATTRIB + i, + out.Write("ATTRIBUTE_LOCATION(%d) in float%d rawtex%d;\n", SHADER_TEXTURE0_ATTRIB + i, hastexmtx ? 3 : 2, i); } } @@ -143,7 +143,7 @@ ShaderCode GenerateVertexShaderCode(APIType api_type, const ShaderHostConfig& ho { if (i < uid_data->numTexGens) { - out.Write("%s out float3 uv%u;\n", GetInterpolationQualifier(msaa, ssaa), i); + out.Write("%s out float3 tex%u;\n", GetInterpolationQualifier(msaa, ssaa), i); } } out.Write("%s out float4 clipPos;\n", GetInterpolationQualifier(msaa, ssaa)); @@ -170,14 +170,14 @@ ShaderCode GenerateVertexShaderCode(APIType api_type, const ShaderHostConfig& ho if (uid_data->components & VB_HAS_NRM2) out.Write(" float3 rawnorm2 : NORMAL2,\n"); if (uid_data->components & VB_HAS_COL0) - out.Write(" float4 color0 : COLOR0,\n"); + out.Write(" float4 rawcolor0 : COLOR0,\n"); if (uid_data->components & VB_HAS_COL1) - out.Write(" float4 color1 : COLOR1,\n"); + out.Write(" float4 rawcolor1 : COLOR1,\n"); for (int i = 0; i < 8; ++i) { u32 hastexmtx = (uid_data->components & (VB_HAS_TEXMTXIDX0 << i)); if ((uid_data->components & (VB_HAS_UV0 << i)) || hastexmtx) - out.Write(" float%d tex%d : TEXCOORD%d,\n", hastexmtx ? 3 : 2, i, i); + out.Write(" float%d rawtex%d : TEXCOORD%d,\n", hastexmtx ? 3 : 2, i, i); } if (uid_data->components & VB_HAS_POSMTXIDX) out.Write(" uint4 posmtx : BLENDINDICES,\n"); @@ -242,18 +242,18 @@ ShaderCode GenerateVertexShaderCode(APIType api_type, const ShaderHostConfig& ho if (uid_data->numColorChans == 0) { if (uid_data->components & VB_HAS_COL0) - out.Write("o.colors_0 = color0;\n"); + out.Write("o.colors_0 = rawcolor0;\n"); else out.Write("o.colors_0 = float4(1.0, 1.0, 1.0, 1.0);\n"); } GenerateLightingShaderCode(out, uid_data->lighting, uid_data->components, uid_data->numColorChans, - "color", "o.colors_"); + "rawcolor", "o.colors_"); if (uid_data->numColorChans < 2) { if (uid_data->components & VB_HAS_COL1) - out.Write("o.colors_1 = color1;\n"); + out.Write("o.colors_1 = rawcolor1;\n"); else out.Write("o.colors_1 = o.colors_0;\n"); } @@ -296,7 +296,7 @@ ShaderCode GenerateVertexShaderCode(APIType api_type, const ShaderHostConfig& ho default: _assert_(texinfo.sourcerow <= XF_SRCTEX7_INROW); if (uid_data->components & (VB_HAS_UV0 << (texinfo.sourcerow - XF_SRCTEX0_INROW))) - out.Write("coord = float4(tex%d.x, tex%d.y, 1.0, 1.0);\n", + out.Write("coord = float4(rawtex%d.x, rawtex%d.y, 1.0, 1.0);\n", texinfo.sourcerow - XF_SRCTEX0_INROW, texinfo.sourcerow - XF_SRCTEX0_INROW); break; } @@ -338,7 +338,7 @@ ShaderCode GenerateVertexShaderCode(APIType api_type, const ShaderHostConfig& ho default: if (uid_data->components & (VB_HAS_TEXMTXIDX0 << i)) { - out.Write("int tmp = int(tex%d.z);\n", i); + out.Write("int tmp = int(rawtex%d.z);\n", i); if (((uid_data->texMtxInfo_n_projection >> i) & 1) == XF_TEXPROJ_STQ) out.Write("o.tex%d.xyz = float3(dot(coord, " I_TRANSFORMMATRICES "[tmp]), dot(coord, " I_TRANSFORMMATRICES @@ -407,10 +407,10 @@ ShaderCode GenerateVertexShaderCode(APIType api_type, const ShaderHostConfig& ho out.Write("o.WorldPos = pos.xyz;\n"); if (uid_data->components & VB_HAS_COL0) - out.Write("o.colors_0 = color0;\n"); + out.Write("o.colors_0 = rawcolor0;\n"); if (uid_data->components & VB_HAS_COL1) - out.Write("o.colors_1 = color1;\n"); + out.Write("o.colors_1 = rawcolor1;\n"); } // If we can disable the incorrect depth clipping planes using depth clamping, then we can do @@ -495,7 +495,7 @@ ShaderCode GenerateVertexShaderCode(APIType api_type, const ShaderHostConfig& ho // TODO: Pass interface blocks between shader stages even if geometry shaders // are not supported, however that will require at least OpenGL 3.2 support. for (unsigned int i = 0; i < uid_data->numTexGens; ++i) - out.Write("uv%d.xyz = o.tex%d;\n", i, i); + out.Write("tex%d.xyz = o.tex%d;\n", i, i); out.Write("clipPos = o.clipPos;\n"); if (per_pixel_lighting) { diff --git a/Source/Core/VideoCommon/VertexShaderManager.cpp b/Source/Core/VideoCommon/VertexShaderManager.cpp index ea2e8a528d..ad20ab2332 100644 --- a/Source/Core/VideoCommon/VertexShaderManager.cpp +++ b/Source/Core/VideoCommon/VertexShaderManager.cpp @@ -30,6 +30,7 @@ alignas(16) static float g_fProjectionMatrix[16]; // track changes static bool bTexMatricesChanged[2], bPosNormalMatrixChanged, bProjectionChanged, bViewportChanged; +static bool bTexMtxInfoChanged, bLightingConfigChanged; static BitSet32 nMaterialsChanged; static int nTransformMatricesChanged[2]; // min,max static int nNormalMatricesChanged[2]; // min,max @@ -193,8 +194,10 @@ void VertexShaderManager::Init() bPosNormalMatrixChanged = false; bProjectionChanged = true; bViewportChanged = false; + bTexMtxInfoChanged = false; + bLightingConfigChanged = false; - xfmem = {}; + std::memset(&xfmem, 0, sizeof(xfmem)); constants = {}; ResetView(); @@ -561,6 +564,32 @@ void VertexShaderManager::SetConstants() dirty = true; } + + if (bTexMtxInfoChanged) + { + bTexMtxInfoChanged = false; + constants.xfmem_dualTexInfo = xfmem.dualTexTrans.enabled; + for (size_t i = 0; i < ArraySize(xfmem.texMtxInfo); i++) + constants.xfmem_pack1[i][0] = xfmem.texMtxInfo[i].hex; + for (size_t i = 0; i < ArraySize(xfmem.postMtxInfo); i++) + constants.xfmem_pack1[i][1] = xfmem.postMtxInfo[i].hex; + + dirty = true; + } + + if (bLightingConfigChanged) + { + bLightingConfigChanged = false; + + for (size_t i = 0; i < 2; i++) + { + constants.xfmem_pack1[i][2] = xfmem.color[i].hex; + constants.xfmem_pack1[i][3] = xfmem.alpha[i].hex; + } + constants.xfmem_numColorChans = xfmem.numChan.numColorChans; + + dirty = true; + } } void VertexShaderManager::InvalidateXFRange(int start, int end) @@ -758,6 +787,27 @@ void VertexShaderManager::ResetView() bProjectionChanged = true; } +void VertexShaderManager::SetVertexFormat(u32 components) +{ + if (components != constants.components) + { + constants.components = components; + dirty = true; + } +} + +void VertexShaderManager::SetTexMatrixInfoChanged(int index) +{ + // TODO: Should we track this with more precision, like which indices changed? + // The whole vertex constants are probably going to be uploaded regardless. + bTexMtxInfoChanged = true; +} + +void VertexShaderManager::SetLightingConfigChanged() +{ + bLightingConfigChanged = true; +} + void VertexShaderManager::TransformToClipSpace(const float* data, float* out, u32 MtxIdx) { const float* world_matrix = &xfmem.posMatrices[(MtxIdx & 0x3f) * 4]; @@ -800,6 +850,8 @@ void VertexShaderManager::DoState(PointerWrap& p) p.Do(bPosNormalMatrixChanged); p.Do(bProjectionChanged); p.Do(bViewportChanged); + p.Do(bTexMtxInfoChanged); + p.Do(bLightingConfigChanged); p.Do(constants); diff --git a/Source/Core/VideoCommon/VertexShaderManager.h b/Source/Core/VideoCommon/VertexShaderManager.h index 86042437c3..b2c707db1f 100644 --- a/Source/Core/VideoCommon/VertexShaderManager.h +++ b/Source/Core/VideoCommon/VertexShaderManager.h @@ -36,6 +36,10 @@ public: static void RotateView(float x, float y); static void ResetView(); + static void SetVertexFormat(u32 components); + static void SetTexMatrixInfoChanged(int index); + static void SetLightingConfigChanged(); + // data: 3 floats representing the X, Y and Z vertex model coordinates and the posmatrix index. // out: 4 floats which will be initialized with the corresponding clip space coordinates // NOTE: g_fProjectionMatrix must be up to date when this is called diff --git a/Source/Core/VideoCommon/VideoCommon.vcxproj b/Source/Core/VideoCommon/VideoCommon.vcxproj index e33ccd9d7f..d8c43256e1 100644 --- a/Source/Core/VideoCommon/VideoCommon.vcxproj +++ b/Source/Core/VideoCommon/VideoCommon.vcxproj @@ -38,6 +38,7 @@ + @@ -66,12 +67,15 @@ + + + @@ -94,6 +98,7 @@ + @@ -107,6 +112,8 @@ + + @@ -131,6 +138,7 @@ + @@ -172,4 +180,4 @@ - \ No newline at end of file + diff --git a/Source/Core/VideoCommon/VideoCommon.vcxproj.filters b/Source/Core/VideoCommon/VideoCommon.vcxproj.filters index f4ec59eb1d..8e5b9fef31 100644 --- a/Source/Core/VideoCommon/VideoCommon.vcxproj.filters +++ b/Source/Core/VideoCommon/VideoCommon.vcxproj.filters @@ -176,6 +176,18 @@ Shader Generators + + Util + + + Shader Generators + + + Shader Generators + + + Shader Generators + @@ -332,8 +344,20 @@ Base + + Util + + + Shader Generators + + + Shader Generators + + + Shader Generators + - \ No newline at end of file + diff --git a/Source/Core/VideoCommon/VideoConfig.cpp b/Source/Core/VideoCommon/VideoConfig.cpp index 355d2486b4..c277386f71 100644 --- a/Source/Core/VideoCommon/VideoConfig.cpp +++ b/Source/Core/VideoCommon/VideoConfig.cpp @@ -4,6 +4,7 @@ #include +#include "Common/CPUDetect.h" #include "Common/CommonTypes.h" #include "Common/StringUtil.h" #include "Core/Config/GraphicsSettings.h" @@ -93,6 +94,13 @@ void VideoConfig::Refresh() bBackendMultithreading = Config::Get(Config::GFX_BACKEND_MULTITHREADING); iCommandBufferExecuteInterval = Config::Get(Config::GFX_COMMAND_BUFFER_EXECUTE_INTERVAL); bShaderCache = Config::Get(Config::GFX_SHADER_CACHE); + bBackgroundShaderCompiling = Config::Get(Config::GFX_BACKGROUND_SHADER_COMPILING); + bDisableSpecializedShaders = Config::Get(Config::GFX_DISABLE_SPECIALIZED_SHADERS); + bPrecompileUberShaders = Config::Get(Config::GFX_PRECOMPILE_UBER_SHADERS); + iShaderCompilerThreads = Config::Get(Config::GFX_SHADER_COMPILER_THREADS); + iShaderPrecompilerThreads = Config::Get(Config::GFX_SHADER_PRECOMPILER_THREADS); + bForceVertexUberShaders = Config::Get(Config::GFX_FORCE_VERTEX_UBER_SHADERS); + bForcePixelUberShaders = Config::Get(Config::GFX_FORCE_PIXEL_UBER_SHADERS); bZComploc = Config::Get(Config::GFX_SW_ZCOMPLOC); bZFreeze = Config::Get(Config::GFX_SW_ZFREEZE); @@ -188,3 +196,37 @@ bool VideoConfig::IsVSync() { return bVSync && !Core::GetIsThrottlerTempDisabled(); } + +static u32 GetNumAutoShaderCompilerThreads() +{ + // Automatic number. We use clamp(cpus - 3, 1, 4). + return static_cast(std::min(std::max(cpu_info.num_cores - 3, 1), 4)); +} + +u32 VideoConfig::GetShaderCompilerThreads() const +{ + if (iShaderCompilerThreads >= 0) + return static_cast(iShaderCompilerThreads); + else + return GetNumAutoShaderCompilerThreads(); +} + +u32 VideoConfig::GetShaderPrecompilerThreads() const +{ + if (iShaderPrecompilerThreads >= 0) + return static_cast(iShaderPrecompilerThreads); + else + return GetNumAutoShaderCompilerThreads(); +} + +bool VideoConfig::CanPrecompileUberShaders() const +{ + // We don't want to precompile ubershaders if they're never going to be used. + return bPrecompileUberShaders && (bBackgroundShaderCompiling || bDisableSpecializedShaders); +} + +bool VideoConfig::CanBackgroundCompileShaders() const +{ + // We require precompiled ubershaders to background compile shaders. + return bBackgroundShaderCompiling && bPrecompileUberShaders; +} diff --git a/Source/Core/VideoCommon/VideoConfig.h b/Source/Core/VideoCommon/VideoConfig.h index 1216bcc431..f876e63802 100644 --- a/Source/Core/VideoCommon/VideoConfig.h +++ b/Source/Core/VideoCommon/VideoConfig.h @@ -168,6 +168,36 @@ struct VideoConfig final // Currently only supported with Vulkan. int iCommandBufferExecuteInterval; + // The following options determine the ubershader mode: + // No ubershaders: + // - bBackgroundShaderCompiling = false + // - bDisableSpecializedShaders = false + // Hybrid/background compiling: + // - bBackgroundShaderCompiling = true + // - bDisableSpecializedShaders = false + // Ubershaders only: + // - bBackgroundShaderCompiling = false + // - bDisableSpecializedShaders = true + + // Enable background shader compiling, use ubershaders while waiting. + bool bBackgroundShaderCompiling; + + // Use ubershaders only, don't compile specialized shaders. + bool bDisableSpecializedShaders; + + // Precompile ubershader variants at boot/config reload time. + bool bPrecompileUberShaders; + + // Number of shader compiler threads. + // 0 disables background compilation. + // -1 uses an automatic number based on the CPU threads. + int iShaderCompilerThreads; + int iShaderPrecompilerThreads; + + // Temporary toggling of ubershaders, for debugging + bool bForceVertexUberShaders; + bool bForcePixelUberShaders; + // Static config per API // TODO: Move this out of VideoConfig struct @@ -204,6 +234,8 @@ struct VideoConfig final bool bSupportsInternalResolutionFrameDumps; bool bSupportsGPUTextureDecoding; bool bSupportsST3CTextures; + bool bSupportsBitfield; // Needed by UberShaders, so must stay in VideoCommon + bool bSupportsDynamicSamplerIndexing; // Needed by UberShaders, so must stay in VideoCommon } backend_info; // Utility @@ -224,6 +256,10 @@ struct VideoConfig final return backend_info.bSupportsGPUTextureDecoding && bEnableGPUTextureDecoding; } bool UseVertexRounding() const { return bVertexRounding && iEFBScale != SCALE_1X; } + u32 GetShaderCompilerThreads() const; + u32 GetShaderPrecompilerThreads() const; + bool CanPrecompileUberShaders() const; + bool CanBackgroundCompileShaders() const; }; extern VideoConfig g_Config; diff --git a/Source/Core/VideoCommon/XFMemory.h b/Source/Core/VideoCommon/XFMemory.h index b4e4ec7325..461a63e368 100644 --- a/Source/Core/VideoCommon/XFMemory.h +++ b/Source/Core/VideoCommon/XFMemory.h @@ -4,6 +4,7 @@ #pragma once +#include "Common/BitField.h" #include "Common/CommonTypes.h" #include "VideoCommon/CPMemory.h" @@ -132,27 +133,15 @@ enum union LitChannel { - struct - { - u32 matsource : 1; - u32 enablelighting : 1; - u32 lightMask0_3 : 4; - u32 ambsource : 1; - u32 diffusefunc : 2; // LIGHTDIF_X - u32 attnfunc : 2; // LIGHTATTN_X - u32 lightMask4_7 : 4; - }; - struct - { - u32 hex : 15; - u32 unused : 17; - }; - struct - { - u32 dummy0 : 7; - u32 lightparams : 4; - u32 dummy1 : 21; - }; + BitField<0, 1, u32> matsource; + BitField<1, 1, u32> enablelighting; + BitField<2, 4, u32> lightMask0_3; + BitField<6, 1, u32> ambsource; + BitField<7, 2, u32> diffusefunc; // LIGHTDIF_X + BitField<9, 2, u32> attnfunc; // LIGHTATTN_X + BitField<11, 4, u32> lightMask4_7; + u32 hex; + unsigned int GetFullLightMask() const { return enablelighting ? (lightMask0_3 | (lightMask4_7 << 4)) : 0; @@ -173,28 +162,22 @@ union INVTXSPEC union TexMtxInfo { - struct - { - u32 unknown : 1; - u32 projection : 1; // XF_TEXPROJ_X - u32 inputform : 1; // XF_TEXINPUT_X - u32 unknown2 : 1; - u32 texgentype : 3; // XF_TEXGEN_X - u32 sourcerow : 5; // XF_SRCGEOM_X - u32 embosssourceshift : 3; // what generated texcoord to use - u32 embosslightshift : 3; // light index that is used - }; + BitField<0, 1, u32> unknown; // + BitField<1, 1, u32> projection; // XF_TEXPROJ_X + BitField<2, 1, u32> inputform; // XF_TEXINPUT_X + BitField<3, 1, u32> unknown2; // + BitField<4, 3, u32> texgentype; // XF_TEXGEN_X + BitField<7, 5, u32> sourcerow; // XF_SRCGEOM_X + BitField<12, 3, u32> embosssourceshift; // what generated texcoord to use + BitField<15, 3, u32> embosslightshift; // light index that is used u32 hex; }; union PostMtxInfo { - struct - { - u32 index : 6; // base row of dual transform matrix - u32 unused : 2; - u32 normalize : 1; // normalize before send operation - }; + BitField<0, 6, u32> index; // base row of dual transform matrix + BitField<6, 2, u32> unused; // + BitField<8, 1, u32> normalize; // normalize before send operation u32 hex; }; diff --git a/Source/Core/VideoCommon/XFStructs.cpp b/Source/Core/VideoCommon/XFStructs.cpp index dbe3f21e02..41a3a3e447 100644 --- a/Source/Core/VideoCommon/XFStructs.cpp +++ b/Source/Core/VideoCommon/XFStructs.cpp @@ -56,6 +56,7 @@ static void XFRegWritten(int transferSize, u32 baseAddress, DataReader src) case XFMEM_SETNUMCHAN: if (xfmem.numChan.numColorChans != (newValue & 3)) g_vertex_manager->Flush(); + VertexShaderManager::SetLightingConfigChanged(); break; case XFMEM_SETCHAN0_AMBCOLOR: // Channel Ambient Color @@ -88,11 +89,13 @@ static void XFRegWritten(int transferSize, u32 baseAddress, DataReader src) case XFMEM_SETCHAN1_ALPHA: if (((u32*)&xfmem)[address] != (newValue & 0x7fff)) g_vertex_manager->Flush(); + VertexShaderManager::SetLightingConfigChanged(); break; case XFMEM_DUALTEX: if (xfmem.dualTexTrans.enabled != (newValue & 1)) g_vertex_manager->Flush(); + VertexShaderManager::SetTexMatrixInfoChanged(-1); break; case XFMEM_SETMATRIXINDA: @@ -146,6 +149,7 @@ static void XFRegWritten(int transferSize, u32 baseAddress, DataReader src) case XFMEM_SETTEXMTXINFO + 6: case XFMEM_SETTEXMTXINFO + 7: g_vertex_manager->Flush(); + VertexShaderManager::SetTexMatrixInfoChanged(address - XFMEM_SETTEXMTXINFO); nextAddress = XFMEM_SETTEXMTXINFO + 8; break; @@ -159,6 +163,7 @@ static void XFRegWritten(int transferSize, u32 baseAddress, DataReader src) case XFMEM_SETPOSMTXINFO + 6: case XFMEM_SETPOSMTXINFO + 7: g_vertex_manager->Flush(); + VertexShaderManager::SetTexMatrixInfoChanged(address - XFMEM_SETPOSMTXINFO); nextAddress = XFMEM_SETPOSMTXINFO + 8; break;