diff --git a/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/ui/settings/SettingsFragmentPresenter.java b/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/ui/settings/SettingsFragmentPresenter.java index 5285ab2a09..1ad529afd8 100644 --- a/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/ui/settings/SettingsFragmentPresenter.java +++ b/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/ui/settings/SettingsFragmentPresenter.java @@ -286,6 +286,7 @@ public final class SettingsFragmentPresenter BooleanSetting ignoreFormat = new BooleanSetting(SettingsFile.KEY_IGNORE_FORMAT, SettingsFile.SECTION_GFX_HACKS, SettingsFile.SETTINGS_GFX, ignoreFormatValue); Setting efbToTexture = mSettings.get(SettingsFile.SETTINGS_GFX).get(SettingsFile.SECTION_GFX_HACKS).getSetting(SettingsFile.KEY_EFB_TEXTURE); Setting texCacheAccuracy = mSettings.get(SettingsFile.SETTINGS_GFX).get(SettingsFile.SECTION_GFX_SETTINGS).getSetting(SettingsFile.KEY_TEXCACHE_ACCURACY); + Setting gpuTextureDecoding = mSettings.get(SettingsFile.SETTINGS_GFX).get(SettingsFile.SECTION_GFX_SETTINGS).getSetting(SettingsFile.KEY_GPU_TEXTURE_DECODING); IntSetting xfb = new IntSetting(SettingsFile.KEY_XFB, SettingsFile.SECTION_GFX_HACKS, SettingsFile.SETTINGS_GFX, xfbValue); Setting fastDepth = mSettings.get(SettingsFile.SETTINGS_GFX).get(SettingsFile.SECTION_GFX_HACKS).getSetting(SettingsFile.KEY_FAST_DEPTH); Setting aspectRatio = mSettings.get(SettingsFile.SETTINGS_GFX).get(SettingsFile.SECTION_GFX_SETTINGS).getSetting(SettingsFile.KEY_ASPECT_RATIO); @@ -297,6 +298,7 @@ public final class SettingsFragmentPresenter sl.add(new HeaderSetting(null, null, R.string.texture_cache, 0)); sl.add(new SingleChoiceSetting(SettingsFile.KEY_TEXCACHE_ACCURACY, SettingsFile.SECTION_GFX_SETTINGS, SettingsFile.SETTINGS_GFX, R.string.texture_cache_accuracy, R.string.texture_cache_accuracy_descrip, R.array.textureCacheAccuracyEntries, R.array.textureCacheAccuracyValues, 128, texCacheAccuracy)); + sl.add(new CheckBoxSetting(SettingsFile.KEY_GPU_TEXTURE_DECODING, SettingsFile.SECTION_GFX_SETTINGS, SettingsFile.SETTINGS_GFX, R.string.gpu_texture_decoding, R.string.gpu_texture_decoding_descrip, false, gpuTextureDecoding)); sl.add(new HeaderSetting(null, null, R.string.external_frame_buffer, 0)); sl.add(new SingleChoiceSetting(SettingsFile.KEY_XFB_METHOD, SettingsFile.SECTION_GFX_HACKS, SettingsFile.SETTINGS_GFX, R.string.external_frame_buffer, R.string.external_frame_buffer_descrip, R.array.externalFrameBufferEntries, R.array.externalFrameBufferValues, 0, xfb)); diff --git a/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/utils/SettingsFile.java b/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/utils/SettingsFile.java index 707e345d26..76043fbb3f 100644 --- a/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/utils/SettingsFile.java +++ b/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/utils/SettingsFile.java @@ -73,6 +73,7 @@ public final class SettingsFile public static final String KEY_IGNORE_FORMAT = "EFBEmulateFormatChanges"; public static final String KEY_EFB_TEXTURE = "EFBToTextureEnable"; public static final String KEY_TEXCACHE_ACCURACY = "SafeTextureCacheColorSamples"; + public static final String KEY_GPU_TEXTURE_DECODING = "EnableGPUTextureDecoding"; public static final String KEY_XFB = "UseXFB"; public static final String KEY_XFB_REAL = "UseRealXFB"; public static final String KEY_FAST_DEPTH = "FastDepthCalc"; diff --git a/Source/Android/app/src/main/res/values/strings.xml b/Source/Android/app/src/main/res/values/strings.xml index f41d8fe8a2..10be6058bb 100644 --- a/Source/Android/app/src/main/res/values/strings.xml +++ b/Source/Android/app/src/main/res/values/strings.xml @@ -168,6 +168,8 @@ Texture Cache Texture Cache Accuracy The safer the selection, the less likely the emulator will be missing any texture updates from RAM. + GPU Texture Decoding + Decodes textures on the GPU using compute shaders where supported. May improve performance in some scenarios. External Frame Buffer Determines how the XFB will be emulated. Disable Destination Alpha diff --git a/Source/Core/Common/Common.vcxproj b/Source/Core/Common/Common.vcxproj index 13256bdce7..a00a8d488e 100644 --- a/Source/Core/Common/Common.vcxproj +++ b/Source/Core/Common/Common.vcxproj @@ -72,6 +72,7 @@ + @@ -83,9 +84,11 @@ + + diff --git a/Source/Core/Common/Common.vcxproj.filters b/Source/Core/Common/Common.vcxproj.filters index cb6fba5ad5..ff6b9ad7fc 100644 --- a/Source/Core/Common/Common.vcxproj.filters +++ b/Source/Core/Common/Common.vcxproj.filters @@ -238,6 +238,16 @@ + + + GL\GLExtensions + + + GL\GLExtensions + + + GL\GLExtensions + @@ -303,6 +313,7 @@ + diff --git a/Source/Core/Common/GL/GLExtensions/ARB_compute_shader.h b/Source/Core/Common/GL/GLExtensions/ARB_compute_shader.h new file mode 100644 index 0000000000..b27b8091e4 --- /dev/null +++ b/Source/Core/Common/GL/GLExtensions/ARB_compute_shader.h @@ -0,0 +1,53 @@ +/* +** Copyright (c) 2013-2015 The Khronos Group Inc. +** +** Permission is hereby granted, free of charge, to any person obtaining a +** copy of this software and/or associated documentation files (the +** "Materials"), to deal in the Materials without restriction, including +** without limitation the rights to use, copy, modify, merge, publish, +** distribute, sublicense, and/or sell copies of the Materials, and to +** permit persons to whom the Materials are furnished to do so, subject to +** the following conditions: +** +** The above copyright notice and this permission notice shall be included +** in all copies or substantial portions of the Materials. +** +** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +** MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. +*/ + +#include "Common/GL/GLExtensions/gl_common.h" + +#define GL_COMPUTE_SHADER 0x91B9 +#define GL_MAX_COMPUTE_UNIFORM_BLOCKS 0x91BB +#define GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS 0x91BC +#define GL_MAX_COMPUTE_IMAGE_UNIFORMS 0x91BD +#define GL_MAX_COMPUTE_SHARED_MEMORY_SIZE 0x8262 +#define GL_MAX_COMPUTE_UNIFORM_COMPONENTS 0x8263 +#define GL_MAX_COMPUTE_ATOMIC_COUNTER_BUFFERS 0x8264 +#define GL_MAX_COMPUTE_ATOMIC_COUNTERS 0x8265 +#define GL_MAX_COMBINED_COMPUTE_UNIFORM_COMPONENTS 0x8266 +#define GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS 0x90EB +#define GL_MAX_COMPUTE_WORK_GROUP_COUNT 0x91BE +#define GL_MAX_COMPUTE_WORK_GROUP_SIZE 0x91BF +#define GL_COMPUTE_WORK_GROUP_SIZE 0x8267 +#define GL_UNIFORM_BLOCK_REFERENCED_BY_COMPUTE_SHADER 0x90EC +#define GL_ATOMIC_COUNTER_BUFFER_REFERENCED_BY_COMPUTE_SHADER 0x90ED +#define GL_DISPATCH_INDIRECT_BUFFER 0x90EE +#define GL_DISPATCH_INDIRECT_BUFFER_BINDING 0x90EF +#define GL_COMPUTE_SHADER_BIT 0x00000020 + +typedef void(APIENTRYP PFNDOLDISPATCHCOMPUTEPROC)(GLuint num_groups_x, GLuint num_groups_y, + GLuint num_groups_z); +typedef void(APIENTRYP PFNDOLDISPATCHCOMPUTEINDIRECTPROC)(GLintptr indirect); + +extern PFNDOLDISPATCHCOMPUTEPROC dolDispatchCompute; +extern PFNDOLDISPATCHCOMPUTEINDIRECTPROC dolDispatchComputeIndirect; + +#define glDispatchCompute dolDispatchCompute +#define glDispatchComputeIndirect dolDispatchComputeIndirect diff --git a/Source/Core/Common/GL/GLExtensions/ARB_shader_image_load_store.h b/Source/Core/Common/GL/GLExtensions/ARB_shader_image_load_store.h new file mode 100644 index 0000000000..ca366f3c5b --- /dev/null +++ b/Source/Core/Common/GL/GLExtensions/ARB_shader_image_load_store.h @@ -0,0 +1,100 @@ +/* +** Copyright (c) 2013-2015 The Khronos Group Inc. +** +** Permission is hereby granted, free of charge, to any person obtaining a +** copy of this software and/or associated documentation files (the +** "Materials"), to deal in the Materials without restriction, including +** without limitation the rights to use, copy, modify, merge, publish, +** distribute, sublicense, and/or sell copies of the Materials, and to +** permit persons to whom the Materials are furnished to do so, subject to +** the following conditions: +** +** The above copyright notice and this permission notice shall be included +** in all copies or substantial portions of the Materials. +** +** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +** MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. +*/ + +#include "Common/GL/GLExtensions/gl_common.h" + +#define GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT 0x00000001 +#define GL_ELEMENT_ARRAY_BARRIER_BIT 0x00000002 +#define GL_UNIFORM_BARRIER_BIT 0x00000004 +#define GL_TEXTURE_FETCH_BARRIER_BIT 0x00000008 +#define GL_SHADER_IMAGE_ACCESS_BARRIER_BIT 0x00000020 +#define GL_COMMAND_BARRIER_BIT 0x00000040 +#define GL_PIXEL_BUFFER_BARRIER_BIT 0x00000080 +#define GL_TEXTURE_UPDATE_BARRIER_BIT 0x00000100 +#define GL_BUFFER_UPDATE_BARRIER_BIT 0x00000200 +#define GL_FRAMEBUFFER_BARRIER_BIT 0x00000400 +#define GL_TRANSFORM_FEEDBACK_BARRIER_BIT 0x00000800 +#define GL_ATOMIC_COUNTER_BARRIER_BIT 0x00001000 +#define GL_ALL_BARRIER_BITS 0xFFFFFFFF +#define GL_MAX_IMAGE_UNITS 0x8F38 +#define GL_MAX_COMBINED_IMAGE_UNITS_AND_FRAGMENT_OUTPUTS 0x8F39 +#define GL_IMAGE_BINDING_NAME 0x8F3A +#define GL_IMAGE_BINDING_LEVEL 0x8F3B +#define GL_IMAGE_BINDING_LAYERED 0x8F3C +#define GL_IMAGE_BINDING_LAYER 0x8F3D +#define GL_IMAGE_BINDING_ACCESS 0x8F3E +#define GL_IMAGE_1D 0x904C +#define GL_IMAGE_2D 0x904D +#define GL_IMAGE_3D 0x904E +#define GL_IMAGE_2D_RECT 0x904F +#define GL_IMAGE_CUBE 0x9050 +#define GL_IMAGE_BUFFER 0x9051 +#define GL_IMAGE_1D_ARRAY 0x9052 +#define GL_IMAGE_2D_ARRAY 0x9053 +#define GL_IMAGE_CUBE_MAP_ARRAY 0x9054 +#define GL_IMAGE_2D_MULTISAMPLE 0x9055 +#define GL_IMAGE_2D_MULTISAMPLE_ARRAY 0x9056 +#define GL_INT_IMAGE_1D 0x9057 +#define GL_INT_IMAGE_2D 0x9058 +#define GL_INT_IMAGE_3D 0x9059 +#define GL_INT_IMAGE_2D_RECT 0x905A +#define GL_INT_IMAGE_CUBE 0x905B +#define GL_INT_IMAGE_BUFFER 0x905C +#define GL_INT_IMAGE_1D_ARRAY 0x905D +#define GL_INT_IMAGE_2D_ARRAY 0x905E +#define GL_INT_IMAGE_CUBE_MAP_ARRAY 0x905F +#define GL_INT_IMAGE_2D_MULTISAMPLE 0x9060 +#define GL_INT_IMAGE_2D_MULTISAMPLE_ARRAY 0x9061 +#define GL_UNSIGNED_INT_IMAGE_1D 0x9062 +#define GL_UNSIGNED_INT_IMAGE_2D 0x9063 +#define GL_UNSIGNED_INT_IMAGE_3D 0x9064 +#define GL_UNSIGNED_INT_IMAGE_2D_RECT 0x9065 +#define GL_UNSIGNED_INT_IMAGE_CUBE 0x9066 +#define GL_UNSIGNED_INT_IMAGE_BUFFER 0x9067 +#define GL_UNSIGNED_INT_IMAGE_1D_ARRAY 0x9068 +#define GL_UNSIGNED_INT_IMAGE_2D_ARRAY 0x9069 +#define GL_UNSIGNED_INT_IMAGE_CUBE_MAP_ARRAY 0x906A +#define GL_UNSIGNED_INT_IMAGE_2D_MULTISAMPLE 0x906B +#define GL_UNSIGNED_INT_IMAGE_2D_MULTISAMPLE_ARRAY 0x906C +#define GL_MAX_IMAGE_SAMPLES 0x906D +#define GL_IMAGE_BINDING_FORMAT 0x906E +#define GL_IMAGE_FORMAT_COMPATIBILITY_TYPE 0x90C7 +#define GL_IMAGE_FORMAT_COMPATIBILITY_BY_SIZE 0x90C8 +#define GL_IMAGE_FORMAT_COMPATIBILITY_BY_CLASS 0x90C9 +#define GL_MAX_VERTEX_IMAGE_UNIFORMS 0x90CA +#define GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS 0x90CB +#define GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS 0x90CC +#define GL_MAX_GEOMETRY_IMAGE_UNIFORMS 0x90CD +#define GL_MAX_FRAGMENT_IMAGE_UNIFORMS 0x90CE +#define GL_MAX_COMBINED_IMAGE_UNIFORMS 0x90CF + +typedef void(APIENTRYP PFNDOLBINDIMAGETEXTUREPROC)(GLuint unit, GLuint texture, GLint level, + GLboolean layered, GLint layer, GLenum access, + GLenum format); +typedef void(APIENTRYP PFNDOLMEMORYBARRIERPROC)(GLbitfield barriers); + +extern PFNDOLBINDIMAGETEXTUREPROC dolBindImageTexture; +extern PFNDOLMEMORYBARRIERPROC dolMemoryBarrier; + +#define glBindImageTexture dolBindImageTexture +#define glMemoryBarrier dolMemoryBarrier diff --git a/Source/Core/Common/GL/GLExtensions/ARB_texture_storage.h b/Source/Core/Common/GL/GLExtensions/ARB_texture_storage.h new file mode 100644 index 0000000000..1686a11248 --- /dev/null +++ b/Source/Core/Common/GL/GLExtensions/ARB_texture_storage.h @@ -0,0 +1,41 @@ +/* +** Copyright (c) 2013-2015 The Khronos Group Inc. +** +** Permission is hereby granted, free of charge, to any person obtaining a +** copy of this software and/or associated documentation files (the +** "Materials"), to deal in the Materials without restriction, including +** without limitation the rights to use, copy, modify, merge, publish, +** distribute, sublicense, and/or sell copies of the Materials, and to +** permit persons to whom the Materials are furnished to do so, subject to +** the following conditions: +** +** The above copyright notice and this permission notice shall be included +** in all copies or substantial portions of the Materials. +** +** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +** MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. +*/ + +#include "Common/GL/GLExtensions/gl_common.h" + +#define GL_TEXTURE_IMMUTABLE_FORMAT 0x912F + +typedef void(APIENTRYP PFNDOLTEXSTORAGE1DPROC)(GLenum target, GLsizei levels, GLenum internalformat, + GLsizei width); +typedef void(APIENTRYP PFNDOLTEXSTORAGE2DPROC)(GLenum target, GLsizei levels, GLenum internalformat, + GLsizei width, GLsizei height); +typedef void(APIENTRYP PFNDOLTEXSTORAGE3DPROC)(GLenum target, GLsizei levels, GLenum internalformat, + GLsizei width, GLsizei height, GLsizei depth); + +extern PFNDOLTEXSTORAGE1DPROC dolTexStorage1D; +extern PFNDOLTEXSTORAGE2DPROC dolTexStorage2D; +extern PFNDOLTEXSTORAGE3DPROC dolTexStorage3D; + +#define glTexStorage1D dolTexStorage1D +#define glTexStorage2D dolTexStorage2D +#define glTexStorage3D dolTexStorage3D diff --git a/Source/Core/Common/GL/GLExtensions/GLExtensions.cpp b/Source/Core/Common/GL/GLExtensions/GLExtensions.cpp index 546bfa1b69..cba40b35dc 100644 --- a/Source/Core/Common/GL/GLExtensions/GLExtensions.cpp +++ b/Source/Core/Common/GL/GLExtensions/GLExtensions.cpp @@ -653,19 +653,12 @@ PFNDOLDRAWELEMENTSINSTANCEDBASEVERTEXBASEINSTANCEPROC dolDrawElementsInstancedBaseVertexBaseInstance; PFNDOLGETINTERNALFORMATIVPROC dolGetInternalformativ; PFNDOLGETACTIVEATOMICCOUNTERBUFFERIVPROC dolGetActiveAtomicCounterBufferiv; -PFNDOLBINDIMAGETEXTUREPROC dolBindImageTexture; -PFNDOLMEMORYBARRIERPROC dolMemoryBarrier; -PFNDOLTEXSTORAGE1DPROC dolTexStorage1D; -PFNDOLTEXSTORAGE2DPROC dolTexStorage2D; -PFNDOLTEXSTORAGE3DPROC dolTexStorage3D; PFNDOLDRAWTRANSFORMFEEDBACKINSTANCEDPROC dolDrawTransformFeedbackInstanced; PFNDOLDRAWTRANSFORMFEEDBACKSTREAMINSTANCEDPROC dolDrawTransformFeedbackStreamInstanced; // gl_4_3 PFNDOLCLEARBUFFERDATAPROC dolClearBufferData; PFNDOLCLEARBUFFERSUBDATAPROC dolClearBufferSubData; -PFNDOLDISPATCHCOMPUTEPROC dolDispatchCompute; -PFNDOLDISPATCHCOMPUTEINDIRECTPROC dolDispatchComputeIndirect; PFNDOLFRAMEBUFFERPARAMETERIPROC dolFramebufferParameteri; PFNDOLGETFRAMEBUFFERPARAMETERIVPROC dolGetFramebufferParameteriv; PFNDOLGETINTERNALFORMATI64VPROC dolGetInternalformati64v; @@ -905,6 +898,11 @@ PFNDOLTEXIMAGE3DMULTISAMPLEPROC dolTexImage3DMultisample; PFNDOLGETMULTISAMPLEFVPROC dolGetMultisamplefv; PFNDOLSAMPLEMASKIPROC dolSampleMaski; +// ARB_texture_storage +PFNDOLTEXSTORAGE1DPROC dolTexStorage1D; +PFNDOLTEXSTORAGE2DPROC dolTexStorage2D; +PFNDOLTEXSTORAGE3DPROC dolTexStorage3D; + // ARB_texture_storage_multisample PFNDOLTEXSTORAGE2DMULTISAMPLEPROC dolTexStorage2DMultisample; PFNDOLTEXSTORAGE3DMULTISAMPLEPROC dolTexStorage3DMultisample; @@ -989,6 +987,14 @@ PFNDOLDEPTHRANGEDNVPROC dolDepthRangedNV; PFNDOLCLEARDEPTHDNVPROC dolClearDepthdNV; PFNDOLDEPTHBOUNDSDNVPROC dolDepthBoundsdNV; +// ARB_shader_image_load_store +PFNDOLBINDIMAGETEXTUREPROC dolBindImageTexture; +PFNDOLMEMORYBARRIERPROC dolMemoryBarrier; + +// ARB_compute_shader +PFNDOLDISPATCHCOMPUTEPROC dolDispatchCompute; +PFNDOLDISPATCHCOMPUTEINDIRECTPROC dolDispatchComputeIndirect; + // Creates a GLFunc object that requires a feature #define GLFUNC_REQUIRES(x, y) \ { \ @@ -1681,6 +1687,11 @@ const GLFunc gl_function_array[] = { GLFUNC_REQUIRES(glGetMultisamplefv, "GL_ARB_texture_multisample"), GLFUNC_REQUIRES(glSampleMaski, "GL_ARB_texture_multisample"), + // ARB_texture_storage + GLFUNC_REQUIRES(glTexStorage1D, "GL_ARB_texture_storage !VERSION_4_2"), + GLFUNC_REQUIRES(glTexStorage2D, "GL_ARB_texture_storage !VERSION_4_2 |VERSION_GLES_3"), + GLFUNC_REQUIRES(glTexStorage3D, "GL_ARB_texture_storage !VERSION_4_2 |VERSION_GLES_3"), + // ARB_texture_storage_multisample GLFUNC_REQUIRES(glTexStorage2DMultisample, "GL_ARB_texture_storage_multisample !VERSION_4_3 |VERSION_GLES_3_1"), @@ -1848,6 +1859,17 @@ const GLFunc gl_function_array[] = { GLFUNC_REQUIRES(glDepthRangedNV, "GL_NV_depth_buffer_float"), GLFUNC_REQUIRES(glClearDepthdNV, "GL_NV_depth_buffer_float"), GLFUNC_REQUIRES(glDepthBoundsdNV, "GL_NV_depth_buffer_float"), + + // ARB_shader_image_load_store + GLFUNC_REQUIRES(glBindImageTexture, + "GL_ARB_shader_image_load_store !VERSION_4_2 |VERSION_GLES_3_1"), + GLFUNC_REQUIRES(glMemoryBarrier, + "GL_ARB_shader_image_load_store !VERSION_4_2 |VERSION_GLES_3_1"), + + // ARB_compute_shader + GLFUNC_REQUIRES(glDispatchCompute, "GL_ARB_compute_shader !VERSION_4_3 |VERSION_GLES_3_1"), + GLFUNC_REQUIRES(glDispatchComputeIndirect, + "GL_ARB_compute_shader !VERSION_4_3 |VERSION_GLES_3_1"), }; namespace GLExtensions diff --git a/Source/Core/Common/GL/GLExtensions/GLExtensions.h b/Source/Core/Common/GL/GLExtensions/GLExtensions.h index 4c58167700..49e9fc282f 100644 --- a/Source/Core/Common/GL/GLExtensions/GLExtensions.h +++ b/Source/Core/Common/GL/GLExtensions/GLExtensions.h @@ -12,6 +12,7 @@ #include "Common/GL/GLExtensions/ARB_blend_func_extended.h" #include "Common/GL/GLExtensions/ARB_buffer_storage.h" #include "Common/GL/GLExtensions/ARB_clip_control.h" +#include "Common/GL/GLExtensions/ARB_compute_shader.h" #include "Common/GL/GLExtensions/ARB_copy_image.h" #include "Common/GL/GLExtensions/ARB_debug_output.h" #include "Common/GL/GLExtensions/ARB_draw_elements_base_vertex.h" @@ -21,9 +22,11 @@ #include "Common/GL/GLExtensions/ARB_occlusion_query2.h" #include "Common/GL/GLExtensions/ARB_sample_shading.h" #include "Common/GL/GLExtensions/ARB_sampler_objects.h" +#include "Common/GL/GLExtensions/ARB_shader_image_load_store.h" #include "Common/GL/GLExtensions/ARB_shader_storage_buffer_object.h" #include "Common/GL/GLExtensions/ARB_sync.h" #include "Common/GL/GLExtensions/ARB_texture_multisample.h" +#include "Common/GL/GLExtensions/ARB_texture_storage.h" #include "Common/GL/GLExtensions/ARB_texture_storage_multisample.h" #include "Common/GL/GLExtensions/ARB_uniform_buffer_object.h" #include "Common/GL/GLExtensions/ARB_vertex_array_object.h" diff --git a/Source/Core/Common/GL/GLExtensions/gl_4_2.h b/Source/Core/Common/GL/GLExtensions/gl_4_2.h index bb9286bf32..e4eab3f475 100644 --- a/Source/Core/Common/GL/GLExtensions/gl_4_2.h +++ b/Source/Core/Common/GL/GLExtensions/gl_4_2.h @@ -66,75 +66,10 @@ #define GL_ACTIVE_ATOMIC_COUNTER_BUFFERS 0x92D9 #define GL_UNIFORM_ATOMIC_COUNTER_BUFFER_INDEX 0x92DA #define GL_UNSIGNED_INT_ATOMIC_COUNTER 0x92DB -#define GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT 0x00000001 -#define GL_ELEMENT_ARRAY_BARRIER_BIT 0x00000002 -#define GL_UNIFORM_BARRIER_BIT 0x00000004 -#define GL_TEXTURE_FETCH_BARRIER_BIT 0x00000008 -#define GL_SHADER_IMAGE_ACCESS_BARRIER_BIT 0x00000020 -#define GL_COMMAND_BARRIER_BIT 0x00000040 -#define GL_PIXEL_BUFFER_BARRIER_BIT 0x00000080 -#define GL_TEXTURE_UPDATE_BARRIER_BIT 0x00000100 -#define GL_BUFFER_UPDATE_BARRIER_BIT 0x00000200 -#define GL_FRAMEBUFFER_BARRIER_BIT 0x00000400 -#define GL_TRANSFORM_FEEDBACK_BARRIER_BIT 0x00000800 -#define GL_ATOMIC_COUNTER_BARRIER_BIT 0x00001000 -#define GL_ALL_BARRIER_BITS 0xFFFFFFFF -#define GL_MAX_IMAGE_UNITS 0x8F38 -#define GL_MAX_COMBINED_IMAGE_UNITS_AND_FRAGMENT_OUTPUTS 0x8F39 -#define GL_IMAGE_BINDING_NAME 0x8F3A -#define GL_IMAGE_BINDING_LEVEL 0x8F3B -#define GL_IMAGE_BINDING_LAYERED 0x8F3C -#define GL_IMAGE_BINDING_LAYER 0x8F3D -#define GL_IMAGE_BINDING_ACCESS 0x8F3E -#define GL_IMAGE_1D 0x904C -#define GL_IMAGE_2D 0x904D -#define GL_IMAGE_3D 0x904E -#define GL_IMAGE_2D_RECT 0x904F -#define GL_IMAGE_CUBE 0x9050 -#define GL_IMAGE_BUFFER 0x9051 -#define GL_IMAGE_1D_ARRAY 0x9052 -#define GL_IMAGE_2D_ARRAY 0x9053 -#define GL_IMAGE_CUBE_MAP_ARRAY 0x9054 -#define GL_IMAGE_2D_MULTISAMPLE 0x9055 -#define GL_IMAGE_2D_MULTISAMPLE_ARRAY 0x9056 -#define GL_INT_IMAGE_1D 0x9057 -#define GL_INT_IMAGE_2D 0x9058 -#define GL_INT_IMAGE_3D 0x9059 -#define GL_INT_IMAGE_2D_RECT 0x905A -#define GL_INT_IMAGE_CUBE 0x905B -#define GL_INT_IMAGE_BUFFER 0x905C -#define GL_INT_IMAGE_1D_ARRAY 0x905D -#define GL_INT_IMAGE_2D_ARRAY 0x905E -#define GL_INT_IMAGE_CUBE_MAP_ARRAY 0x905F -#define GL_INT_IMAGE_2D_MULTISAMPLE 0x9060 -#define GL_INT_IMAGE_2D_MULTISAMPLE_ARRAY 0x9061 -#define GL_UNSIGNED_INT_IMAGE_1D 0x9062 -#define GL_UNSIGNED_INT_IMAGE_2D 0x9063 -#define GL_UNSIGNED_INT_IMAGE_3D 0x9064 -#define GL_UNSIGNED_INT_IMAGE_2D_RECT 0x9065 -#define GL_UNSIGNED_INT_IMAGE_CUBE 0x9066 -#define GL_UNSIGNED_INT_IMAGE_BUFFER 0x9067 -#define GL_UNSIGNED_INT_IMAGE_1D_ARRAY 0x9068 -#define GL_UNSIGNED_INT_IMAGE_2D_ARRAY 0x9069 -#define GL_UNSIGNED_INT_IMAGE_CUBE_MAP_ARRAY 0x906A -#define GL_UNSIGNED_INT_IMAGE_2D_MULTISAMPLE 0x906B -#define GL_UNSIGNED_INT_IMAGE_2D_MULTISAMPLE_ARRAY 0x906C -#define GL_MAX_IMAGE_SAMPLES 0x906D -#define GL_IMAGE_BINDING_FORMAT 0x906E -#define GL_IMAGE_FORMAT_COMPATIBILITY_TYPE 0x90C7 -#define GL_IMAGE_FORMAT_COMPATIBILITY_BY_SIZE 0x90C8 -#define GL_IMAGE_FORMAT_COMPATIBILITY_BY_CLASS 0x90C9 -#define GL_MAX_VERTEX_IMAGE_UNIFORMS 0x90CA -#define GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS 0x90CB -#define GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS 0x90CC -#define GL_MAX_GEOMETRY_IMAGE_UNIFORMS 0x90CD -#define GL_MAX_FRAGMENT_IMAGE_UNIFORMS 0x90CE -#define GL_MAX_COMBINED_IMAGE_UNIFORMS 0x90CF #define GL_COMPRESSED_RGBA_BPTC_UNORM 0x8E8C #define GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM 0x8E8D #define GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT 0x8E8E #define GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT 0x8E8F -#define GL_TEXTURE_IMMUTABLE_FORMAT 0x912F typedef void(APIENTRYP PFNDOLDRAWARRAYSINSTANCEDBASEINSTANCEPROC)(GLenum mode, GLint first, GLsizei count, @@ -152,16 +87,6 @@ typedef void(APIENTRYP PFNDOLGETINTERNALFORMATIVPROC)(GLenum target, GLenum inte GLenum pname, GLsizei bufSize, GLint* params); typedef void(APIENTRYP PFNDOLGETACTIVEATOMICCOUNTERBUFFERIVPROC)(GLuint program, GLuint bufferIndex, GLenum pname, GLint* params); -typedef void(APIENTRYP PFNDOLBINDIMAGETEXTUREPROC)(GLuint unit, GLuint texture, GLint level, - GLboolean layered, GLint layer, GLenum access, - GLenum format); -typedef void(APIENTRYP PFNDOLMEMORYBARRIERPROC)(GLbitfield barriers); -typedef void(APIENTRYP PFNDOLTEXSTORAGE1DPROC)(GLenum target, GLsizei levels, GLenum internalformat, - GLsizei width); -typedef void(APIENTRYP PFNDOLTEXSTORAGE2DPROC)(GLenum target, GLsizei levels, GLenum internalformat, - GLsizei width, GLsizei height); -typedef void(APIENTRYP PFNDOLTEXSTORAGE3DPROC)(GLenum target, GLsizei levels, GLenum internalformat, - GLsizei width, GLsizei height, GLsizei depth); typedef void(APIENTRYP PFNDOLDRAWTRANSFORMFEEDBACKINSTANCEDPROC)(GLenum mode, GLuint id, GLsizei instancecount); typedef void(APIENTRYP PFNDOLDRAWTRANSFORMFEEDBACKSTREAMINSTANCEDPROC)(GLenum mode, GLuint id, @@ -174,11 +99,6 @@ extern PFNDOLDRAWELEMENTSINSTANCEDBASEVERTEXBASEINSTANCEPROC dolDrawElementsInstancedBaseVertexBaseInstance; extern PFNDOLGETINTERNALFORMATIVPROC dolGetInternalformativ; extern PFNDOLGETACTIVEATOMICCOUNTERBUFFERIVPROC dolGetActiveAtomicCounterBufferiv; -extern PFNDOLBINDIMAGETEXTUREPROC dolBindImageTexture; -extern PFNDOLMEMORYBARRIERPROC dolMemoryBarrier; -extern PFNDOLTEXSTORAGE1DPROC dolTexStorage1D; -extern PFNDOLTEXSTORAGE2DPROC dolTexStorage2D; -extern PFNDOLTEXSTORAGE3DPROC dolTexStorage3D; extern PFNDOLDRAWTRANSFORMFEEDBACKINSTANCEDPROC dolDrawTransformFeedbackInstanced; extern PFNDOLDRAWTRANSFORMFEEDBACKSTREAMINSTANCEDPROC dolDrawTransformFeedbackStreamInstanced; @@ -187,10 +107,5 @@ extern PFNDOLDRAWTRANSFORMFEEDBACKSTREAMINSTANCEDPROC dolDrawTransformFeedbackSt #define glDrawElementsInstancedBaseVertexBaseInstance dolDrawElementsInstancedBaseVertexBaseInstance #define glGetInternalformativ dolGetInternalformativ #define glGetActiveAtomicCounterBufferiv dolGetActiveAtomicCounterBufferiv -#define glBindImageTexture dolBindImageTexture -#define glMemoryBarrier dolMemoryBarrier -#define glTexStorage1D dolTexStorage1D -#define glTexStorage2D dolTexStorage2D -#define glTexStorage3D dolTexStorage3D #define glDrawTransformFeedbackInstanced dolDrawTransformFeedbackInstanced #define glDrawTransformFeedbackStreamInstanced dolDrawTransformFeedbackStreamInstanced diff --git a/Source/Core/Common/GL/GLExtensions/gl_4_3.h b/Source/Core/Common/GL/GLExtensions/gl_4_3.h index e81b24e370..98b18ad246 100644 --- a/Source/Core/Common/GL/GLExtensions/gl_4_3.h +++ b/Source/Core/Common/GL/GLExtensions/gl_4_3.h @@ -38,24 +38,6 @@ #define GL_PRIMITIVE_RESTART_FIXED_INDEX 0x8D69 #define GL_ANY_SAMPLES_PASSED_CONSERVATIVE 0x8D6A #define GL_MAX_ELEMENT_INDEX 0x8D6B -#define GL_COMPUTE_SHADER 0x91B9 -#define GL_MAX_COMPUTE_UNIFORM_BLOCKS 0x91BB -#define GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS 0x91BC -#define GL_MAX_COMPUTE_IMAGE_UNIFORMS 0x91BD -#define GL_MAX_COMPUTE_SHARED_MEMORY_SIZE 0x8262 -#define GL_MAX_COMPUTE_UNIFORM_COMPONENTS 0x8263 -#define GL_MAX_COMPUTE_ATOMIC_COUNTER_BUFFERS 0x8264 -#define GL_MAX_COMPUTE_ATOMIC_COUNTERS 0x8265 -#define GL_MAX_COMBINED_COMPUTE_UNIFORM_COMPONENTS 0x8266 -#define GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS 0x90EB -#define GL_MAX_COMPUTE_WORK_GROUP_COUNT 0x91BE -#define GL_MAX_COMPUTE_WORK_GROUP_SIZE 0x91BF -#define GL_COMPUTE_WORK_GROUP_SIZE 0x8267 -#define GL_UNIFORM_BLOCK_REFERENCED_BY_COMPUTE_SHADER 0x90EC -#define GL_ATOMIC_COUNTER_BUFFER_REFERENCED_BY_COMPUTE_SHADER 0x90ED -#define GL_DISPATCH_INDIRECT_BUFFER 0x90EE -#define GL_DISPATCH_INDIRECT_BUFFER_BINDING 0x90EF -#define GL_COMPUTE_SHADER_BIT 0x00000020 #define GL_DEBUG_OUTPUT_SYNCHRONOUS 0x8242 #define GL_DEBUG_NEXT_LOGGED_MESSAGE_LENGTH 0x8243 #define GL_DEBUG_CALLBACK_FUNCTION 0x8244 @@ -287,9 +269,6 @@ typedef void(APIENTRYP PFNDOLCLEARBUFFERDATAPROC)(GLenum target, GLenum internal typedef void(APIENTRYP PFNDOLCLEARBUFFERSUBDATAPROC)(GLenum target, GLenum internalformat, GLintptr offset, GLsizeiptr size, GLenum format, GLenum type, const void* data); -typedef void(APIENTRYP PFNDOLDISPATCHCOMPUTEPROC)(GLuint num_groups_x, GLuint num_groups_y, - GLuint num_groups_z); -typedef void(APIENTRYP PFNDOLDISPATCHCOMPUTEINDIRECTPROC)(GLintptr indirect); typedef void(APIENTRYP PFNDOLFRAMEBUFFERPARAMETERIPROC)(GLenum target, GLenum pname, GLint param); typedef void(APIENTRYP PFNDOLGETFRAMEBUFFERPARAMETERIVPROC)(GLenum target, GLenum pname, GLint* params); @@ -348,8 +327,6 @@ typedef void(APIENTRYP PFNDOLVERTEXBINDINGDIVISORPROC)(GLuint bindingindex, GLui extern PFNDOLCLEARBUFFERDATAPROC dolClearBufferData; extern PFNDOLCLEARBUFFERSUBDATAPROC dolClearBufferSubData; -extern PFNDOLDISPATCHCOMPUTEPROC dolDispatchCompute; -extern PFNDOLDISPATCHCOMPUTEINDIRECTPROC dolDispatchComputeIndirect; extern PFNDOLFRAMEBUFFERPARAMETERIPROC dolFramebufferParameteri; extern PFNDOLGETFRAMEBUFFERPARAMETERIVPROC dolGetFramebufferParameteriv; extern PFNDOLGETINTERNALFORMATI64VPROC dolGetInternalformati64v; @@ -378,8 +355,6 @@ extern PFNDOLVERTEXBINDINGDIVISORPROC dolVertexBindingDivisor; #define glClearBufferData dolClearBufferData #define glClearBufferSubData dolClearBufferSubData -#define glDispatchCompute dolDispatchCompute -#define glDispatchComputeIndirect dolDispatchComputeIndirect #define glFramebufferParameteri dolFramebufferParameteri #define glGetFramebufferParameteriv dolGetFramebufferParameteriv #define glGetInternalformati64v dolGetInternalformati64v diff --git a/Source/Core/DolphinWX/VideoConfigDiag.cpp b/Source/Core/DolphinWX/VideoConfigDiag.cpp index fd6db1942c..49a004b73c 100644 --- a/Source/Core/DolphinWX/VideoConfigDiag.cpp +++ b/Source/Core/DolphinWX/VideoConfigDiag.cpp @@ -284,6 +284,10 @@ static wxString true_color_desc = wxTRANSLATE("Forces the game to render the RGB color channels in 24-bit, thereby increasing " "quality by reducing color banding.\nIt has no impact on performance and causes " "few graphical issues.\n\n\nIf unsure, leave this checked."); +static wxString gpu_texture_decoding_desc = + wxTRANSLATE("Enables texture decoding using the GPU instead of the CPU. This may result in " + "performance gains in some scenarios, or systems where the CPU is the bottleneck." + "\n\nIf unsure, leave this unchecked."); #if !defined(__APPLE__) // Search for available resolutions - TODO: Move to Common? @@ -755,6 +759,15 @@ VideoConfigDiag::VideoConfigDiag(wxWindow* parent, const std::string& title) slide_szr->Add(new wxStaticText(page_hacks, wxID_ANY, _("Fast")), 0, wxALIGN_CENTER_VERTICAL); szr_safetex->Add(slide_szr, 1, wxEXPAND | wxLEFT | wxRIGHT, space5); + + if (vconfig.backend_info.bSupportsGPUTextureDecoding) + { + szr_safetex->Add(CreateCheckBox(page_hacks, _("GPU Texture Decoding"), + wxGetTranslation(gpu_texture_decoding_desc), + vconfig.bEnableGPUTextureDecoding), + 1, wxEXPAND | wxLEFT | wxRIGHT, space5); + } + if (slider_pos == -1) { stc_slider->Disable(); diff --git a/Source/Core/VideoBackends/D3D/main.cpp b/Source/Core/VideoBackends/D3D/main.cpp index e8a51d3441..bc0af68180 100644 --- a/Source/Core/VideoBackends/D3D/main.cpp +++ b/Source/Core/VideoBackends/D3D/main.cpp @@ -67,6 +67,7 @@ void VideoBackend::InitBackendInfo() g_Config.backend_info.bSupportsPrimitiveRestart = true; g_Config.backend_info.bSupportsOversizedViewports = false; g_Config.backend_info.bSupportsGeometryShaders = true; + g_Config.backend_info.bSupportsComputeShaders = false; g_Config.backend_info.bSupports3DVision = true; g_Config.backend_info.bSupportsPostProcessing = false; g_Config.backend_info.bSupportsPaletteConversion = true; @@ -75,6 +76,7 @@ void VideoBackend::InitBackendInfo() g_Config.backend_info.bSupportsReversedDepthRange = false; g_Config.backend_info.bSupportsMultithreading = false; g_Config.backend_info.bSupportsInternalResolutionFrameDumps = false; + g_Config.backend_info.bSupportsGPUTextureDecoding = false; IDXGIFactory* factory; IDXGIAdapter* ad; diff --git a/Source/Core/VideoBackends/D3D12/main.cpp b/Source/Core/VideoBackends/D3D12/main.cpp index 7a48647c0e..2c95bff055 100644 --- a/Source/Core/VideoBackends/D3D12/main.cpp +++ b/Source/Core/VideoBackends/D3D12/main.cpp @@ -70,6 +70,7 @@ void VideoBackend::InitBackendInfo() g_Config.backend_info.bSupportsPrimitiveRestart = true; g_Config.backend_info.bSupportsOversizedViewports = false; g_Config.backend_info.bSupportsGeometryShaders = true; + g_Config.backend_info.bSupportsComputeShaders = false; g_Config.backend_info.bSupports3DVision = true; g_Config.backend_info.bSupportsPostProcessing = false; g_Config.backend_info.bSupportsPaletteConversion = true; @@ -78,6 +79,7 @@ void VideoBackend::InitBackendInfo() g_Config.backend_info.bSupportsReversedDepthRange = false; g_Config.backend_info.bSupportsMultithreading = false; g_Config.backend_info.bSupportsInternalResolutionFrameDumps = false; + g_Config.backend_info.bSupportsGPUTextureDecoding = false; IDXGIFactory* factory; IDXGIAdapter* ad; diff --git a/Source/Core/VideoBackends/Null/NullBackend.cpp b/Source/Core/VideoBackends/Null/NullBackend.cpp index 1fe0914a91..60a6f637ac 100644 --- a/Source/Core/VideoBackends/Null/NullBackend.cpp +++ b/Source/Core/VideoBackends/Null/NullBackend.cpp @@ -30,6 +30,7 @@ void VideoBackend::InitBackendInfo() g_Config.backend_info.bSupportsPrimitiveRestart = true; g_Config.backend_info.bSupportsOversizedViewports = true; g_Config.backend_info.bSupportsGeometryShaders = true; + g_Config.backend_info.bSupportsComputeShaders = false; g_Config.backend_info.bSupports3DVision = false; g_Config.backend_info.bSupportsEarlyZ = true; g_Config.backend_info.bSupportsBindingLayout = true; @@ -43,6 +44,7 @@ void VideoBackend::InitBackendInfo() g_Config.backend_info.bSupportsReversedDepthRange = true; g_Config.backend_info.bSupportsMultithreading = false; g_Config.backend_info.bSupportsInternalResolutionFrameDumps = false; + g_Config.backend_info.bSupportsGPUTextureDecoding = false; // aamodes: We only support 1 sample, so no MSAA g_Config.backend_info.Adapters.clear(); diff --git a/Source/Core/VideoBackends/OGL/FramebufferManager.cpp b/Source/Core/VideoBackends/OGL/FramebufferManager.cpp index 5fc1651285..59a8679fd0 100644 --- a/Source/Core/VideoBackends/OGL/FramebufferManager.cpp +++ b/Source/Core/VideoBackends/OGL/FramebufferManager.cpp @@ -65,7 +65,7 @@ GLuint FramebufferManager::CreateTexture(GLenum texture_type, GLenum internal_fo } else if (texture_type == GL_TEXTURE_2D_MULTISAMPLE_ARRAY) { - if (g_ogl_config.bSupports3DTextureStorage) + if (g_ogl_config.bSupports3DTextureStorageMultisample) glTexStorage3DMultisample(texture_type, m_msaaSamples, internal_format, m_targetWidth, m_targetHeight, m_EFBLayers, false); else @@ -74,7 +74,7 @@ GLuint FramebufferManager::CreateTexture(GLenum texture_type, GLenum internal_fo } else if (texture_type == GL_TEXTURE_2D_MULTISAMPLE) { - if (g_ogl_config.bSupports2DTextureStorage) + if (g_ogl_config.bSupports2DTextureStorageMultisample) glTexStorage2DMultisample(texture_type, m_msaaSamples, internal_format, m_targetWidth, m_targetHeight, false); else diff --git a/Source/Core/VideoBackends/OGL/GPUTimer.h b/Source/Core/VideoBackends/OGL/GPUTimer.h new file mode 100644 index 0000000000..50724ab06f --- /dev/null +++ b/Source/Core/VideoBackends/OGL/GPUTimer.h @@ -0,0 +1,105 @@ +// Copyright 2016 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#pragma once + +#include "Common/GL/GLExtensions/GLExtensions.h" + +#ifndef GL_TIME_ELAPSED +#define GL_TIME_ELAPSED 0x88BF +#endif + +namespace OGL +{ +/* + * This class can be used to measure the time it takes for the GPU to perform a draw call + * or compute dispatch. To use: + * + * - Create an instance of GPUTimer before issuing the draw call. + * (this can be before or after any binding that needs to be done) + * + * - (optionally) call Begin(). This is not needed for a single draw call. + * + * - Issue the draw call or compute dispatch as normal. + * + * - (optionally) call End(). This is not necessary for a single draw call. + * + * - Call GetTime{Seconds,Milliseconds,Nanoseconds} to determine how long the operation + * took to execute on the GPU. + * + * NOTE: When the timer is read back, this will force a GL flush, so the more often a timer is used, + * the larger of a performance impact it will have. Only one timer can be active at any time, due to + * using GL_TIME_ELAPSED. This is not enforced by the class, however. + * + */ +class GPUTimer final +{ +public: + GPUTimer() + { + glGenQueries(1, &m_query_id); + Begin(); + } + + ~GPUTimer() + { + End(); + glDeleteQueries(1, &m_query_id); + } + + void Begin() + { + if (m_started) + glEndQuery(GL_TIME_ELAPSED); + + glBeginQuery(GL_TIME_ELAPSED, m_query_id); + m_started = true; + } + + void End() + { + if (!m_started) + return; + + glEndQuery(GL_TIME_ELAPSED); + m_started = false; + } + + double GetTimeSeconds() + { + GetResult(); + return static_cast(m_result) / 1000000000.0; + } + + double GetTimeMilliseconds() + { + GetResult(); + return static_cast(m_result) / 1000000.0; + } + + u32 GetTimeNanoseconds() + { + GetResult(); + return m_result; + } + +private: + void GetResult() + { + if (m_has_result) + return; + + if (m_started) + End(); + + glGetQueryObjectuiv(m_query_id, GL_QUERY_RESULT, &m_result); + m_has_result = true; + } + + GLuint m_query_id; + GLuint m_result = 0; + bool m_started = false; + bool m_has_result = false; +}; +} // namespace OGL diff --git a/Source/Core/VideoBackends/OGL/OGL.vcxproj b/Source/Core/VideoBackends/OGL/OGL.vcxproj index 0c234a8e71..3b945793a4 100644 --- a/Source/Core/VideoBackends/OGL/OGL.vcxproj +++ b/Source/Core/VideoBackends/OGL/OGL.vcxproj @@ -53,6 +53,7 @@ + @@ -79,4 +80,4 @@ - + \ No newline at end of file diff --git a/Source/Core/VideoBackends/OGL/OGL.vcxproj.filters b/Source/Core/VideoBackends/OGL/OGL.vcxproj.filters index 282934d830..201a4045f8 100644 --- a/Source/Core/VideoBackends/OGL/OGL.vcxproj.filters +++ b/Source/Core/VideoBackends/OGL/OGL.vcxproj.filters @@ -90,8 +90,11 @@ + + GLUtil + - + \ No newline at end of file diff --git a/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp b/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp index 508d6e41ed..3d4aeea2fc 100644 --- a/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp +++ b/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp @@ -65,6 +65,8 @@ static std::string GetGLSLVersionString() return "#version 330"; case GLSL_400: return "#version 400"; + case GLSL_430: + return "#version 430"; default: // Shouldn't ever hit this return "#version ERROR"; @@ -103,27 +105,30 @@ void SHADER::SetProgramVariables() } } -void SHADER::SetProgramBindings() +void SHADER::SetProgramBindings(bool is_compute) { - if (g_ActiveConfig.backend_info.bSupportsDualSourceBlend) + if (!is_compute) { - // So we do support extended blending - // So we need to set a few more things here. - // Bind our out locations - glBindFragDataLocationIndexed(glprogid, 0, 0, "ocol0"); - glBindFragDataLocationIndexed(glprogid, 0, 1, "ocol1"); + if (g_ActiveConfig.backend_info.bSupportsDualSourceBlend) + { + // So we do support extended blending + // So we need to set a few more things here. + // Bind our out locations + glBindFragDataLocationIndexed(glprogid, 0, 0, "ocol0"); + glBindFragDataLocationIndexed(glprogid, 0, 1, "ocol1"); + } + // Need to set some attribute locations + glBindAttribLocation(glprogid, SHADER_POSITION_ATTRIB, "rawpos"); + + glBindAttribLocation(glprogid, SHADER_POSMTX_ATTRIB, "posmtx"); + + glBindAttribLocation(glprogid, SHADER_COLOR0_ATTRIB, "color0"); + glBindAttribLocation(glprogid, SHADER_COLOR1_ATTRIB, "color1"); + + glBindAttribLocation(glprogid, SHADER_NORM0_ATTRIB, "rawnorm0"); + glBindAttribLocation(glprogid, SHADER_NORM1_ATTRIB, "rawnorm1"); + glBindAttribLocation(glprogid, SHADER_NORM2_ATTRIB, "rawnorm2"); } - // Need to set some attribute locations - glBindAttribLocation(glprogid, SHADER_POSITION_ATTRIB, "rawpos"); - - glBindAttribLocation(glprogid, SHADER_POSMTX_ATTRIB, "posmtx"); - - glBindAttribLocation(glprogid, SHADER_COLOR0_ATTRIB, "color0"); - glBindAttribLocation(glprogid, SHADER_COLOR1_ATTRIB, "color1"); - - glBindAttribLocation(glprogid, SHADER_NORM0_ATTRIB, "rawnorm0"); - glBindAttribLocation(glprogid, SHADER_NORM1_ATTRIB, "rawnorm1"); - glBindAttribLocation(glprogid, SHADER_NORM2_ATTRIB, "rawnorm2"); for (int i = 0; i < 8; i++) { @@ -281,7 +286,7 @@ bool ProgramShaderCache::CompileShader(SHADER& shader, const std::string& vcode, if (g_ogl_config.bSupportsGLSLCache) glProgramParameteri(pid, GL_PROGRAM_BINARY_RETRIEVABLE_HINT, GL_TRUE); - shader.SetProgramBindings(); + shader.SetProgramBindings(false); glLinkProgram(pid); @@ -296,10 +301,10 @@ bool ProgramShaderCache::CompileShader(SHADER& shader, const std::string& vcode, glGetProgramiv(pid, GL_INFO_LOG_LENGTH, &length); if (linkStatus != GL_TRUE || (length > 1 && DEBUG_GLSL)) { - GLsizei charsWritten; - GLchar* infoLog = new GLchar[length]; - glGetProgramInfoLog(pid, length, &charsWritten, infoLog); - ERROR_LOG(VIDEO, "Program info log:\n%s", infoLog); + std::string info_log; + info_log.resize(length); + glGetProgramInfoLog(pid, length, &length, &info_log[0]); + ERROR_LOG(VIDEO, "Program info log:\n%s", info_log.c_str()); std::string filename = StringFromFormat("%sbad_p_%d.txt", File::GetUserPath(D_DUMP_IDX).c_str(), num_failures++); @@ -308,7 +313,7 @@ bool ProgramShaderCache::CompileShader(SHADER& shader, const std::string& vcode, file << s_glsl_header << vcode << s_glsl_header << pcode; if (!gcode.empty()) file << s_glsl_header << gcode; - file << infoLog; + file << info_log; file.close(); if (linkStatus != GL_TRUE) @@ -316,10 +321,8 @@ bool ProgramShaderCache::CompileShader(SHADER& shader, const std::string& vcode, PanicAlert("Failed to link shaders: %s\n" "Debug info (%s, %s, %s):\n%s", filename.c_str(), g_ogl_config.gl_vendor, g_ogl_config.gl_renderer, - g_ogl_config.gl_version, infoLog); + g_ogl_config.gl_version, info_log.c_str()); } - - delete[] infoLog; } if (linkStatus != GL_TRUE) { @@ -336,6 +339,73 @@ bool ProgramShaderCache::CompileShader(SHADER& shader, const std::string& vcode, return true; } +bool ProgramShaderCache::CompileComputeShader(SHADER& shader, const std::string& code) +{ + // We need to enable GL_ARB_compute_shader for drivers that support the extension, + // but not GLSL 4.3. Mesa is one example. + std::string header; + if (g_ActiveConfig.backend_info.bSupportsComputeShaders && + g_ogl_config.eSupportedGLSLVersion < GLSL_430) + { + header = "#extension GL_ARB_compute_shader : enable\n"; + } + + GLuint shader_id = CompileSingleShader(GL_COMPUTE_SHADER, header + code); + if (!shader_id) + return false; + + GLuint pid = shader.glprogid = glCreateProgram(); + glAttachShader(pid, shader_id); + if (g_ogl_config.bSupportsGLSLCache) + glProgramParameteri(pid, GL_PROGRAM_BINARY_RETRIEVABLE_HINT, GL_TRUE); + + shader.SetProgramBindings(true); + + glLinkProgram(pid); + + // original shaders aren't needed any more + glDeleteShader(shader_id); + + GLint linkStatus; + glGetProgramiv(pid, GL_LINK_STATUS, &linkStatus); + GLsizei length = 0; + glGetProgramiv(pid, GL_INFO_LOG_LENGTH, &length); + if (linkStatus != GL_TRUE || (length > 1 && DEBUG_GLSL)) + { + std::string info_log; + info_log.resize(length); + glGetProgramInfoLog(pid, length, &length, &info_log[0]); + ERROR_LOG(VIDEO, "Program info log:\n%s", info_log.c_str()); + + std::string filename = + StringFromFormat("%sbad_p_%d.txt", File::GetUserPath(D_DUMP_IDX).c_str(), num_failures++); + std::ofstream file; + OpenFStream(file, filename, std::ios_base::out); + file << s_glsl_header << code; + file << info_log; + file.close(); + + if (linkStatus != GL_TRUE) + { + PanicAlert("Failed to link shaders: %s\n" + "Debug info (%s, %s, %s):\n%s", + filename.c_str(), g_ogl_config.gl_vendor, g_ogl_config.gl_renderer, + g_ogl_config.gl_version, info_log.c_str()); + } + } + if (linkStatus != GL_TRUE) + { + // Compile failed + ERROR_LOG(VIDEO, "Program linking failed; see info log"); + + // Don't try to use this shader + glDeleteProgram(pid); + return false; + } + + return true; +} + GLuint ProgramShaderCache::CompileSingleShader(GLuint type, const std::string& code) { GLuint result = glCreateShader(type); @@ -351,31 +421,43 @@ GLuint ProgramShaderCache::CompileSingleShader(GLuint type, const std::string& c if (compileStatus != GL_TRUE || (length > 1 && DEBUG_GLSL)) { - GLsizei charsWritten; - GLchar* infoLog = new GLchar[length]; - glGetShaderInfoLog(result, length, &charsWritten, infoLog); - ERROR_LOG(VIDEO, "%s Shader info log:\n%s", - type == GL_VERTEX_SHADER ? "VS" : type == GL_FRAGMENT_SHADER ? "PS" : "GS", infoLog); + std::string info_log; + info_log.resize(length); + glGetShaderInfoLog(result, length, &length, &info_log[0]); + + const char* prefix = ""; + switch (type) + { + case GL_VERTEX_SHADER: + prefix = "vs"; + break; + case GL_GEOMETRY_SHADER: + prefix = "gs"; + break; + case GL_FRAGMENT_SHADER: + prefix = "ps"; + break; + case GL_COMPUTE_SHADER: + prefix = "cs"; + break; + } + + ERROR_LOG(VIDEO, "%s Shader info log:\n%s", prefix, info_log.c_str()); std::string filename = StringFromFormat( - "%sbad_%s_%04i.txt", File::GetUserPath(D_DUMP_IDX).c_str(), - type == GL_VERTEX_SHADER ? "vs" : type == GL_FRAGMENT_SHADER ? "ps" : "gs", num_failures++); + "%sbad_%s_%04i.txt", File::GetUserPath(D_DUMP_IDX).c_str(), prefix, num_failures++); std::ofstream file; OpenFStream(file, filename, std::ios_base::out); - file << s_glsl_header << code << infoLog; + file << s_glsl_header << code << info_log; file.close(); if (compileStatus != GL_TRUE) { PanicAlert("Failed to compile %s shader: %s\n" "Debug info (%s, %s, %s):\n%s", - type == GL_VERTEX_SHADER ? "vertex" : type == GL_FRAGMENT_SHADER ? "pixel" : - "geometry", - filename.c_str(), g_ogl_config.gl_vendor, g_ogl_config.gl_renderer, - g_ogl_config.gl_version, infoLog); + prefix, filename.c_str(), g_ogl_config.gl_vendor, g_ogl_config.gl_renderer, + g_ogl_config.gl_version, info_log.c_str()); } - - delete[] infoLog; } if (compileStatus != GL_TRUE) { @@ -539,11 +621,9 @@ void ProgramShaderCache::CreateHeader() std::string earlyz_string = ""; if (g_ActiveConfig.backend_info.bSupportsEarlyZ) { - if (g_ogl_config.bSupportsEarlyFragmentTests) + if (g_ogl_config.bSupportsImageLoadStore) { earlyz_string = "#define FORCE_EARLY_Z layout(early_fragment_tests) in\n"; - if (!is_glsles) // GLES supports this by default - earlyz_string += "#extension GL_ARB_shader_image_load_store : enable\n"; } else if (g_ogl_config.bSupportsConservativeDepth) { @@ -569,6 +649,7 @@ void ProgramShaderCache::CreateHeader() "%s\n" // texture buffer "%s\n" // ES texture buffer "%s\n" // ES dual source blend + "%s\n" // shader image load store // Precision defines for GLSL ES "%s\n" @@ -576,6 +657,7 @@ void ProgramShaderCache::CreateHeader() "%s\n" "%s\n" "%s\n" + "%s\n" // Silly differences "#define float2 vec2\n" @@ -638,12 +720,17 @@ void ProgramShaderCache::CreateHeader() "" , + g_ogl_config.bSupportsImageLoadStore && + ((!is_glsles && v < GLSL_430) || (is_glsles && v < GLSLES_310)) ? + "#extension GL_ARB_shader_image_load_store : enable" : + "", is_glsles ? "precision highp float;" : "", is_glsles ? "precision highp int;" : "", is_glsles ? "precision highp sampler2DArray;" : "", (is_glsles && g_ActiveConfig.backend_info.bSupportsPaletteConversion) ? "precision highp usamplerBuffer;" : "", - v > GLSLES_300 ? "precision highp sampler2DMS;" : ""); + v > GLSLES_300 ? "precision highp sampler2DMS;" : "", + v >= GLSLES_310 ? "precision highp image2DArray;" : ""); } void ProgramShaderCache::ProgramShaderCacheInserter::Read(const SHADERUID& key, const u8* value, diff --git a/Source/Core/VideoBackends/OGL/ProgramShaderCache.h b/Source/Core/VideoBackends/OGL/ProgramShaderCache.h index c471db63ed..a8b2bfcbc1 100644 --- a/Source/Core/VideoBackends/OGL/ProgramShaderCache.h +++ b/Source/Core/VideoBackends/OGL/ProgramShaderCache.h @@ -46,7 +46,7 @@ struct SHADER std::string strvprog, strpprog, strgprog; void SetProgramVariables(); - void SetProgramBindings(); + void SetProgramBindings(bool is_compute); void Bind(); }; @@ -67,6 +67,7 @@ public: static bool CompileShader(SHADER& shader, const std::string& vcode, const std::string& pcode, const std::string& gcode = ""); + static bool CompileComputeShader(SHADER& shader, const std::string& code); static GLuint CompileSingleShader(GLuint type, const std::string& code); static void UploadConstants(); diff --git a/Source/Core/VideoBackends/OGL/Render.cpp b/Source/Core/VideoBackends/OGL/Render.cpp index 198dc7f05f..18626eaa0e 100644 --- a/Source/Core/VideoBackends/OGL/Render.cpp +++ b/Source/Core/VideoBackends/OGL/Render.cpp @@ -451,15 +451,16 @@ Renderer::Renderer() g_ogl_config.bSupportViewportFloat = GLExtensions::Supports("GL_ARB_viewport_array"); g_ogl_config.bSupportsDebug = GLExtensions::Supports("GL_KHR_debug") || GLExtensions::Supports("GL_ARB_debug_output"); - g_ogl_config.bSupports3DTextureStorage = + g_ogl_config.bSupportsTextureStorage = GLExtensions::Supports("GL_ARB_texture_storage"); + g_ogl_config.bSupports3DTextureStorageMultisample = GLExtensions::Supports("GL_ARB_texture_storage_multisample") || GLExtensions::Supports("GL_OES_texture_storage_multisample_2d_array"); - g_ogl_config.bSupports2DTextureStorage = + g_ogl_config.bSupports2DTextureStorageMultisample = GLExtensions::Supports("GL_ARB_texture_storage_multisample"); - g_ogl_config.bSupportsEarlyFragmentTests = - GLExtensions::Supports("GL_ARB_shader_image_load_store"); + g_ogl_config.bSupportsImageLoadStore = GLExtensions::Supports("GL_ARB_shader_image_load_store"); g_ogl_config.bSupportsConservativeDepth = GLExtensions::Supports("GL_ARB_conservative_depth"); g_ogl_config.bSupportsAniso = GLExtensions::Supports("GL_EXT_texture_filter_anisotropic"); + g_Config.backend_info.bSupportsComputeShaders = GLExtensions::Supports("GL_ARB_compute_shader"); if (GLInterface->GetMode() == GLInterfaceMode::MODE_OPENGLES3) { @@ -486,6 +487,7 @@ Renderer::Renderer() { g_ogl_config.eSupportedGLSLVersion = GLSLES_300; g_ogl_config.bSupportsAEP = false; + g_ogl_config.bSupportsTextureStorage = true; g_Config.backend_info.bSupportsGeometryShaders = false; } else if (GLExtensions::Version() == 310) @@ -493,16 +495,18 @@ Renderer::Renderer() g_ogl_config.eSupportedGLSLVersion = GLSLES_310; g_ogl_config.bSupportsAEP = GLExtensions::Supports("GL_ANDROID_extension_pack_es31a"); g_Config.backend_info.bSupportsBindingLayout = true; - g_ogl_config.bSupportsEarlyFragmentTests = true; + g_ogl_config.bSupportsImageLoadStore = true; g_Config.backend_info.bSupportsGeometryShaders = g_ogl_config.bSupportsAEP; + g_Config.backend_info.bSupportsComputeShaders = true; g_Config.backend_info.bSupportsGSInstancing = g_Config.backend_info.bSupportsGeometryShaders && g_ogl_config.SupportedESPointSize > 0; g_Config.backend_info.bSupportsSSAA = g_ogl_config.bSupportsAEP; g_Config.backend_info.bSupportsFragmentStoresAndAtomics = true; g_ogl_config.bSupportsMSAA = true; - g_ogl_config.bSupports2DTextureStorage = true; + g_ogl_config.bSupportsTextureStorage = true; + g_ogl_config.bSupports2DTextureStorageMultisample = true; if (g_ActiveConfig.iStereoMode > 0 && g_ActiveConfig.iMultisamples > 1 && - !g_ogl_config.bSupports3DTextureStorage) + !g_ogl_config.bSupports3DTextureStorageMultisample) { // GLES 3.1 can't support stereo rendering and MSAA OSD::AddMessage("MSAA Stereo rendering isn't supported by your GPU.", 10000); @@ -514,8 +518,9 @@ Renderer::Renderer() g_ogl_config.eSupportedGLSLVersion = GLSLES_320; g_ogl_config.bSupportsAEP = GLExtensions::Supports("GL_ANDROID_extension_pack_es31a"); g_Config.backend_info.bSupportsBindingLayout = true; - g_ogl_config.bSupportsEarlyFragmentTests = true; + g_ogl_config.bSupportsImageLoadStore = true; g_Config.backend_info.bSupportsGeometryShaders = true; + g_Config.backend_info.bSupportsComputeShaders = true; g_Config.backend_info.bSupportsGSInstancing = g_ogl_config.SupportedESPointSize > 0; g_Config.backend_info.bSupportsPaletteConversion = true; g_Config.backend_info.bSupportsSSAA = true; @@ -524,8 +529,9 @@ Renderer::Renderer() g_ogl_config.bSupportsGLBaseVertex = true; g_ogl_config.bSupportsDebug = true; g_ogl_config.bSupportsMSAA = true; - g_ogl_config.bSupports2DTextureStorage = true; - g_ogl_config.bSupports3DTextureStorage = true; + g_ogl_config.bSupportsTextureStorage = true; + g_ogl_config.bSupports2DTextureStorageMultisample = true; + g_ogl_config.bSupports3DTextureStorageMultisample = true; } } else @@ -541,8 +547,7 @@ Renderer::Renderer() else if (GLExtensions::Version() == 300) { g_ogl_config.eSupportedGLSLVersion = GLSL_130; - g_ogl_config.bSupportsEarlyFragmentTests = - false; // layout keyword is only supported on glsl150+ + g_ogl_config.bSupportsImageLoadStore = false; // layout keyword is only supported on glsl150+ g_ogl_config.bSupportsConservativeDepth = false; // layout keyword is only supported on glsl150+ g_Config.backend_info.bSupportsGeometryShaders = @@ -551,8 +556,7 @@ Renderer::Renderer() else if (GLExtensions::Version() == 310) { g_ogl_config.eSupportedGLSLVersion = GLSL_140; - g_ogl_config.bSupportsEarlyFragmentTests = - false; // layout keyword is only supported on glsl150+ + g_ogl_config.bSupportsImageLoadStore = false; // layout keyword is only supported on glsl150+ g_ogl_config.bSupportsConservativeDepth = false; // layout keyword is only supported on glsl150+ g_Config.backend_info.bSupportsGeometryShaders = @@ -566,10 +570,28 @@ Renderer::Renderer() { g_ogl_config.eSupportedGLSLVersion = GLSL_330; } + else if (GLExtensions::Version() >= 430) + { + // TODO: We should really parse the GL_SHADING_LANGUAGE_VERSION token. + g_ogl_config.eSupportedGLSLVersion = GLSL_430; + g_ogl_config.bSupportsTextureStorage = true; + g_ogl_config.bSupportsImageLoadStore = true; + g_Config.backend_info.bSupportsSSAA = true; + + // Compute shaders are core in GL4.3. + g_Config.backend_info.bSupportsComputeShaders = true; + } else { g_ogl_config.eSupportedGLSLVersion = GLSL_400; g_Config.backend_info.bSupportsSSAA = true; + + if (GLExtensions::Version() == 420) + { + // Texture storage and shader image load/store are core in GL4.2. + g_ogl_config.bSupportsTextureStorage = true; + g_ogl_config.bSupportsImageLoadStore = true; + } } // Desktop OpenGL can't have the Android Extension Pack @@ -578,12 +600,19 @@ Renderer::Renderer() // Either method can do early-z tests. See PixelShaderGen for details. g_Config.backend_info.bSupportsEarlyZ = - g_ogl_config.bSupportsEarlyFragmentTests || g_ogl_config.bSupportsConservativeDepth; + g_ogl_config.bSupportsImageLoadStore || g_ogl_config.bSupportsConservativeDepth; glGetIntegerv(GL_MAX_SAMPLES, &g_ogl_config.max_samples); if (g_ogl_config.max_samples < 1 || !g_ogl_config.bSupportsMSAA) g_ogl_config.max_samples = 1; + // We require texel buffers, image load store, and compute shaders to enable GPU texture decoding. + // If the driver doesn't expose the extensions, but supports GL4.3/GLES3.1, it will still be + // enabled in the version check below. + g_Config.backend_info.bSupportsGPUTextureDecoding = + g_Config.backend_info.bSupportsPaletteConversion && + g_Config.backend_info.bSupportsComputeShaders && g_ogl_config.bSupportsImageLoadStore; + if (g_ogl_config.bSupportsDebug) { if (GLExtensions::Supports("GL_KHR_debug")) diff --git a/Source/Core/VideoBackends/OGL/Render.h b/Source/Core/VideoBackends/OGL/Render.h index e3c2ba13c3..8ec6a21e0d 100644 --- a/Source/Core/VideoBackends/OGL/Render.h +++ b/Source/Core/VideoBackends/OGL/Render.h @@ -22,7 +22,8 @@ enum GLSL_VERSION GLSL_140, GLSL_150, GLSL_330, - GLSL_400, // and above + GLSL_400, // and above + GLSL_430, GLSLES_300, // GLES 3.0 GLSLES_310, // GLES 3.1 GLSLES_320, // GLES 3.2 @@ -51,10 +52,11 @@ struct VideoConfig bool bSupportsCopySubImage; u8 SupportedESPointSize; ES_TEXBUF_TYPE SupportedESTextureBuffer; - bool bSupports2DTextureStorage; - bool bSupports3DTextureStorage; - bool bSupportsEarlyFragmentTests; + bool bSupportsTextureStorage; + bool bSupports2DTextureStorageMultisample; + bool bSupports3DTextureStorageMultisample; bool bSupportsConservativeDepth; + bool bSupportsImageLoadStore; bool bSupportsAniso; const char* gl_vendor; diff --git a/Source/Core/VideoBackends/OGL/TextureCache.cpp b/Source/Core/VideoBackends/OGL/TextureCache.cpp index 2cf0623939..bd6f99b2e8 100644 --- a/Source/Core/VideoBackends/OGL/TextureCache.cpp +++ b/Source/Core/VideoBackends/OGL/TextureCache.cpp @@ -16,6 +16,7 @@ #include "Common/StringUtil.h" #include "VideoBackends/OGL/FramebufferManager.h" +#include "VideoBackends/OGL/GPUTimer.h" #include "VideoBackends/OGL/ProgramShaderCache.h" #include "VideoBackends/OGL/Render.h" #include "VideoBackends/OGL/SamplerCache.h" @@ -23,6 +24,7 @@ #include "VideoBackends/OGL/TextureConverter.h" #include "VideoCommon/ImageWrite.h" +#include "VideoCommon/TextureConversionShader.h" #include "VideoCommon/TextureDecoder.h" #include "VideoCommon/VideoConfig.h" @@ -49,6 +51,26 @@ static GLuint s_palette_buffer_offset_uniform[3]; static GLuint s_palette_multiplier_uniform[3]; static GLuint s_palette_copy_position_uniform[3]; +struct TextureDecodingProgramInfo +{ + const TextureConversionShader::DecodingShaderInfo* base_info = nullptr; + SHADER program; + GLint uniform_dst_size = -1; + GLint uniform_src_size = -1; + GLint uniform_src_row_stride = -1; + GLint uniform_src_offset = -1; + GLint uniform_palette_offset = -1; + bool valid = false; +}; + +//#define TIME_TEXTURE_DECODING 1 + +static std::map, TextureDecodingProgramInfo> s_texture_decoding_program_info; +static std::array + s_texture_decoding_buffer_views; +static void CreateTextureDecodingResources(); +static void DestroyTextureDecodingResources(); + bool SaveTexture(const std::string& filename, u32 textarget, u32 tex, int virtual_width, int virtual_height, unsigned int level) { @@ -119,12 +141,22 @@ TextureCache::TCacheEntryBase* TextureCache::CreateTexture(const TCacheEntryConf glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MAX_LEVEL, config.levels - 1); + if (g_ogl_config.bSupportsTextureStorage) + { + glTexStorage3D(GL_TEXTURE_2D_ARRAY, config.levels, GL_RGBA8, config.width, config.height, + config.layers); + } + if (config.rendertarget) { - for (u32 level = 0; level <= config.levels; level++) + if (!g_ogl_config.bSupportsTextureStorage) { - glTexImage3D(GL_TEXTURE_2D_ARRAY, level, GL_RGBA, config.width, config.height, config.layers, - 0, GL_RGBA, GL_UNSIGNED_BYTE, nullptr); + for (u32 level = 0; level < config.levels; level++) + { + glTexImage3D(GL_TEXTURE_2D_ARRAY, level, GL_RGBA, std::max(config.width >> level, 1u), + std::max(config.height >> level, 1u), config.layers, 0, GL_RGBA, + GL_UNSIGNED_BYTE, nullptr); + } } glGenFramebuffers(1, &entry->framebuffer); FramebufferManager::SetFramebuffer(entry->framebuffer); @@ -187,8 +219,16 @@ void TextureCache::TCacheEntry::Load(const u8* buffer, u32 width, u32 height, u3 if (expanded_width != width) glPixelStorei(GL_UNPACK_ROW_LENGTH, expanded_width); - glTexImage3D(GL_TEXTURE_2D_ARRAY, level, GL_RGBA, width, height, 1, 0, GL_RGBA, GL_UNSIGNED_BYTE, - buffer); + if (g_ogl_config.bSupportsTextureStorage) + { + glTexSubImage3D(GL_TEXTURE_2D_ARRAY, level, 0, 0, 0, width, height, 1, GL_RGBA, + GL_UNSIGNED_BYTE, buffer); + } + else + { + glTexImage3D(GL_TEXTURE_2D_ARRAY, level, GL_RGBA, width, height, 1, 0, GL_RGBA, + GL_UNSIGNED_BYTE, buffer); + } if (expanded_width != width) glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); @@ -267,26 +307,31 @@ TextureCache::TextureCache() if (g_ActiveConfig.backend_info.bSupportsPaletteConversion) { - s32 buffer_size = 1024 * 1024; + s32 buffer_size_mb = (g_ActiveConfig.backend_info.bSupportsGPUTextureDecoding ? 32 : 1); + s32 buffer_size = buffer_size_mb * 1024 * 1024; s32 max_buffer_size = 0; - // The minimum MAX_TEXTURE_BUFFER_SIZE that the spec mandates - // is 65KB, we are asking for a 1MB buffer here. - // Make sure to check the maximum size and if it is below 1MB - // then use the maximum the hardware supports instead. + // The minimum MAX_TEXTURE_BUFFER_SIZE that the spec mandates is 65KB, we are asking for a 1MB + // buffer here. This buffer is also used as storage for undecoded textures when compute shader + // texture decoding is enabled, in which case the requested size is 32MB. glGetIntegerv(GL_MAX_TEXTURE_BUFFER_SIZE, &max_buffer_size); + + // Clamp the buffer size to the maximum size that the driver supports. buffer_size = std::min(buffer_size, max_buffer_size); s_palette_stream_buffer = StreamBuffer::Create(GL_TEXTURE_BUFFER, buffer_size); glGenTextures(1, &s_palette_resolv_texture); glBindTexture(GL_TEXTURE_BUFFER, s_palette_resolv_texture); glTexBuffer(GL_TEXTURE_BUFFER, GL_R16UI, s_palette_stream_buffer->m_buffer); + + CreateTextureDecodingResources(); } } TextureCache::~TextureCache() { DeleteShaders(); + DestroyTextureDecodingResources(); if (g_ActiveConfig.backend_info.bSupportsPaletteConversion) { @@ -588,4 +633,159 @@ void TextureCache::ConvertTexture(TCacheEntryBase* _entry, TCacheEntryBase* _unc FramebufferManager::SetFramebuffer(0); g_renderer->RestoreAPIState(); } + +static const std::string decoding_vertex_shader = R"( +void main() +{ + vec2 rawpos = vec2(gl_VertexID&1, gl_VertexID&2); + gl_Position = vec4(rawpos*2.0-1.0, 0.0, 1.0); +} +)"; + +void CreateTextureDecodingResources() +{ + static const GLenum gl_view_types[TextureConversionShader::BUFFER_FORMAT_COUNT] = { + GL_R8UI, // BUFFER_FORMAT_R8_UINT + GL_R16UI, // BUFFER_FORMAT_R16_UINT + GL_RG32UI, // BUFFER_FORMAT_R32G32_UINT + }; + + glGenTextures(TextureConversionShader::BUFFER_FORMAT_COUNT, + s_texture_decoding_buffer_views.data()); + for (size_t i = 0; i < TextureConversionShader::BUFFER_FORMAT_COUNT; i++) + { + glBindTexture(GL_TEXTURE_BUFFER, s_texture_decoding_buffer_views[i]); + glTexBuffer(GL_TEXTURE_BUFFER, gl_view_types[i], s_palette_stream_buffer->m_buffer); + } +} + +void DestroyTextureDecodingResources() +{ + glDeleteTextures(TextureConversionShader::BUFFER_FORMAT_COUNT, + s_texture_decoding_buffer_views.data()); + s_texture_decoding_buffer_views.fill(0); + s_texture_decoding_program_info.clear(); +} + +bool TextureCache::SupportsGPUTextureDecode(TextureFormat format, TlutFormat palette_format) +{ + auto key = std::make_pair(static_cast(format), static_cast(palette_format)); + auto iter = s_texture_decoding_program_info.find(key); + if (iter != s_texture_decoding_program_info.end()) + return iter->second.valid; + + TextureDecodingProgramInfo info; + info.base_info = TextureConversionShader::GetDecodingShaderInfo(format); + if (!info.base_info) + { + s_texture_decoding_program_info.emplace(key, info); + return false; + } + + std::string shader_source = + TextureConversionShader::GenerateDecodingShader(format, palette_format, APIType::OpenGL); + if (shader_source.empty()) + { + s_texture_decoding_program_info.emplace(key, info); + return false; + } + + if (!ProgramShaderCache::CompileComputeShader(info.program, shader_source)) + { + s_texture_decoding_program_info.emplace(key, info); + return false; + } + + info.uniform_dst_size = glGetUniformLocation(info.program.glprogid, "u_dst_size"); + info.uniform_src_size = glGetUniformLocation(info.program.glprogid, "u_src_size"); + info.uniform_src_offset = glGetUniformLocation(info.program.glprogid, "u_src_offset"); + info.uniform_src_row_stride = glGetUniformLocation(info.program.glprogid, "u_src_row_stride"); + info.uniform_palette_offset = glGetUniformLocation(info.program.glprogid, "u_palette_offset"); + info.valid = true; + s_texture_decoding_program_info.emplace(key, info); + return true; +} + +void TextureCache::DecodeTextureOnGPU(TCacheEntryBase* entry, u32 dst_level, const u8* data, + size_t data_size, TextureFormat format, u32 width, u32 height, + u32 aligned_width, u32 aligned_height, u32 row_stride, + const u8* palette, TlutFormat palette_format) +{ + auto key = std::make_pair(static_cast(format), static_cast(palette_format)); + auto iter = s_texture_decoding_program_info.find(key); + if (iter == s_texture_decoding_program_info.end()) + return; + +#ifdef TIME_TEXTURE_DECODING + GPUTimer timer; +#endif + + // Copy to GPU-visible buffer, aligned to the data type. + auto info = iter->second; + u32 bytes_per_buffer_elem = + TextureConversionShader::GetBytesPerBufferElement(info.base_info->buffer_format); + + // Only copy palette if it is required. + bool has_palette = info.base_info->palette_size > 0; + u32 total_upload_size = static_cast(data_size); + u32 palette_offset = total_upload_size; + if (has_palette) + { + // Align to u16. + if ((total_upload_size % sizeof(u16)) != 0) + { + total_upload_size++; + palette_offset++; + } + + total_upload_size += info.base_info->palette_size; + } + + // Allocate space in stream buffer, and copy texture + palette across. + auto buffer = s_palette_stream_buffer->Map(total_upload_size, bytes_per_buffer_elem); + memcpy(buffer.first, data, data_size); + if (has_palette) + memcpy(buffer.first + palette_offset, palette, info.base_info->palette_size); + s_palette_stream_buffer->Unmap(total_upload_size); + + info.program.Bind(); + + // Calculate stride in buffer elements + u32 row_stride_in_elements = row_stride / bytes_per_buffer_elem; + u32 offset_in_elements = buffer.second / bytes_per_buffer_elem; + u32 palette_offset_in_elements = (buffer.second + palette_offset) / sizeof(u16); + if (info.uniform_dst_size >= 0) + glUniform2ui(info.uniform_dst_size, width, height); + if (info.uniform_src_size >= 0) + glUniform2ui(info.uniform_src_size, aligned_width, aligned_height); + if (info.uniform_src_offset >= 0) + glUniform1ui(info.uniform_src_offset, offset_in_elements); + if (info.uniform_src_row_stride >= 0) + glUniform1ui(info.uniform_src_row_stride, row_stride_in_elements); + if (info.uniform_palette_offset >= 0) + glUniform1ui(info.uniform_palette_offset, palette_offset_in_elements); + + glActiveTexture(GL_TEXTURE9); + glBindTexture(GL_TEXTURE_BUFFER, s_texture_decoding_buffer_views[info.base_info->buffer_format]); + + if (has_palette) + { + // Use an R16UI view for the palette. + glActiveTexture(GL_TEXTURE10); + glBindTexture(GL_TEXTURE_BUFFER, s_palette_resolv_texture); + } + + auto dispatch_groups = TextureConversionShader::GetDispatchCount(info.base_info, width, height); + glBindImageTexture(0, static_cast(entry)->texture, dst_level, GL_TRUE, 0, + GL_WRITE_ONLY, GL_RGBA8); + glDispatchCompute(dispatch_groups.first, dispatch_groups.second, 1); + glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT); + + TextureCache::SetStage(); + +#ifdef TIME_TEXTURE_DECODING + WARN_LOG(VIDEO, "Decode texture format %u size %ux%u took %.4fms", static_cast(format), + width, height, timer.GetTimeMilliseconds()); +#endif +} } diff --git a/Source/Core/VideoBackends/OGL/TextureCache.h b/Source/Core/VideoBackends/OGL/TextureCache.h index 66f58cae0b..cfd267caae 100644 --- a/Source/Core/VideoBackends/OGL/TextureCache.h +++ b/Source/Core/VideoBackends/OGL/TextureCache.h @@ -23,6 +23,12 @@ public: static void DisableStage(unsigned int stage); static void SetStage(); + bool SupportsGPUTextureDecode(TextureFormat format, TlutFormat palette_format) override; + void DecodeTextureOnGPU(TCacheEntryBase* entry, u32 dst_level, const u8* data, size_t data_size, + TextureFormat format, u32 width, u32 height, u32 aligned_width, + u32 aligned_height, u32 row_stride, const u8* palette, + TlutFormat palette_format) override; + private: struct TCacheEntry : TCacheEntryBase { diff --git a/Source/Core/VideoBackends/OGL/main.cpp b/Source/Core/VideoBackends/OGL/main.cpp index 300d096978..7b5ccbe93e 100644 --- a/Source/Core/VideoBackends/OGL/main.cpp +++ b/Source/Core/VideoBackends/OGL/main.cpp @@ -101,6 +101,7 @@ void VideoBackend::InitBackendInfo() g_Config.backend_info.bSupportsExclusiveFullscreen = false; g_Config.backend_info.bSupportsOversizedViewports = true; g_Config.backend_info.bSupportsGeometryShaders = true; + g_Config.backend_info.bSupportsComputeShaders = false; g_Config.backend_info.bSupports3DVision = false; g_Config.backend_info.bSupportsPostProcessing = true; g_Config.backend_info.bSupportsSSAA = true; @@ -108,6 +109,11 @@ void VideoBackend::InitBackendInfo() g_Config.backend_info.bSupportsMultithreading = false; g_Config.backend_info.bSupportsInternalResolutionFrameDumps = true; + // TODO: There is a bug here, if texel buffers are not supported the graphics options + // will show the option when it is not supported. The only way around this would be + // creating a context when calling this function to determine what is available. + g_Config.backend_info.bSupportsGPUTextureDecoding = true; + // Overwritten in Render.cpp later g_Config.backend_info.bSupportsDualSourceBlend = true; g_Config.backend_info.bSupportsPrimitiveRestart = true; diff --git a/Source/Core/VideoBackends/Software/SWmain.cpp b/Source/Core/VideoBackends/Software/SWmain.cpp index 96ebdd8adb..eb70f4059f 100644 --- a/Source/Core/VideoBackends/Software/SWmain.cpp +++ b/Source/Core/VideoBackends/Software/SWmain.cpp @@ -131,7 +131,9 @@ void VideoSoftware::InitBackendInfo() g_Config.backend_info.bSupportsOversizedViewports = true; g_Config.backend_info.bSupportsPrimitiveRestart = false; g_Config.backend_info.bSupportsMultithreading = false; + g_Config.backend_info.bSupportsComputeShaders = false; g_Config.backend_info.bSupportsInternalResolutionFrameDumps = false; + g_Config.backend_info.bSupportsGPUTextureDecoding = false; // aamodes g_Config.backend_info.AAModes = {1}; diff --git a/Source/Core/VideoBackends/Vulkan/CommandBufferManager.cpp b/Source/Core/VideoBackends/Vulkan/CommandBufferManager.cpp index 27a3976b83..460ec919aa 100644 --- a/Source/Core/VideoBackends/Vulkan/CommandBufferManager.cpp +++ b/Source/Core/VideoBackends/Vulkan/CommandBufferManager.cpp @@ -91,7 +91,8 @@ bool CommandBufferManager::CreateCommandBuffers() VkDescriptorPoolSize pool_sizes[] = {{VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC, 500000}, {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 500000}, {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 16}, - {VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, 1024}}; + {VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, 1024}, + {VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1024}}; VkDescriptorPoolCreateInfo pool_create_info = {VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, nullptr, diff --git a/Source/Core/VideoBackends/Vulkan/Constants.h b/Source/Core/VideoBackends/Vulkan/Constants.h index f65aad6cfc..8507d23342 100644 --- a/Source/Core/VideoBackends/Vulkan/Constants.h +++ b/Source/Core/VideoBackends/Vulkan/Constants.h @@ -30,6 +30,7 @@ enum DESCRIPTOR_SET_LAYOUT DESCRIPTOR_SET_LAYOUT_PIXEL_SHADER_SAMPLERS, DESCRIPTOR_SET_LAYOUT_SHADER_STORAGE_BUFFERS, DESCRIPTOR_SET_LAYOUT_TEXEL_BUFFERS, + DESCRIPTOR_SET_LAYOUT_COMPUTE, NUM_DESCRIPTOR_SET_LAYOUTS }; @@ -52,6 +53,12 @@ enum DESCRIPTOR_SET_BIND_POINT // - Same as standard, plus 128 bytes of push constants, accessible from all stages. // - Texture Decoding // - Same as push constant, plus a single texel buffer accessible from PS. +// - Compute +// - 1 uniform buffer [set=0, binding=0] +// - 4 combined image samplers [set=0, binding=1-4] +// - 1 texel buffer [set=0, binding=5] +// - 1 storage image [set=0, binding=6] +// - 128 bytes of push constants // // All four pipeline layout share the first two descriptor sets (uniform buffers, PS samplers). // The third descriptor set (see bind points above) is used for storage or texel buffers. @@ -62,6 +69,7 @@ enum PIPELINE_LAYOUT PIPELINE_LAYOUT_BBOX, PIPELINE_LAYOUT_PUSH_CONSTANT, PIPELINE_LAYOUT_TEXTURE_CONVERSION, + PIPELINE_LAYOUT_COMPUTE, NUM_PIPELINE_LAYOUTS }; diff --git a/Source/Core/VideoBackends/Vulkan/ObjectCache.cpp b/Source/Core/VideoBackends/Vulkan/ObjectCache.cpp index 9c903b0065..1fb083be0f 100644 --- a/Source/Core/VideoBackends/Vulkan/ObjectCache.cpp +++ b/Source/Core/VideoBackends/Vulkan/ObjectCache.cpp @@ -324,6 +324,41 @@ std::pair ObjectCache::GetPipelineWithCacheResult(const Pipeli return {pipeline, false}; } +VkPipeline ObjectCache::CreateComputePipeline(const ComputePipelineInfo& info) +{ + VkComputePipelineCreateInfo pipeline_info = {VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + nullptr, + 0, + {VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + nullptr, 0, VK_SHADER_STAGE_COMPUTE_BIT, info.cs, + "main", nullptr}, + info.pipeline_layout, + VK_NULL_HANDLE, + -1}; + + VkPipeline pipeline; + VkResult res = vkCreateComputePipelines(g_vulkan_context->GetDevice(), VK_NULL_HANDLE, 1, + &pipeline_info, nullptr, &pipeline); + if (res != VK_SUCCESS) + { + LOG_VULKAN_ERROR(res, "vkCreateComputePipelines failed: "); + return VK_NULL_HANDLE; + } + + return pipeline; +} + +VkPipeline ObjectCache::GetComputePipeline(const ComputePipelineInfo& info) +{ + auto iter = m_compute_pipeline_objects.find(info); + if (iter != m_compute_pipeline_objects.end()) + return iter->second; + + VkPipeline pipeline = CreateComputePipeline(info); + m_compute_pipeline_objects.emplace(info, pipeline); + return pipeline; +} + std::string ObjectCache::GetDiskCacheFileName(const char* type) { return StringFromFormat("%svulkan-%s-%s.cache", File::GetUserPath(D_SHADERCACHE_IDX).c_str(), @@ -477,6 +512,13 @@ void ObjectCache::DestroyPipelineCache() } m_pipeline_objects.clear(); + for (const auto& it : m_compute_pipeline_objects) + { + if (it.second != VK_NULL_HANDLE) + vkDestroyPipeline(g_vulkan_context->GetDevice(), it.second, nullptr); + } + m_compute_pipeline_objects.clear(); + vkDestroyPipelineCache(g_vulkan_context->GetDevice(), m_pipeline_cache, nullptr); m_pipeline_cache = VK_NULL_HANDLE; } @@ -725,6 +767,17 @@ bool ObjectCache::CreateDescriptorSetLayouts() {0, VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, 1, VK_SHADER_STAGE_FRAGMENT_BIT}, }; + static const VkDescriptorSetLayoutBinding compute_set_bindings[] = { + {0, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC, 1, VK_SHADER_STAGE_COMPUTE_BIT}, + {1, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1, VK_SHADER_STAGE_COMPUTE_BIT}, + {2, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1, VK_SHADER_STAGE_COMPUTE_BIT}, + {3, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1, VK_SHADER_STAGE_COMPUTE_BIT}, + {4, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1, VK_SHADER_STAGE_COMPUTE_BIT}, + {5, VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT}, + {6, VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT}, + {7, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT}, + }; + static const VkDescriptorSetLayoutCreateInfo create_infos[NUM_DESCRIPTOR_SET_LAYOUTS] = { {VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, nullptr, 0, static_cast(ArraySize(ubo_set_bindings)), ubo_set_bindings}, @@ -733,7 +786,9 @@ bool ObjectCache::CreateDescriptorSetLayouts() {VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, nullptr, 0, static_cast(ArraySize(ssbo_set_bindings)), ssbo_set_bindings}, {VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, nullptr, 0, - static_cast(ArraySize(texel_buffer_set_bindings)), texel_buffer_set_bindings}}; + static_cast(ArraySize(texel_buffer_set_bindings)), texel_buffer_set_bindings}, + {VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, nullptr, 0, + static_cast(ArraySize(compute_set_bindings)), compute_set_bindings}}; for (size_t i = 0; i < NUM_DESCRIPTOR_SET_LAYOUTS; i++) { @@ -774,8 +829,11 @@ bool ObjectCache::CreatePipelineLayouts() m_descriptor_set_layouts[DESCRIPTOR_SET_LAYOUT_UNIFORM_BUFFERS], m_descriptor_set_layouts[DESCRIPTOR_SET_LAYOUT_PIXEL_SHADER_SAMPLERS], m_descriptor_set_layouts[DESCRIPTOR_SET_LAYOUT_TEXEL_BUFFERS]}; + VkDescriptorSetLayout compute_sets[] = {m_descriptor_set_layouts[DESCRIPTOR_SET_LAYOUT_COMPUTE]}; VkPushConstantRange push_constant_range = { VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT, 0, PUSH_CONSTANT_BUFFER_SIZE}; + VkPushConstantRange compute_push_constant_range = {VK_SHADER_STAGE_COMPUTE_BIT, 0, + PUSH_CONSTANT_BUFFER_SIZE}; // Info for each pipeline layout VkPipelineLayoutCreateInfo pipeline_layout_info[NUM_PIPELINE_LAYOUTS] = { @@ -794,7 +852,11 @@ bool ObjectCache::CreatePipelineLayouts() // Texture Conversion {VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, nullptr, 0, static_cast(ArraySize(texture_conversion_sets)), texture_conversion_sets, 1, - &push_constant_range}}; + &push_constant_range}, + + // Compute + {VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, nullptr, 0, + static_cast(ArraySize(compute_sets)), compute_sets, 1, &compute_push_constant_range}}; for (size_t i = 0; i < NUM_PIPELINE_LAYOUTS; i++) { @@ -1007,6 +1069,31 @@ bool operator<(const SamplerState& lhs, const SamplerState& rhs) return lhs.bits < rhs.bits; } +std::size_t ComputePipelineInfoHash::operator()(const ComputePipelineInfo& key) const +{ + return static_cast(XXH64(&key, sizeof(key), 0)); +} + +bool operator==(const ComputePipelineInfo& lhs, const ComputePipelineInfo& rhs) +{ + return std::memcmp(&lhs, &rhs, sizeof(lhs)) == 0; +} + +bool operator!=(const ComputePipelineInfo& lhs, const ComputePipelineInfo& rhs) +{ + return !operator==(lhs, rhs); +} + +bool operator<(const ComputePipelineInfo& lhs, const ComputePipelineInfo& rhs) +{ + return std::memcmp(&lhs, &rhs, sizeof(lhs)) < 0; +} + +bool operator>(const ComputePipelineInfo& lhs, const ComputePipelineInfo& rhs) +{ + return std::memcmp(&lhs, &rhs, sizeof(lhs)) > 0; +} + bool ObjectCache::CompileSharedShaders() { static const char PASSTHROUGH_VERTEX_SHADER_SOURCE[] = R"( diff --git a/Source/Core/VideoBackends/Vulkan/ObjectCache.h b/Source/Core/VideoBackends/Vulkan/ObjectCache.h index 546d1439a5..11d436fc35 100644 --- a/Source/Core/VideoBackends/Vulkan/ObjectCache.h +++ b/Source/Core/VideoBackends/Vulkan/ObjectCache.h @@ -56,6 +56,22 @@ bool operator!=(const SamplerState& lhs, const SamplerState& rhs); bool operator>(const SamplerState& lhs, const SamplerState& rhs); bool operator<(const SamplerState& lhs, const SamplerState& rhs); +struct ComputePipelineInfo +{ + VkPipelineLayout pipeline_layout; + VkShaderModule cs; +}; + +struct ComputePipelineInfoHash +{ + std::size_t operator()(const ComputePipelineInfo& key) const; +}; + +bool operator==(const ComputePipelineInfo& lhs, const ComputePipelineInfo& rhs); +bool operator!=(const ComputePipelineInfo& lhs, const ComputePipelineInfo& rhs); +bool operator<(const ComputePipelineInfo& lhs, const ComputePipelineInfo& rhs); +bool operator>(const ComputePipelineInfo& lhs, const ComputePipelineInfo& rhs); + class ObjectCache { public: @@ -114,6 +130,12 @@ public: // otherwise for a cache hit it will be true. std::pair GetPipelineWithCacheResult(const PipelineInfo& info); + // Creates a compute pipeline, and does not track the handle. + VkPipeline CreateComputePipeline(const ComputePipelineInfo& info); + + // Find a pipeline by the specified description, if not found, attempts to create it + VkPipeline GetComputePipeline(const ComputePipelineInfo& info); + // Saves the pipeline cache to disk. Call when shutting down. void SavePipelineCache(); @@ -166,6 +188,8 @@ private: ShaderCache m_ps_cache; std::unordered_map m_pipeline_objects; + std::unordered_map + m_compute_pipeline_objects; VkPipelineCache m_pipeline_cache = VK_NULL_HANDLE; std::string m_pipeline_cache_filename; diff --git a/Source/Core/VideoBackends/Vulkan/ShaderCompiler.cpp b/Source/Core/VideoBackends/Vulkan/ShaderCompiler.cpp index 2265a34364..d4d095bb78 100644 --- a/Source/Core/VideoBackends/Vulkan/ShaderCompiler.cpp +++ b/Source/Core/VideoBackends/Vulkan/ShaderCompiler.cpp @@ -35,7 +35,7 @@ static const TBuiltInResource* GetCompilerResourceLimits(); // Compile a shader to SPIR-V via glslang static bool CompileShaderToSPV(SPIRVCodeVector* out_code, EShLanguage stage, const char* stage_filename, const char* source_code, - size_t source_code_length, bool prepend_header); + size_t source_code_length, const char* header, size_t header_length); // Regarding the UBO bind points, we subtract one from the binding index because // the OpenGL backend requires UBO #0 for non-block uniforms (at least on NV). @@ -73,9 +73,32 @@ static const char SHADER_HEADER[] = R"( #define gl_VertexID gl_VertexIndex #define gl_InstanceID gl_InstanceIndex )"; +static const char COMPUTE_SHADER_HEADER[] = R"( + // Target GLSL 4.5. + #version 450 core + // All resources are packed into one descriptor set for compute. + #define UBO_BINDING(packing, x) layout(packing, set = 0, binding = (0 + x)) + #define SAMPLER_BINDING(x) layout(set = 0, binding = (1 + x)) + #define TEXEL_BUFFER_BINDING(x) layout(set = 0, binding = (5 + x)) + #define IMAGE_BINDING(format, x) layout(format, set = 0, binding = (7 + x)) + + // hlsl to glsl function translation + #define float2 vec2 + #define float3 vec3 + #define float4 vec4 + #define uint2 uvec2 + #define uint3 uvec3 + #define uint4 uvec4 + #define int2 ivec2 + #define int3 ivec3 + #define int4 ivec4 + #define frac fract + #define lerp mix +)"; bool CompileShaderToSPV(SPIRVCodeVector* out_code, EShLanguage stage, const char* stage_filename, - const char* source_code, size_t source_code_length, bool prepend_header) + const char* source_code, size_t source_code_length, const char* header, + size_t header_length) { if (!InitializeGlslang()) return false; @@ -91,10 +114,10 @@ bool CompileShaderToSPV(SPIRVCodeVector* out_code, EShLanguage stage, const char std::string full_source_code; const char* pass_source_code = source_code; int pass_source_code_length = static_cast(source_code_length); - if (prepend_header) + if (header_length > 0) { - full_source_code.reserve(sizeof(SHADER_HEADER) + source_code_length); - full_source_code.append(SHADER_HEADER, sizeof(SHADER_HEADER) - 1); + full_source_code.reserve(header_length + source_code_length); + full_source_code.append(header, header_length); full_source_code.append(source_code, source_code_length); pass_source_code = full_source_code.c_str(); pass_source_code_length = static_cast(full_source_code.length()); @@ -318,21 +341,28 @@ bool CompileVertexShader(SPIRVCodeVector* out_code, const char* source_code, size_t source_code_length, bool prepend_header) { return CompileShaderToSPV(out_code, EShLangVertex, "vs", source_code, source_code_length, - prepend_header); + SHADER_HEADER, sizeof(SHADER_HEADER) - 1); } bool CompileGeometryShader(SPIRVCodeVector* out_code, const char* source_code, size_t source_code_length, bool prepend_header) { return CompileShaderToSPV(out_code, EShLangGeometry, "gs", source_code, source_code_length, - prepend_header); + SHADER_HEADER, sizeof(SHADER_HEADER) - 1); } bool CompileFragmentShader(SPIRVCodeVector* out_code, const char* source_code, size_t source_code_length, bool prepend_header) { return CompileShaderToSPV(out_code, EShLangFragment, "ps", source_code, source_code_length, - prepend_header); + SHADER_HEADER, sizeof(SHADER_HEADER) - 1); +} + +bool CompileComputeShader(SPIRVCodeVector* out_code, const char* source_code, + size_t source_code_length, bool prepend_header) +{ + return CompileShaderToSPV(out_code, EShLangCompute, "cs", source_code, source_code_length, + COMPUTE_SHADER_HEADER, sizeof(COMPUTE_SHADER_HEADER) - 1); } } // namespace ShaderCompiler diff --git a/Source/Core/VideoBackends/Vulkan/ShaderCompiler.h b/Source/Core/VideoBackends/Vulkan/ShaderCompiler.h index 96bd9081bf..197dc1787c 100644 --- a/Source/Core/VideoBackends/Vulkan/ShaderCompiler.h +++ b/Source/Core/VideoBackends/Vulkan/ShaderCompiler.h @@ -29,5 +29,9 @@ bool CompileGeometryShader(SPIRVCodeVector* out_code, const char* source_code, bool CompileFragmentShader(SPIRVCodeVector* out_code, const char* source_code, size_t source_code_length, bool prepend_header = true); +// Compile a compute shader to SPIR-V. +bool CompileComputeShader(SPIRVCodeVector* out_code, const char* source_code, + size_t source_code_length, bool prepend_header = true); + } // namespace ShaderCompiler } // namespace Vulkan diff --git a/Source/Core/VideoBackends/Vulkan/Texture2D.cpp b/Source/Core/VideoBackends/Vulkan/Texture2D.cpp index 9dda089b21..9b8111aa94 100644 --- a/Source/Core/VideoBackends/Vulkan/Texture2D.cpp +++ b/Source/Core/VideoBackends/Vulkan/Texture2D.cpp @@ -4,6 +4,7 @@ #include +#include "Common/Assert.h" #include "VideoBackends/Vulkan/CommandBufferManager.h" #include "VideoBackends/Vulkan/Texture2D.h" #include "VideoBackends/Vulkan/VulkanContext.h" @@ -273,10 +274,132 @@ void Texture2D::TransitionToLayout(VkCommandBuffer command_buffer, VkImageLayout break; } + // If we were using a compute layout, the stages need to reflect that + switch (m_compute_layout) + { + case ComputeImageLayout::Undefined: + break; + case ComputeImageLayout::ReadOnly: + barrier.srcAccessMask = VK_ACCESS_SHADER_READ_BIT; + srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + break; + case ComputeImageLayout::WriteOnly: + barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; + srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + break; + case ComputeImageLayout::ReadWrite: + barrier.srcAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + break; + } + m_compute_layout = ComputeImageLayout::Undefined; + vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, nullptr, 0, nullptr, 1, &barrier); m_layout = new_layout; } +void Texture2D::TransitionToLayout(VkCommandBuffer command_buffer, ComputeImageLayout new_layout) +{ + _assert_(new_layout != ComputeImageLayout::Undefined); + if (m_compute_layout == new_layout) + return; + + VkImageMemoryBarrier barrier = { + VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, // VkStructureType sType + nullptr, // const void* pNext + 0, // VkAccessFlags srcAccessMask + 0, // VkAccessFlags dstAccessMask + m_layout, // VkImageLayout oldLayout + VK_IMAGE_LAYOUT_GENERAL, // VkImageLayout newLayout + VK_QUEUE_FAMILY_IGNORED, // uint32_t srcQueueFamilyIndex + VK_QUEUE_FAMILY_IGNORED, // uint32_t dstQueueFamilyIndex + m_image, // VkImage image + {static_cast(Util::IsDepthFormat(m_format) ? VK_IMAGE_ASPECT_DEPTH_BIT : + VK_IMAGE_ASPECT_COLOR_BIT), + 0, m_levels, 0, m_layers} // VkImageSubresourceRange subresourceRange + }; + + VkPipelineStageFlags srcStageMask, dstStageMask; + switch (m_layout) + { + case VK_IMAGE_LAYOUT_UNDEFINED: + // Layout undefined therefore contents undefined, and we don't care what happens to it. + barrier.srcAccessMask = 0; + srcStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; + break; + + case VK_IMAGE_LAYOUT_PREINITIALIZED: + // Image has been pre-initialized by the host, so ensure all writes have completed. + barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; + srcStageMask = VK_PIPELINE_STAGE_HOST_BIT; + break; + + case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL: + // Image was being used as a color attachment, so ensure all writes have completed. + barrier.srcAccessMask = + VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; + srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; + break; + + case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL: + // Image was being used as a depthstencil attachment, so ensure all writes have completed. + barrier.srcAccessMask = + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; + break; + + case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL: + // Image was being used as a shader resource, make sure all reads have finished. + barrier.srcAccessMask = VK_ACCESS_SHADER_READ_BIT; + srcStageMask = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; + break; + + case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: + // Image was being used as a copy source, ensure all reads have finished. + barrier.srcAccessMask = VK_ACCESS_TRANSFER_READ_BIT; + srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT; + break; + + case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL: + // Image was being used as a copy destination, ensure all writes have finished. + barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT; + break; + + default: + srcStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; + break; + } + + switch (new_layout) + { + case ComputeImageLayout::ReadOnly: + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + break; + case ComputeImageLayout::WriteOnly: + barrier.dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT; + barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; + dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + break; + case ComputeImageLayout::ReadWrite: + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; + dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + break; + default: + dstStageMask = 0; + break; + } + + m_layout = barrier.newLayout; + m_compute_layout = new_layout; + + vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, nullptr, 0, nullptr, 1, + &barrier); +} + } // namespace Vulkan diff --git a/Source/Core/VideoBackends/Vulkan/Texture2D.h b/Source/Core/VideoBackends/Vulkan/Texture2D.h index bf0a8a70ab..3fce48d758 100644 --- a/Source/Core/VideoBackends/Vulkan/Texture2D.h +++ b/Source/Core/VideoBackends/Vulkan/Texture2D.h @@ -17,6 +17,15 @@ class ObjectCache; class Texture2D { public: + // Custom image layouts, mainly used for switching to/from compute + enum class ComputeImageLayout + { + Undefined, + ReadOnly, + WriteOnly, + ReadWrite + }; + Texture2D(u32 width, u32 height, u32 levels, u32 layers, VkFormat format, VkSampleCountFlagBits samples, VkImageViewType view_type, VkImage image, VkDeviceMemory device_memory, VkImageView view); @@ -50,6 +59,7 @@ public: void OverrideImageLayout(VkImageLayout new_layout); void TransitionToLayout(VkCommandBuffer command_buffer, VkImageLayout new_layout); + void TransitionToLayout(VkCommandBuffer command_buffer, ComputeImageLayout new_layout); private: u32 m_width; @@ -60,6 +70,7 @@ private: VkSampleCountFlagBits m_samples; VkImageViewType m_view_type; VkImageLayout m_layout = VK_IMAGE_LAYOUT_UNDEFINED; + ComputeImageLayout m_compute_layout = ComputeImageLayout::Undefined; VkImage m_image; VkDeviceMemory m_device_memory; diff --git a/Source/Core/VideoBackends/Vulkan/TextureCache.cpp b/Source/Core/VideoBackends/Vulkan/TextureCache.cpp index 5a73106476..b951b78aa2 100644 --- a/Source/Core/VideoBackends/Vulkan/TextureCache.cpp +++ b/Source/Core/VideoBackends/Vulkan/TextureCache.cpp @@ -138,6 +138,21 @@ void TextureCache::CopyRectangleFromTexture(TCacheEntry* dst_texture, ScaleTextureRectangle(dst_texture, dst_rect, src_texture, src_rect); } +bool TextureCache::SupportsGPUTextureDecode(TextureFormat format, TlutFormat palette_format) +{ + return m_texture_converter->SupportsTextureDecoding(format, palette_format); +} + +void TextureCache::DecodeTextureOnGPU(TCacheEntryBase* entry, u32 dst_level, const u8* data, + size_t data_size, TextureFormat format, u32 width, u32 height, + u32 aligned_width, u32 aligned_height, u32 row_stride, + const u8* palette, TlutFormat palette_format) +{ + m_texture_converter->DecodeTexture(static_cast(entry), dst_level, data, data_size, + format, width, height, aligned_width, aligned_height, + row_stride, palette, palette_format); +} + void TextureCache::CopyTextureRectangle(TCacheEntry* dst_texture, const MathUtil::Rectangle& dst_rect, Texture2D* src_texture, diff --git a/Source/Core/VideoBackends/Vulkan/TextureCache.h b/Source/Core/VideoBackends/Vulkan/TextureCache.h index f014492d7b..b433d7d9e0 100644 --- a/Source/Core/VideoBackends/Vulkan/TextureCache.h +++ b/Source/Core/VideoBackends/Vulkan/TextureCache.h @@ -66,6 +66,13 @@ public: void CopyRectangleFromTexture(TCacheEntry* dst_texture, const MathUtil::Rectangle& dst_rect, Texture2D* src_texture, const MathUtil::Rectangle& src_rect); + bool SupportsGPUTextureDecode(TextureFormat format, TlutFormat palette_format) override; + + void DecodeTextureOnGPU(TCacheEntryBase* entry, u32 dst_level, const u8* data, size_t data_size, + TextureFormat format, u32 width, u32 height, u32 aligned_width, + u32 aligned_height, u32 row_stride, const u8* palette, + TlutFormat palette_format) override; + private: bool CreateRenderPasses(); diff --git a/Source/Core/VideoBackends/Vulkan/TextureConverter.cpp b/Source/Core/VideoBackends/Vulkan/TextureConverter.cpp index 8f69b386e0..cbe4322b1d 100644 --- a/Source/Core/VideoBackends/Vulkan/TextureConverter.cpp +++ b/Source/Core/VideoBackends/Vulkan/TextureConverter.cpp @@ -42,8 +42,12 @@ TextureConverter::~TextureConverter() vkDestroyShaderModule(g_vulkan_context->GetDevice(), it, nullptr); } + if (m_texel_buffer_view_r8_uint != VK_NULL_HANDLE) + vkDestroyBufferView(g_vulkan_context->GetDevice(), m_texel_buffer_view_r8_uint, nullptr); if (m_texel_buffer_view_r16_uint != VK_NULL_HANDLE) vkDestroyBufferView(g_vulkan_context->GetDevice(), m_texel_buffer_view_r16_uint, nullptr); + if (m_texel_buffer_view_r32g32_uint != VK_NULL_HANDLE) + vkDestroyBufferView(g_vulkan_context->GetDevice(), m_texel_buffer_view_r32g32_uint, nullptr); if (m_texel_buffer_view_rgba8_unorm != VK_NULL_HANDLE) vkDestroyBufferView(g_vulkan_context->GetDevice(), m_texel_buffer_view_rgba8_unorm, nullptr); @@ -59,6 +63,12 @@ TextureConverter::~TextureConverter() vkDestroyShaderModule(g_vulkan_context->GetDevice(), shader, nullptr); } + for (const auto& it : m_decoding_pipelines) + { + if (it.second.compute_shader != VK_NULL_HANDLE) + vkDestroyShaderModule(g_vulkan_context->GetDevice(), it.second.compute_shader, nullptr); + } + if (m_rgb_to_yuyv_shader != VK_NULL_HANDLE) vkDestroyShaderModule(g_vulkan_context->GetDevice(), m_rgb_to_yuyv_shader, nullptr); if (m_yuyv_to_rgb_shader != VK_NULL_HANDLE) @@ -103,6 +113,12 @@ bool TextureConverter::Initialize() return false; } + if (!CreateDecodingTexture()) + { + PanicAlert("Failed to create decoding texture"); + return false; + } + if (!CompileYUYVConversionShaders()) { PanicAlert("Failed to compile YUYV conversion shaders"); @@ -371,6 +387,152 @@ void TextureConverter::DecodeYUYVTextureFromMemory(TextureCache::TCacheEntry* ds draw.EndRenderPass(); } +bool TextureConverter::SupportsTextureDecoding(TextureFormat format, TlutFormat palette_format) +{ + auto key = std::make_pair(format, palette_format); + auto iter = m_decoding_pipelines.find(key); + if (iter != m_decoding_pipelines.end()) + return iter->second.valid; + + TextureDecodingPipeline pipeline; + pipeline.base_info = TextureConversionShader::GetDecodingShaderInfo(format); + pipeline.compute_shader = VK_NULL_HANDLE; + pipeline.valid = false; + + if (!pipeline.base_info) + { + m_decoding_pipelines.emplace(key, pipeline); + return false; + } + + std::string shader_source = + TextureConversionShader::GenerateDecodingShader(format, palette_format, APIType::Vulkan); + + pipeline.compute_shader = Util::CompileAndCreateComputeShader(shader_source, true); + if (pipeline.compute_shader == VK_NULL_HANDLE) + { + m_decoding_pipelines.emplace(key, pipeline); + return false; + } + + pipeline.valid = true; + m_decoding_pipelines.emplace(key, pipeline); + return true; +} + +void TextureConverter::DecodeTexture(TextureCache::TCacheEntry* entry, u32 dst_level, + const u8* data, size_t data_size, TextureFormat format, + u32 width, u32 height, u32 aligned_width, u32 aligned_height, + u32 row_stride, const u8* palette, TlutFormat palette_format) +{ + auto key = std::make_pair(format, palette_format); + auto iter = m_decoding_pipelines.find(key); + if (iter == m_decoding_pipelines.end()) + return; + + struct PushConstants + { + u32 dst_size[2]; + u32 src_size[2]; + u32 src_offset; + u32 src_row_stride; + u32 palette_offset; + }; + + // Copy to GPU-visible buffer, aligned to the data type + auto info = iter->second; + u32 bytes_per_buffer_elem = + TextureConversionShader::GetBytesPerBufferElement(info.base_info->buffer_format); + + // Calculate total data size, including palette. + // Only copy palette if it is required. + u32 total_upload_size = static_cast(data_size); + u32 palette_size = iter->second.base_info->palette_size; + u32 palette_offset = total_upload_size; + bool has_palette = palette_size > 0; + if (has_palette) + { + // Align to u16. + if ((total_upload_size % sizeof(u16)) != 0) + { + total_upload_size++; + palette_offset++; + } + + total_upload_size += palette_size; + } + + // Allocate space for upload, if it fails, execute the buffer. + if (!m_texel_buffer->ReserveMemory(total_upload_size, bytes_per_buffer_elem)) + { + Util::ExecuteCurrentCommandsAndRestoreState(true, false); + if (!m_texel_buffer->ReserveMemory(total_upload_size, bytes_per_buffer_elem)) + PanicAlert("Failed to reserve memory for encoded texture upload"); + } + + // Copy/commit upload buffer. + u32 texel_buffer_offset = static_cast(m_texel_buffer->GetCurrentOffset()); + std::memcpy(m_texel_buffer->GetCurrentHostPointer(), data, data_size); + if (has_palette) + std::memcpy(m_texel_buffer->GetCurrentHostPointer() + palette_offset, palette, palette_size); + m_texel_buffer->CommitMemory(total_upload_size); + + // Determine uniforms. + PushConstants constants = { + {width, height}, + {aligned_width, aligned_height}, + texel_buffer_offset / bytes_per_buffer_elem, + row_stride / bytes_per_buffer_elem, + static_cast((texel_buffer_offset + palette_offset) / sizeof(u16))}; + + // Determine view to use for texel buffers. + VkBufferView data_view = VK_NULL_HANDLE; + switch (iter->second.base_info->buffer_format) + { + case TextureConversionShader::BUFFER_FORMAT_R8_UINT: + data_view = m_texel_buffer_view_r8_uint; + break; + case TextureConversionShader::BUFFER_FORMAT_R16_UINT: + data_view = m_texel_buffer_view_r16_uint; + break; + case TextureConversionShader::BUFFER_FORMAT_R32G32_UINT: + data_view = m_texel_buffer_view_r32g32_uint; + break; + default: + break; + } + + // Place compute shader dispatches together in the init command buffer. + // That way we don't have to pay a penalty for switching from graphics->compute, + // or end/restart our render pass. + VkCommandBuffer command_buffer = g_command_buffer_mgr->GetCurrentInitCommandBuffer(); + + // Dispatch compute to temporary texture. + ComputeShaderDispatcher dispatcher(command_buffer, + g_object_cache->GetPipelineLayout(PIPELINE_LAYOUT_COMPUTE), + iter->second.compute_shader); + m_decoding_texture->TransitionToLayout(command_buffer, Texture2D::ComputeImageLayout::WriteOnly); + dispatcher.SetPushConstants(&constants, sizeof(constants)); + dispatcher.SetStorageImage(m_decoding_texture->GetView(), m_decoding_texture->GetLayout()); + dispatcher.SetTexelBuffer(0, data_view); + if (has_palette) + dispatcher.SetTexelBuffer(1, m_texel_buffer_view_r16_uint); + auto groups = TextureConversionShader::GetDispatchCount(iter->second.base_info, width, height); + dispatcher.Dispatch(groups.first, groups.second, 1); + + // Copy from temporary texture to final destination. + m_decoding_texture->TransitionToLayout(command_buffer, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); + entry->GetTexture()->TransitionToLayout(command_buffer, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + VkImageCopy image_copy = {{VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1}, + {0, 0, 0}, + {VK_IMAGE_ASPECT_COLOR_BIT, dst_level, 0, 1}, + {0, 0, 0}, + {width, height, 1}}; + vkCmdCopyImage(command_buffer, m_decoding_texture->GetImage(), + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, entry->GetTexture()->GetImage(), + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &image_copy); +} + bool TextureConverter::CreateTexelBuffer() { // Prefer an 8MB buffer if possible, but use less if the device doesn't support this. @@ -386,9 +548,13 @@ bool TextureConverter::CreateTexelBuffer() return false; // Create views of the formats that we will be using. + m_texel_buffer_view_r8_uint = CreateTexelBufferView(VK_FORMAT_R8_UINT); m_texel_buffer_view_r16_uint = CreateTexelBufferView(VK_FORMAT_R16_UINT); + m_texel_buffer_view_r32g32_uint = CreateTexelBufferView(VK_FORMAT_R32G32_UINT); m_texel_buffer_view_rgba8_unorm = CreateTexelBufferView(VK_FORMAT_R8G8B8A8_UNORM); - return m_texel_buffer_view_r16_uint != VK_NULL_HANDLE && + return m_texel_buffer_view_r8_uint != VK_NULL_HANDLE && + m_texel_buffer_view_r16_uint != VK_NULL_HANDLE && + m_texel_buffer_view_r32g32_uint != VK_NULL_HANDLE && m_texel_buffer_view_rgba8_unorm != VK_NULL_HANDLE; } @@ -611,6 +777,15 @@ bool TextureConverter::CreateEncodingDownloadTexture() return m_encoding_download_texture && m_encoding_download_texture->Map(); } +bool TextureConverter::CreateDecodingTexture() +{ + m_decoding_texture = Texture2D::Create( + DECODING_TEXTURE_WIDTH, DECODING_TEXTURE_HEIGHT, 1, 1, VK_FORMAT_R8G8B8A8_UNORM, + VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_VIEW_TYPE_2D_ARRAY, VK_IMAGE_TILING_OPTIMAL, + VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT); + return static_cast(m_decoding_texture); +} + bool TextureConverter::CompileYUYVConversionShaders() { static const char RGB_TO_YUYV_SHADER_SOURCE[] = R"( diff --git a/Source/Core/VideoBackends/Vulkan/TextureConverter.h b/Source/Core/VideoBackends/Vulkan/TextureConverter.h index 651d511849..39543e0f17 100644 --- a/Source/Core/VideoBackends/Vulkan/TextureConverter.h +++ b/Source/Core/VideoBackends/Vulkan/TextureConverter.h @@ -5,11 +5,14 @@ #pragma once #include +#include #include +#include #include "Common/CommonTypes.h" #include "VideoBackends/Vulkan/StreamBuffer.h" #include "VideoBackends/Vulkan/TextureCache.h" +#include "VideoCommon/TextureConversionShader.h" #include "VideoCommon/TextureDecoder.h" #include "VideoCommon/VideoCommon.h" @@ -45,6 +48,12 @@ public: void DecodeYUYVTextureFromMemory(TextureCache::TCacheEntry* dst_texture, const void* src_ptr, u32 src_width, u32 src_stride, u32 src_height); + bool SupportsTextureDecoding(TextureFormat format, TlutFormat palette_format); + void DecodeTexture(TextureCache::TCacheEntry* entry, u32 dst_level, const u8* data, + size_t data_size, TextureFormat format, u32 width, u32 height, + u32 aligned_width, u32 aligned_height, u32 row_stride, const u8* palette, + TlutFormat palette_format); + private: static const u32 NUM_TEXTURE_ENCODING_SHADERS = 64; static const u32 ENCODING_TEXTURE_WIDTH = EFB_WIDTH * 4; @@ -52,6 +61,10 @@ private: static const VkFormat ENCODING_TEXTURE_FORMAT = VK_FORMAT_B8G8R8A8_UNORM; static const size_t NUM_PALETTE_CONVERSION_SHADERS = 3; + // Maximum size of a texture based on BP registers. + static const u32 DECODING_TEXTURE_WIDTH = 1024; + static const u32 DECODING_TEXTURE_HEIGHT = 1024; + bool CreateTexelBuffer(); VkBufferView CreateTexelBufferView(VkFormat format) const; @@ -62,6 +75,8 @@ private: bool CreateEncodingTexture(); bool CreateEncodingDownloadTexture(); + bool CreateDecodingTexture(); + bool CompileYUYVConversionShaders(); // Allocates storage in the texel command buffer of the specified size. @@ -77,7 +92,9 @@ private: // Shared between conversion types std::unique_ptr m_texel_buffer; + VkBufferView m_texel_buffer_view_r8_uint = VK_NULL_HANDLE; VkBufferView m_texel_buffer_view_r16_uint = VK_NULL_HANDLE; + VkBufferView m_texel_buffer_view_r32g32_uint = VK_NULL_HANDLE; VkBufferView m_texel_buffer_view_rgba8_unorm = VK_NULL_HANDLE; size_t m_texel_buffer_size = 0; @@ -91,6 +108,16 @@ private: VkFramebuffer m_encoding_render_framebuffer = VK_NULL_HANDLE; std::unique_ptr m_encoding_download_texture; + // Texture decoding - GX format in memory->RGBA8 + struct TextureDecodingPipeline + { + const TextureConversionShader::DecodingShaderInfo* base_info; + VkShaderModule compute_shader; + bool valid; + }; + std::map, TextureDecodingPipeline> m_decoding_pipelines; + std::unique_ptr m_decoding_texture; + // XFB encoding/decoding shaders VkShaderModule m_rgb_to_yuyv_shader = VK_NULL_HANDLE; VkShaderModule m_yuyv_to_rgb_shader = VK_NULL_HANDLE; diff --git a/Source/Core/VideoBackends/Vulkan/Util.cpp b/Source/Core/VideoBackends/Vulkan/Util.cpp index f49ca90580..02ad129972 100644 --- a/Source/Core/VideoBackends/Vulkan/Util.cpp +++ b/Source/Core/VideoBackends/Vulkan/Util.cpp @@ -250,6 +250,18 @@ VkShaderModule CompileAndCreateFragmentShader(const std::string& source_code, bo return CreateShaderModule(code.data(), code.size()); } +VkShaderModule CompileAndCreateComputeShader(const std::string& source_code, bool prepend_header) +{ + ShaderCompiler::SPIRVCodeVector code; + if (!ShaderCompiler::CompileComputeShader(&code, source_code.c_str(), source_code.length(), + prepend_header)) + { + return VK_NULL_HANDLE; + } + + return CreateShaderModule(code.data(), code.size()); +} + } // namespace Util UtilityShaderDraw::UtilityShaderDraw(VkCommandBuffer command_buffer, @@ -670,4 +682,157 @@ bool UtilityShaderDraw::BindPipeline() return true; } +ComputeShaderDispatcher::ComputeShaderDispatcher(VkCommandBuffer command_buffer, + VkPipelineLayout pipeline_layout, + VkShaderModule compute_shader) + : m_command_buffer(command_buffer) +{ + // Populate minimal pipeline state + m_pipeline_info.pipeline_layout = pipeline_layout; + m_pipeline_info.cs = compute_shader; +} + +u8* ComputeShaderDispatcher::AllocateUniformBuffer(size_t size) +{ + if (!g_object_cache->GetUtilityShaderUniformBuffer()->ReserveMemory( + size, g_vulkan_context->GetUniformBufferAlignment(), true, true, true)) + PanicAlert("Failed to allocate util uniforms"); + + return g_object_cache->GetUtilityShaderUniformBuffer()->GetCurrentHostPointer(); +} + +void ComputeShaderDispatcher::CommitUniformBuffer(size_t size) +{ + m_uniform_buffer.buffer = g_object_cache->GetUtilityShaderUniformBuffer()->GetBuffer(); + m_uniform_buffer.offset = 0; + m_uniform_buffer.range = size; + m_uniform_buffer_offset = + static_cast(g_object_cache->GetUtilityShaderUniformBuffer()->GetCurrentOffset()); + + g_object_cache->GetUtilityShaderUniformBuffer()->CommitMemory(size); +} + +void ComputeShaderDispatcher::SetPushConstants(const void* data, size_t data_size) +{ + _assert_(static_cast(data_size) < PUSH_CONSTANT_BUFFER_SIZE); + + vkCmdPushConstants(m_command_buffer, m_pipeline_info.pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, + 0, static_cast(data_size), data); +} + +void ComputeShaderDispatcher::SetSampler(size_t index, VkImageView view, VkSampler sampler) +{ + m_samplers[index].sampler = sampler; + m_samplers[index].imageView = view; + m_samplers[index].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; +} + +void ComputeShaderDispatcher::SetStorageImage(VkImageView view, VkImageLayout image_layout) +{ + m_storage_image.sampler = VK_NULL_HANDLE; + m_storage_image.imageView = view; + m_storage_image.imageLayout = image_layout; +} + +void ComputeShaderDispatcher::SetTexelBuffer(size_t index, VkBufferView view) +{ + m_texel_buffers[index] = view; +} + +void ComputeShaderDispatcher::Dispatch(u32 groups_x, u32 groups_y, u32 groups_z) +{ + BindDescriptors(); + if (!BindPipeline()) + return; + + vkCmdDispatch(m_command_buffer, groups_x, groups_y, groups_z); +} + +void ComputeShaderDispatcher::BindDescriptors() +{ + VkDescriptorSet set = g_command_buffer_mgr->AllocateDescriptorSet( + g_object_cache->GetDescriptorSetLayout(DESCRIPTOR_SET_LAYOUT_COMPUTE)); + if (set == VK_NULL_HANDLE) + { + PanicAlert("Failed to allocate descriptor set for compute dispatch"); + return; + } + + // Reserve enough descriptors to write every binding. + std::array set_writes = {}; + u32 num_set_writes = 0; + + if (m_uniform_buffer.buffer != VK_NULL_HANDLE) + { + set_writes[num_set_writes++] = {VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + nullptr, + set, + 0, + 0, + 1, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC, + nullptr, + &m_uniform_buffer, + nullptr}; + } + + // Samplers + for (size_t i = 0; i < m_samplers.size(); i++) + { + const VkDescriptorImageInfo& info = m_samplers[i]; + if (info.imageView != VK_NULL_HANDLE && info.sampler != VK_NULL_HANDLE) + { + set_writes[num_set_writes++] = {VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + nullptr, + set, + static_cast(1 + i), + 0, + 1, + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + &info, + nullptr, + nullptr}; + } + } + + for (size_t i = 0; i < m_texel_buffers.size(); i++) + { + if (m_texel_buffers[i] != VK_NULL_HANDLE) + { + set_writes[num_set_writes++] = { + VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, nullptr, set, 5 + static_cast(i), 0, 1, + VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, nullptr, nullptr, &m_texel_buffers[i]}; + } + } + + if (m_storage_image.imageView != VK_NULL_HANDLE) + { + set_writes[num_set_writes++] = { + VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, nullptr, set, 7, 0, 1, + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &m_storage_image, nullptr, nullptr}; + } + + if (num_set_writes > 0) + { + vkUpdateDescriptorSets(g_vulkan_context->GetDevice(), num_set_writes, set_writes.data(), 0, + nullptr); + } + + vkCmdBindDescriptorSets(m_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, + m_pipeline_info.pipeline_layout, 0, 1, &set, 1, &m_uniform_buffer_offset); +} + +bool ComputeShaderDispatcher::BindPipeline() +{ + VkPipeline pipeline = g_object_cache->GetComputePipeline(m_pipeline_info); + if (pipeline == VK_NULL_HANDLE) + { + PanicAlert("Failed to get pipeline for backend compute dispatch"); + return false; + } + + vkCmdBindPipeline(m_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + return true; +} + } // namespace Vulkan diff --git a/Source/Core/VideoBackends/Vulkan/Util.h b/Source/Core/VideoBackends/Vulkan/Util.h index 7ee8e82356..f5385932bd 100644 --- a/Source/Core/VideoBackends/Vulkan/Util.h +++ b/Source/Core/VideoBackends/Vulkan/Util.h @@ -63,6 +63,10 @@ VkShaderModule CompileAndCreateGeometryShader(const std::string& source_code, // Compile a fragment shader and create a shader module, discarding the intermediate SPIR-V. VkShaderModule CompileAndCreateFragmentShader(const std::string& source_code, bool prepend_header = true); + +// Compile a compute shader and create a shader module, discarding the intermediate SPIR-V. +VkShaderModule CompileAndCreateComputeShader(const std::string& source_code, + bool prepend_header = true); } // Utility shader vertex format @@ -188,4 +192,41 @@ private: PipelineInfo m_pipeline_info = {}; }; +class ComputeShaderDispatcher +{ +public: + ComputeShaderDispatcher(VkCommandBuffer command_buffer, VkPipelineLayout pipeline_layout, + VkShaderModule compute_shader); + + u8* AllocateUniformBuffer(size_t size); + void CommitUniformBuffer(size_t size); + + void SetPushConstants(const void* data, size_t data_size); + + void SetSampler(size_t index, VkImageView view, VkSampler sampler); + + void SetTexelBuffer(size_t index, VkBufferView view); + + void SetStorageImage(VkImageView view, VkImageLayout image_layout); + + void Dispatch(u32 groups_x, u32 groups_y, u32 groups_z); + +private: + void BindDescriptors(); + bool BindPipeline(); + + VkCommandBuffer m_command_buffer = VK_NULL_HANDLE; + + VkDescriptorBufferInfo m_uniform_buffer = {}; + u32 m_uniform_buffer_offset = 0; + + std::array m_samplers = {}; + + std::array m_texel_buffers = {}; + + VkDescriptorImageInfo m_storage_image = {}; + + ComputePipelineInfo m_pipeline_info = {}; +}; + } // namespace Vulkan diff --git a/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp b/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp index 74e7786130..b7ef583786 100644 --- a/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp +++ b/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp @@ -234,6 +234,8 @@ void VulkanContext::PopulateBackendInfo(VideoConfig* config) config->backend_info.bSupportsPaletteConversion = true; // Assumed support. config->backend_info.bSupportsClipControl = true; // Assumed support. config->backend_info.bSupportsMultithreading = true; // Assumed support. + config->backend_info.bSupportsComputeShaders = true; // Assumed support. + config->backend_info.bSupportsGPUTextureDecoding = true; // Assumed support. config->backend_info.bSupportsInternalResolutionFrameDumps = true; // Assumed support. config->backend_info.bSupportsPostProcessing = false; // No support yet. config->backend_info.bSupportsDualSourceBlend = false; // Dependent on features. diff --git a/Source/Core/VideoCommon/TextureCacheBase.cpp b/Source/Core/VideoCommon/TextureCacheBase.cpp index eabb005600..862dd7c699 100644 --- a/Source/Core/VideoCommon/TextureCacheBase.cpp +++ b/Source/Core/VideoCommon/TextureCacheBase.cpp @@ -110,7 +110,8 @@ void TextureCacheBase::OnConfigChanged(VideoConfig& config) if (config.iSafeTextureCache_ColorSamples != backup_config.color_samples || config.bTexFmtOverlayEnable != backup_config.texfmt_overlay || config.bTexFmtOverlayCenter != backup_config.texfmt_overlay_center || - config.bHiresTextures != backup_config.hires_textures) + config.bHiresTextures != backup_config.hires_textures || + config.bEnableGPUTextureDecoding != backup_config.gpu_texture_decoding) { Invalidate(); @@ -209,6 +210,7 @@ void TextureCacheBase::SetBackupConfig(const VideoConfig& config) backup_config.cache_hires_textures = config.bCacheHiresTextures; backup_config.stereo_3d = config.iStereoMode > 0; backup_config.efb_mono_depth = config.bStereoEFBMonoDepth; + backup_config.gpu_texture_decoding = config.bEnableGPUTextureDecoding; } TextureCacheBase::TCacheEntryBase* TextureCacheBase::ApplyPaletteToEntry(TCacheEntryBase* entry, @@ -526,6 +528,7 @@ TextureCacheBase::TCacheEntryBase* TextureCacheBase::Load(const u32 stage) const u32 texture_size = TexDecoder_GetTextureSizeInBytes(expandedWidth, expandedHeight, texformat); + u32 bytes_per_block = (bsw * bsh * TexDecoder_GetTexelSizeInNibbles(texformat)) / 2; u32 additional_mips_size = 0; // not including level 0, which is texture_size // GPUs don't like when the specified mipmap count would require more than one 1x1-sized LOD in @@ -755,6 +758,17 @@ TextureCacheBase::TCacheEntryBase* TextureCacheBase::Load(const u32 stage) // how many levels the allocated texture shall have const u32 texLevels = hires_tex ? (u32)hires_tex->m_levels.size() : tex_levels; + // We can decode on the GPU if it is a supported format and the flag is enabled. + // Currently we don't decode RGBA8 textures from Tmem, as that would require copying from both + // banks, and if we're doing an copy we may as well just do the whole thing on the CPU, since + // there's no conversion between formats. In the future this could be extended with a separate + // shader, however. + bool decode_on_gpu = + !hires_tex && g_ActiveConfig.UseGPUTextureDecoding() && + g_texture_cache->SupportsGPUTextureDecode(static_cast(texformat), + static_cast(tlutfmt)) && + !(from_tmem && texformat == GX_TF_RGBA8); + // create the entry/texture TCacheEntryConfig config; config.width = width; @@ -769,17 +783,29 @@ TextureCacheBase::TCacheEntryBase* TextureCacheBase::Load(const u32 stage) if (!hires_tex) { - if (!(texformat == GX_TF_RGBA8 && from_tmem)) + const u8* tlut = &texMem[tlutaddr]; + if (decode_on_gpu) { - const u8* tlut = &texMem[tlutaddr]; - TexDecoder_Decode(temp, src_data, expandedWidth, expandedHeight, texformat, tlut, - (TlutFormat)tlutfmt); + u32 row_stride = bytes_per_block * (expandedWidth / bsw); + g_texture_cache->DecodeTextureOnGPU( + entry, 0, src_data, texture_size, static_cast(texformat), width, height, + expandedWidth, expandedHeight, row_stride, tlut, static_cast(tlutfmt)); } else { - u8* src_data_gb = - &texMem[bpmem.tex[stage / 4].texImage2[stage % 4].tmem_odd * TMEM_LINE_SIZE]; - TexDecoder_DecodeRGBA8FromTmem(temp, src_data, src_data_gb, expandedWidth, expandedHeight); + if (!(texformat == GX_TF_RGBA8 && from_tmem)) + { + TexDecoder_Decode(temp, src_data, expandedWidth, expandedHeight, texformat, tlut, + (TlutFormat)tlutfmt); + } + else + { + u8* src_data_gb = + &texMem[bpmem.tex[stage / 4].texImage2[stage % 4].tmem_odd * TMEM_LINE_SIZE]; + TexDecoder_DecodeRGBA8FromTmem(temp, src_data, src_data_gb, expandedWidth, expandedHeight); + } + + entry->Load(temp, width, height, expandedWidth, 0); } } @@ -797,9 +823,6 @@ TextureCacheBase::TCacheEntryBase* TextureCacheBase::Load(const u32 stage) entry->is_efb_copy = false; entry->is_custom_tex = hires_tex != nullptr; - // load texture - entry->Load(temp, width, height, expandedWidth, 0); - std::string basename = ""; if (g_ActiveConfig.bDumpTextures && !hires_tex) { @@ -840,13 +863,26 @@ TextureCacheBase::TCacheEntryBase* TextureCacheBase::Load(const u32 stage) const u32 expanded_mip_height = Common::AlignUp(mip_height, bsh); const u8*& mip_src_data = from_tmem ? ((level % 2) ? ptr_odd : ptr_even) : src_data; - const u8* tlut = &texMem[tlutaddr]; - TexDecoder_Decode(temp, mip_src_data, expanded_mip_width, expanded_mip_height, texformat, - tlut, (TlutFormat)tlutfmt); - mip_src_data += + size_t mip_size = TexDecoder_GetTextureSizeInBytes(expanded_mip_width, expanded_mip_height, texformat); + const u8* tlut = &texMem[tlutaddr]; - entry->Load(temp, mip_width, mip_height, expanded_mip_width, level); + if (decode_on_gpu) + { + u32 row_stride = bytes_per_block * (mip_width / bsw); + g_texture_cache->DecodeTextureOnGPU(entry, level, mip_src_data, mip_size, + static_cast(texformat), mip_width, + mip_height, expanded_mip_width, expanded_mip_height, + row_stride, tlut, static_cast(tlutfmt)); + } + else + { + TexDecoder_Decode(temp, mip_src_data, expanded_mip_width, expanded_mip_height, texformat, + tlut, (TlutFormat)tlutfmt); + entry->Load(temp, mip_width, mip_height, expanded_mip_width, level); + } + + mip_src_data += mip_size; if (g_ActiveConfig.bDumpTextures) DumpTexture(entry, basename, level); diff --git a/Source/Core/VideoCommon/TextureCacheBase.h b/Source/Core/VideoCommon/TextureCacheBase.h index 3adec5ed47..3000491924 100644 --- a/Source/Core/VideoCommon/TextureCacheBase.h +++ b/Source/Core/VideoCommon/TextureCacheBase.h @@ -171,6 +171,23 @@ public: virtual void ConvertTexture(TCacheEntryBase* entry, TCacheEntryBase* unconverted, void* palette, TlutFormat format) = 0; + // Returns true if the texture data and palette formats are supported by the GPU decoder. + virtual bool SupportsGPUTextureDecode(TextureFormat format, TlutFormat palette_format) + { + return false; + } + + // Decodes the specified data to the GPU texture specified by entry. + // width, height are the size of the image in pixels. + // aligned_width, aligned_height are the size of the image in pixels, aligned to the block size. + // row_stride is the number of bytes for a row of blocks, not pixels. + virtual void DecodeTextureOnGPU(TCacheEntryBase* entry, u32 dst_level, const u8* data, + size_t data_size, TextureFormat format, u32 width, u32 height, + u32 aligned_width, u32 aligned_height, u32 row_stride, + const u8* palette, TlutFormat palette_format) + { + } + protected: TextureCacheBase(); @@ -225,6 +242,7 @@ private: bool copy_cache_enable; bool stereo_3d; bool efb_mono_depth; + bool gpu_texture_decoding; }; BackupConfig backup_config = {}; }; diff --git a/Source/Core/VideoCommon/TextureConversionShader.cpp b/Source/Core/VideoCommon/TextureConversionShader.cpp index bd4e1349df..57b9f4d932 100644 --- a/Source/Core/VideoCommon/TextureConversionShader.cpp +++ b/Source/Core/VideoCommon/TextureConversionShader.cpp @@ -2,9 +2,13 @@ // Licensed under GPLv2+ // Refer to the license.txt file included. +#include #include #include +#include +#include +#include "Common/CommonFuncs.h" #include "Common/CommonTypes.h" #include "Common/MathUtil.h" #include "Common/MsgHandler.h" @@ -720,4 +724,546 @@ const char* GenerateEncodingShader(u32 format, APIType ApiType) return text; } +// NOTE: In these uniforms, a row refers to a row of blocks, not texels. +static const char decoding_shader_header[] = R"( +#ifdef VULKAN + +layout(std140, push_constant) uniform PushConstants { + uvec2 dst_size; + uvec2 src_size; + uint src_offset; + uint src_row_stride; + uint palette_offset; +} push_constants; +#define u_dst_size (push_constants.dst_size) +#define u_src_size (push_constants.src_size) +#define u_src_offset (push_constants.src_offset) +#define u_src_row_stride (push_constants.src_row_stride) +#define u_palette_offset (push_constants.palette_offset) + +TEXEL_BUFFER_BINDING(0) uniform usamplerBuffer s_input_buffer; +TEXEL_BUFFER_BINDING(1) uniform usamplerBuffer s_palette_buffer; + +IMAGE_BINDING(rgba8, 0) uniform writeonly image2DArray output_image; + +#else + +uniform uvec2 u_dst_size; +uniform uvec2 u_src_size; +uniform uint u_src_offset; +uniform uint u_src_row_stride; +uniform uint u_palette_offset; + +SAMPLER_BINDING(9) uniform usamplerBuffer s_input_buffer; +SAMPLER_BINDING(10) uniform usamplerBuffer s_palette_buffer; + +layout(rgba8, binding = 0) uniform writeonly image2DArray output_image; + +#endif + +uint Swap16(uint v) +{ + // Convert BE to LE. + return ((v >> 8) | (v << 8)) & 0xFFFFu; +} + +uint Convert3To8(uint v) +{ + // Swizzle bits: 00000123 -> 12312312 + return (v << 5) | (v << 2) | (v >> 1); +} +uint Convert4To8(uint v) +{ + // Swizzle bits: 00001234 -> 12341234 + return (v << 4) | v; +} +uint Convert5To8(uint v) +{ + // Swizzle bits: 00012345 -> 12345123 + return (v << 3) | (v >> 2); +} +uint Convert6To8(uint v) +{ + // Swizzle bits: 00123456 -> 12345612 + return (v << 2) | (v >> 4); +} + +uint GetTiledTexelOffset(uvec2 block_size, uvec2 coords) +{ + uvec2 block = coords / block_size; + uvec2 offset = coords % block_size; + uint buffer_pos = u_src_offset; + buffer_pos += block.y * u_src_row_stride; + buffer_pos += block.x * (block_size.x * block_size.y); + buffer_pos += offset.y * block_size.x; + buffer_pos += offset.x; + return buffer_pos; +} + +uvec4 GetPaletteColor(uint index) +{ + // Fetch and swap BE to LE. + uint val = Swap16(texelFetch(s_palette_buffer, int(u_palette_offset + index)).x); + + uvec4 color; +#if defined(PALETTE_FORMAT_IA8) + uint a = bitfieldExtract(val, 8, 8); + uint i = bitfieldExtract(val, 0, 8); + color = uvec4(i, i, i, a); +#elif defined(PALETTE_FORMAT_RGB565) + color.x = Convert5To8(bitfieldExtract(val, 11, 5)); + color.y = Convert6To8(bitfieldExtract(val, 5, 6)); + color.z = Convert5To8(bitfieldExtract(val, 0, 5)); + color.a = 255u; + +#elif defined(PALETTE_FORMAT_RGB5A3) + if ((val & 0x8000u) != 0u) + { + color.x = Convert5To8(bitfieldExtract(val, 10, 5)); + color.y = Convert5To8(bitfieldExtract(val, 5, 5)); + color.z = Convert5To8(bitfieldExtract(val, 0, 5)); + color.a = 255u; + } + else + { + color.a = Convert3To8(bitfieldExtract(val, 12, 3)); + color.r = Convert4To8(bitfieldExtract(val, 8, 4)); + color.g = Convert4To8(bitfieldExtract(val, 4, 4)); + color.b = Convert4To8(bitfieldExtract(val, 0, 4)); + } +#else + // Not used. + color = uvec4(0, 0, 0, 0); +#endif + + return color; +} + +vec4 GetPaletteColorNormalized(uint index) +{ + uvec4 color = GetPaletteColor(index); + return vec4(color) / 255.0; +} + +)"; + +static const std::map s_decoding_shader_info{ + {GX_TF_I4, + {BUFFER_FORMAT_R8_UINT, 0, 8, 8, false, + R"( + layout(local_size_x = 8, local_size_y = 8) in; + + void main() + { + uvec2 coords = gl_GlobalInvocationID.xy; + + // Tiled in 8x8 blocks, 4 bits per pixel + // We need to do the tiling manually here because the texel size is smaller than + // the size of the buffer elements. + uint2 block = coords.xy / 8u; + uint2 offset = coords.xy % 8u; + uint buffer_pos = u_src_offset; + buffer_pos += block.y * u_src_row_stride; + buffer_pos += block.x * 32u; + buffer_pos += offset.y * 4u; + buffer_pos += offset.x / 2u; + + // Select high nibble for odd texels, low for even. + uint val = texelFetch(s_input_buffer, int(buffer_pos)).x; + uint i; + if ((coords.x & 1u) == 0u) + i = Convert4To8((val >> 4)); + else + i = Convert4To8((val & 0x0Fu)); + + uvec4 color = uvec4(i, i, i, i); + vec4 norm_color = vec4(color) / 255.0; + + imageStore(output_image, ivec3(ivec2(coords), 0), norm_color); + } + + )"}}, + {GX_TF_IA4, + {BUFFER_FORMAT_R8_UINT, 0, 8, 8, false, + R"( + layout(local_size_x = 8, local_size_y = 8) in; + + void main() + { + uvec2 coords = gl_GlobalInvocationID.xy; + + // Tiled in 8x4 blocks, 8 bits per pixel + uint buffer_pos = GetTiledTexelOffset(uvec2(8u, 4u), coords); + uint val = texelFetch(s_input_buffer, int(buffer_pos)).x; + uint i = Convert4To8((val & 0x0Fu)); + uint a = Convert4To8((val >> 4)); + uvec4 color = uvec4(i, i, i, a); + vec4 norm_color = vec4(color) / 255.0; + + imageStore(output_image, ivec3(ivec2(coords), 0), norm_color); + } + )"}}, + {GX_TF_I8, + {BUFFER_FORMAT_R8_UINT, 0, 8, 8, false, + R"( + layout(local_size_x = 8, local_size_y = 8) in; + + void main() + { + uvec2 coords = gl_GlobalInvocationID.xy; + + // Tiled in 8x4 blocks, 8 bits per pixel + uint buffer_pos = GetTiledTexelOffset(uvec2(8u, 4u), coords); + uint i = texelFetch(s_input_buffer, int(buffer_pos)).x; + uvec4 color = uvec4(i, i, i, i); + vec4 norm_color = vec4(color) / 255.0; + + imageStore(output_image, ivec3(ivec2(coords), 0), norm_color); + } + )"}}, + {GX_TF_IA8, + {BUFFER_FORMAT_R16_UINT, 0, 8, 8, false, + R"( + layout(local_size_x = 8, local_size_y = 8) in; + + void main() + { + uvec2 coords = gl_GlobalInvocationID.xy; + + // Tiled in 4x4 blocks, 16 bits per pixel + uint buffer_pos = GetTiledTexelOffset(uvec2(4u, 4u), coords); + uint val = texelFetch(s_input_buffer, int(buffer_pos)).x; + uint a = (val & 0xFFu); + uint i = (val >> 8); + uvec4 color = uvec4(i, i, i, a); + vec4 norm_color = vec4(color) / 255.0; + imageStore(output_image, ivec3(ivec2(coords), 0), norm_color); + } + )"}}, + {GX_TF_RGB565, + {BUFFER_FORMAT_R16_UINT, 0, 8, 8, false, + R"( + layout(local_size_x = 8, local_size_y = 8) in; + + void main() + { + uvec2 coords = gl_GlobalInvocationID.xy; + + // Tiled in 4x4 blocks + uint buffer_pos = GetTiledTexelOffset(uvec2(4u, 4u), coords); + uint val = Swap16(texelFetch(s_input_buffer, int(buffer_pos)).x); + + uvec4 color; + color.x = Convert5To8(bitfieldExtract(val, 11, 5)); + color.y = Convert6To8(bitfieldExtract(val, 5, 6)); + color.z = Convert5To8(bitfieldExtract(val, 0, 5)); + color.a = 255u; + + vec4 norm_color = vec4(color) / 255.0; + imageStore(output_image, ivec3(ivec2(coords), 0), norm_color); + } + + )"}}, + {GX_TF_RGB5A3, + {BUFFER_FORMAT_R16_UINT, 0, 8, 8, false, + R"( + layout(local_size_x = 8, local_size_y = 8) in; + + void main() + { + uvec2 coords = gl_GlobalInvocationID.xy; + + // Tiled in 4x4 blocks + uint buffer_pos = GetTiledTexelOffset(uvec2(4u, 4u), coords); + uint val = Swap16(texelFetch(s_input_buffer, int(buffer_pos)).x); + + uvec4 color; + if ((val & 0x8000u) != 0u) + { + color.x = Convert5To8(bitfieldExtract(val, 10, 5)); + color.y = Convert5To8(bitfieldExtract(val, 5, 5)); + color.z = Convert5To8(bitfieldExtract(val, 0, 5)); + color.a = 255u; + } + else + { + color.a = Convert3To8(bitfieldExtract(val, 12, 3)); + color.r = Convert4To8(bitfieldExtract(val, 8, 4)); + color.g = Convert4To8(bitfieldExtract(val, 4, 4)); + color.b = Convert4To8(bitfieldExtract(val, 0, 4)); + } + + vec4 norm_color = vec4(color) / 255.0; + imageStore(output_image, ivec3(ivec2(coords), 0), norm_color); + } + + )"}}, + {GX_TF_RGBA8, + {BUFFER_FORMAT_R16_UINT, 0, 8, 8, false, + R"( + layout(local_size_x = 8, local_size_y = 8) in; + + void main() + { + uvec2 coords = gl_GlobalInvocationID.xy; + + // Tiled in 4x4 blocks + // We can't use the normal calculation function, as these are packed as the AR channels + // for the entire block, then the GB channels afterwards. + uint2 block = coords.xy / 4u; + uint2 offset = coords.xy % 4u; + uint buffer_pos = u_src_offset; + + // Our buffer has 16-bit elements, so the offsets here are half what they would be in bytes. + buffer_pos += block.y * u_src_row_stride; + buffer_pos += block.x * 32u; + buffer_pos += offset.y * 4u; + buffer_pos += offset.x; + + // The two GB channels follow after the block's AR channels. + uint val1 = texelFetch(s_input_buffer, int(buffer_pos + 0u)).x; + uint val2 = texelFetch(s_input_buffer, int(buffer_pos + 16u)).x; + + uvec4 color; + color.a = (val1 & 0xFFu); + color.r = (val1 >> 8); + color.g = (val2 & 0xFFu); + color.b = (val2 >> 8); + + vec4 norm_color = vec4(color) / 255.0; + imageStore(output_image, ivec3(ivec2(coords), 0), norm_color); + } + )"}}, + {GX_TF_CMPR, + {BUFFER_FORMAT_R32G32_UINT, 0, 64, 1, true, + R"( + // In the compute version of this decoder, we flatten the blocks to a one-dimension array. + // Each group is subdivided into 16, and the first thread in each group fetches the DXT data. + // All threads then calculate the possible colors for the block and write to the output image. + + #define GROUP_SIZE 64u + #define BLOCK_SIZE_X 4u + #define BLOCK_SIZE_Y 4u + #define BLOCK_SIZE (BLOCK_SIZE_X * BLOCK_SIZE_Y) + #define BLOCKS_PER_GROUP (GROUP_SIZE / BLOCK_SIZE) + + layout(local_size_x = GROUP_SIZE, local_size_y = 1) in; + + shared uvec2 shared_temp[BLOCKS_PER_GROUP]; + + uint DXTBlend(uint v1, uint v2) + { + // 3/8 blend, which is close to 1/3 + return ((v1 * 3u + v2 * 5u) >> 3); + } + + void main() + { + uint local_thread_id = gl_LocalInvocationID.x; + uint block_in_group = local_thread_id / BLOCK_SIZE; + uint thread_in_block = local_thread_id % BLOCK_SIZE; + uint block_index = gl_WorkGroupID.x * BLOCKS_PER_GROUP + block_in_group; + + // Annoyingly, we can't precalculate this as a uniform because the DXT block size differs + // from the block size of the overall texture (4 vs 8). We can however use a multiply and + // subtraction to avoid the modulo for calculating the block's X coordinate. + uint blocks_wide = u_src_size.x / BLOCK_SIZE_X; + uvec2 block_coords; + block_coords.y = block_index / blocks_wide; + block_coords.x = block_index - (block_coords.y * blocks_wide); + + // Only the first thread for each block reads from the texel buffer. + if (thread_in_block == 0u) + { + // Calculate tiled block coordinates. + uvec2 tile_block_coords = block_coords / 2u; + uvec2 subtile_block_coords = block_coords % 2u; + uint buffer_pos = u_src_offset; + buffer_pos += tile_block_coords.y * u_src_row_stride; + buffer_pos += tile_block_coords.x * 4u; + buffer_pos += subtile_block_coords.y * 2u; + buffer_pos += subtile_block_coords.x; + + // Read the entire DXT block to shared memory. + uvec2 raw_data = texelFetch(s_input_buffer, int(buffer_pos)).xy; + shared_temp[block_in_group] = raw_data; + } + + // Ensure store is completed before the remaining threads in the block continue. + memoryBarrierShared(); + barrier(); + + // Unpack colors and swap BE to LE. + uvec2 raw_data = shared_temp[block_in_group]; + uint swapped = ((raw_data.x & 0xFF00FF00u) >> 8) | ((raw_data.x & 0x00FF00FFu) << 8); + uint c1 = swapped & 0xFFFFu; + uint c2 = swapped >> 16; + + // Expand 5/6 bit channels to 8-bits per channel. + uint blue1 = Convert5To8(bitfieldExtract(c1, 0, 5)); + uint blue2 = Convert5To8(bitfieldExtract(c2, 0, 5)); + uint green1 = Convert6To8(bitfieldExtract(c1, 5, 6)); + uint green2 = Convert6To8(bitfieldExtract(c2, 5, 6)); + uint red1 = Convert5To8(bitfieldExtract(c1, 11, 5)); + uint red2 = Convert5To8(bitfieldExtract(c2, 11, 5)); + + // Determine the four colors the block can use. + // It's quicker to just precalculate all four colors rather than branching on the index. + // NOTE: These must be masked with 0xFF. This is done at the normalization stage below. + uvec4 color0, color1, color2, color3; + color0 = uvec4(red1, green1, blue1, 255u); + color1 = uvec4(red2, green2, blue2, 255u); + if (c1 > c2) + { + color2 = uvec4(DXTBlend(red2, red1), DXTBlend(green2, green1), DXTBlend(blue2, blue1), 255u); + color3 = uvec4(DXTBlend(red1, red2), DXTBlend(green1, green2), DXTBlend(blue1, blue2), 255u); + } + else + { + color2 = uvec4((red1 + red2) / 2u, (green1 + green2) / 2u, (blue1 + blue2) / 2u, 255u); + color3 = uvec4((red1 + red2) / 2u, (green1 + green2) / 2u, (blue1 + blue2) / 2u, 0u); + } + + // Calculate the texel coordinates that we will write to. + // The divides/modulo here should be turned into a shift/binary AND. + uint local_y = thread_in_block / BLOCK_SIZE_X; + uint local_x = thread_in_block % BLOCK_SIZE_X; + uint global_x = block_coords.x * BLOCK_SIZE_X + local_x; + uint global_y = block_coords.y * BLOCK_SIZE_Y + local_y; + + // Use the coordinates within the block to shift the 32-bit value containing + // all 16 indices to a single 2-bit index. + uint index = bitfieldExtract(raw_data.y, int((local_y * 8u) + (6u - local_x * 2u)), 2); + + // Select the un-normalized color from the precalculated color array. + // Using a switch statement here removes the need for dynamic indexing of an array. + uvec4 color; + switch (index) + { + case 0u: color = color0; break; + case 1u: color = color1; break; + case 2u: color = color2; break; + case 3u: color = color3; break; + default: color = color0; break; + } + + // Normalize and write to the output image. + vec4 norm_color = vec4(color & 0xFFu) / 255.0; + imageStore(output_image, ivec3(ivec2(uvec2(global_x, global_y)), 0), norm_color); + } + )"}}, + {GX_TF_C4, + {BUFFER_FORMAT_R8_UINT, static_cast(TexDecoder_GetPaletteSize(GX_TF_C4)), 8, 8, false, + R"( + layout(local_size_x = 8, local_size_y = 8) in; + + void main() + { + uvec2 coords = gl_GlobalInvocationID.xy; + + // Tiled in 8x8 blocks, 4 bits per pixel + // We need to do the tiling manually here because the texel size is smaller than + // the size of the buffer elements. + uint2 block = coords.xy / 8u; + uint2 offset = coords.xy % 8u; + uint buffer_pos = u_src_offset; + buffer_pos += block.y * u_src_row_stride; + buffer_pos += block.x * 32u; + buffer_pos += offset.y * 4u; + buffer_pos += offset.x / 2u; + + // Select high nibble for odd texels, low for even. + uint val = texelFetch(s_input_buffer, int(buffer_pos)).x; + uint index = ((coords.x & 1u) == 0u) ? (val >> 4) : (val & 0x0Fu); + vec4 norm_color = GetPaletteColorNormalized(index); + imageStore(output_image, ivec3(ivec2(coords), 0), norm_color); + } + + )"}}, + {GX_TF_C8, + {BUFFER_FORMAT_R8_UINT, static_cast(TexDecoder_GetPaletteSize(GX_TF_C8)), 8, 8, false, + R"( + layout(local_size_x = 8, local_size_y = 8) in; + + void main() + { + uvec2 coords = gl_GlobalInvocationID.xy; + + // Tiled in 8x4 blocks, 8 bits per pixel + uint buffer_pos = GetTiledTexelOffset(uvec2(8u, 4u), coords); + uint index = texelFetch(s_input_buffer, int(buffer_pos)).x; + vec4 norm_color = GetPaletteColorNormalized(index); + imageStore(output_image, ivec3(ivec2(coords), 0), norm_color); + } + )"}}, + {GX_TF_C14X2, + {BUFFER_FORMAT_R16_UINT, static_cast(TexDecoder_GetPaletteSize(GX_TF_C14X2)), 8, 8, false, + R"( + layout(local_size_x = 8, local_size_y = 8) in; + + void main() + { + uvec2 coords = gl_GlobalInvocationID.xy; + + // Tiled in 4x4 blocks, 16 bits per pixel + uint buffer_pos = GetTiledTexelOffset(uvec2(4u, 4u), coords); + uint index = texelFetch(s_input_buffer, int(buffer_pos)).x) & 0x3FFFu; + vec4 norm_color = GetPaletteColorNormalized(index); + imageStore(output_image, ivec3(ivec2(coords), 0), norm_color); + } + )"}}}; + +static const std::array s_buffer_bytes_per_texel = {{ + 1, // BUFFER_FORMAT_R8_UINT + 2, // BUFFER_FORMAT_R16_UINT + 8, // BUFFER_FORMAT_R32G32_UINT +}}; + +const DecodingShaderInfo* GetDecodingShaderInfo(u32 format) +{ + auto iter = s_decoding_shader_info.find(static_cast(format)); + return iter != s_decoding_shader_info.end() ? &iter->second : nullptr; +} + +u32 GetBytesPerBufferElement(BufferFormat buffer_format) +{ + return s_buffer_bytes_per_texel[buffer_format]; +} + +std::pair GetDispatchCount(const DecodingShaderInfo* info, u32 width, u32 height) +{ + // Flatten to a single dimension? + if (info->group_flatten) + return {(width * height + (info->group_size_x - 1)) / info->group_size_x, 1}; + + return {(width + (info->group_size_x - 1)) / info->group_size_x, + (height + (info->group_size_y - 1)) / info->group_size_y}; +} + +std::string GenerateDecodingShader(u32 format, u32 palette_format, APIType api_type) +{ + const DecodingShaderInfo* info = GetDecodingShaderInfo(format); + if (!info) + return ""; + + std::stringstream ss; + switch (palette_format) + { + case GX_TL_IA8: + ss << "#define PALETTE_FORMAT_IA8 1\n"; + break; + case GX_TL_RGB565: + ss << "#define PALETTE_FORMAT_RGB565 1\n"; + break; + case GX_TL_RGB5A3: + ss << "#define PALETTE_FORMAT_RGB5A3 1\n"; + break; + } + + ss << decoding_shader_header; + ss << info->shader_body; + + return ss.str(); +} + } // namespace diff --git a/Source/Core/VideoCommon/TextureConversionShader.h b/Source/Core/VideoCommon/TextureConversionShader.h index 714cf773c3..cc65a2d201 100644 --- a/Source/Core/VideoCommon/TextureConversionShader.h +++ b/Source/Core/VideoCommon/TextureConversionShader.h @@ -4,6 +4,9 @@ #pragma once +#include +#include + #include "Common/CommonTypes.h" enum class APIType; @@ -13,4 +16,40 @@ namespace TextureConversionShader u16 GetEncodedSampleCount(u32 format); const char* GenerateEncodingShader(u32 format, APIType ApiType); -} + +// View format of the input data to the texture decoding shader. +enum BufferFormat +{ + BUFFER_FORMAT_R8_UINT, + BUFFER_FORMAT_R16_UINT, + BUFFER_FORMAT_R32G32_UINT, + BUFFER_FORMAT_COUNT +}; + +// Information required to compile and dispatch a texture decoding shader. +struct DecodingShaderInfo +{ + BufferFormat buffer_format; + u32 palette_size; + u32 group_size_x; + u32 group_size_y; + bool group_flatten; + const char* shader_body; +}; + +// Obtain shader information for the specified texture format. +// If this format does not have a shader written for it, returns nullptr. +const DecodingShaderInfo* GetDecodingShaderInfo(u32 format); + +// Determine how many bytes there are in each element of the texel buffer. +// Needed for alignment and stride calculations. +u32 GetBytesPerBufferElement(BufferFormat buffer_format); + +// Determine how many thread groups should be dispatched for an image of the specified width/height. +// First is the number of X groups, second is the number of Y groups, Z is always one. +std::pair GetDispatchCount(const DecodingShaderInfo* info, u32 width, u32 height); + +// Returns the GLSL string containing the texture decoding shader for the specified format. +std::string GenerateDecodingShader(u32 format, u32 palette_format, APIType api_type); + +} // namespace TextureConversionShader diff --git a/Source/Core/VideoCommon/VideoConfig.cpp b/Source/Core/VideoCommon/VideoConfig.cpp index 7579c90c6e..08b814fd28 100644 --- a/Source/Core/VideoCommon/VideoConfig.cpp +++ b/Source/Core/VideoCommon/VideoConfig.cpp @@ -81,6 +81,7 @@ void VideoConfig::Load(const std::string& ini_file) settings->Get("DumpPath", &sDumpPath, ""); settings->Get("BitrateKbps", &iBitrateKbps, 2500); settings->Get("InternalResolutionFrameDumps", &bInternalResolutionFrameDumps, false); + settings->Get("EnableGPUTextureDecoding", &bEnableGPUTextureDecoding, false); settings->Get("EnablePixelLighting", &bEnablePixelLighting, false); settings->Get("FastDepthCalc", &bFastDepthCalc, true); settings->Get("MSAA", &iMultisamples, 1); @@ -305,6 +306,7 @@ void VideoConfig::Save(const std::string& ini_file) settings->Set("DumpPath", sDumpPath); settings->Set("BitrateKbps", iBitrateKbps); settings->Set("InternalResolutionFrameDumps", bInternalResolutionFrameDumps); + settings->Set("EnableGPUTextureDecoding", bEnableGPUTextureDecoding); settings->Set("EnablePixelLighting", bEnablePixelLighting); settings->Set("FastDepthCalc", bFastDepthCalc); settings->Set("MSAA", iMultisamples); diff --git a/Source/Core/VideoCommon/VideoConfig.h b/Source/Core/VideoCommon/VideoConfig.h index a3a44f2ec7..c70cf104c8 100644 --- a/Source/Core/VideoCommon/VideoConfig.h +++ b/Source/Core/VideoCommon/VideoConfig.h @@ -108,6 +108,7 @@ struct VideoConfig final bool bInternalResolutionFrameDumps; bool bFreeLook; bool bBorderlessFullscreen; + bool bEnableGPUTextureDecoding; int iBitrateKbps; // Hacks @@ -181,6 +182,7 @@ struct VideoConfig final bool bSupportsPrimitiveRestart; bool bSupportsOversizedViewports; bool bSupportsGeometryShaders; + bool bSupportsComputeShaders; bool bSupports3DVision; bool bSupportsEarlyZ; // needed by PixelShaderGen, so must stay in VideoCommon bool bSupportsBindingLayout; // Needed by ShaderGen, so must stay in VideoCommon @@ -195,6 +197,7 @@ struct VideoConfig final bool bSupportsReversedDepthRange; bool bSupportsMultithreading; bool bSupportsInternalResolutionFrameDumps; + bool bSupportsGPUTextureDecoding; } backend_info; // Utility @@ -210,6 +213,10 @@ struct VideoConfig final return false; return backend_info.bSupportsBBox && backend_info.bSupportsFragmentStoresAndAtomics; } + bool UseGPUTextureDecoding() const + { + return backend_info.bSupportsGPUTextureDecoding && bEnableGPUTextureDecoding; + } }; extern VideoConfig g_Config;