dolphin/Source/Core/VideoBackends/OGL/TextureCache.cpp
Markus Wick 491c10ec96 VideoBackends: Use VideoCommon shader generators for efb2tex copies.
This will generate one shader per copy format. For now, it is the same
shader with the colmat hard coded. So it should already improve the GPU
performance a bit, but a rewrite of the shader generator is suggested.

Half of the patch is done by linkmauve1:
VideoCommon: Reorganise the shader writes.
2017-12-02 15:17:39 +01:00

550 lines
19 KiB
C++

// Copyright 2008 Dolphin Emulator Project
// Licensed under GPLv2+
// Refer to the license.txt file included.
#include <algorithm>
#include <cmath>
#include <cstring>
#include <fstream>
#include <memory>
#include <vector>
#include "Common/Assert.h"
#include "Common/GL/GLInterfaceBase.h"
#include "Common/MsgHandler.h"
#include "Common/StringUtil.h"
#include "VideoBackends/OGL/FramebufferManager.h"
#include "VideoBackends/OGL/GPUTimer.h"
#include "VideoBackends/OGL/OGLTexture.h"
#include "VideoBackends/OGL/ProgramShaderCache.h"
#include "VideoBackends/OGL/Render.h"
#include "VideoBackends/OGL/SamplerCache.h"
#include "VideoBackends/OGL/StreamBuffer.h"
#include "VideoBackends/OGL/TextureCache.h"
#include "VideoBackends/OGL/TextureConverter.h"
#include "VideoCommon/ImageWrite.h"
#include "VideoCommon/TextureConversionShader.h"
#include "VideoCommon/TextureConverterShaderGen.h"
#include "VideoCommon/TextureDecoder.h"
#include "VideoCommon/VideoCommon.h"
#include "VideoCommon/VideoConfig.h"
namespace OGL
{
constexpr const char* vertex_program =
"out vec3 %c_uv0;\n"
"SAMPLER_BINDING(9) uniform sampler2DArray samp9;\n"
"uniform vec4 copy_position;\n" // left, top, right, bottom
"void main()\n"
"{\n"
" vec2 rawpos = vec2(gl_VertexID&1, gl_VertexID&2);\n"
" %c_uv0 = vec3(mix(copy_position.xy, copy_position.zw, rawpos) / vec2(textureSize(samp9, "
"0).xy), 0.0);\n"
" gl_Position = vec4(rawpos*2.0-1.0, 0.0, 1.0);\n"
"}\n";
constexpr const char* geometry_program = "layout(triangles) in;\n"
"layout(triangle_strip, max_vertices = 6) out;\n"
"in vec3 v_uv0[3];\n"
"out vec3 f_uv0;\n"
"SAMPLER_BINDING(9) uniform sampler2DArray samp9;\n"
"void main()\n"
"{\n"
" int layers = textureSize(samp9, 0).z;\n"
" for (int layer = 0; layer < layers; ++layer) {\n"
" for (int i = 0; i < 3; ++i) {\n"
" f_uv0 = vec3(v_uv0[i].xy, layer);\n"
" gl_Position = gl_in[i].gl_Position;\n"
" gl_Layer = layer;\n"
" EmitVertex();\n"
" }\n"
" EndPrimitive();\n"
" }\n"
"}\n";
//#define TIME_TEXTURE_DECODING 1
void TextureCache::CopyEFB(u8* dst, const EFBCopyParams& params, u32 native_width,
u32 bytes_per_row, u32 num_blocks_y, u32 memory_stride,
const EFBRectangle& src_rect, bool scale_by_half)
{
TextureConverter::EncodeToRamFromTexture(dst, params, native_width, bytes_per_row, num_blocks_y,
memory_stride, src_rect, scale_by_half);
}
TextureCache::TextureCache()
{
CompileShaders();
if (g_ActiveConfig.backend_info.bSupportsPaletteConversion)
{
s32 buffer_size_mb = (g_ActiveConfig.backend_info.bSupportsGPUTextureDecoding ? 32 : 1);
s32 buffer_size = buffer_size_mb * 1024 * 1024;
s32 max_buffer_size = 0;
// The minimum MAX_TEXTURE_BUFFER_SIZE that the spec mandates is 65KB, we are asking for a 1MB
// buffer here. This buffer is also used as storage for undecoded textures when compute shader
// texture decoding is enabled, in which case the requested size is 32MB.
glGetIntegerv(GL_MAX_TEXTURE_BUFFER_SIZE, &max_buffer_size);
// Clamp the buffer size to the maximum size that the driver supports.
buffer_size = std::min(buffer_size, max_buffer_size);
m_palette_stream_buffer = StreamBuffer::Create(GL_TEXTURE_BUFFER, buffer_size);
glGenTextures(1, &m_palette_resolv_texture);
glBindTexture(GL_TEXTURE_BUFFER, m_palette_resolv_texture);
glTexBuffer(GL_TEXTURE_BUFFER, GL_R16UI, m_palette_stream_buffer->m_buffer);
if (g_ActiveConfig.backend_info.bSupportsGPUTextureDecoding)
CreateTextureDecodingResources();
}
}
TextureCache::~TextureCache()
{
DeleteShaders();
if (g_ActiveConfig.backend_info.bSupportsGPUTextureDecoding)
DestroyTextureDecodingResources();
if (g_ActiveConfig.backend_info.bSupportsPaletteConversion)
{
glDeleteTextures(1, &m_palette_resolv_texture);
}
}
TextureCache* TextureCache::GetInstance()
{
return static_cast<TextureCache*>(g_texture_cache.get());
}
const SHADER& TextureCache::GetColorCopyProgram() const
{
return m_colorCopyProgram;
}
GLuint TextureCache::GetColorCopyPositionUniform() const
{
return m_colorCopyPositionUniform;
}
bool TextureCache::CompilePaletteShader(TLUTFormat tlutfmt, const std::string& vcode,
const std::string& pcode, const std::string& gcode)
{
_assert_(IsValidTLUTFormat(tlutfmt));
PaletteShader& shader = m_palette_shaders[static_cast<int>(tlutfmt)];
if (!ProgramShaderCache::CompileShader(shader.shader, vcode, pcode, gcode))
return false;
shader.buffer_offset_uniform =
glGetUniformLocation(shader.shader.glprogid, "texture_buffer_offset");
shader.multiplier_uniform = glGetUniformLocation(shader.shader.glprogid, "multiplier");
shader.copy_position_uniform = glGetUniformLocation(shader.shader.glprogid, "copy_position");
return true;
}
bool TextureCache::CompileShaders()
{
constexpr const char* color_copy_program = "SAMPLER_BINDING(9) uniform sampler2DArray samp9;\n"
"in vec3 f_uv0;\n"
"out vec4 ocol0;\n"
"\n"
"void main(){\n"
" vec4 texcol = texture(samp9, f_uv0);\n"
" ocol0 = texcol;\n"
"}\n";
std::string geo_program = "";
char prefix = 'f';
if (g_ActiveConfig.stereo_mode != StereoMode::Off)
{
geo_program = geometry_program;
prefix = 'v';
}
if (!ProgramShaderCache::CompileShader(m_colorCopyProgram,
StringFromFormat(vertex_program, prefix, prefix),
color_copy_program, geo_program))
{
return false;
}
m_colorCopyPositionUniform = glGetUniformLocation(m_colorCopyProgram.glprogid, "copy_position");
std::string palette_shader =
R"GLSL(
uniform int texture_buffer_offset;
uniform float multiplier;
SAMPLER_BINDING(9) uniform sampler2DArray samp9;
SAMPLER_BINDING(10) uniform usamplerBuffer samp10;
in vec3 f_uv0;
out vec4 ocol0;
int Convert3To8(int v)
{
// Swizzle bits: 00000123 -> 12312312
return (v << 5) | (v << 2) | (v >> 1);
}
int Convert4To8(int v)
{
// Swizzle bits: 00001234 -> 12341234
return (v << 4) | v;
}
int Convert5To8(int v)
{
// Swizzle bits: 00012345 -> 12345123
return (v << 3) | (v >> 2);
}
int Convert6To8(int v)
{
// Swizzle bits: 00123456 -> 12345612
return (v << 2) | (v >> 4);
}
float4 DecodePixel_RGB5A3(int val)
{
int r,g,b,a;
if ((val&0x8000) > 0)
{
r=Convert5To8((val>>10) & 0x1f);
g=Convert5To8((val>>5 ) & 0x1f);
b=Convert5To8((val ) & 0x1f);
a=0xFF;
}
else
{
a=Convert3To8((val>>12) & 0x7);
r=Convert4To8((val>>8 ) & 0xf);
g=Convert4To8((val>>4 ) & 0xf);
b=Convert4To8((val ) & 0xf);
}
return float4(r, g, b, a) / 255.0;
}
float4 DecodePixel_RGB565(int val)
{
int r, g, b, a;
r = Convert5To8((val >> 11) & 0x1f);
g = Convert6To8((val >> 5) & 0x3f);
b = Convert5To8((val) & 0x1f);
a = 0xFF;
return float4(r, g, b, a) / 255.0;
}
float4 DecodePixel_IA8(int val)
{
int i = val & 0xFF;
int a = val >> 8;
return float4(i, i, i, a) / 255.0;
}
void main()
{
int src = int(round(texture(samp9, f_uv0).r * multiplier));
src = int(texelFetch(samp10, src + texture_buffer_offset).r);
src = ((src << 8) & 0xFF00) | (src >> 8);
ocol0 = DECODE(src);
}
)GLSL";
if (g_ActiveConfig.backend_info.bSupportsPaletteConversion)
{
if (!CompilePaletteShader(TLUTFormat::IA8, StringFromFormat(vertex_program, prefix, prefix),
"#define DECODE DecodePixel_IA8" + palette_shader, geo_program))
return false;
if (!CompilePaletteShader(TLUTFormat::RGB565, StringFromFormat(vertex_program, prefix, prefix),
"#define DECODE DecodePixel_RGB565" + palette_shader, geo_program))
return false;
if (!CompilePaletteShader(TLUTFormat::RGB5A3, StringFromFormat(vertex_program, prefix, prefix),
"#define DECODE DecodePixel_RGB5A3" + palette_shader, geo_program))
return false;
}
return true;
}
void TextureCache::DeleteShaders()
{
for (auto& it : m_efb_copy_programs)
it.second.shader.Destroy();
m_efb_copy_programs.clear();
if (g_ActiveConfig.backend_info.bSupportsPaletteConversion)
for (auto& shader : m_palette_shaders)
shader.shader.Destroy();
}
void TextureCache::ConvertTexture(TCacheEntry* destination, TCacheEntry* source,
const void* palette, TLUTFormat tlutfmt)
{
if (!g_ActiveConfig.backend_info.bSupportsPaletteConversion)
return;
_assert_(IsValidTLUTFormat(tlutfmt));
const PaletteShader& palette_shader = m_palette_shaders[static_cast<int>(tlutfmt)];
g_renderer->ResetAPIState();
OGLTexture* source_texture = static_cast<OGLTexture*>(source->texture.get());
OGLTexture* destination_texture = static_cast<OGLTexture*>(destination->texture.get());
glActiveTexture(GL_TEXTURE9);
glBindTexture(GL_TEXTURE_2D_ARRAY, source_texture->GetRawTexIdentifier());
g_sampler_cache->BindNearestSampler(9);
FramebufferManager::SetFramebuffer(destination_texture->GetFramebuffer());
glViewport(0, 0, destination->GetWidth(), destination->GetHeight());
palette_shader.shader.Bind();
// C14 textures are currently unsupported
int size = source->format == TextureFormat::I4 ? 32 : 512;
auto buffer = m_palette_stream_buffer->Map(size);
memcpy(buffer.first, palette, size);
m_palette_stream_buffer->Unmap(size);
glUniform1i(palette_shader.buffer_offset_uniform, buffer.second / 2);
glUniform1f(palette_shader.multiplier_uniform,
source->format == TextureFormat::I4 ? 15.0f : 255.0f);
glUniform4f(palette_shader.copy_position_uniform, 0.0f, 0.0f,
static_cast<float>(source->GetWidth()), static_cast<float>(source->GetHeight()));
glActiveTexture(GL_TEXTURE10);
glBindTexture(GL_TEXTURE_BUFFER, m_palette_resolv_texture);
g_sampler_cache->BindNearestSampler(10);
OpenGL_BindAttributelessVAO();
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
FramebufferManager::SetFramebuffer(0);
g_renderer->RestoreAPIState();
}
static const std::string decoding_vertex_shader = R"(
void main()
{
vec2 rawpos = vec2(gl_VertexID&1, gl_VertexID&2);
gl_Position = vec4(rawpos*2.0-1.0, 0.0, 1.0);
}
)";
void TextureCache::CreateTextureDecodingResources()
{
static const GLenum gl_view_types[TextureConversionShader::BUFFER_FORMAT_COUNT] = {
GL_R8UI, // BUFFER_FORMAT_R8_UINT
GL_R16UI, // BUFFER_FORMAT_R16_UINT
GL_RG32UI, // BUFFER_FORMAT_R32G32_UINT
GL_RGBA8UI, // BUFFER_FORMAT_RGBA8_UINT
};
glGenTextures(TextureConversionShader::BUFFER_FORMAT_COUNT,
m_texture_decoding_buffer_views.data());
for (size_t i = 0; i < TextureConversionShader::BUFFER_FORMAT_COUNT; i++)
{
glBindTexture(GL_TEXTURE_BUFFER, m_texture_decoding_buffer_views[i]);
glTexBuffer(GL_TEXTURE_BUFFER, gl_view_types[i], m_palette_stream_buffer->m_buffer);
}
}
void TextureCache::DestroyTextureDecodingResources()
{
glDeleteTextures(TextureConversionShader::BUFFER_FORMAT_COUNT,
m_texture_decoding_buffer_views.data());
m_texture_decoding_buffer_views.fill(0);
m_texture_decoding_program_info.clear();
}
bool TextureCache::SupportsGPUTextureDecode(TextureFormat format, TLUTFormat palette_format)
{
auto key = std::make_pair(static_cast<u32>(format), static_cast<u32>(palette_format));
auto iter = m_texture_decoding_program_info.find(key);
if (iter != m_texture_decoding_program_info.end())
return iter->second.valid;
TextureDecodingProgramInfo info;
info.base_info = TextureConversionShader::GetDecodingShaderInfo(format);
if (!info.base_info)
{
m_texture_decoding_program_info.emplace(key, info);
return false;
}
std::string shader_source =
TextureConversionShader::GenerateDecodingShader(format, palette_format, APIType::OpenGL);
if (shader_source.empty())
{
m_texture_decoding_program_info.emplace(key, info);
return false;
}
if (!ProgramShaderCache::CompileComputeShader(info.program, shader_source))
{
m_texture_decoding_program_info.emplace(key, info);
return false;
}
info.uniform_dst_size = glGetUniformLocation(info.program.glprogid, "u_dst_size");
info.uniform_src_size = glGetUniformLocation(info.program.glprogid, "u_src_size");
info.uniform_src_offset = glGetUniformLocation(info.program.glprogid, "u_src_offset");
info.uniform_src_row_stride = glGetUniformLocation(info.program.glprogid, "u_src_row_stride");
info.uniform_palette_offset = glGetUniformLocation(info.program.glprogid, "u_palette_offset");
info.valid = true;
m_texture_decoding_program_info.emplace(key, info);
return true;
}
void TextureCache::DecodeTextureOnGPU(TCacheEntry* entry, u32 dst_level, const u8* data,
size_t data_size, TextureFormat format, u32 width, u32 height,
u32 aligned_width, u32 aligned_height, u32 row_stride,
const u8* palette, TLUTFormat palette_format)
{
auto key = std::make_pair(static_cast<u32>(format), static_cast<u32>(palette_format));
auto iter = m_texture_decoding_program_info.find(key);
if (iter == m_texture_decoding_program_info.end())
return;
#ifdef TIME_TEXTURE_DECODING
GPUTimer timer;
#endif
// Copy to GPU-visible buffer, aligned to the data type.
auto info = iter->second;
u32 bytes_per_buffer_elem =
TextureConversionShader::GetBytesPerBufferElement(info.base_info->buffer_format);
// Only copy palette if it is required.
bool has_palette = info.base_info->palette_size > 0;
u32 total_upload_size = static_cast<u32>(data_size);
u32 palette_offset = total_upload_size;
if (has_palette)
{
// Align to u16.
if ((total_upload_size % sizeof(u16)) != 0)
{
total_upload_size++;
palette_offset++;
}
total_upload_size += info.base_info->palette_size;
}
// Allocate space in stream buffer, and copy texture + palette across.
auto buffer = m_palette_stream_buffer->Map(total_upload_size, bytes_per_buffer_elem);
memcpy(buffer.first, data, data_size);
if (has_palette)
memcpy(buffer.first + palette_offset, palette, info.base_info->palette_size);
m_palette_stream_buffer->Unmap(total_upload_size);
info.program.Bind();
// Calculate stride in buffer elements
u32 row_stride_in_elements = row_stride / bytes_per_buffer_elem;
u32 offset_in_elements = buffer.second / bytes_per_buffer_elem;
u32 palette_offset_in_elements = (buffer.second + palette_offset) / sizeof(u16);
if (info.uniform_dst_size >= 0)
glUniform2ui(info.uniform_dst_size, width, height);
if (info.uniform_src_size >= 0)
glUniform2ui(info.uniform_src_size, aligned_width, aligned_height);
if (info.uniform_src_offset >= 0)
glUniform1ui(info.uniform_src_offset, offset_in_elements);
if (info.uniform_src_row_stride >= 0)
glUniform1ui(info.uniform_src_row_stride, row_stride_in_elements);
if (info.uniform_palette_offset >= 0)
glUniform1ui(info.uniform_palette_offset, palette_offset_in_elements);
glActiveTexture(GL_TEXTURE9);
glBindTexture(GL_TEXTURE_BUFFER, m_texture_decoding_buffer_views[info.base_info->buffer_format]);
if (has_palette)
{
// Use an R16UI view for the palette.
glActiveTexture(GL_TEXTURE10);
glBindTexture(GL_TEXTURE_BUFFER, m_palette_resolv_texture);
}
auto dispatch_groups =
TextureConversionShader::GetDispatchCount(info.base_info, aligned_width, aligned_height);
glBindImageTexture(0, static_cast<OGLTexture*>(entry->texture.get())->GetRawTexIdentifier(),
dst_level, GL_TRUE, 0, GL_WRITE_ONLY, GL_RGBA8);
glDispatchCompute(dispatch_groups.first, dispatch_groups.second, 1);
glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT);
OGLTexture::SetStage();
#ifdef TIME_TEXTURE_DECODING
WARN_LOG(VIDEO, "Decode texture format %u size %ux%u took %.4fms", static_cast<u32>(format),
width, height, timer.GetTimeMilliseconds());
#endif
}
void TextureCache::CopyEFBToCacheEntry(TCacheEntry* entry, bool is_depth_copy,
const EFBRectangle& src_rect, bool scale_by_half,
unsigned int cbuf_id, const float* colmat,
EFBCopyFormat dst_format, bool is_intensity)
{
auto* destination_texture = static_cast<OGLTexture*>(entry->texture.get());
g_renderer->ResetAPIState(); // reset any game specific settings
// Make sure to resolve anything we need to read from.
const GLuint read_texture = is_depth_copy ?
FramebufferManager::ResolveAndGetDepthTarget(src_rect) :
FramebufferManager::ResolveAndGetRenderTarget(src_rect);
FramebufferManager::SetFramebuffer(destination_texture->GetFramebuffer());
OpenGL_BindAttributelessVAO();
glActiveTexture(GL_TEXTURE9);
glBindTexture(GL_TEXTURE_2D_ARRAY, read_texture);
if (scale_by_half)
g_sampler_cache->BindLinearSampler(9);
else
g_sampler_cache->BindNearestSampler(9);
glViewport(0, 0, destination_texture->GetConfig().width, destination_texture->GetConfig().height);
auto uid = GetTextureConverterShaderUid(dst_format, is_depth_copy, is_intensity, scale_by_half);
auto it = m_efb_copy_programs.emplace(uid, EFBCopyShader());
EFBCopyShader& shader = it.first->second;
bool created = it.second;
if (created)
{
ShaderCode code = GenerateTextureConverterShaderCode(APIType::OpenGL, uid.GetUidData());
std::string geo_program = "";
char prefix = 'f';
if (g_ActiveConfig.stereo_mode != StereoMode::Off)
{
geo_program = geometry_program;
prefix = 'v';
}
ProgramShaderCache::CompileShader(shader.shader,
StringFromFormat(vertex_program, prefix, prefix),
code.GetBuffer(), geo_program);
shader.position_uniform = glGetUniformLocation(shader.shader.glprogid, "copy_position");
}
shader.shader.Bind();
TargetRectangle R = g_renderer->ConvertEFBRectangle(src_rect);
glUniform4f(shader.position_uniform, static_cast<float>(R.left), static_cast<float>(R.top),
static_cast<float>(R.right), static_cast<float>(R.bottom));
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
FramebufferManager::SetFramebuffer(0);
g_renderer->RestoreAPIState();
}
}