// Copyright 2013 Dolphin Emulator Project // Licensed under GPLv2 // Refer to the license.txt file included. // Fast image conversion using OpenGL shaders. #include "Common/FileUtil.h" #include "Core/HW/Memmap.h" #include "VideoBackends/OGL/FramebufferManager.h" #include "VideoBackends/OGL/Globals.h" #include "VideoBackends/OGL/ProgramShaderCache.h" #include "VideoBackends/OGL/Render.h" #include "VideoBackends/OGL/TextureCache.h" #include "VideoBackends/OGL/TextureConverter.h" #include "VideoCommon/DriverDetails.h" #include "VideoCommon/ImageWrite.h" #include "VideoCommon/TextureConversionShader.h" #include "VideoCommon/VideoConfig.h" namespace OGL { namespace TextureConverter { using OGL::TextureCache; static GLuint s_texConvFrameBuffer[2] = {0,0}; static GLuint s_srcTexture = 0; // for decoding from RAM static GLuint s_dstTexture = 0; // for encoding to RAM const int renderBufferWidth = 1024; const int renderBufferHeight = 1024; static SHADER s_rgbToYuyvProgram; static int s_rgbToYuyvUniform_loc; static SHADER s_yuyvToRgbProgram; // Not all slots are taken - but who cares. const u32 NUM_ENCODING_PROGRAMS = 64; static SHADER s_encodingPrograms[NUM_ENCODING_PROGRAMS]; static int s_encodingUniforms[NUM_ENCODING_PROGRAMS]; static GLuint s_PBO = 0; // for readback with different strides void CreatePrograms() { /* TODO: Accuracy Improvements * * This shader doesn't really match what the gamecube does interally in the * copy pipeline. * 1. It uses Opengl's built in filtering when yscaling, someone could work * out how the copypipeline does it's filtering and implement it correctly * in this shader. * 2. Deflickering isn't implemented, a futher filtering over 3 lines. * Isn't really needed on non-interlaced monitors (and would lower quality; * But hey, accuracy!) * 3. Flipper's YUYV conversion implements a 3 pixel horozontal blur on the * UV channels, centering the U channel on the Left pixel and the V channel * on the Right pixel. * The current implementation Centers both UV channels at the same place * inbetween the two Pixels, and only blurs over these two pixels. */ // Output is BGRA because that is slightly faster than RGBA. const char *VProgramRgbToYuyv = "VARYOUT vec2 uv0;\n" "uniform vec4 copy_position;\n" // left, top, right, bottom "uniform sampler2D samp9;\n" "void main()\n" "{\n" " vec2 rawpos = vec2(gl_VertexID&1, gl_VertexID&2);\n" " gl_Position = vec4(rawpos*2.0-1.0, 0.0, 1.0);\n" " uv0 = mix(copy_position.xy, copy_position.zw, rawpos) / vec2(textureSize(samp9, 0));\n" "}\n"; const char *FProgramRgbToYuyv = "uniform sampler2D samp9;\n" "VARYIN vec2 uv0;\n" "out vec4 ocol0;\n" "void main()\n" "{\n" " vec3 c0 = texture(samp9, (uv0 - dFdx(uv0) * 0.25)).rgb;\n" " vec3 c1 = texture(samp9, (uv0 + dFdx(uv0) * 0.25)).rgb;\n" " vec3 c01 = (c0 + c1) * 0.5;\n" " vec3 y_const = vec3(0.257,0.504,0.098);\n" " vec3 u_const = vec3(-0.148,-0.291,0.439);\n" " vec3 v_const = vec3(0.439,-0.368,-0.071);\n" " vec4 const3 = vec4(0.0625,0.5,0.0625,0.5);\n" " ocol0 = vec4(dot(c1,y_const),dot(c01,u_const),dot(c0,y_const),dot(c01, v_const)) + const3;\n" "}\n"; ProgramShaderCache::CompileShader(s_rgbToYuyvProgram, VProgramRgbToYuyv, FProgramRgbToYuyv); s_rgbToYuyvUniform_loc = glGetUniformLocation(s_rgbToYuyvProgram.glprogid, "copy_position"); /* TODO: Accuracy Improvements * * The YVYU to RGB conversion here matches the RGB to YUYV done above, but * if a game modifies or adds images to the XFB then it should be using the * same algorithm as the flipper, and could result in slight colour inaccuracies * when run back through this shader. */ const char *VProgramYuyvToRgb = "void main()\n" "{\n" " vec2 rawpos = vec2(gl_VertexID&1, gl_VertexID&2);\n" " gl_Position = vec4(rawpos*2.0-1.0, 0.0, 1.0);\n" "}\n"; const char *FProgramYuyvToRgb = "uniform sampler2D samp9;\n" "VARYIN vec2 uv0;\n" "out vec4 ocol0;\n" "void main()\n" "{\n" " ivec2 uv = ivec2(gl_FragCoord.xy);\n" // We switch top/bottom here. TODO: move this to screen blit. " ivec2 ts = textureSize(samp9, 0);\n" " vec4 c0 = texelFetch(samp9, ivec2(uv.x>>1, ts.y-uv.y-1), 0);\n" " float y = mix(c0.b, c0.r, (uv.x & 1) == 1);\n" " float yComp = 1.164 * (y - 0.0625);\n" " float uComp = c0.g - 0.5;\n" " float vComp = c0.a - 0.5;\n" " ocol0 = vec4(yComp + (1.596 * vComp),\n" " yComp - (0.813 * vComp) - (0.391 * uComp),\n" " yComp + (2.018 * uComp),\n" " 1.0);\n" "}\n"; ProgramShaderCache::CompileShader(s_yuyvToRgbProgram, VProgramYuyvToRgb, FProgramYuyvToRgb); } SHADER &GetOrCreateEncodingShader(u32 format) { if (format > NUM_ENCODING_PROGRAMS) { PanicAlert("Unknown texture copy format: 0x%x\n", format); return s_encodingPrograms[0]; } if (s_encodingPrograms[format].glprogid == 0) { const char* shader = TextureConversionShader::GenerateEncodingShader(format, API_OPENGL); #if defined(_DEBUG) || defined(DEBUGFAST) if (g_ActiveConfig.iLog & CONF_SAVESHADERS && shader) { static int counter = 0; char szTemp[MAX_PATH]; sprintf(szTemp, "%senc_%04i.txt", File::GetUserPath(D_DUMP_IDX).c_str(), counter++); SaveData(szTemp, shader); } #endif const char *VProgram = "void main()\n" "{\n" " vec2 rawpos = vec2(gl_VertexID&1, gl_VertexID&2);\n" " gl_Position = vec4(rawpos*2.0-1.0, 0.0, 1.0);\n" "}\n"; ProgramShaderCache::CompileShader(s_encodingPrograms[format], VProgram, shader); s_encodingUniforms[format] = glGetUniformLocation(s_encodingPrograms[format].glprogid, "position"); } return s_encodingPrograms[format]; } void Init() { glGenFramebuffers(2, s_texConvFrameBuffer); glActiveTexture(GL_TEXTURE0 + 9); glGenTextures(1, &s_srcTexture); glBindTexture(GL_TEXTURE_2D, s_srcTexture); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0); glGenTextures(1, &s_dstTexture); glBindTexture(GL_TEXTURE_2D, s_dstTexture); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0); glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, renderBufferWidth, renderBufferHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); FramebufferManager::SetFramebuffer(s_texConvFrameBuffer[0]); glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, s_dstTexture, 0); FramebufferManager::SetFramebuffer(0); glGenBuffers(1, &s_PBO); CreatePrograms(); } void Shutdown() { glDeleteTextures(1, &s_srcTexture); glDeleteTextures(1, &s_dstTexture); glDeleteBuffers(1, &s_PBO); glDeleteFramebuffers(2, s_texConvFrameBuffer); s_rgbToYuyvProgram.Destroy(); s_yuyvToRgbProgram.Destroy(); for (auto& program : s_encodingPrograms) program.Destroy(); s_srcTexture = 0; s_dstTexture = 0; s_PBO = 0; s_texConvFrameBuffer[0] = 0; s_texConvFrameBuffer[1] = 0; } void EncodeToRamUsingShader(GLuint srcTexture, const TargetRectangle& sourceRc, u8* destAddr, int dstWidth, int dstHeight, int readStride, bool linearFilter) { // switch to texture converter frame buffer // attach render buffer as color destination FramebufferManager::SetFramebuffer(s_texConvFrameBuffer[0]); GL_REPORT_ERRORD(); // set source texture glActiveTexture(GL_TEXTURE0+9); glBindTexture(GL_TEXTURE_2D, srcTexture); if (linearFilter) { glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); } else { glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); } GL_REPORT_ERRORD(); glViewport(0, 0, (GLsizei)dstWidth, (GLsizei)dstHeight); glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); GL_REPORT_ERRORD(); // .. and then read back the results. // TODO: make this less slow. int writeStride = bpmem.copyMipMapStrideChannels * 32; int dstSize = dstWidth*dstHeight*4; int readHeight = readStride / dstWidth / 4; // 4 bytes per pixel int readLoops = dstHeight / readHeight; if (writeStride != readStride && readLoops > 1) { // writing to a texture of a different size // also copy more then one block line, so the different strides matters // copy into one pbo first, map this buffer, and then memcpy into gc memory // in this way, we only have one vram->ram transfer, but maybe a bigger // cpu overhead because of the pbo glBindBuffer(GL_PIXEL_PACK_BUFFER, s_PBO); glBufferData(GL_PIXEL_PACK_BUFFER, dstSize, NULL, GL_STREAM_READ); glReadPixels(0, 0, (GLsizei)dstWidth, (GLsizei)dstHeight, GL_BGRA, GL_UNSIGNED_BYTE, 0); u8* pbo = (u8*)glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, dstSize, GL_MAP_READ_BIT); for (int i = 0; i < readLoops; i++) { memcpy(destAddr, pbo, readStride); pbo += readStride; destAddr += writeStride; } glUnmapBuffer(GL_PIXEL_PACK_BUFFER); glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); } else { glReadPixels(0, 0, (GLsizei)dstWidth, (GLsizei)dstHeight, GL_BGRA, GL_UNSIGNED_BYTE, destAddr); } GL_REPORT_ERRORD(); } int EncodeToRamFromTexture(u32 address,GLuint source_texture, bool bFromZBuffer, bool bIsIntensityFmt, u32 copyfmt, int bScaleByHalf, const EFBRectangle& source) { u32 format = copyfmt; if (bFromZBuffer) { format |= _GX_TF_ZTF; if (copyfmt == 11) format = GX_TF_Z16; else if (format < GX_TF_Z8 || format > GX_TF_Z24X8) format |= _GX_TF_CTF; } else if (copyfmt > GX_TF_RGBA8 || (copyfmt < GX_TF_RGB565 && !bIsIntensityFmt)) format |= _GX_TF_CTF; SHADER& texconv_shader = GetOrCreateEncodingShader(format); u8 *dest_ptr = Memory::GetPointer(address); int width = (source.right - source.left) >> bScaleByHalf; int height = (source.bottom - source.top) >> bScaleByHalf; int size_in_bytes = TexDecoder_GetTextureSizeInBytes(width, height, format); u16 blkW = TexDecoder_GetBlockWidthInTexels(format) - 1; u16 blkH = TexDecoder_GetBlockHeightInTexels(format) - 1; u16 samples = TextureConversionShader::GetEncodedSampleCount(format); // only copy on cache line boundaries // extra pixels are copied but not displayed in the resulting texture s32 expandedWidth = (width + blkW) & (~blkW); s32 expandedHeight = (height + blkH) & (~blkH); texconv_shader.Bind(); glUniform4i(s_encodingUniforms[format], source.left, source.top, expandedWidth, bScaleByHalf ? 2 : 1); TargetRectangle scaledSource; scaledSource.top = 0; scaledSource.bottom = expandedHeight; scaledSource.left = 0; scaledSource.right = expandedWidth / samples; int cacheBytes = 32; if ((format & 0x0f) == 6) cacheBytes = 64; int readStride = (expandedWidth * cacheBytes) / TexDecoder_GetBlockWidthInTexels(format); EncodeToRamUsingShader(source_texture, scaledSource, dest_ptr, expandedWidth / samples, expandedHeight, readStride, bScaleByHalf > 0 && !bFromZBuffer); return size_in_bytes; // TODO: D3D11 is calculating this value differently! } void EncodeToRamYUYV(GLuint srcTexture, const TargetRectangle& sourceRc, u8* destAddr, int dstWidth, int dstHeight) { g_renderer->ResetAPIState(); s_rgbToYuyvProgram.Bind(); glUniform4f(s_rgbToYuyvUniform_loc, sourceRc.left, sourceRc.top, sourceRc.right, sourceRc.bottom); // We enable linear filtering, because the gamecube does filtering in the vertical direction when // yscale is enabled. // Otherwise we get jaggies when a game uses yscaling (most PAL games) EncodeToRamUsingShader(srcTexture, sourceRc, destAddr, dstWidth / 2, dstHeight, dstWidth*dstHeight*2, true); FramebufferManager::SetFramebuffer(0); TextureCache::DisableStage(0); g_renderer->RestoreAPIState(); GL_REPORT_ERRORD(); } // Should be scale free. void DecodeToTexture(u32 xfbAddr, int srcWidth, int srcHeight, GLuint destTexture) { u8* srcAddr = Memory::GetPointer(xfbAddr); if (!srcAddr) { WARN_LOG(VIDEO, "Tried to decode from invalid memory address"); return; } g_renderer->ResetAPIState(); // reset any game specific settings // switch to texture converter frame buffer // attach destTexture as color destination FramebufferManager::SetFramebuffer(s_texConvFrameBuffer[1]); glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, destTexture, 0); GL_REPORT_FBO_ERROR(); // activate source texture // set srcAddr as data for source texture glActiveTexture(GL_TEXTURE0+9); glBindTexture(GL_TEXTURE_2D, s_srcTexture); glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, srcWidth / 2, srcHeight, 0, GL_BGRA, GL_UNSIGNED_BYTE, srcAddr); glViewport(0, 0, srcWidth, srcHeight); s_yuyvToRgbProgram.Bind(); glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); FramebufferManager::SetFramebuffer(0); g_renderer->RestoreAPIState(); GL_REPORT_ERRORD(); } } // namespace } // namespace OGL