dolphin/Source/Core/VideoBackends/OGL/VertexManager.cpp

151 lines
4.2 KiB
C++
Raw Normal View History

// Copyright 2008 Dolphin Emulator Project
2015-05-18 01:08:10 +02:00
// Licensed under GPLv2+
// Refer to the license.txt file included.
#include "VideoBackends/OGL/VertexManager.h"
#include <fstream>
#include <memory>
#include <string>
#include <vector>
#include "Common/CommonTypes.h"
#include "Common/FileUtil.h"
#include "Common/GL/GLExtensions/GLExtensions.h"
#include "Common/StringUtil.h"
OGL: implement Bounding Box on systems w/o SSBO This commit should have zero performance effect if SSBOs are supported. If they aren't (e.g. on all Macs), this commit alters FramebufferManager to attach a new stencil buffer and VertexManager to draw to it when bounding box is active. `BBoxRead` gets the pixel data from the buffer and dumbly loops through it to find the bounding box. This patch can run Paper Mario: The Thousand-Year Door at almost full speed (50–60 FPS) without Dual-Core enabled for all common bounding box-using actions I tested (going through pipes, Plane Mode, Paper Mode, Prof. Frankly's gate, combat, walking around the overworld, etc.) on my computer (macOS 10.12.3, 2.8 GHz Intel Core i7, 16 GB 1600 MHz DDR3, and Intel Iris 1536 MB). A few more demanding scenes (e.g. the self-building bridge on the way to Petalburg) slow to ~15% of their speed without this patch (though they don't run quite at full speed even on master). The slowdown is caused almost solely by `glReadPixels` in `OGL::BoundingBox::Get`. Other implementation ideas: - Use a stencil buffer that's separate from the depth buffer. This would require ARB_texture_stencil8 / OpenGL 4.4, which isn't available on macOS. - Use `glGetTexImage` instead of `glReadPixels`. This is ~5 FPS slower on my computer, presumably because it has to transfer the entire combined depth-stencil buffer instead of only the stencil data. Getting only stencil data from `glGetTexImage` requires ARB_texture_stencil8 / OpenGL 4.4, which (again) is not available on macOS. - Don't use a PBO, and use `glReadPixels` synchronously. This has no visible performance effect on my computer, and is theoretically slower.
2017-03-05 15:34:30 -08:00
#include "VideoBackends/OGL/BoundingBox.h"
2018-11-27 17:16:53 +10:00
#include "VideoBackends/OGL/OGLPipeline.h"
#include "VideoBackends/OGL/ProgramShaderCache.h"
#include "VideoBackends/OGL/Render.h"
#include "VideoBackends/OGL/StreamBuffer.h"
OGL: implement Bounding Box on systems w/o SSBO This commit should have zero performance effect if SSBOs are supported. If they aren't (e.g. on all Macs), this commit alters FramebufferManager to attach a new stencil buffer and VertexManager to draw to it when bounding box is active. `BBoxRead` gets the pixel data from the buffer and dumbly loops through it to find the bounding box. This patch can run Paper Mario: The Thousand-Year Door at almost full speed (50–60 FPS) without Dual-Core enabled for all common bounding box-using actions I tested (going through pipes, Plane Mode, Paper Mode, Prof. Frankly's gate, combat, walking around the overworld, etc.) on my computer (macOS 10.12.3, 2.8 GHz Intel Core i7, 16 GB 1600 MHz DDR3, and Intel Iris 1536 MB). A few more demanding scenes (e.g. the self-building bridge on the way to Petalburg) slow to ~15% of their speed without this patch (though they don't run quite at full speed even on master). The slowdown is caused almost solely by `glReadPixels` in `OGL::BoundingBox::Get`. Other implementation ideas: - Use a stencil buffer that's separate from the depth buffer. This would require ARB_texture_stencil8 / OpenGL 4.4, which isn't available on macOS. - Use `glGetTexImage` instead of `glReadPixels`. This is ~5 FPS slower on my computer, presumably because it has to transfer the entire combined depth-stencil buffer instead of only the stencil data. Getting only stencil data from `glGetTexImage` requires ARB_texture_stencil8 / OpenGL 4.4, which (again) is not available on macOS. - Don't use a PBO, and use `glReadPixels` synchronously. This has no visible performance effect on my computer, and is theoretically slower.
2017-03-05 15:34:30 -08:00
#include "VideoCommon/BoundingBox.h"
#include "VideoCommon/IndexGenerator.h"
#include "VideoCommon/Statistics.h"
#include "VideoCommon/VertexLoaderManager.h"
#include "VideoCommon/VideoConfig.h"
namespace OGL
{
// This are the initially requested size for the buffers expressed in bytes
const u32 MAX_IBUFFER_SIZE = 2 * 1024 * 1024;
const u32 MAX_VBUFFER_SIZE = 32 * 1024 * 1024;
VertexManager::VertexManager() : m_cpu_v_buffer(MAX_VBUFFER_SIZE), m_cpu_i_buffer(MAX_IBUFFER_SIZE)
{
CreateDeviceObjects();
}
VertexManager::~VertexManager()
{
DestroyDeviceObjects();
}
void VertexManager::CreateDeviceObjects()
{
2018-11-27 17:16:53 +10:00
m_vertex_buffer = StreamBuffer::Create(GL_ARRAY_BUFFER, MAX_VBUFFER_SIZE);
m_index_buffer = StreamBuffer::Create(GL_ELEMENT_ARRAY_BUFFER, MAX_IBUFFER_SIZE);
}
void VertexManager::DestroyDeviceObjects()
{
2018-11-27 17:16:53 +10:00
m_vertex_buffer.reset();
m_index_buffer.reset();
}
2018-11-27 17:16:53 +10:00
void VertexManager::UploadUtilityUniforms(const void* uniforms, u32 uniforms_size)
{
2018-11-27 17:16:53 +10:00
ProgramShaderCache::InvalidateConstants();
ProgramShaderCache::UploadConstants(uniforms, uniforms_size);
}
GLuint VertexManager::GetVertexBufferHandle() const
{
2018-11-27 17:16:53 +10:00
return m_vertex_buffer->m_buffer;
}
GLuint VertexManager::GetIndexBufferHandle() const
{
2018-11-27 17:16:53 +10:00
return m_index_buffer->m_buffer;
}
2018-11-27 17:16:53 +10:00
static void CheckBufferBinding()
{
// The index buffer is part of the VAO state, therefore we need to bind it first.
2018-11-27 17:16:53 +10:00
if (!ProgramShaderCache::IsValidVertexFormatBound())
{
ProgramShaderCache::BindVertexFormat(
static_cast<GLVertexFormat*>(VertexLoaderManager::GetCurrentVertexFormat()));
}
}
2018-11-27 17:16:53 +10:00
void VertexManager::ResetBuffer(u32 vertex_stride, bool cull_all)
2014-01-23 13:41:53 +01:00
{
2018-11-27 17:16:53 +10:00
if (cull_all)
{
// This buffer isn't getting sent to the GPU. Just allocate it on the cpu.
m_cur_buffer_pointer = m_base_buffer_pointer = m_cpu_v_buffer.data();
m_end_buffer_pointer = m_base_buffer_pointer + m_cpu_v_buffer.size();
IndexGenerator::Start((u16*)m_cpu_i_buffer.data());
}
else
{
2018-11-27 17:16:53 +10:00
CheckBufferBinding();
2018-11-27 17:16:53 +10:00
auto buffer = m_vertex_buffer->Map(MAXVBUFFERSIZE, vertex_stride);
m_cur_buffer_pointer = m_base_buffer_pointer = buffer.first;
m_end_buffer_pointer = buffer.first + MAXVBUFFERSIZE;
2018-11-27 17:16:53 +10:00
buffer = m_index_buffer->Map(MAXIBUFFERSIZE * sizeof(u16));
IndexGenerator::Start((u16*)buffer.first);
}
2014-01-23 13:41:53 +01:00
}
2018-11-27 17:16:53 +10:00
void VertexManager::CommitBuffer(u32 num_vertices, u32 vertex_stride, u32 num_indices,
u32* out_base_vertex, u32* out_base_index)
{
2018-11-27 17:16:53 +10:00
u32 vertex_data_size = num_vertices * vertex_stride;
u32 index_data_size = num_indices * sizeof(u16);
2018-11-27 17:16:53 +10:00
*out_base_vertex = vertex_stride > 0 ? (m_vertex_buffer->GetCurrentOffset() / vertex_stride) : 0;
*out_base_index = m_index_buffer->GetCurrentOffset() / sizeof(u16);
2018-11-27 17:16:53 +10:00
CheckBufferBinding();
m_vertex_buffer->Unmap(vertex_data_size);
m_index_buffer->Unmap(index_data_size);
2018-11-27 17:16:53 +10:00
ADDSTAT(stats.thisFrame.bytesVertexStreamed, vertex_data_size);
ADDSTAT(stats.thisFrame.bytesIndexStreamed, index_data_size);
}
2018-11-27 17:16:53 +10:00
void VertexManager::UploadConstants()
{
ProgramShaderCache::UploadConstants();
2018-11-27 17:16:53 +10:00
}
2018-11-27 17:16:53 +10:00
void VertexManager::DrawCurrentBatch(u32 base_index, u32 num_indices, u32 base_vertex)
{
if (::BoundingBox::active && !g_Config.BBoxUseFragmentShaderImplementation())
OGL: implement Bounding Box on systems w/o SSBO This commit should have zero performance effect if SSBOs are supported. If they aren't (e.g. on all Macs), this commit alters FramebufferManager to attach a new stencil buffer and VertexManager to draw to it when bounding box is active. `BBoxRead` gets the pixel data from the buffer and dumbly loops through it to find the bounding box. This patch can run Paper Mario: The Thousand-Year Door at almost full speed (50–60 FPS) without Dual-Core enabled for all common bounding box-using actions I tested (going through pipes, Plane Mode, Paper Mode, Prof. Frankly's gate, combat, walking around the overworld, etc.) on my computer (macOS 10.12.3, 2.8 GHz Intel Core i7, 16 GB 1600 MHz DDR3, and Intel Iris 1536 MB). A few more demanding scenes (e.g. the self-building bridge on the way to Petalburg) slow to ~15% of their speed without this patch (though they don't run quite at full speed even on master). The slowdown is caused almost solely by `glReadPixels` in `OGL::BoundingBox::Get`. Other implementation ideas: - Use a stencil buffer that's separate from the depth buffer. This would require ARB_texture_stencil8 / OpenGL 4.4, which isn't available on macOS. - Use `glGetTexImage` instead of `glReadPixels`. This is ~5 FPS slower on my computer, presumably because it has to transfer the entire combined depth-stencil buffer instead of only the stencil data. Getting only stencil data from `glGetTexImage` requires ARB_texture_stencil8 / OpenGL 4.4, which (again) is not available on macOS. - Don't use a PBO, and use `glReadPixels` synchronously. This has no visible performance effect on my computer, and is theoretically slower.
2017-03-05 15:34:30 -08:00
{
glEnable(GL_STENCIL_TEST);
}
2018-02-25 01:15:35 +10:00
if (m_current_pipeline_object)
{
2018-11-27 17:16:53 +10:00
static_cast<Renderer*>(g_renderer.get())->SetPipeline(m_current_pipeline_object);
static_cast<Renderer*>(g_renderer.get())->DrawIndexed(base_index, num_indices, base_vertex);
2018-02-25 01:15:35 +10:00
}
if (::BoundingBox::active && !g_Config.BBoxUseFragmentShaderImplementation())
OGL: implement Bounding Box on systems w/o SSBO This commit should have zero performance effect if SSBOs are supported. If they aren't (e.g. on all Macs), this commit alters FramebufferManager to attach a new stencil buffer and VertexManager to draw to it when bounding box is active. `BBoxRead` gets the pixel data from the buffer and dumbly loops through it to find the bounding box. This patch can run Paper Mario: The Thousand-Year Door at almost full speed (50–60 FPS) without Dual-Core enabled for all common bounding box-using actions I tested (going through pipes, Plane Mode, Paper Mode, Prof. Frankly's gate, combat, walking around the overworld, etc.) on my computer (macOS 10.12.3, 2.8 GHz Intel Core i7, 16 GB 1600 MHz DDR3, and Intel Iris 1536 MB). A few more demanding scenes (e.g. the self-building bridge on the way to Petalburg) slow to ~15% of their speed without this patch (though they don't run quite at full speed even on master). The slowdown is caused almost solely by `glReadPixels` in `OGL::BoundingBox::Get`. Other implementation ideas: - Use a stencil buffer that's separate from the depth buffer. This would require ARB_texture_stencil8 / OpenGL 4.4, which isn't available on macOS. - Use `glGetTexImage` instead of `glReadPixels`. This is ~5 FPS slower on my computer, presumably because it has to transfer the entire combined depth-stencil buffer instead of only the stencil data. Getting only stencil data from `glGetTexImage` requires ARB_texture_stencil8 / OpenGL 4.4, which (again) is not available on macOS. - Don't use a PBO, and use `glReadPixels` synchronously. This has no visible performance effect on my computer, and is theoretically slower.
2017-03-05 15:34:30 -08:00
{
OGL::BoundingBox::StencilWasUpdated();
glDisable(GL_STENCIL_TEST);
}
g_Config.iSaveTargetId++;
ClearEFBCache();
}
2018-11-27 17:16:53 +10:00
} // namespace OGL