From be1fee6d7476459a4c1fa09e1cb5b42dc005da54 Mon Sep 17 00:00:00 2001 From: degasus Date: Wed, 22 Jan 2014 18:02:55 +0100 Subject: [PATCH] OpenGL: change StreamBuffer in a streaming way This is a bit slower on map_and_* because of flushing and _very_ much slower on buffer(sub)?data because of a new memcpy. But this design allow us to decode directly into a gpu buffer, eg vertexloader will profit :) --- .../VideoBackends/OGL/ProgramShaderCache.cpp | 36 ++++----- .../Core/VideoBackends/OGL/StreamBuffer.cpp | 78 ++++++++++--------- Source/Core/VideoBackends/OGL/StreamBuffer.h | 6 +- .../Core/VideoBackends/OGL/VertexManager.cpp | 13 ++-- 4 files changed, 66 insertions(+), 67 deletions(-) diff --git a/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp b/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp index 51b6e05063..8f3a545261 100644 --- a/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp +++ b/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp @@ -193,29 +193,19 @@ void ProgramShaderCache::UploadConstants() { if(PixelShaderManager::dirty || VertexShaderManager::dirty) { - s_buffer->Alloc(s_ubo_buffer_size); - if (DriverDetails::HasBug(DriverDetails::BUG_BROKENBUFFERSTREAM)) - { - // This is just a hack to support our BUFFERDATA upload method - // as it's broken to uploaded in a splited way - static u8 *tmpbuffer = new u8[s_ubo_buffer_size]; - memcpy(tmpbuffer, &PixelShaderManager::constants, sizeof(PixelShaderConstants)); - memcpy(tmpbuffer+ROUND_UP(sizeof(PixelShaderConstants), s_ubo_align), &VertexShaderManager::constants, sizeof(VertexShaderConstants)); - size_t offset = s_buffer->Upload(tmpbuffer, s_ubo_buffer_size); - glBindBufferRange(GL_UNIFORM_BUFFER, 1, - s_buffer->getBuffer(), offset, sizeof(PixelShaderConstants)); - glBindBufferRange(GL_UNIFORM_BUFFER, 2, - s_buffer->getBuffer(), offset+ROUND_UP(sizeof(PixelShaderConstants), s_ubo_align), sizeof(VertexShaderConstants)); - } - else - { - size_t offset = s_buffer->Upload((u8*)&PixelShaderManager::constants, ROUND_UP(sizeof(PixelShaderConstants), s_ubo_align)); - glBindBufferRange(GL_UNIFORM_BUFFER, 1, - s_buffer->getBuffer(), offset, sizeof(PixelShaderConstants)); - offset = s_buffer->Upload((u8*)&VertexShaderManager::constants, ROUND_UP(sizeof(VertexShaderConstants), s_ubo_align)); - glBindBufferRange(GL_UNIFORM_BUFFER, 2, - s_buffer->getBuffer(), offset, sizeof(VertexShaderConstants)); - } + u8* buffer = s_buffer->Map(s_ubo_buffer_size, s_ubo_align); + + memcpy(buffer, + &PixelShaderManager::constants, sizeof(PixelShaderConstants)); + + memcpy(buffer + ROUND_UP(sizeof(PixelShaderConstants), s_ubo_align), + &VertexShaderManager::constants, sizeof(VertexShaderConstants)); + + size_t offset = s_buffer->Unmap(s_ubo_buffer_size); + glBindBufferRange(GL_UNIFORM_BUFFER, 1, s_buffer->getBuffer(), offset, + sizeof(PixelShaderConstants)); + glBindBufferRange(GL_UNIFORM_BUFFER, 2, s_buffer->getBuffer(), offset + ROUND_UP(sizeof(PixelShaderConstants), s_ubo_align), + sizeof(VertexShaderConstants)); PixelShaderManager::dirty = false; VertexShaderManager::dirty = false; diff --git a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp index b643a052c0..28585e07d7 100644 --- a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp +++ b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp @@ -51,20 +51,18 @@ StreamBuffer::~StreamBuffer() #define SLOT(x) ((x)*SYNC_POINTS/m_size) -void StreamBuffer::Alloc ( size_t size, u32 stride ) +u8* StreamBuffer::Map ( size_t size, u32 stride ) { - size_t m_iterator_aligned = m_iterator; - if(m_iterator_aligned && stride) { - m_iterator_aligned--; - m_iterator_aligned = m_iterator_aligned - (m_iterator_aligned % stride) + stride; + if(m_iterator && stride) { + m_iterator--; + m_iterator = m_iterator - (m_iterator % stride) + stride; } - size_t iter_end = m_iterator_aligned + size; switch(m_uploadtype) { case MAP_AND_ORPHAN: - if(iter_end >= m_size) { + if(m_iterator + size >= m_size) { glBufferData(m_buffertype, m_size, NULL, GL_STREAM_DRAW); - m_iterator_aligned = 0; + m_iterator = 0; } break; case MAP_AND_SYNC: @@ -78,15 +76,15 @@ void StreamBuffer::Alloc ( size_t size, u32 stride ) m_used_iterator = m_iterator; // wait for new slots to end of buffer - for (size_t i = SLOT(m_free_iterator) + 1; i <= SLOT(iter_end) && i < SYNC_POINTS; i++) + for (size_t i = SLOT(m_free_iterator) + 1; i <= SLOT(m_iterator + size) && i < SYNC_POINTS; i++) { glClientWaitSync(fences[i], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED); glDeleteSync(fences[i]); } - m_free_iterator = iter_end; + m_free_iterator = m_iterator + size; // if buffer is full - if (iter_end >= m_size) { + if (m_iterator + size >= m_size) { // insert waiting slots in unused space at the end of the buffer for (size_t i = SLOT(m_used_iterator); i < SYNC_POINTS; i++) @@ -95,54 +93,58 @@ void StreamBuffer::Alloc ( size_t size, u32 stride ) } // move to the start - m_used_iterator = m_iterator_aligned = m_iterator = 0; // offset 0 is always aligned - iter_end = size; + m_used_iterator = m_iterator = 0; // offset 0 is always aligned // wait for space at the start - for (u32 i = 0; i <= SLOT(iter_end); i++) + for (u32 i = 0; i <= SLOT(m_iterator + size); i++) { glClientWaitSync(fences[i], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED); glDeleteSync(fences[i]); } - m_free_iterator = iter_end; + m_free_iterator = m_iterator + size; } - break; case BUFFERSUBDATA: case BUFFERDATA: - m_iterator_aligned = 0; + m_iterator = 0; break; } - m_iterator = m_iterator_aligned; -} -size_t StreamBuffer::Upload ( u8* data, size_t size ) -{ + // MAP_AND_* methods need to remap this buffer every time switch(m_uploadtype) { - case MAP_AND_SYNC: case MAP_AND_ORPHAN: - pointer = (u8*)glMapBufferRange(m_buffertype, m_iterator, size, GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT); - if(pointer) { - memcpy(pointer, data, size); - glUnmapBuffer(m_buffertype); - } else { - ERROR_LOG(VIDEO, "Buffer mapping failed"); - } + case MAP_AND_SYNC: + pointer = (u8*)glMapBufferRange(m_buffertype, m_iterator, size, + GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_RANGE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT | GL_MAP_UNSYNCHRONIZED_BIT) - m_iterator; break; case PINNED_MEMORY: case BUFFERSTORAGE: - if (pointer) - memcpy(pointer + m_iterator, data, size); - break; case BUFFERSUBDATA: - glBufferSubData(m_buffertype, m_iterator, size, data); - break; case BUFFERDATA: - glBufferData(m_buffertype, size, data, GL_STREAM_DRAW); break; } + return pointer + m_iterator; +} + +size_t StreamBuffer::Unmap(size_t used_size) +{ size_t ret = m_iterator; - m_iterator += size; + switch(m_uploadtype) { + case MAP_AND_SYNC: + case MAP_AND_ORPHAN: + glFlushMappedBufferRange(m_buffertype, 0, used_size); + glUnmapBuffer(m_buffertype); + break; + case PINNED_MEMORY: + case BUFFERSTORAGE: + case BUFFERSUBDATA: + glBufferSubData(m_buffertype, 0, used_size, pointer); + break; + case BUFFERDATA: + glBufferData(m_buffertype, used_size, pointer, GL_STREAM_DRAW); + break; + } + m_iterator += used_size; return ret; } @@ -162,6 +164,7 @@ void StreamBuffer::Init() case BUFFERSUBDATA: glBindBuffer(m_buffertype, m_buffer); glBufferData(m_buffertype, m_size, NULL, GL_STREAM_DRAW); + pointer = new u8[m_size]; break; case PINNED_MEMORY: glGetError(); // errors before this allocation should be ignored @@ -205,6 +208,7 @@ void StreamBuffer::Init() case BUFFERDATA: glBindBuffer(m_buffertype, m_buffer); + pointer = new u8[m_size]; break; } } @@ -216,8 +220,10 @@ void StreamBuffer::Shutdown() DeleteFences(); break; case MAP_AND_ORPHAN: + break; case BUFFERSUBDATA: case BUFFERDATA: + delete [] pointer; break; case PINNED_MEMORY: DeleteFences(); diff --git a/Source/Core/VideoBackends/OGL/StreamBuffer.h b/Source/Core/VideoBackends/OGL/StreamBuffer.h index f2283bb923..abef139546 100644 --- a/Source/Core/VideoBackends/OGL/StreamBuffer.h +++ b/Source/Core/VideoBackends/OGL/StreamBuffer.h @@ -32,10 +32,10 @@ public: StreamBuffer(u32 type, size_t size); ~StreamBuffer(); - void Alloc(size_t size, u32 stride = 0); - size_t Upload(u8 *data, size_t size); + u8* Map(size_t size, u32 stride = 0); + size_t Unmap(size_t used_size); // returns the offset of the beginning of the uploaded block - u32 getBuffer() { return m_buffer; } + inline u32 getBuffer() { return m_buffer; } private: void Init(); diff --git a/Source/Core/VideoBackends/OGL/VertexManager.cpp b/Source/Core/VideoBackends/OGL/VertexManager.cpp index 274c9b9fcc..07a6a24196 100644 --- a/Source/Core/VideoBackends/OGL/VertexManager.cpp +++ b/Source/Core/VideoBackends/OGL/VertexManager.cpp @@ -39,7 +39,7 @@ namespace OGL { //This are the initially requested size for the buffers expressed in bytes const u32 MAX_IBUFFER_SIZE = 2*1024*1024; -const u32 MAX_VBUFFER_SIZE = 16*1024*1024; +const u32 MAX_VBUFFER_SIZE = 32*1024*1024; static StreamBuffer *s_vertexBuffer; static StreamBuffer *s_indexBuffer; @@ -85,12 +85,14 @@ void VertexManager::PrepareDrawBuffers(u32 stride) u32 vertex_data_size = IndexGenerator::GetNumVerts() * stride; u32 index_data_size = IndexGenerator::GetIndexLen() * sizeof(u16); - s_vertexBuffer->Alloc(vertex_data_size, stride); - size_t offset = s_vertexBuffer->Upload(GetVertexBuffer(), vertex_data_size); + u8* buffer = s_vertexBuffer->Map(vertex_data_size, stride); + memcpy(buffer, GetVertexBuffer(), vertex_data_size); + size_t offset = s_vertexBuffer->Unmap(vertex_data_size); s_baseVertex = offset / stride; - s_indexBuffer->Alloc(index_data_size); - s_index_offset = s_indexBuffer->Upload((u8*)GetIndexBuffer(), index_data_size); + buffer = s_indexBuffer->Map(index_data_size); + memcpy(buffer, GetIndexBuffer(), index_data_size); + s_index_offset = s_indexBuffer->Unmap(index_data_size); ADDSTAT(stats.thisFrame.bytesVertexStreamed, vertex_data_size); ADDSTAT(stats.thisFrame.bytesIndexStreamed, index_data_size); @@ -234,4 +236,5 @@ void VertexManager::vFlush() GL_REPORT_ERRORD(); } + } // namespace