From d81d2e89155cb6eb5ab20cc7a32bcf8daa4788ea Mon Sep 17 00:00:00 2001 From: degasus Date: Thu, 5 Jun 2014 10:40:17 +0200 Subject: [PATCH 1/4] OGL-StreamBuffer: allocate fences in StreamBuffer directly --- Source/Core/VideoBackends/OGL/StreamBuffer.cpp | 4 ---- Source/Core/VideoBackends/OGL/StreamBuffer.h | 3 ++- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp index 4a79c8f6b7..85320c14b0 100644 --- a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp +++ b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp @@ -28,7 +28,6 @@ StreamBuffer::StreamBuffer(u32 type, size_t size) m_iterator = 0; m_used_iterator = 0; m_free_iterator = 0; - fences = nullptr; } @@ -61,10 +60,8 @@ StreamBuffer::~StreamBuffer() */ #define SLOT(x) ((x)*SYNC_POINTS/m_size) -static const u32 SYNC_POINTS = 16; void StreamBuffer::CreateFences() { - fences = new GLsync[SYNC_POINTS]; for (u32 i=0; i Date: Thu, 5 Jun 2014 11:06:41 +0200 Subject: [PATCH 2/4] OGL-StreamBuffer: make the SLOT calculation much easier The size of the buffer is now power of 2, so we can use a shift instead of a division. This was at about 2% of the global CPU usage. --- .../Core/VideoBackends/OGL/StreamBuffer.cpp | 20 +++++++++---------- Source/Core/VideoBackends/OGL/StreamBuffer.h | 5 ++++- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp index 85320c14b0..57a1cc0036 100644 --- a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp +++ b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp @@ -23,7 +23,7 @@ static u32 genBuffer() } StreamBuffer::StreamBuffer(u32 type, size_t size) -: m_buffer(genBuffer()), m_buffertype(type), m_size(size) +: m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(Log2(ROUND_UP_POW2(size) / SYNC_POINTS)) { m_iterator = 0; m_used_iterator = 0; @@ -59,19 +59,20 @@ StreamBuffer::~StreamBuffer() * As ring buffers have an ugly behavoir on rollover, have fun to read this code ;) */ -#define SLOT(x) ((x)*SYNC_POINTS/m_size) void StreamBuffer::CreateFences() { - for (u32 i=0; i= m_size) { // insert waiting slots in unused space at the end of the buffer - for (size_t i = SLOT(m_used_iterator); i < SYNC_POINTS; i++) + for (int i = SLOT(m_used_iterator); i < SYNC_POINTS; i++) { fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); } @@ -106,7 +107,7 @@ void StreamBuffer::AllocMemory(size_t size) m_used_iterator = m_iterator = 0; // offset 0 is always aligned // wait for space at the start - for (u32 i = 0; i <= SLOT(m_iterator + size); i++) + for (int i = 0; i <= SLOT(m_iterator + size); i++) { glClientWaitSync(fences[i], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED); glDeleteSync(fences[i]); @@ -114,7 +115,6 @@ void StreamBuffer::AllocMemory(size_t size) m_free_iterator = m_iterator + size; } } -#undef SLOT void StreamBuffer::Align(u32 stride) { diff --git a/Source/Core/VideoBackends/OGL/StreamBuffer.h b/Source/Core/VideoBackends/OGL/StreamBuffer.h index fa7a29eac2..66747eafd9 100644 --- a/Source/Core/VideoBackends/OGL/StreamBuffer.h +++ b/Source/Core/VideoBackends/OGL/StreamBuffer.h @@ -46,7 +46,10 @@ protected: size_t m_free_iterator; private: - static const u32 SYNC_POINTS = 16; + static const int SYNC_POINTS = 16; + inline int SLOT(size_t x) const { return x >> m_bit_per_slot; } + const int m_bit_per_slot; + GLsync fences[SYNC_POINTS]; }; From 606e46ba8d497ccd2719b697c7ed1994710435e5 Mon Sep 17 00:00:00 2001 From: degasus Date: Thu, 5 Jun 2014 11:25:57 +0200 Subject: [PATCH 3/4] OGL-StreamBuffer: move alignment to caller Only the caller know if alignment is needed at all, so it can be skipped now. --- .../Core/VideoBackends/OGL/StreamBuffer.cpp | 24 +++++-------------- Source/Core/VideoBackends/OGL/StreamBuffer.h | 13 ++++++++-- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp index 57a1cc0036..49a8a0e050 100644 --- a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp +++ b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp @@ -116,14 +116,6 @@ void StreamBuffer::AllocMemory(size_t size) } } -void StreamBuffer::Align(u32 stride) -{ - if (m_iterator && stride) { - m_iterator--; - m_iterator = m_iterator - (m_iterator % stride) + stride; - } -} - /* The usual way to stream data to the gpu. * Described here: https://www.opengl.org/wiki/Buffer_Object_Streaming#Unsynchronized_buffer_mapping * Just do unsync appends until the buffer is full. @@ -142,8 +134,7 @@ public: ~MapAndOrphan() { } - std::pair Map(size_t size, u32 stride) override { - Align(stride); + std::pair Map(size_t size) override { if (m_iterator + size >= m_size) { glBufferData(m_buffertype, m_size, nullptr, GL_STREAM_DRAW); m_iterator = 0; @@ -180,8 +171,7 @@ public: DeleteFences(); } - std::pair Map(size_t size, u32 stride) override { - Align(stride); + std::pair Map(size_t size) override { AllocMemory(size); u8* pointer = (u8*)glMapBufferRange(m_buffertype, m_iterator, size, GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT | GL_MAP_UNSYNCHRONIZED_BIT); @@ -230,8 +220,7 @@ public: glBindBuffer(m_buffertype, 0); } - std::pair Map(size_t size, u32 stride) override { - Align(stride); + std::pair Map(size_t size) override { AllocMemory(size); return std::make_pair(m_pointer + m_iterator, m_iterator); } @@ -271,8 +260,7 @@ public: m_pointer = nullptr; } - std::pair Map(size_t size, u32 stride) override { - Align(stride); + std::pair Map(size_t size) override { AllocMemory(size); return std::make_pair(m_pointer + m_iterator, m_iterator); } @@ -303,7 +291,7 @@ public: delete [] m_pointer; } - std::pair Map(size_t size, u32 stride) override { + std::pair Map(size_t size) override { return std::make_pair(m_pointer, 0); } @@ -331,7 +319,7 @@ public: delete [] m_pointer; } - std::pair Map(size_t size, u32 stride) override { + std::pair Map(size_t size) override { return std::make_pair(m_pointer, 0); } diff --git a/Source/Core/VideoBackends/OGL/StreamBuffer.h b/Source/Core/VideoBackends/OGL/StreamBuffer.h index 66747eafd9..d9d12fe8f4 100644 --- a/Source/Core/VideoBackends/OGL/StreamBuffer.h +++ b/Source/Core/VideoBackends/OGL/StreamBuffer.h @@ -26,9 +26,19 @@ public: * Mapping invalidates the current buffer content, * so it isn't allowed to access the old content any more. */ - virtual std::pair Map(size_t size, u32 stride = 0) = 0; + virtual std::pair Map(size_t size) = 0; virtual void Unmap(size_t used_size) = 0; + inline std::pair Map(size_t size, u32 stride) + { + u32 padding = m_iterator % stride; + if (padding) + { + m_iterator += stride - padding; + } + return Map(size); + } + const u32 m_buffer; protected: @@ -36,7 +46,6 @@ protected: void CreateFences(); void DeleteFences(); void AllocMemory(size_t size); - void Align(u32 stride); const u32 m_buffertype; const size_t m_size; From d9eafd94a2f102c879951836229b539283de0426 Mon Sep 17 00:00:00 2001 From: degasus Date: Thu, 5 Jun 2014 11:51:05 +0200 Subject: [PATCH 4/4] OGL-StreamBuffer: replace size_t with u32 Yes, this matters. We align our buffer all the the time which needs a division. u64 divisions are just so slow. --- .../Core/VideoBackends/OGL/StreamBuffer.cpp | 42 +++++++++---------- Source/Core/VideoBackends/OGL/StreamBuffer.h | 22 +++++----- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp index 49a8a0e050..5723283b72 100644 --- a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp +++ b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp @@ -22,7 +22,7 @@ static u32 genBuffer() return id; } -StreamBuffer::StreamBuffer(u32 type, size_t size) +StreamBuffer::StreamBuffer(u32 type, u32 size) : m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(Log2(ROUND_UP_POW2(size) / SYNC_POINTS)) { m_iterator = 0; @@ -77,7 +77,7 @@ void StreamBuffer::DeleteFences() glDeleteSync(fences[i]); } } -void StreamBuffer::AllocMemory(size_t size) +void StreamBuffer::AllocMemory(u32 size) { // insert waiting slots for used memory for (int i = SLOT(m_used_iterator); i < SLOT(m_iterator); i++) @@ -126,7 +126,7 @@ void StreamBuffer::AllocMemory(size_t size) class MapAndOrphan : public StreamBuffer { public: - MapAndOrphan(u32 type, size_t size) : StreamBuffer(type, size) { + MapAndOrphan(u32 type, u32 size) : StreamBuffer(type, size) { glBindBuffer(m_buffertype, m_buffer); glBufferData(m_buffertype, m_size, nullptr, GL_STREAM_DRAW); } @@ -134,7 +134,7 @@ public: ~MapAndOrphan() { } - std::pair Map(size_t size) override { + std::pair Map(u32 size) override { if (m_iterator + size >= m_size) { glBufferData(m_buffertype, m_size, nullptr, GL_STREAM_DRAW); m_iterator = 0; @@ -144,7 +144,7 @@ public: return std::make_pair(pointer, m_iterator); } - void Unmap(size_t used_size) override { + void Unmap(u32 used_size) override { glFlushMappedBufferRange(m_buffertype, 0, used_size); glUnmapBuffer(m_buffertype); m_iterator += used_size; @@ -161,7 +161,7 @@ public: class MapAndSync : public StreamBuffer { public: - MapAndSync(u32 type, size_t size) : StreamBuffer(type, size) { + MapAndSync(u32 type, u32 size) : StreamBuffer(type, size) { CreateFences(); glBindBuffer(m_buffertype, m_buffer); glBufferData(m_buffertype, m_size, nullptr, GL_STREAM_DRAW); @@ -171,14 +171,14 @@ public: DeleteFences(); } - std::pair Map(size_t size) override { + std::pair Map(u32 size) override { AllocMemory(size); u8* pointer = (u8*)glMapBufferRange(m_buffertype, m_iterator, size, GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT | GL_MAP_UNSYNCHRONIZED_BIT); return std::make_pair(pointer, m_iterator); } - void Unmap(size_t used_size) override { + void Unmap(u32 used_size) override { glFlushMappedBufferRange(m_buffertype, 0, used_size); glUnmapBuffer(m_buffertype); m_iterator += used_size; @@ -201,7 +201,7 @@ public: class BufferStorage : public StreamBuffer { public: - BufferStorage(u32 type, size_t size) : StreamBuffer(type, size) { + BufferStorage(u32 type, u32 size) : StreamBuffer(type, size) { CreateFences(); glBindBuffer(m_buffertype, m_buffer); @@ -220,12 +220,12 @@ public: glBindBuffer(m_buffertype, 0); } - std::pair Map(size_t size) override { + std::pair Map(u32 size) override { AllocMemory(size); return std::make_pair(m_pointer + m_iterator, m_iterator); } - void Unmap(size_t used_size) override { + void Unmap(u32 used_size) override { glFlushMappedBufferRange(m_buffertype, m_iterator, used_size); m_iterator += used_size; } @@ -243,7 +243,7 @@ public: class PinnedMemory : public StreamBuffer { public: - PinnedMemory(u32 type, size_t size) : StreamBuffer(type, size) { + PinnedMemory(u32 type, u32 size) : StreamBuffer(type, size) { CreateFences(); m_pointer = (u8*)AllocateAlignedMemory(ROUND_UP(m_size,ALIGN_PINNED_MEMORY), ALIGN_PINNED_MEMORY ); glBindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, m_buffer); @@ -260,12 +260,12 @@ public: m_pointer = nullptr; } - std::pair Map(size_t size) override { + std::pair Map(u32 size) override { AllocMemory(size); return std::make_pair(m_pointer + m_iterator, m_iterator); } - void Unmap(size_t used_size) override { + void Unmap(u32 used_size) override { m_iterator += used_size; } @@ -281,7 +281,7 @@ public: class BufferSubData : public StreamBuffer { public: - BufferSubData(u32 type, size_t size) : StreamBuffer(type, size) { + BufferSubData(u32 type, u32 size) : StreamBuffer(type, size) { glBindBuffer(m_buffertype, m_buffer); glBufferData(m_buffertype, size, nullptr, GL_STATIC_DRAW); m_pointer = new u8[m_size]; @@ -291,11 +291,11 @@ public: delete [] m_pointer; } - std::pair Map(size_t size) override { + std::pair Map(u32 size) override { return std::make_pair(m_pointer, 0); } - void Unmap(size_t used_size) override { + void Unmap(u32 used_size) override { glBufferSubData(m_buffertype, 0, used_size, m_pointer); } @@ -310,7 +310,7 @@ public: class BufferData : public StreamBuffer { public: - BufferData(u32 type, size_t size) : StreamBuffer(type, size) { + BufferData(u32 type, u32 size) : StreamBuffer(type, size) { glBindBuffer(m_buffertype, m_buffer); m_pointer = new u8[m_size]; } @@ -319,11 +319,11 @@ public: delete [] m_pointer; } - std::pair Map(size_t size) override { + std::pair Map(u32 size) override { return std::make_pair(m_pointer, 0); } - void Unmap(size_t used_size) override { + void Unmap(u32 used_size) override { glBufferData(m_buffertype, used_size, m_pointer, GL_STREAM_DRAW); } @@ -331,7 +331,7 @@ public: }; // choose best streaming library based on the supported extensions and known issues -StreamBuffer* StreamBuffer::Create(u32 type, size_t size) +StreamBuffer* StreamBuffer::Create(u32 type, u32 size) { // without basevertex support, only streaming methods whith uploads everything to zero works fine: if (!g_ogl_config.bSupportsGLBaseVertex) diff --git a/Source/Core/VideoBackends/OGL/StreamBuffer.h b/Source/Core/VideoBackends/OGL/StreamBuffer.h index d9d12fe8f4..928af0bbaf 100644 --- a/Source/Core/VideoBackends/OGL/StreamBuffer.h +++ b/Source/Core/VideoBackends/OGL/StreamBuffer.h @@ -15,7 +15,7 @@ namespace OGL class StreamBuffer { public: - static StreamBuffer* Create(u32 type, size_t size); + static StreamBuffer* Create(u32 type, u32 size); virtual ~StreamBuffer(); /* This mapping function will return a pair of: @@ -26,10 +26,10 @@ public: * Mapping invalidates the current buffer content, * so it isn't allowed to access the old content any more. */ - virtual std::pair Map(size_t size) = 0; - virtual void Unmap(size_t used_size) = 0; + virtual std::pair Map(u32 size) = 0; + virtual void Unmap(u32 used_size) = 0; - inline std::pair Map(size_t size, u32 stride) + inline std::pair Map(u32 size, u32 stride) { u32 padding = m_iterator % stride; if (padding) @@ -42,21 +42,21 @@ public: const u32 m_buffer; protected: - StreamBuffer(u32 type, size_t size); + StreamBuffer(u32 type, u32 size); void CreateFences(); void DeleteFences(); - void AllocMemory(size_t size); + void AllocMemory(u32 size); const u32 m_buffertype; - const size_t m_size; + const u32 m_size; - size_t m_iterator; - size_t m_used_iterator; - size_t m_free_iterator; + u32 m_iterator; + u32 m_used_iterator; + u32 m_free_iterator; private: static const int SYNC_POINTS = 16; - inline int SLOT(size_t x) const { return x >> m_bit_per_slot; } + inline int SLOT(u32 x) const { return x >> m_bit_per_slot; } const int m_bit_per_slot; GLsync fences[SYNC_POINTS];