dolphin/Source/Core/VideoBackends/OGL/StreamBuffer.cpp
Markus Wick 227db66e4f OGL: Use glBufferData on Mali.
tl;dr: This PR speedups dolphin on mobiles with the Mali GPU and ES 3.2
drivers by a factor of 10 by using the method with the biggest overhead.
Please keep care not to buy this shit!

The ARM driver team seems to care very well about their customers. But
bad luck, users and open source developers are *not* their customers. So
even device-independent feature requests are just ignored for *years*:

https://community.arm.com/graphics/f/discussions/4645/gl_ext_buffer_storage-support

The bad point, they neither implement any of the other common ways to
stream dynamic content in unextented GL:
- They just ignore the GL_MAP_UNSYNCHRONIZED_BIT flag
- They don't support on-device buffer updates and just stall with
glBufferSubData

It seems like no benchmark is using any dynamic content - and like no
customer cares about anything but benchmarks, or users...

We have a flag to disable the glBufferSubData way, this PR adds the flag
to also disable the unsychronized mapping way. The second one is
available since their ES 3.2 update, but slow as hell.

So how to continue? The last remaining technical way to stream dynamic
content at all is to alloc a new buffer per draw call with glBufferData.
This is very gross, but still a factor 10 speedup compared to stalling
the GPU. Small tests shows that you can expect another 3-5 times speedup
with EXT_buffer_data, so Mali would be on pair with Adreno here. So if
you have bought such a device unfortunately, please try to make noise on
your vendor forums/support and ask for this extension. If you are going
to buy a new mobile, I'd recormend to avoid *any* mobile with a Mali GPU
in it.
2018-02-25 17:12:36 +01:00

395 lines
13 KiB
C++

// Copyright 2013 Dolphin Emulator Project
// Licensed under GPLv2+
// Refer to the license.txt file included.
#include "VideoBackends/OGL/StreamBuffer.h"
#include "Common/Align.h"
#include "Common/CommonFuncs.h"
#include "Common/GL/GLUtil.h"
#include "Common/MemoryUtil.h"
#include "VideoBackends/OGL/Render.h"
#include "VideoCommon/DriverDetails.h"
#include "VideoCommon/OnScreenDisplay.h"
namespace OGL
{
// moved out of constructor, so m_buffer is allowed to be const
static u32 GenBuffer()
{
u32 id;
glGenBuffers(1, &id);
return id;
}
StreamBuffer::StreamBuffer(u32 type, u32 size)
: m_buffer(GenBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)),
m_bit_per_slot(IntLog2(ROUND_UP_POW2(size) / SYNC_POINTS))
{
m_iterator = 0;
m_used_iterator = 0;
m_free_iterator = 0;
}
StreamBuffer::~StreamBuffer()
{
glDeleteBuffers(1, &m_buffer);
}
/* Shared synchronization code for ring buffers
*
* The next three functions are to create/delete/use the OpenGL synchronization.
* ARB_sync (OpenGL 3.2) is used and required.
*
* To reduce overhead, the complete buffer is splitted up into SYNC_POINTS chunks.
* For each of this chunks, there is a fence which checks if this chunk is still in use.
*
* As our API allows to alloc more memory then it has to use, we have to catch how much is already
* written.
*
* m_iterator - writing position
* m_free_iterator - last position checked if free
* m_used_iterator - last position known to be written
*
* So on alloc, we have to wait for all slots between m_free_iterator and m_iterator (and set
* m_free_iterator to m_iterator afterwards).
*
* We also assume that this buffer is accessed by the GPU between the Unmap and Map function,
* so we may create the fences on the start of mapping.
* Some here, new fences for the chunks between m_used_iterator and m_iterator (also update
* m_used_iterator).
*
* As ring buffers have an ugly behavior on rollover, have fun to read this code ;)
*/
void StreamBuffer::CreateFences()
{
for (int i = 0; i < SYNC_POINTS; i++)
{
m_fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
}
}
void StreamBuffer::DeleteFences()
{
for (int i = Slot(m_free_iterator) + 1; i < SYNC_POINTS; i++)
{
glDeleteSync(m_fences[i]);
}
for (int i = 0; i < Slot(m_iterator); i++)
{
glDeleteSync(m_fences[i]);
}
}
void StreamBuffer::AllocMemory(u32 size)
{
// insert waiting slots for used memory
for (int i = Slot(m_used_iterator); i < Slot(m_iterator); i++)
{
m_fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
}
m_used_iterator = m_iterator;
// wait for new slots to end of buffer
for (int i = Slot(m_free_iterator) + 1; i <= Slot(m_iterator + size) && i < SYNC_POINTS; i++)
{
glClientWaitSync(m_fences[i], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
glDeleteSync(m_fences[i]);
}
// If we allocate a large amount of memory (A), commit a smaller amount, then allocate memory
// smaller than allocation A, we will have already waited for these fences in A, but not used
// the space. In this case, don't set m_free_iterator to a position before that which we know
// is safe to use, which would result in waiting on the same fence(s) next time.
if ((m_iterator + size) > m_free_iterator)
m_free_iterator = m_iterator + size;
// if buffer is full
if (m_iterator + size >= m_size)
{
// insert waiting slots in unused space at the end of the buffer
for (int i = Slot(m_used_iterator); i < SYNC_POINTS; i++)
{
m_fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
}
// move to the start
m_used_iterator = m_iterator = 0; // offset 0 is always aligned
// wait for space at the start
for (int i = 0; i <= Slot(m_iterator + size); i++)
{
glClientWaitSync(m_fences[i], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
glDeleteSync(m_fences[i]);
}
m_free_iterator = m_iterator + size;
}
}
/* The usual way to stream data to the GPU.
* Described here: https://www.opengl.org/wiki/Buffer_Object_Streaming#Unsynchronized_buffer_mapping
* Just do unsync appends until the buffer is full.
* When it's full, orphan (alloc a new buffer and free the old one)
*
* As reallocation is an overhead, this method isn't as fast as it is known to be.
*/
class MapAndOrphan : public StreamBuffer
{
public:
MapAndOrphan(u32 type, u32 size) : StreamBuffer(type, size)
{
glBindBuffer(m_buffertype, m_buffer);
glBufferData(m_buffertype, m_size, nullptr, GL_STREAM_DRAW);
}
~MapAndOrphan() {}
std::pair<u8*, u32> Map(u32 size) override
{
if (m_iterator + size >= m_size)
{
glBufferData(m_buffertype, m_size, nullptr, GL_STREAM_DRAW);
m_iterator = 0;
}
u8* pointer = (u8*)glMapBufferRange(m_buffertype, m_iterator, size,
GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT |
GL_MAP_UNSYNCHRONIZED_BIT);
return std::make_pair(pointer, m_iterator);
}
void Unmap(u32 used_size) override
{
glFlushMappedBufferRange(m_buffertype, 0, used_size);
glUnmapBuffer(m_buffertype);
m_iterator += used_size;
}
};
/* A modified streaming way without reallocation
* This one fixes the reallocation overhead of the MapAndOrphan one.
* So it alloc a ring buffer on initialization.
* But with this limited resource, we have to care about the CPU-GPU distance.
* Else this fifo may overflow.
* So we had traded orphan vs syncing.
*/
class MapAndSync : public StreamBuffer
{
public:
MapAndSync(u32 type, u32 size) : StreamBuffer(type, size)
{
CreateFences();
glBindBuffer(m_buffertype, m_buffer);
glBufferData(m_buffertype, m_size, nullptr, GL_STREAM_DRAW);
}
~MapAndSync() { DeleteFences(); }
std::pair<u8*, u32> Map(u32 size) override
{
AllocMemory(size);
u8* pointer = (u8*)glMapBufferRange(m_buffertype, m_iterator, size,
GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT |
GL_MAP_UNSYNCHRONIZED_BIT);
return std::make_pair(pointer, m_iterator);
}
void Unmap(u32 used_size) override
{
glFlushMappedBufferRange(m_buffertype, 0, used_size);
glUnmapBuffer(m_buffertype);
m_iterator += used_size;
}
};
/* Streaming fifo without mapping overhead.
* This one usually requires ARB_buffer_storage (OpenGL 4.4).
* And is usually not available on OpenGL3 GPUs.
*
* ARB_buffer_storage allows us to render from a mapped buffer.
* So we map it persistently in the initialization.
*
* Unsync mapping sounds like an easy task, but it isn't for threaded drivers.
* So every mapping on current close-source driver _will_ end in
* at least a round trip time between two threads.
*
* As persistently mapped buffer can't use orphaning, we also have to sync.
*/
class BufferStorage : public StreamBuffer
{
public:
BufferStorage(u32 type, u32 size, bool _coherent = false)
: StreamBuffer(type, size), coherent(_coherent)
{
CreateFences();
glBindBuffer(m_buffertype, m_buffer);
// PERSISTANT_BIT to make sure that the buffer can be used while mapped
// COHERENT_BIT is set so we don't have to use a MemoryBarrier on write
// CLIENT_STORAGE_BIT is set since we access the buffer more frequently on the client side then
// server side
glBufferStorage(m_buffertype, m_size, nullptr, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT |
(coherent ? GL_MAP_COHERENT_BIT : 0));
m_pointer = (u8*)glMapBufferRange(
m_buffertype, 0, m_size, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT |
(coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT));
}
~BufferStorage()
{
DeleteFences();
glUnmapBuffer(m_buffertype);
glBindBuffer(m_buffertype, 0);
}
std::pair<u8*, u32> Map(u32 size) override
{
AllocMemory(size);
return std::make_pair(m_pointer + m_iterator, m_iterator);
}
void Unmap(u32 used_size) override
{
if (!coherent)
glFlushMappedBufferRange(m_buffertype, m_iterator, used_size);
m_iterator += used_size;
}
u8* m_pointer;
const bool coherent;
};
/* --- AMD only ---
* Another streaming fifo without mapping overhead.
* As we can't orphan without mapping, we have to sync.
*
* This one uses AMD_pinned_memory which is available on all AMD GPUs.
* OpenGL 4.4 drivers should use BufferStorage.
*/
class PinnedMemory : public StreamBuffer
{
public:
PinnedMemory(u32 type, u32 size) : StreamBuffer(type, size)
{
CreateFences();
m_pointer = static_cast<u8*>(Common::AllocateAlignedMemory(
Common::AlignUp(m_size, ALIGN_PINNED_MEMORY), ALIGN_PINNED_MEMORY));
glBindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, m_buffer);
glBufferData(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD,
Common::AlignUp(m_size, ALIGN_PINNED_MEMORY), m_pointer, GL_STREAM_COPY);
glBindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, 0);
glBindBuffer(m_buffertype, m_buffer);
}
~PinnedMemory()
{
DeleteFences();
glBindBuffer(m_buffertype, 0);
glFinish(); // ogl pipeline must be flushed, else this buffer can be in use
Common::FreeAlignedMemory(m_pointer);
m_pointer = nullptr;
}
std::pair<u8*, u32> Map(u32 size) override
{
AllocMemory(size);
return std::make_pair(m_pointer + m_iterator, m_iterator);
}
void Unmap(u32 used_size) override { m_iterator += used_size; }
u8* m_pointer;
static const u32 ALIGN_PINNED_MEMORY = 4096;
};
/* Fifo based on the glBufferSubData call.
* As everything must be copied before glBufferSubData returns,
* an additional memcpy in the driver will be done.
* So this is a huge overhead, only use it if required.
*/
class BufferSubData : public StreamBuffer
{
public:
BufferSubData(u32 type, u32 size) : StreamBuffer(type, size)
{
glBindBuffer(m_buffertype, m_buffer);
glBufferData(m_buffertype, size, nullptr, GL_STATIC_DRAW);
m_pointer = new u8[m_size];
}
~BufferSubData() { delete[] m_pointer; }
std::pair<u8*, u32> Map(u32 size) override { return std::make_pair(m_pointer, 0); }
void Unmap(u32 used_size) override { glBufferSubData(m_buffertype, 0, used_size, m_pointer); }
u8* m_pointer;
};
/* Fifo based on the glBufferData call.
* Some trashy drivers stall in BufferSubData.
* So here we use glBufferData, which realloc this buffer every time.
* This may avoid stalls, but it is a bigger overhead than BufferSubData.
*/
class BufferData : public StreamBuffer
{
public:
BufferData(u32 type, u32 size) : StreamBuffer(type, size)
{
glBindBuffer(m_buffertype, m_buffer);
m_pointer = new u8[m_size];
}
~BufferData() { delete[] m_pointer; }
std::pair<u8*, u32> Map(u32 size) override { return std::make_pair(m_pointer, 0); }
void Unmap(u32 used_size) override
{
glBufferData(m_buffertype, used_size, m_pointer, GL_STREAM_DRAW);
}
u8* m_pointer;
};
// Chooses the best streaming method based on the supported extensions and known issues
std::unique_ptr<StreamBuffer> StreamBuffer::Create(u32 type, u32 size)
{
// without basevertex support, only streaming methods whith uploads everything to zero works fine:
if (!g_ogl_config.bSupportsGLBaseVertex)
{
if (!DriverDetails::HasBug(DriverDetails::BUG_BROKEN_BUFFER_STREAM))
return std::make_unique<BufferSubData>(type, size);
// BufferData is by far the worst way, only use it if needed
return std::make_unique<BufferData>(type, size);
}
// Prefer the syncing buffers over the orphaning one
if (g_ogl_config.bSupportsGLSync)
{
// pinned memory is much faster than buffer storage on AMD cards
if (g_ogl_config.bSupportsGLPinnedMemory &&
!(DriverDetails::HasBug(DriverDetails::BUG_BROKEN_PINNED_MEMORY)))
return std::make_unique<PinnedMemory>(type, size);
// buffer storage works well in most situations
bool coherent = DriverDetails::HasBug(DriverDetails::BUG_BROKEN_EXPLICIT_FLUSH);
if (g_ogl_config.bSupportsGLBufferStorage &&
!(DriverDetails::HasBug(DriverDetails::BUG_BROKEN_BUFFER_STORAGE) &&
type == GL_ARRAY_BUFFER) &&
!(DriverDetails::HasBug(DriverDetails::BUG_INTEL_BROKEN_BUFFER_STORAGE) &&
type == GL_ELEMENT_ARRAY_BUFFER))
return std::make_unique<BufferStorage>(type, size, coherent);
// don't fall back to MapAnd* for Nvidia drivers
if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_UNSYNC_MAPPING))
{
if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_BUFFER_STREAM))
return std::make_unique<BufferData>(type, size);
else
return std::make_unique<BufferSubData>(type, size);
}
// mapping fallback
if (g_ogl_config.bSupportsGLSync)
return std::make_unique<MapAndSync>(type, size);
}
// default fallback, should work everywhere, but isn't the best way to do this job
return std::make_unique<MapAndOrphan>(type, size);
}
}