hthh 8be5717a60 Improve PerfQuery accuracy
In TimeSplitters: Future Perfect, PerfQuery is used to detect
the visibility of lights and draw coronas. 25 points are drawn
for each light. However, the returned count was incorrectly
being divided by four leading to dim coronas.

Using 4x antialiasing was a workaround because of a bug where
antialiasing multiplied the PerfQuery results. This commit
fixes that bug too (but only for OpenGL).
2016-07-04 18:54:49 +10:00

228 lines
7.0 KiB
C++

// Copyright 2012 Dolphin Emulator Project
// Licensed under GPLv2+
// Refer to the license.txt file included.
#include <algorithm>
#include "Common/CommonFuncs.h"
#include "Common/CommonTypes.h"
#include "Common/Logging/Log.h"
#include "VideoBackends/D3D12/D3DBase.h"
#include "VideoBackends/D3D12/D3DCommandListManager.h"
#include "VideoBackends/D3D12/PerfQuery.h"
#include "VideoCommon/RenderBase.h"
namespace DX12
{
PerfQuery::PerfQuery()
{
D3D12_QUERY_HEAP_DESC desc = {D3D12_QUERY_HEAP_TYPE_OCCLUSION, PERF_QUERY_BUFFER_SIZE, 0};
CheckHR(D3D::device12->CreateQueryHeap(&desc, IID_PPV_ARGS(&m_query_heap)));
CheckHR(D3D::device12->CreateCommittedResource(
&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_READBACK), D3D12_HEAP_FLAG_NONE,
&CD3DX12_RESOURCE_DESC::Buffer(QUERY_READBACK_BUFFER_SIZE), D3D12_RESOURCE_STATE_COPY_DEST,
nullptr, IID_PPV_ARGS(&m_query_readback_buffer)));
m_tracking_fence =
D3D::command_list_mgr->RegisterQueueFenceCallback(this, &PerfQuery::QueueFenceCallback);
}
PerfQuery::~PerfQuery()
{
D3D::command_list_mgr->RemoveQueueFenceCallback(this);
SAFE_RELEASE(m_query_heap);
SAFE_RELEASE(m_query_readback_buffer);
}
void PerfQuery::EnableQuery(PerfQueryGroup type)
{
if (m_query_count > m_query_buffer.size() / 2)
WeakFlush();
// all queries already used?
if (m_query_buffer.size() == m_query_count)
{
FlushOne();
// WARN_LOG(VIDEO, "Flushed query buffer early!");
}
if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP)
{
size_t index = (m_query_read_pos + m_query_count) % m_query_buffer.size();
auto& entry = m_query_buffer[index];
D3D::current_command_list->BeginQuery(m_query_heap, D3D12_QUERY_TYPE_OCCLUSION,
static_cast<UINT>(index));
entry.query_type = type;
entry.fence_value = -1;
++m_query_count;
}
}
void PerfQuery::DisableQuery(PerfQueryGroup type)
{
if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP)
{
size_t index =
(m_query_read_pos + m_query_count + m_query_buffer.size() - 1) % m_query_buffer.size();
auto& entry = m_query_buffer[index];
D3D::current_command_list->EndQuery(m_query_heap, D3D12_QUERY_TYPE_OCCLUSION,
static_cast<UINT>(index));
D3D::current_command_list->ResolveQueryData(m_query_heap, D3D12_QUERY_TYPE_OCCLUSION,
static_cast<UINT>(index), 1,
m_query_readback_buffer, index * sizeof(UINT64));
entry.fence_value = m_next_fence_value;
}
}
void PerfQuery::ResetQuery()
{
m_query_count = 0;
std::fill_n(m_results, ArraySize(m_results), 0);
}
u32 PerfQuery::GetQueryResult(PerfQueryType type)
{
u32 result = 0;
if (type == PQ_ZCOMP_INPUT_ZCOMPLOC || type == PQ_ZCOMP_OUTPUT_ZCOMPLOC)
result = m_results[PQG_ZCOMP_ZCOMPLOC];
else if (type == PQ_ZCOMP_INPUT || type == PQ_ZCOMP_OUTPUT)
result = m_results[PQG_ZCOMP];
else if (type == PQ_BLEND_INPUT)
result = m_results[PQG_ZCOMP] + m_results[PQG_ZCOMP_ZCOMPLOC];
else if (type == PQ_EFB_COPY_CLOCKS)
result = m_results[PQG_EFB_COPY_CLOCKS];
return result;
}
void PerfQuery::FlushOne()
{
size_t index = m_query_read_pos;
ActiveQuery& entry = m_query_buffer[index];
// Has the command list been executed yet?
if (entry.fence_value == m_next_fence_value)
D3D::command_list_mgr->ExecuteQueuedWork(false);
// Block until the fence is reached
D3D::command_list_mgr->WaitOnCPUForFence(m_tracking_fence, entry.fence_value);
// Copy from readback buffer to local
void* readback_buffer_map;
D3D12_RANGE read_range = {sizeof(UINT64) * index, sizeof(UINT64) * (index + 1)};
CheckHR(m_query_readback_buffer->Map(0, &read_range, &readback_buffer_map));
UINT64 result;
memcpy(&result, reinterpret_cast<u8*>(readback_buffer_map) + sizeof(UINT64) * index,
sizeof(UINT64));
D3D12_RANGE write_range = {};
m_query_readback_buffer->Unmap(0, &write_range);
// NOTE: Reported pixel metrics should be referenced to native resolution
// TODO: Dropping the lower 2 bits from this count should be closer to actual
// hardware behavior when drawing triangles.
m_results[entry.query_type] += (u32)(result * EFB_WIDTH / g_renderer->GetTargetWidth() *
EFB_HEIGHT / g_renderer->GetTargetHeight());
m_query_read_pos = (m_query_read_pos + 1) % m_query_buffer.size();
m_query_count--;
}
UINT64 PerfQuery::FindLastPendingFenceValue() const
{
UINT64 last_fence_value = 0;
u32 query_count = m_query_count;
u32 query_read_pos = m_query_read_pos;
while (query_count > 0)
{
const ActiveQuery& entry = m_query_buffer[query_read_pos];
last_fence_value = std::max(entry.fence_value, last_fence_value);
query_read_pos = (query_read_pos + 1) % m_query_buffer.size();
query_count--;
}
return last_fence_value;
}
void PerfQuery::FlushResults()
{
if (IsFlushed())
return;
// Find the fence value we have to wait for.
UINT64 last_fence_value = FindLastPendingFenceValue();
if (last_fence_value == m_next_fence_value)
D3D::command_list_mgr->ExecuteQueuedWork(false);
// Wait for all queries to be resolved.
D3D::command_list_mgr->WaitOnCPUForFence(m_tracking_fence, last_fence_value);
// Map the whole readback buffer. Shouldn't have much overhead, and saves taking the
// wrapped-around cases into consideration.
void* readback_buffer_map;
D3D12_RANGE read_range = {0, QUERY_READBACK_BUFFER_SIZE};
CheckHR(m_query_readback_buffer->Map(0, &read_range, &readback_buffer_map));
// Read all pending queries.
while (m_query_count > 0)
{
ActiveQuery& entry = m_query_buffer[m_query_read_pos];
UINT64 result;
memcpy(&result, reinterpret_cast<u8*>(readback_buffer_map) + sizeof(UINT64) * m_query_read_pos,
sizeof(UINT64));
// NOTE: Reported pixel metrics should be referenced to native resolution
// TODO: Dropping the lower 2 bits from this count should be closer to actual
// hardware behavior when drawing triangles.
m_results[entry.query_type] += (u32)(result * EFB_WIDTH / g_renderer->GetTargetWidth() *
EFB_HEIGHT / g_renderer->GetTargetHeight());
m_query_read_pos = (m_query_read_pos + 1) % m_query_buffer.size();
m_query_count--;
}
D3D12_RANGE write_range = {};
m_query_readback_buffer->Unmap(0, &write_range);
}
void PerfQuery::WeakFlush()
{
UINT64 completed_fence = m_tracking_fence->GetCompletedValue();
while (!IsFlushed())
{
ActiveQuery& entry = m_query_buffer[m_query_read_pos];
if (entry.fence_value > completed_fence)
break;
FlushOne();
}
}
bool PerfQuery::IsFlushed() const
{
return m_query_count == 0;
}
void PerfQuery::QueueFenceCallback(void* owning_object, UINT64 fence_value)
{
PerfQuery* owning_perf_query = static_cast<PerfQuery*>(owning_object);
owning_perf_query->QueueFence(fence_value);
}
void PerfQuery::QueueFence(UINT64 fence_value)
{
m_next_fence_value = fence_value + 1;
}
} // namespace