From 4269abdc3e9ff2fe0cba7d42aa875afb8a1fa460 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Sun, 6 Mar 2016 21:34:41 +1000 Subject: [PATCH] D3D12: Implement perf query support --- .../D3D12/D3DQueuedCommandList.cpp | 72 ++++++- .../D3D12/D3DQueuedCommandList.h | 30 +++ Source/Core/VideoBackends/D3D12/PerfQuery.cpp | 175 ++++++++++++++++-- Source/Core/VideoBackends/D3D12/PerfQuery.h | 22 ++- .../VideoBackends/D3D12/VertexManager.cpp | 6 +- 5 files changed, 280 insertions(+), 25 deletions(-) diff --git a/Source/Core/VideoBackends/D3D12/D3DQueuedCommandList.cpp b/Source/Core/VideoBackends/D3D12/D3DQueuedCommandList.cpp index 981b2e731c..8a84a0c42b 100644 --- a/Source/Core/VideoBackends/D3D12/D3DQueuedCommandList.cpp +++ b/Source/Core/VideoBackends/D3D12/D3DQueuedCommandList.cpp @@ -274,6 +274,45 @@ void ID3D12QueuedCommandList::BackgroundThreadFunction(ID3D12QueuedCommandList* break; } + case D3DQueueItemType::BeginQuery: + { + command_list->BeginQuery( + reinterpret_cast(item)->BeginQuery.pQueryHeap, + reinterpret_cast(item)->BeginQuery.Type, + reinterpret_cast(item)->BeginQuery.Index + ); + + item += BufferOffsetForQueueItemType(); + break; + } + + case D3DQueueItemType::EndQuery: + { + command_list->EndQuery( + reinterpret_cast(item)->EndQuery.pQueryHeap, + reinterpret_cast(item)->EndQuery.Type, + reinterpret_cast(item)->EndQuery.Index + ); + + item += BufferOffsetForQueueItemType(); + break; + } + + case D3DQueueItemType::ResolveQueryData: + { + command_list->ResolveQueryData( + reinterpret_cast(item)->ResolveQueryData.pQueryHeap, + reinterpret_cast(item)->ResolveQueryData.Type, + reinterpret_cast(item)->ResolveQueryData.StartElement, + reinterpret_cast(item)->ResolveQueryData.ElementCount, + reinterpret_cast(item)->ResolveQueryData.pDestinationBuffer, + reinterpret_cast(item)->ResolveQueryData.AlignedDestinationBufferOffset + ); + + item += BufferOffsetForQueueItemType(); + break; + } + case D3DQueueItemType::CloseCommandList: { CheckHR(command_list->Close()); @@ -916,8 +955,14 @@ void STDMETHODCALLTYPE ID3D12QueuedCommandList::BeginQuery( _In_ UINT Index ) { - // Function not implemented yet. - DEBUGCHECK(0, "Function not implemented yet."); + reinterpret_cast(m_queue_array_back)->Type = D3DQueueItemType::BeginQuery; + reinterpret_cast(m_queue_array_back)->BeginQuery.pQueryHeap = pQueryHeap; + reinterpret_cast(m_queue_array_back)->BeginQuery.Type = Type; + reinterpret_cast(m_queue_array_back)->BeginQuery.Index = Index; + + m_queue_array_back += BufferOffsetForQueueItemType(); + + CheckForOverflow(); } void STDMETHODCALLTYPE ID3D12QueuedCommandList::EndQuery( @@ -926,8 +971,14 @@ void STDMETHODCALLTYPE ID3D12QueuedCommandList::EndQuery( _In_ UINT Index ) { - // Function not implemented yet. - DEBUGCHECK(0, "Function not implemented yet."); + reinterpret_cast(m_queue_array_back)->Type = D3DQueueItemType::EndQuery; + reinterpret_cast(m_queue_array_back)->EndQuery.pQueryHeap = pQueryHeap; + reinterpret_cast(m_queue_array_back)->EndQuery.Type = Type; + reinterpret_cast(m_queue_array_back)->EndQuery.Index = Index; + + m_queue_array_back += BufferOffsetForQueueItemType(); + + CheckForOverflow(); } void STDMETHODCALLTYPE ID3D12QueuedCommandList::ResolveQueryData( @@ -939,8 +990,17 @@ void STDMETHODCALLTYPE ID3D12QueuedCommandList::ResolveQueryData( _In_ UINT64 AlignedDestinationBufferOffset ) { - // Function not implemented yet. - DEBUGCHECK(0, "Function not implemented yet."); + reinterpret_cast(m_queue_array_back)->Type = D3DQueueItemType::ResolveQueryData; + reinterpret_cast(m_queue_array_back)->ResolveQueryData.pQueryHeap = pQueryHeap; + reinterpret_cast(m_queue_array_back)->ResolveQueryData.Type = Type; + reinterpret_cast(m_queue_array_back)->ResolveQueryData.StartElement = StartElement; + reinterpret_cast(m_queue_array_back)->ResolveQueryData.ElementCount = ElementCount; + reinterpret_cast(m_queue_array_back)->ResolveQueryData.pDestinationBuffer = pDestinationBuffer; + reinterpret_cast(m_queue_array_back)->ResolveQueryData.AlignedDestinationBufferOffset = AlignedDestinationBufferOffset; + + m_queue_array_back += BufferOffsetForQueueItemType(); + + CheckForOverflow(); } void STDMETHODCALLTYPE ID3D12QueuedCommandList::SetPredication( diff --git a/Source/Core/VideoBackends/D3D12/D3DQueuedCommandList.h b/Source/Core/VideoBackends/D3D12/D3DQueuedCommandList.h index 2933b2fc72..2c9d80febe 100644 --- a/Source/Core/VideoBackends/D3D12/D3DQueuedCommandList.h +++ b/Source/Core/VideoBackends/D3D12/D3DQueuedCommandList.h @@ -35,6 +35,9 @@ enum D3DQueueItemType SetDescriptorHeaps, ResourceBarrier, ResolveSubresource, + BeginQuery, + EndQuery, + ResolveQueryData, ExecuteCommandList, CloseCommandList, Present, @@ -170,6 +173,30 @@ struct ResolveSubresourceArguments DXGI_FORMAT Format; }; +struct BeginQueryArguments +{ + ID3D12QueryHeap* pQueryHeap; + D3D12_QUERY_TYPE Type; + UINT Index; +}; + +struct EndQueryArguments +{ + ID3D12QueryHeap* pQueryHeap; + D3D12_QUERY_TYPE Type; + UINT Index; +}; + +struct ResolveQueryDataArguments +{ + ID3D12QueryHeap* pQueryHeap; + D3D12_QUERY_TYPE Type; + UINT StartElement; + UINT ElementCount; + ID3D12Resource* pDestinationBuffer; + UINT64 AlignedDestinationBufferOffset; +}; + struct CloseCommandListArguments { }; @@ -239,6 +266,9 @@ struct D3DQueueItem SetDescriptorHeapsArguments SetDescriptorHeaps; ResourceBarrierArguments ResourceBarrier; ResolveSubresourceArguments ResolveSubresource; + BeginQueryArguments BeginQuery; + EndQueryArguments EndQuery; + ResolveQueryDataArguments ResolveQueryData; CloseCommandListArguments CloseCommandList; ExecuteCommandListArguments ExecuteCommandList; PresentArguments Present; diff --git a/Source/Core/VideoBackends/D3D12/PerfQuery.cpp b/Source/Core/VideoBackends/D3D12/PerfQuery.cpp index f5821ca1b7..e86f2e9396 100644 --- a/Source/Core/VideoBackends/D3D12/PerfQuery.cpp +++ b/Source/Core/VideoBackends/D3D12/PerfQuery.cpp @@ -2,68 +2,215 @@ // Licensed under GPLv2+ // Refer to the license.txt file included. +#include + #include "Common/CommonFuncs.h" #include "Common/CommonTypes.h" #include "Common/Logging/Log.h" #include "VideoBackends/D3D12/D3DBase.h" +#include "VideoBackends/D3D12/D3DCommandListManager.h" #include "VideoBackends/D3D12/PerfQuery.h" #include "VideoCommon/RenderBase.h" -//D3D12TODO: Implement PerfQuery class. - namespace DX12 { PerfQuery::PerfQuery() { - //D3D12TODO: Add implementation + D3D12_QUERY_HEAP_DESC desc = { D3D12_QUERY_HEAP_TYPE_OCCLUSION, PERF_QUERY_BUFFER_SIZE, 0 }; + CheckHR(D3D::device12->CreateQueryHeap(&desc, IID_PPV_ARGS(&m_query_heap))); + + CheckHR(D3D::device12->CreateCommittedResource( + &CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_READBACK), + D3D12_HEAP_FLAG_NONE, + &CD3DX12_RESOURCE_DESC::Buffer(QUERY_READBACK_BUFFER_SIZE), + D3D12_RESOURCE_STATE_COPY_DEST, + nullptr, + IID_PPV_ARGS(&m_query_readback_buffer))); + + m_tracking_fence = D3D::command_list_mgr->RegisterQueueFenceCallback(this, &PerfQuery::QueueFenceCallback); } PerfQuery::~PerfQuery() { - //D3D12TODO: Add implementation + D3D::command_list_mgr->RemoveQueueFenceCallback(this); + + SAFE_RELEASE(m_query_heap); + SAFE_RELEASE(m_query_readback_buffer); } void PerfQuery::EnableQuery(PerfQueryGroup type) { - //D3D12TODO: Add implementation + if (m_query_count > m_query_buffer.size() / 2) + WeakFlush(); + + // all queries already used? + if (m_query_buffer.size() == m_query_count) + { + FlushOne(); + //WARN_LOG(VIDEO, "Flushed query buffer early!"); + } + + if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP) + { + size_t index = (m_query_read_pos + m_query_count) % m_query_buffer.size(); + auto& entry = m_query_buffer[index]; + + D3D::current_command_list->BeginQuery(m_query_heap, D3D12_QUERY_TYPE_OCCLUSION, static_cast(index)); + entry.query_type = type; + entry.fence_value = -1; + + ++m_query_count; + } } void PerfQuery::DisableQuery(PerfQueryGroup type) { - //D3D12TODO: Add implementation + if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP) + { + size_t index = (m_query_read_pos + m_query_count + m_query_buffer.size() - 1) % m_query_buffer.size(); + auto& entry = m_query_buffer[index]; + + D3D::current_command_list->EndQuery(m_query_heap, D3D12_QUERY_TYPE_OCCLUSION, static_cast(index)); + D3D::current_command_list->ResolveQueryData(m_query_heap, D3D12_QUERY_TYPE_OCCLUSION, static_cast(index), 1, m_query_readback_buffer, index * sizeof(UINT64)); + entry.fence_value = m_next_fence_value; + } } void PerfQuery::ResetQuery() { - //D3D12TODO: Add implementation + m_query_count = 0; + std::fill_n(m_results, ArraySize(m_results), 0); } u32 PerfQuery::GetQueryResult(PerfQueryType type) { - //D3D12TODO: Add implementation - return 0; + u32 result = 0; + + if (type == PQ_ZCOMP_INPUT_ZCOMPLOC || type == PQ_ZCOMP_OUTPUT_ZCOMPLOC) + result = m_results[PQG_ZCOMP_ZCOMPLOC]; + else if (type == PQ_ZCOMP_INPUT || type == PQ_ZCOMP_OUTPUT) + result = m_results[PQG_ZCOMP]; + else if (type == PQ_BLEND_INPUT) + result = m_results[PQG_ZCOMP] + m_results[PQG_ZCOMP_ZCOMPLOC]; + else if (type == PQ_EFB_COPY_CLOCKS) + result = m_results[PQG_EFB_COPY_CLOCKS]; + + return result / 4; } void PerfQuery::FlushOne() { - //D3D12TODO: Add implementation + size_t index = m_query_read_pos; + ActiveQuery& entry = m_query_buffer[index]; + + // Has the command list been executed yet? + if (entry.fence_value == m_next_fence_value) + D3D::command_list_mgr->ExecuteQueuedWork(false); + + // Block until the fence is reached + D3D::command_list_mgr->WaitOnCPUForFence(m_tracking_fence, entry.fence_value); + + // Copy from readback buffer to local + void* readback_buffer_map; + D3D12_RANGE read_range = { sizeof(UINT64) * index, sizeof(UINT64) * (index + 1) }; + CheckHR(m_query_readback_buffer->Map(0, &read_range, &readback_buffer_map)); + + UINT64 result; + memcpy(&result, reinterpret_cast(readback_buffer_map) + sizeof(UINT64) * index, sizeof(UINT64)); + + D3D12_RANGE empty_range = {}; + m_query_readback_buffer->Unmap(0, &empty_range); + + // NOTE: Reported pixel metrics should be referenced to native resolution + m_results[entry.query_type] += (u32)(result * EFB_WIDTH / g_renderer->GetTargetWidth() * EFB_HEIGHT / g_renderer->GetTargetHeight()); + + m_query_read_pos = (m_query_read_pos + 1) % m_query_buffer.size(); + m_query_count--; +} + +UINT64 PerfQuery::FindLastPendingFenceValue() const +{ + UINT64 last_fence_value = 0; + u32 query_count = m_query_count; + u32 query_read_pos = m_query_read_pos; + while (query_count > 0) + { + const ActiveQuery& entry = m_query_buffer[query_read_pos]; + + last_fence_value = std::max(entry.fence_value, last_fence_value); + query_read_pos = (query_read_pos + 1) % m_query_buffer.size(); + query_count--; + } + + return last_fence_value; } void PerfQuery::FlushResults() { - //D3D12TODO: Add implementation + if (IsFlushed()) + return; + + // Find the fence value we have to wait for. + UINT64 last_fence_value = FindLastPendingFenceValue(); + if (last_fence_value == m_next_fence_value) + D3D::command_list_mgr->ExecuteQueuedWork(false); + + // Wait for all queries to be resolved. + D3D::command_list_mgr->WaitOnCPUForFence(m_tracking_fence, last_fence_value); + + // Map the whole readback buffer. Shouldn't have much overhead, and saves taking the wrapped-around cases into consideration. + void* readback_buffer_map; + D3D12_RANGE read_range = { 0, QUERY_READBACK_BUFFER_SIZE }; + CheckHR(m_query_readback_buffer->Map(0, &read_range, &readback_buffer_map)); + + // Read all pending queries. + while (m_query_count > 0) + { + ActiveQuery& entry = m_query_buffer[m_query_read_pos]; + + UINT64 result; + memcpy(&result, reinterpret_cast(readback_buffer_map) + sizeof(UINT64) * m_query_read_pos, sizeof(UINT64)); + + // NOTE: Reported pixel metrics should be referenced to native resolution + m_results[entry.query_type] += (u32)(result * EFB_WIDTH / g_renderer->GetTargetWidth() * EFB_HEIGHT / g_renderer->GetTargetHeight()); + + m_query_read_pos = (m_query_read_pos + 1) % m_query_buffer.size(); + m_query_count--; + } + + D3D12_RANGE write_range = {}; + m_query_readback_buffer->Unmap(0, &write_range); } void PerfQuery::WeakFlush() { - //D3D12TODO: Add implementation + UINT64 completed_fence = m_tracking_fence->GetCompletedValue(); + + while (!IsFlushed()) + { + ActiveQuery& entry = m_query_buffer[m_query_read_pos]; + if (entry.fence_value > completed_fence) + break; + + FlushOne(); + } } bool PerfQuery::IsFlushed() const { - //D3D12TODO: Add implementation - return true; + return m_query_count == 0; +} + +void PerfQuery::QueueFenceCallback(void* owning_object, UINT64 fence_value) +{ + PerfQuery* owning_perf_query = static_cast(owning_object); + owning_perf_query->QueueFence(fence_value); +} + +void PerfQuery::QueueFence(UINT64 fence_value) +{ + m_next_fence_value = fence_value + 1; } } // namespace diff --git a/Source/Core/VideoBackends/D3D12/PerfQuery.h b/Source/Core/VideoBackends/D3D12/PerfQuery.h index ddc7b134f0..98760910f3 100644 --- a/Source/Core/VideoBackends/D3D12/PerfQuery.h +++ b/Source/Core/VideoBackends/D3D12/PerfQuery.h @@ -5,6 +5,7 @@ #pragma once #include +#include #include "VideoCommon/PerfQueryBase.h" @@ -27,20 +28,33 @@ public: private: struct ActiveQuery { - //ID3D11Query* query; PerfQueryGroup query_type; + UINT64 fence_value; }; void WeakFlush(); + // Find the last fence value of all pending queries. + UINT64 FindLastPendingFenceValue() const; + // Only use when non-empty void FlushOne(); - // when testing in SMS: 64 was too small, 128 was ok - static const int s_perf_query_buffer_size = 512; + static void QueueFenceCallback(void* owning_object, UINT64 fence_value); + void QueueFence(UINT64 fence_value); - std::array m_query_buffer; + // when testing in SMS: 64 was too small, 128 was ok + static constexpr size_t PERF_QUERY_BUFFER_SIZE = 512; + static constexpr size_t QUERY_READBACK_BUFFER_SIZE = PERF_QUERY_BUFFER_SIZE * sizeof(UINT64); + + std::array m_query_buffer; int m_query_read_pos = 0; + + ID3D12QueryHeap* m_query_heap = nullptr; + ID3D12Resource* m_query_readback_buffer = nullptr; + + ID3D12Fence* m_tracking_fence = nullptr; + UINT64 m_next_fence_value = 0; }; } // namespace diff --git a/Source/Core/VideoBackends/D3D12/VertexManager.cpp b/Source/Core/VideoBackends/D3D12/VertexManager.cpp index 6300ce9fc9..9f22cc56eb 100644 --- a/Source/Core/VideoBackends/D3D12/VertexManager.cpp +++ b/Source/Core/VideoBackends/D3D12/VertexManager.cpp @@ -158,7 +158,11 @@ void VertexManager::vFlush(bool use_dst_alpha) // D3D12TODO: Decide right threshold for drawCountSinceAsyncFlush at runtime depending on // amount of stall measured in AccessEFB. - if (D3D::command_list_mgr->m_draws_since_last_execution > 100 && D3D::command_list_mgr->m_cpu_access_last_frame) + // We can't do this with perf queries enabled since it can leave queries open. + + if (D3D::command_list_mgr->m_cpu_access_last_frame && + D3D::command_list_mgr->m_draws_since_last_execution > 100 && + !PerfQueryBase::ShouldEmulate()) { D3D::command_list_mgr->m_draws_since_last_execution = 0;