From f9f62069298f201448291a6037396618294cfe62 Mon Sep 17 00:00:00 2001 From: Exzap <13877693+Exzap@users.noreply.github.com> Date: Wed, 27 Sep 2023 08:11:57 +0200 Subject: [PATCH] Vulkan: Add profiler for Vulkan API CPU cost Disabled by default. Set VULKAN_API_CPU_BENCHMARK to 1 to enable --- .../HW/Latte/Core/LatteTextureReadback.cpp | 12 ++- .../HW/Latte/Renderer/Vulkan/VulkanAPI.cpp | 82 ++++++++++++++++++- src/Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h | 9 +- .../Latte/Renderer/Vulkan/VulkanRenderer.cpp | 5 ++ .../Renderer/Vulkan/VulkanRendererCore.cpp | 4 +- 5 files changed, 99 insertions(+), 13 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteTextureReadback.cpp b/src/Cafe/HW/Latte/Core/LatteTextureReadback.cpp index a6e865d8..8df5dcea 100644 --- a/src/Cafe/HW/Latte/Core/LatteTextureReadback.cpp +++ b/src/Cafe/HW/Latte/Core/LatteTextureReadback.cpp @@ -8,7 +8,7 @@ #include "Cafe/HW/Latte/Core/LatteTexture.h" #include "Cafe/HW/Latte/Renderer/OpenGL/LatteTextureViewGL.h" -//#define LOG_READBACK_TIME +#define LOG_READBACK_TIME struct LatteTextureReadbackQueueEntry { @@ -47,9 +47,7 @@ bool LatteTextureReadback_Update(bool forceStart) { #ifdef LOG_READBACK_TIME double elapsedSecondsSinceInitiate = HighResolutionTimer::getTimeDiff(entry.initiateTime, HighResolutionTimer().now().getTick()); - char initiateElapsedTimeStr[32]; - sprintf(initiateElapsedTimeStr, "%.4lfms", elapsedSecondsSinceInitiate); - cemuLog_log(LogType::TextureReadback, "[TextureReadback-Update] Starting transfer for {:08x} after {} elapsed drawcalls. Time since initiate: {} Force-start: {}", entry.textureView->baseTexture->physAddress, numElapsedDrawcalls, initiateElapsedTimeStr, forceStart?"yes":"no"); + cemuLog_log(LogType::TextureReadback, "[TextureReadback-Update] Starting transfer for {:08x} after {} elapsed drawcalls. Time since initiate: {:.4} Force-start: {}", entry.textureView->baseTexture->physAddress, numElapsedDrawcalls, elapsedSecondsSinceInitiate, forceStart?"yes":"no"); #endif LatteTextureReadback_StartTransfer(entry.textureView); // remove element @@ -83,7 +81,7 @@ void LatteTextureReadback_Initate(LatteTextureView* textureView) // currently we don't support readback for resized textures if (textureView->baseTexture->overwriteInfo.hasResolutionOverwrite) { - cemuLog_log(LogType::Force, "_initate(): Readback is not supported for textures with modified resolution"); + cemuLog_log(LogType::Force, "Texture readback is not supported for textures with modified resolution. Texture: {:08x} {}x{}", textureView->baseTexture->physAddress, textureView->baseTexture->width, textureView->baseTexture->height); return; } // check if texture isn't already queued for transfer @@ -124,7 +122,7 @@ void LatteTextureReadback_UpdateFinishedTransfers(bool forceFinish) if (cemuLog_isLoggingEnabled(LogType::TextureReadback)) { double elapsedSecondsTransfer = HighResolutionTimer::getTimeDiff(readbackInfo->transferStartTime, HighResolutionTimer().now().getTick()); - forceLog_printf("[Texture-Readback] Force-finish: %08x Res %4d/%4d TM %d FMT %04x Transfer time so far: %.4lfms", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0); + cemuLog_log(LogType::TextureReadback, "[Texture-Readback] Force-finish: {:08x} Res {:}/{:} TM {:} FMT {:04x} Transfer time so far: {:.4}ms", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0); } #endif readbackInfo->forceFinish = true; @@ -146,7 +144,7 @@ void LatteTextureReadback_UpdateFinishedTransfers(bool forceFinish) HRTick currentTick = HighResolutionTimer().now().getTick(); double elapsedSecondsTransfer = HighResolutionTimer::getTimeDiff(readbackInfo->transferStartTime, currentTick); double elapsedSecondsWaiting = HighResolutionTimer::getTimeDiff(readbackInfo->waitStartTime, currentTick); - forceLog_printf("[Texture-Readback] %08x Res %4d/%4d TM %d FMT %04x ReadbackLatency: %6.3lfms WaitTime: %6.3lfms ForcedWait %s", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0, elapsedSecondsWaiting * 1000.0, readbackInfo->forceFinish ? "yes" : "no"); + cemuLog_log(LogType::TextureReadback, "[Texture-Readback] {:08x} Res {}/{} TM {} FMT {:04x} ReadbackLatency: {:6.3}ms WaitTime: {:6.3}ms ForcedWait {}", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0, elapsedSecondsWaiting * 1000.0, readbackInfo->forceFinish ? "yes" : "no"); } #endif uint8* pixelData = readbackInfo->GetData(); diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.cpp index d7d139be..ad32b541 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.cpp +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.cpp @@ -1,13 +1,81 @@ #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h" #define VKFUNC_DEFINE #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h" +#include // for std::iota #if BOOST_OS_LINUX || BOOST_OS_MACOS #include #endif +#define VULKAN_API_CPU_BENCHMARK 0 // if 1, Cemu will log the CPU time spent per Vulkan API function + bool g_vulkan_available = false; +#if VULKAN_API_CPU_BENCHMARK != 0 +uint64 s_vulkanBenchmarkLastResultsTime = 0; + +struct VulkanBenchmarkFuncInfo +{ + std::string funcName; + uint64 cycles; + uint32 numCalls; +}; + +std::vector s_vulkanBenchmarkFuncs; + +template +auto VkWrapperFuncGenTest(TRet (*func)(Args...), const char* name) +{ + static VulkanBenchmarkFuncInfo _FuncInfo; + static auto _FuncPtrCopy = func; + TRet (*newFunc)(Args...); + if constexpr(std::is_void_v) + { + newFunc = +[](Args... args) { uint64 t = __rdtsc(); _mm_mfence(); _FuncPtrCopy(args...); _mm_mfence(); _FuncInfo.cycles += (__rdtsc() - t); _FuncInfo.numCalls++; }; + } + else + newFunc = +[](Args... args) -> TRet { uint64 t = __rdtsc(); _mm_mfence(); TRet r = _FuncPtrCopy(args...); _mm_mfence(); _FuncInfo.cycles += (__rdtsc() - t); _FuncInfo.numCalls++; return r; }; + if(func && func != newFunc) + _FuncPtrCopy = func; + if(_FuncInfo.funcName.empty()) + { + _FuncInfo = {.funcName = name, .cycles = 0, .numCalls = 0}; + s_vulkanBenchmarkFuncs.emplace_back(&_FuncInfo); + } + return newFunc; +}; +#endif + +// called when a TV SwapBuffers is called +void VulkanBenchmarkPrintResults() +{ +#if VULKAN_API_CPU_BENCHMARK != 0 + // note: This could be done by hooking vk present functions + uint64 currentCycle = __rdtsc(); + uint64 elapsedCycles = currentCycle - s_vulkanBenchmarkLastResultsTime; + s_vulkanBenchmarkLastResultsTime = currentCycle; + double elapsedCyclesDbl = (double)elapsedCycles; + cemuLog_log(LogType::Force, "--- Vulkan API CPU benchmark ---"); + cemuLog_log(LogType::Force, "Elapsed cycles this frame: {:} | Current cycle {:} | NumFunc {:}", elapsedCycles, currentCycle, s_vulkanBenchmarkFuncs.size()); + + std::vector sortedIndices(s_vulkanBenchmarkFuncs.size()); + std::iota(sortedIndices.begin(), sortedIndices.end(), 0); + std::sort(sortedIndices.begin(), sortedIndices.end(), + [](int32_t a, int32_t b) { + return s_vulkanBenchmarkFuncs[a]->cycles > s_vulkanBenchmarkFuncs[b]->cycles; + }); + for (sint32 idx : sortedIndices) + { + auto& func = s_vulkanBenchmarkFuncs[idx]; + if(func->cycles == 0) + return; + cemuLog_log(LogType::Force, "{}: {} cycles ({:.4}%) {} calls", func->funcName.c_str(), func->cycles, ((double)func->cycles / elapsedCyclesDbl) * 100.0, func->numCalls); + func->cycles = 0; + func->numCalls = 0; + } +#endif +} + #if BOOST_OS_WINDOWS bool InitializeGlobalVulkan() @@ -57,7 +125,12 @@ bool InitializeDeviceVulkan(VkDevice device) #define VKFUNC_DEVICE_INIT #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h" - + +#if VULKAN_API_CPU_BENCHMARK != 0 + #define VKFUNC_DEFINE_CUSTOM(__func) __func = VkWrapperFuncGenTest(__func, #__func) + #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h" +#endif + return true; } @@ -121,7 +194,12 @@ bool InitializeDeviceVulkan(VkDevice device) #define VKFUNC_DEVICE_INIT #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h" - + +#if VULKAN_API_CPU_BENCHMARK != 0 + #define VKFUNC_DEFINE_CUSTOM(__func) __func = VkWrapperFuncGenTest(__func, #__func) + #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h" +#endif + return true; } diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h index de4f1bb8..0489bb4e 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h @@ -14,7 +14,11 @@ extern bool g_vulkan_available; #endif -#ifdef VKFUNC_DEFINE +#ifdef VKFUNC_DEFINE_CUSTOM + #define VKFUNC(__FUNC__) VKFUNC_DEFINE_CUSTOM(__FUNC__) + #define VKFUNC_INSTANCE(__FUNC__) VKFUNC_DEFINE_CUSTOM(__FUNC__) + #define VKFUNC_DEVICE(__FUNC__) VKFUNC_DEFINE_CUSTOM(__FUNC__) +#elif defined(VKFUNC_DEFINE) #define VKFUNC(__FUNC__) NOEXPORT PFN_##__FUNC__ __FUNC__ = nullptr #define VKFUNC_INSTANCE(__FUNC__) NOEXPORT PFN_##__FUNC__ __FUNC__ = nullptr #define VKFUNC_DEVICE(__FUNC__) NOEXPORT PFN_##__FUNC__ __FUNC__ = nullptr @@ -238,4 +242,5 @@ VKFUNC_DEVICE(vkDestroyDescriptorSetLayout); #undef VKFUNC #undef VKFUNC_INSTANCE -#undef VKFUNC_DEVICE \ No newline at end of file +#undef VKFUNC_DEVICE +#undef VKFUNC_DEFINE_CUSTOM diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp index 2b8376d5..d084a399 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp @@ -2768,6 +2768,8 @@ void VulkanRenderer::NotifyLatteCommandProcessorIdle() SubmitCommandBuffer(); } +void VulkanBenchmarkPrintResults(); + void VulkanRenderer::SwapBuffers(bool swapTV, bool swapDRC) { SubmitCommandBuffer(); @@ -2777,6 +2779,9 @@ void VulkanRenderer::SwapBuffers(bool swapTV, bool swapDRC) if (swapDRC && IsSwapchainInfoValid(false)) SwapBuffer(false); + + if(swapTV) + VulkanBenchmarkPrintResults(); } void VulkanRenderer::ClearColorbuffer(bool padView) diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp index 5bffcc68..8b0e3b63 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp @@ -1574,7 +1574,7 @@ void VulkanRenderer::draw_updateVertexBuffersDirectAccess() uint32 bufferSize = LatteGPUState.contextRegister[bufferBaseRegisterIndex + 1] + 1; uint32 bufferStride = (LatteGPUState.contextRegister[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; - if (bufferAddress == MPTR_NULL) + if (bufferAddress == MPTR_NULL) [[unlikely]] { bufferAddress = 0x10000000; } @@ -1597,7 +1597,7 @@ void VulkanRenderer::draw_updateUniformBuffersDirectAccess(LatteDecompilerShader MPTR physicalAddr = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 0]; uint32 uniformSize = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 1] + 1; - if (physicalAddr == MPTR_NULL) + if (physicalAddr == MPTR_NULL) [[unlikely]] { cemu_assert_unimplemented(); continue;