diff --git a/app/src/main/cpp/skyline/gpu/interconnect/common/shader_cache.cpp b/app/src/main/cpp/skyline/gpu/interconnect/common/shader_cache.cpp new file mode 100644 index 00000000..2820c67f --- /dev/null +++ b/app/src/main/cpp/skyline/gpu/interconnect/common/shader_cache.cpp @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: MPL-2.0 +// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/Ryujinx/) +// Copyright © 2022 yuzu Team and Contributors (https://github.com/yuzu-emu/) +// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/) + +#include +#include +#include +#include +#include +#include "shader_cache.h" + +namespace skyline::gpu::interconnect { + /* Pipeline Stage */ + ShaderBinary ShaderCache::Lookup(InterconnectContext &ctx, u64 programBase, u32 programOffset) { + lastProgramBase = programBase; + lastProgramOffset = programOffset; + auto[blockMapping, blockOffset]{ctx.channelCtx.asCtx->gmmu.LookupBlock(programBase + programOffset)}; + + if (!trapExecutionLock) + trapExecutionLock.emplace(trapMutex); + + // Skip looking up the mirror if it is the same as the one used for the previous update + if (!mirrorBlock.valid() || !mirrorBlock.contains(blockMapping)) { + auto mirrorIt{mirrorMap.find(blockMapping.data())}; + if (mirrorIt == mirrorMap.end()) { + // Allocate a host mirror for the mapping and trap the guest region + auto newIt{mirrorMap.emplace(blockMapping.data(), std::make_unique(ctx.memory.CreateMirror(blockMapping)))}; + + // We need to create the trap after allocating the entry so that we have an `invalid` pointer we can pass in + auto trapHandle{ctx.nce.CreateTrap(blockMapping, [mutex = &trapMutex]() { + std::scoped_lock lock{*mutex}; + return; + }, []() { return true; }, [entry = newIt.first->second.get(), mutex = &trapMutex]() { + std::unique_lock lock{*mutex, std::try_to_lock}; + if (!lock) + return false; + + if (++entry->trapCount <= MirrorEntry::SkipTrapThreshold) + entry->dirty = true; + return true; + })}; + + // Write only trap + ctx.nce.TrapRegions(trapHandle, true); + + entry = newIt.first->second.get(); + entry->trap = trapHandle; + } else { + entry = mirrorIt->second.get(); + } + + mirrorBlock = blockMapping; + } + + if (entry->trapCount > MirrorEntry::SkipTrapThreshold && entry->channelSequenceNumber != ctx.channelCtx.channelSequenceNumber) { + entry->channelSequenceNumber = ctx.channelCtx.channelSequenceNumber; + entry->dirty = true; + } + + // If the mirror entry has been written to, clear its shader binary cache and retrap to catch any future writes + if (entry->dirty) { + entry->cache.clear(); + entry->dirty = false; + + if (entry->trapCount <= MirrorEntry::SkipTrapThreshold) + ctx.nce.TrapRegions(*entry->trap, true); + } else if (auto it{entry->cache.find(blockMapping.data() + blockOffset)}; it != entry->cache.end()) { + return it->second; + } + + // entry->mirror may not be a direct mirror of blockMapping and may just contain it as a subregion, so we need to explicitly calculate the offset + span blockMappingMirror{blockMapping.data() - mirrorBlock.data() + entry->mirror.data(), blockMapping.size()}; + + ShaderBinary binary{}; + // If nothing was in the cache then do a full shader parse + binary.binary = [](span mapping) { + // We attempt to find the shader size by looking for "BRA $" (Infinite Loop) which is used as padding at the end of the shader + // UAM Shader Compiler Reference: https://github.com/devkitPro/uam/blob/5a5afc2bae8b55409ab36ba45be63fcb73f68993/source/compiler_iface.cpp#L319-L351 + constexpr u64 BraSelf1{0xE2400FFFFF87000F}, BraSelf2{0xE2400FFFFF07000F}; + + span shaderInstructions{mapping.cast()}; + for (auto it{shaderInstructions.begin()}; it != shaderInstructions.end(); it++) { + auto instruction{*it}; + if (instruction == BraSelf1 || instruction == BraSelf2) [[unlikely]] + // It is far more likely that the instruction doesn't match so this is an unlikely case + return span{shaderInstructions.begin(), it}.cast(); + } + + return span{}; + }(blockMappingMirror.subspan(blockOffset)); + + binary.baseOffset = programOffset; + binary.hash = XXH64(binary.binary.data(), binary.binary.size_bytes(), 0); + + entry->cache.insert({blockMapping.data() + blockOffset, binary}); + + return binary; + } + + bool ShaderCache::Refresh(InterconnectContext &ctx, u64 programBase, u32 programOffset) { + if (!trapExecutionLock) + trapExecutionLock.emplace(trapMutex); + + if (programBase != lastProgramBase || programOffset != lastProgramOffset) + return true; + + if (entry && entry->trapCount > MirrorEntry::SkipTrapThreshold && entry->channelSequenceNumber != ctx.channelCtx.channelSequenceNumber) + return true; + else if (entry && entry->dirty) + return true; + + return false; + } + + void ShaderCache::PurgeCaches() { + trapExecutionLock.reset(); + } +} diff --git a/app/src/main/cpp/skyline/gpu/interconnect/common/shader_cache.h b/app/src/main/cpp/skyline/gpu/interconnect/common/shader_cache.h new file mode 100644 index 00000000..82f914e0 --- /dev/null +++ b/app/src/main/cpp/skyline/gpu/interconnect/common/shader_cache.h @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: MPL-2.0 +// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/) + +#pragma once + +#include "common.h" + +namespace skyline::gpu::interconnect { + class ShaderCache { + private: + /** + * @brief Holds mirror state for a single GPU mapped block + */ + struct MirrorEntry { + span mirror; + tsl::robin_map cache; + std::optional trap; + + static constexpr u32 SkipTrapThreshold{20}; //!< Threshold for the number of times a mirror trap needs to be hit before we fallback to always hashing + u32 trapCount{}; //!< The number of times the trap has been hit, used to avoid trapping in cases where the constant retraps would harm performance + size_t channelSequenceNumber{}; //!< For the case where `trapCount > SkipTrapThreshold`, the memory sequence number number used to clear the cache after every access + bool dirty{}; //!< If the trap has been hit and the cache needs to be cleared + + MirrorEntry(span alignedMirror) : mirror{alignedMirror} {} + }; + + tsl::robin_map> mirrorMap; + std::mutex trapMutex; //!< Protects accesses from trap handlers to the mirror map + std::optional> trapExecutionLock; //!< Persistently held lock over an execution to avoid frequent relocking + MirrorEntry *entry{}; + span mirrorBlock{}; //!< Guest mapped memory block corresponding to `entry` + u64 lastProgramBase{}; + u32 lastProgramOffset{}; + + public: + ShaderBinary Lookup(InterconnectContext &ctx, u64 programBase, u32 programOffset); + + bool Refresh(InterconnectContext &ctx, u64 programBase, u32 programOffset); + + void PurgeCaches(); + }; +} diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.h b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.h index ba4cb634..8aaf81ea 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.h +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.h @@ -17,11 +17,6 @@ namespace skyline::gpu { } namespace skyline::gpu::interconnect::maxwell3d { - struct ShaderBinary { - span binary; - u32 baseOffset; - }; - class Pipeline { public: struct ShaderStage { diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.cpp b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.cpp index a641b88d..5e3a3f81 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.cpp @@ -255,112 +255,19 @@ namespace skyline::gpu::interconnect::maxwell3d { throw exception("Shader type mismatch: {} != {}!", engine->pipeline.shader.type, static_cast(shaderType)); if (!engine->pipeline.shader.enable && shaderType != engine::Pipeline::Shader::Type::Vertex) { - hash = 0; + binary.hash = 0; return; } - auto[blockMapping, blockOffset]{ctx.channelCtx.asCtx->gmmu.LookupBlock(engine->programRegion + engine->pipeline.programOffset)}; - - if (!trapExecutionLock) - trapExecutionLock.emplace(trapMutex); - - // Skip looking up the mirror if it is the same as the one used for the previous update - if (!mirrorBlock.valid() || !mirrorBlock.contains(blockMapping)) { - auto mirrorIt{mirrorMap.find(blockMapping.data())}; - if (mirrorIt == mirrorMap.end()) { - // Allocate a host mirror for the mapping and trap the guest region - auto newIt{mirrorMap.emplace(blockMapping.data(), std::make_unique(ctx.memory.CreateMirror(blockMapping)))}; - - // We need to create the trap after allocating the entry so that we have an `invalid` pointer we can pass in - auto trapHandle{ctx.nce.CreateTrap(blockMapping, [mutex = &trapMutex]() { - std::scoped_lock lock{*mutex}; - return; - }, []() { return true; }, [entry = newIt.first->second.get(), mutex = &trapMutex]() { - std::unique_lock lock{*mutex, std::try_to_lock}; - if (!lock) - return false; - - if (++entry->trapCount <= MirrorEntry::SkipTrapThreshold) - entry->dirty = true; - return true; - })}; - - // Write only trap - ctx.nce.TrapRegions(trapHandle, true); - - entry = newIt.first->second.get(); - entry->trap = trapHandle; - } else { - entry = mirrorIt->second.get(); - } - - mirrorBlock = blockMapping; - } - - if (entry->trapCount > MirrorEntry::SkipTrapThreshold && entry->channelSequenceNumber != ctx.channelCtx.channelSequenceNumber) { - entry->channelSequenceNumber = ctx.channelCtx.channelSequenceNumber; - entry->dirty = true; - } - - // If the mirror entry has been written to, clear its shader binary cache and retrap to catch any future writes - if (entry->dirty) { - entry->cache.clear(); - entry->dirty = false; - - if (entry->trapCount <= MirrorEntry::SkipTrapThreshold) - ctx.nce.TrapRegions(*entry->trap, true); - } else if (auto it{entry->cache.find(blockMapping.data() + blockOffset)}; it != entry->cache.end()) { - binary = it->second.binary; - hash = it->second.hash; - return; - } - - // entry->mirror may not be a direct mirror of blockMapping and may just contain it as a subregion, so we need to explicitly calculate the offset - span blockMappingMirror{blockMapping.data() - mirrorBlock.data() + entry->mirror.data(), blockMapping.size()}; - - // If nothing was in the cache then do a full shader parse - binary.binary = [](span mapping) { - // We attempt to find the shader size by looking for "BRA $" (Infinite Loop) which is used as padding at the end of the shader - // UAM Shader Compiler Reference: https://github.com/devkitPro/uam/blob/5a5afc2bae8b55409ab36ba45be63fcb73f68993/source/compiler_iface.cpp#L319-L351 - constexpr u64 BraSelf1{0xE2400FFFFF87000F}, BraSelf2{0xE2400FFFFF07000F}; - - span shaderInstructions{mapping.cast()}; - for (auto it{shaderInstructions.begin()}; it != shaderInstructions.end(); it++) { - auto instruction{*it}; - if (instruction == BraSelf1 || instruction == BraSelf2) [[unlikely]] - // It is far more likely that the instruction doesn't match so this is an unlikely case - return span{shaderInstructions.begin(), it}.cast(); - } - - return span{}; - }(blockMappingMirror.subspan(blockOffset)); - - binary.baseOffset = engine->pipeline.programOffset; - hash = XXH64(binary.binary.data(), binary.binary.size_bytes(), 0); - - entry->cache.insert({blockMapping.data() + blockOffset, CacheEntry{binary, hash}}); + binary = cache.Lookup(ctx, engine->programRegion, engine->pipeline.programOffset); } bool PipelineStageState::Refresh(InterconnectContext &ctx) { - if (!trapExecutionLock) - trapExecutionLock.emplace(trapMutex); - - if (entry && entry->trapCount > MirrorEntry::SkipTrapThreshold && entry->channelSequenceNumber != ctx.channelCtx.channelSequenceNumber) - return true; - else if (entry && entry->dirty) - return true; - - return false; + return cache.Refresh(ctx, engine->programRegion, engine->pipeline.programOffset); } void PipelineStageState::PurgeCaches() { - trapExecutionLock.reset(); - } - - PipelineStageState::~PipelineStageState() { - std::scoped_lock lock{trapMutex}; - //for (const auto &mirror : mirrorMap) - // ctx.nce.DestroyTrap(*mirror.second->trap); + cache.PurgeCaches(); } /* Vertex Input State */ @@ -584,7 +491,7 @@ namespace skyline::gpu::interconnect::maxwell3d { std::array shaderBinaries; for (size_t i{}; i < engine::PipelineCount; i++) { const auto &stage{pipelineStages[i].UpdateGet(ctx)}; - packedState.shaderHashes[i] = stage.hash; + packedState.shaderHashes[i] = stage.binary.hash; shaderBinaries[i] = stage.binary; } diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.h b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.h index d6ae11ab..07462f4f 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.h +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.h @@ -5,6 +5,7 @@ #include #include +#include #include "common.h" #include "packed_pipeline_state.h" #include "pipeline_manager.h" @@ -67,46 +68,16 @@ namespace skyline::gpu::interconnect::maxwell3d { }; private: - struct CacheEntry { - ShaderBinary binary; - u64 hash; - - CacheEntry(ShaderBinary binary, u64 hash) : binary{binary}, hash{hash} {} - }; - - /** - * @brief Holds mirror state for a single GPU mapped block - */ - struct MirrorEntry { - span mirror; - tsl::robin_map cache; - std::optional trap; - - static constexpr u32 SkipTrapThreshold{20}; //!< Threshold for the number of times a mirror trap needs to be hit before we fallback to always hashing - u32 trapCount{}; //!< The number of times the trap has been hit, used to avoid trapping in cases where the constant retraps would harm performance - size_t channelSequenceNumber{}; //!< For the case where `trapCount > SkipTrapThreshold`, the memory sequence number number used to clear the cache after every access - bool dirty{}; //!< If the trap has been hit and the cache needs to be cleared - - MirrorEntry(span alignedMirror) : mirror{alignedMirror} {} - }; - dirty::BoundSubresource engine; engine::Pipeline::Shader::Type shaderType; - tsl::robin_map> mirrorMap; - std::mutex trapMutex; //!< Protects accesses from trap handlers to the mirror map - std::optional> trapExecutionLock; - MirrorEntry *entry{}; - span mirrorBlock{}; //!< Guest mapped memory block corresponding to `entry` + ShaderCache cache; public: ShaderBinary binary; - u64 hash; PipelineStageState(dirty::Handle dirtyHandle, DirtyManager &manager, const EngineRegisters &engine, u8 shaderType); - ~PipelineStageState(); - void Flush(InterconnectContext &ctx); bool Refresh(InterconnectContext &ctx);