Implement support for occulusion queries

These are mostly implemented how you would expect, however as opposed to copying out query pool results immeditely, doing so is delayed until the RP end in order to avoid splits.
2025-01-09 00:10:40 +01:00 · 2023-03-21 22:36:41 +00:00 · 2023-03-21 22:36:41 +00:00 · a2798a9184
commit a2798a9184
parent 202c97a1eb
9 changed files with 325 additions and 3 deletions
--- a/app/CMakeLists.txt
+++ b/app/CMakeLists.txt
@ -205,6 +205,7 @@ add_library(skyline SHARED
        ${source_DIR}/skyline/gpu/interconnect/maxwell_3d/packed_pipeline_state.cpp
        ${source_DIR}/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.cpp
        ${source_DIR}/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp
+        ${source_DIR}/skyline/gpu/interconnect/maxwell_3d/queries.cpp
        ${source_DIR}/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp
        ${source_DIR}/skyline/gpu/interconnect/kepler_compute/pipeline_manager.cpp
        ${source_DIR}/skyline/gpu/interconnect/kepler_compute/pipeline_state.cpp
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp
@ -24,7 +24,8 @@ namespace skyline::gpu::interconnect::maxwell3d {
          samplers{manager, registerBundle.samplerPoolRegisters},
          samplerBinding{registerBundle.samplerBinding},
          textures{manager, registerBundle.texturePoolRegisters},
-          directState{activeState.directState} {
+          directState{activeState.directState},
+          queries{gpu} {
        ctx.executor.AddFlushCallback([this] {
            if (attachedDescriptorSets) {
                ctx.executor.AttachDependency(attachedDescriptorSets);
@ -38,6 +39,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
            textures.MarkAllDirty();
            quadConversionBufferAttached = false;
            constantBuffers.DisableQuickBind();
+            queries.PurgeCaches(ctx);
        });

        ctx.executor.AddPipelineChangeCallback([this] {
@ -415,4 +417,26 @@ namespace skyline::gpu::interconnect::maxwell3d {
        }, scissor, activeDescriptorSetSampledImages, {}, activeState.GetColorAttachments(), activeState.GetDepthAttachment(), !ctx.gpu.traits.quirks.relaxedRenderPassCompatibility, srcStageMask, dstStageMask);
        ctx.executor.AddCheckpoint("After indirect draw");
    }
+
+    void Maxwell3D::Query(soc::gm20b::IOVA address, engine::SemaphoreInfo::CounterType type, std::optional<u64> timestamp) {
+        if (type != engine::SemaphoreInfo::CounterType::SamplesPassed) {
+            Logger::Error("Unsupported query type: {}", static_cast<u32>(type));
+            return;
+        }
+
+        queries.Query(ctx, address, Queries::CounterType::Occulusion, timestamp);
+    }
+
+    void Maxwell3D::ResetCounter(engine::ClearReportValue::Type type) {
+        if (type != engine::ClearReportValue::Type::ZPassPixelCount) {
+            Logger::Error("Unsupported query type: {}", static_cast<u32>(type));
+            return;
+        }
+
+        queries.ResetCounter(ctx, Queries::CounterType::Occulusion);
+    }
+
+    bool Maxwell3D::QueryPresentAtAddress(soc::gm20b::IOVA address) {
+        return queries.QueryPresentAtAddress(address);
+    }
 }
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.h
@ -6,9 +6,11 @@
 #include <gpu/descriptor_allocator.h>
 #include <gpu/interconnect/common/samplers.h>
 #include <gpu/interconnect/common/textures.h>
+#include <soc/gm20b/gmmu.h>
 #include "common.h"
 #include "active_state.h"
 #include "constant_buffers.h"
+#include "queries.h"

 namespace skyline::gpu::interconnect::maxwell3d {
    /**
@ -50,6 +52,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
        std::shared_ptr<memory::Buffer> quadConversionBuffer{};
        bool quadConversionBufferAttached{};
        BufferView indirectBufferView;
+        Queries queries;

        static constexpr size_t DescriptorBatchSize{0x100};
        std::shared_ptr<boost::container::static_vector<DescriptorAllocator::ActiveDescriptorSet, DescriptorBatchSize>> attachedDescriptorSets;
@ -105,5 +108,11 @@ namespace skyline::gpu::interconnect::maxwell3d {
        void Draw(engine::DrawTopology topology, bool transformFeedbackEnable, bool indexed, u32 count, u32 first, u32 instanceCount, u32 vertexOffset, u32 firstInstance);

        void DrawIndirect(engine::DrawTopology topology, bool transformFeedbackEnable, bool indexed, span<u8> indirectBuffer, u32 count, u32 stride);
+
+        void Query(soc::gm20b::IOVA address, engine::SemaphoreInfo::CounterType type, std::optional<u64> timestamp);
+
+        void ResetCounter(engine::ClearReportValue::Type type);
+
+        bool QueryPresentAtAddress(soc::gm20b::IOVA address);
    };
 }
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/queries.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/queries.cpp
@ -0,0 +1,147 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2023 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#include <gpu.h>
+#include <soc/gm20b/channel.h>
+#include <vulkan/vulkan.hpp>
+#include "queries.h"
+
+namespace skyline::gpu::interconnect::maxwell3d {
+    Queries::Counter::Counter(vk::raii::Device &device, vk::QueryType type) : pool{device, vk::QueryPoolCreateInfo{
+        .queryType = type,
+        .queryCount = Counter::QueryPoolSize
+    }} {}
+
+    std::function<void(vk::raii::CommandBuffer &, const std::shared_ptr<FenceCycle> &, GPU &)> Queries::Counter::Prepare(InterconnectContext &ctx) {
+        auto currentRenderPassIndex{*ctx.executor.GetRenderPassIndex()};
+        if (ctx.executor.executionTag != lastTag || lastRenderPassIndex != currentRenderPassIndex) {
+            lastTag = ctx.executor.executionTag;
+            lastRenderPassIndex = currentRenderPassIndex;
+
+            // Allocate per-RP memory for tracking queries
+            queries = ctx.executor.allocator->AllocateUntracked<Query>(Counter::QueryPoolSize);
+            usedQueryCount = ctx.executor.allocator->EmplaceUntracked<u32>();
+            queryActive = ctx.executor.allocator->EmplaceUntracked<bool>();
+            std::memset(queries.data(), 0, queries.size_bytes());
+
+            recordOnNextEnd = true;
+
+            // Reset the query pool up to the final used query count before the current RP begins
+            return [this, usedQueryCountPtr = this->usedQueryCount](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &) {
+                commandBuffer.resetQueryPool(*pool, 0, *usedQueryCountPtr);
+            };
+        }
+
+        return {};
+    }
+
+    //TODO call cmdbuf begin
+    void Queries::Counter::Begin(InterconnectContext &ctx, bool atExecutionStart) {
+        auto prepareFunc{Prepare(ctx)};
+
+        *queryActive = true;
+        (*usedQueryCount)++;
+
+        // Begin the query with the current query count as index
+        auto func{[this, queryIndex = *this->usedQueryCount - 1](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &) {
+            commandBuffer.beginQuery(*pool, queryIndex, vk::QueryControlFlagBits::ePrecise);
+        }};
+
+        if (atExecutionStart) {
+            ctx.executor.InsertPreExecuteCommand(std::move(func));
+
+            if (prepareFunc)
+                ctx.executor.InsertPreExecuteCommand(std::move(prepareFunc));
+        } else {
+            if (prepareFunc)
+                ctx.executor.InsertPreRpCommand(std::move(prepareFunc));
+
+            ctx.executor.AddCommand(std::move(func));
+        }
+    }
+
+    // TODO must be called after begin in cmdbuf
+    void Queries::Counter::Report(InterconnectContext &ctx, BufferView view, std::optional<u64> timestamp) {
+        if (ctx.executor.executionTag != lastTag)
+            Begin(ctx, true);
+
+        // End the query with the current query count as index
+        ctx.executor.AddCommand([=, this, queryIndex = *this->usedQueryCount - 1](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &gpu) {
+            commandBuffer.endQuery(*pool, queryIndex);
+        });
+
+        *queryActive = false;
+
+        // Allocate memory for the timestamp in the megabuffer since updateBuffer can be expensive
+        BufferBinding timestampBuffer{timestamp ? ctx.gpu.megaBufferAllocator.Push(ctx.executor.cycle, span<u64>(*timestamp).cast<u8>()) : BufferBinding{}};
+        queries[*usedQueryCount - 1] = {view, timestampBuffer};
+
+        if (recordOnNextEnd) {
+            ctx.executor.InsertPostRpCommand([this, queriesPtr = this->queries, usedQueryCountPtr = this->usedQueryCount, queryActivePtr = this->queryActive](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &gpu) {
+                if (*queryActivePtr)
+                    commandBuffer.endQuery(*pool, *usedQueryCountPtr - 1);
+
+                for (u32 i{}; i < *usedQueryCountPtr; i++) {
+                    if (!queriesPtr[i].view)
+                        continue;
+
+                    auto dstBinding{queriesPtr[i].view.GetBinding(gpu)};
+                    auto timestampSrcBinding{queriesPtr[i].timestampBinding};
+
+                    commandBuffer.copyQueryPoolResults(*pool, i, 1, dstBinding.buffer, dstBinding.offset, 0, {});
+                    if (timestampSrcBinding)
+                        commandBuffer.copyBuffer(timestampSrcBinding.buffer, dstBinding.buffer, {vk::BufferCopy{
+                            .size = 8,
+                            .srcOffset = timestampSrcBinding.offset,
+                            .dstOffset = dstBinding.offset + 8
+                        }});
+                }
+            });
+            recordOnNextEnd = false;
+        }
+    }
+
+    // TODO must be called after begin in cmdbuf
+    // TODO call at exec end
+    void Queries::Counter::End(InterconnectContext &ctx) {
+        if (ctx.executor.executionTag != lastTag  || !queryActive || !*queryActive)
+            return;
+
+        // End the query with the current query count as index
+        ctx.executor.AddCommand([=, this, queryIndex = *this->usedQueryCount - 1](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &gpu) {
+            commandBuffer.endQuery(*pool, queryIndex);
+        });
+
+        *queryActive = false;
+    }
+
+    Queries::Queries(GPU &gpu) : counters{{{gpu.vkDevice, vk::QueryType::eOcclusion}}} {}
+
+    void Queries::Query(InterconnectContext &ctx, soc::gm20b::IOVA address, CounterType type, std::optional<u64> timestamp) {
+        view.Update(ctx, address, timestamp ? 16 : 4);
+        usedQueryAddresses.emplace(u64{address});
+        ctx.executor.AttachBuffer(*view);
+
+        auto &counter{counters[static_cast<u32>(type)]};
+
+        view->GetBuffer()->MarkGpuDirty(ctx.executor.usageTracker);
+        counter.Report(ctx, *view, timestamp);
+        counter.Begin(ctx);
+    }
+
+    void Queries::ResetCounter(InterconnectContext &ctx, CounterType type) {
+        auto &counter{counters[static_cast<u32>(type)]};
+        counter.End(ctx);
+        counter.Begin(ctx);
+    }
+
+    void Queries::PurgeCaches(InterconnectContext &ctx) {
+        view.PurgeCaches();
+        for (u32 i{}; i < static_cast<u32>(CounterType::MaxValue); i++)
+            counters[i].End(ctx);
+    }
+
+    bool Queries::QueryPresentAtAddress(soc::gm20b::IOVA address) {
+        return usedQueryAddresses.contains(u64{address});
+    }
+}
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/queries.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/queries.h
@ -0,0 +1,102 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2023 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#pragma once
+
+#include <limits>
+#include <unordered_set>
+#include <soc/gm20b/gmmu.h>
+#include "common.h"
+#include "gpu/buffer.h"
+#include "gpu/interconnect/common/common.h"
+
+namespace skyline::gpu::interconnect::maxwell3d {
+    /**
+     * @brief Handles using host Vulkan queries
+     */
+    class Queries {
+      public:
+        enum class CounterType : u32 {
+            Occulusion = 0,
+            MaxValue
+        };
+
+      private:
+        /**
+         * @brief Represents a single query counter type
+         */
+        class Counter {
+          private:
+            static constexpr size_t QueryPoolSize{0x1000}; //!< Size of the underlying VK query pool to use
+
+            /**
+             * @brief Information required to report a single query with an optional timestamp
+             */
+            struct Query {
+                BufferView view; //!< View to write the query result to
+                BufferBinding timestampBinding; //!< Binding to buffer containing timestamp to write out (optional)
+            };
+
+            vk::raii::QueryPool pool;
+
+            ContextTag lastTag{}; //!< Execution tag at the last time a query was began
+            u32 lastRenderPassIndex{}; //!< Renderpass index at the last time a query was began
+            bool recordOnNextEnd{}; //!< If to record the query copying code upon ending the next query
+
+            // A note on the below variables: In Vulkan you can begin/end queries in an RP but you can't copy the results. Since some games perform hundreds of queries in a row it's not ideal to have constantly end the RP. To work around this, queries are performed on a per-RP basis, with a reset of query 0->queryCount before the RP begins, and all the copies after the RP ends. Since per-RP storage is needed for this the below variables are linearly allocated and replaced upon new queries happening in a new RP.
+            span<Query> queries{}; //!< A list of queries reports to perform at the end of the current RP, linearly allocated
+            u32 *usedQueryCount{}; //!< Number of queries used from the pool in the current RP, linearly allocated
+            bool *queryActive{}; //!< If a query is active in the current RP, this is used so that the RP end code knows whether it needs to end the final query
+
+            std::function<void(vk::raii::CommandBuffer &, const std::shared_ptr<FenceCycle> &, GPU &)> Prepare(InterconnectContext &ctx);
+
+          public:
+            Counter(vk::raii::Device &device, vk::QueryType type);
+
+            /**
+             * @brief Begins a query in the command stream
+             * @param atExecutionStart Whether to insert the query begin at the start of the current executor or at the current position
+             */
+            void Begin(InterconnectContext &ctx, bool atExecutionStart = false);
+
+            /**
+             * @brief Records a query end, and a copy into the target buffer in the command stream
+             * @param view View to copy the query result into
+             * @param timestamp Optional timestamp to report along with the query
+             */
+            void Report(InterconnectContext &ctx, BufferView view, std::optional<u64> timestamp);
+
+            /**
+             * @brief Records a query end
+             */
+            void End(InterconnectContext &ctx);
+
+        };
+
+        std::array<Counter, static_cast<u32>(CounterType::MaxValue)> counters;
+
+        CachedMappedBufferView view; //!< Cached view for looking up query buffers from IOVAs
+
+        std::unordered_set<u64> usedQueryAddresses;
+
+      public:
+        Queries(GPU &gpu);
+
+        /**
+         * @brief Records a query of the counter corresponding to `type` and writes the result to the supplied address
+         */
+        void Query(InterconnectContext &ctx, soc::gm20b::IOVA address, CounterType type, std::optional<u64> timestamp);
+
+        /**
+         * @brief Resets the counter value for `type` to the default
+         */
+        void ResetCounter(InterconnectContext &ctx, CounterType type);
+
+        void PurgeCaches(InterconnectContext &ctx);
+
+        /**
+         * @return If a query has ever been reported to `address`
+         */
+        bool QueryPresentAtAddress(soc::gm20b::IOVA address);
+    };
+}
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell/initialization.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell/initialization.cpp
@ -276,6 +276,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
        HandleMethod(0x86, 0x10);
        HandleMethod(0x4B7, 0x10);
        HandleMethod(0x365, 0x1);
+        HandleMethod(0x556, 0x1);
        HandleMethod(0x559, 0xFFF);
        HandleMethod(0x55F, 0xFFFFF);
        HandleMethod(0x584, 0x12);
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell/types.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell/types.h
@ -753,6 +753,33 @@ namespace skyline::soc::gm20b::engine::maxwell3d::type {
    };
    static_assert(sizeof(ClearSurface) == sizeof(u32));

+    struct ClearReportValue {
+        enum class Type : u32 {
+            ZPassPixelCount = 0x01,
+            ZCullStats = 0x02,
+            StreamingPrimitvesNeededMinusSucceeded = 0x03,
+            AlphaBetaClocks = 0x04,
+            StreamingPrimitivesSucceeded = 0x10,
+            StreamingPrimitivesNeeded = 0x11,
+            VerticesGenerated = 0x12,
+            PrimitivesGenerated = 0x13,
+            VertexShaderInvocations = 0x15,
+            TessellationInitInvocations = 0x16,
+            TessellationShaderInvocations = 0x17,
+            TessellationShaderPrimitivesGenerated = 0x18,
+            GeometryShaderInvocations = 0x1A,
+            GeometryShaderPrimitivesGenerated = 0x1B,
+            ClipperInvocations = 0x1C,
+            ClipperPrimitivesGenerated = 0x1D,
+            PixelShaderInvocations = 0x1E,
+            VtgPrimitivesOut = 0x1F
+        };
+
+        Type type : 5;
+        u32 _pad_ : 27;
+    };
+    static_assert(sizeof(ClearReportValue) == sizeof(u32));
+
    struct SemaphoreInfo {
        enum class Op : u8 {
            Release = 0,
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
@ -216,6 +216,10 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
                i2m.LoadInlineData(*registers.i2m, loadInlineData);
            })

+            ENGINE_CASE(clearReportValue, {
+                interconnect.ResetCounter(clearReportValue.type);
+            })
+
            ENGINE_CASE(syncpointAction, {
                Logger::Debug("Increment syncpoint: {}", static_cast<u16>(syncpointAction.id));
                channelCtx.executor.Submit([=, syncpoints = &this->syncpoints, index = syncpointAction.id]() {
@ -360,11 +364,15 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
                    case type::SemaphoreInfo::Op::Counter: {
                        switch (info.counterType) {
                            case type::SemaphoreInfo::CounterType::Zero:
-                                WriteSemaphoreResult(*registers.semaphore, registers.semaphore->payload);
+                                channelCtx.executor.Submit([=, this, semaphore = *registers.semaphore]() {
+                                    WriteSemaphoreResult(semaphore, semaphore.payload);
+                                });
                                break;
                            case type::SemaphoreInfo::CounterType::SamplesPassed:
                                // Return a fake result for now
-                                WriteSemaphoreResult(*registers.semaphore, 0xffffff);
+                                interconnect.Query({registers.semaphore->address}, info.counterType,
+                                                   registers.semaphore->info.structureSize == type::SemaphoreInfo::StructureSize::FourWords ?
+                                                   GetGpuTimeTicks() : std::optional<u64>{});
                                break;

                            default:
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.h
@ -260,6 +260,9 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
            Register<0x547, u32> zCullStatCountersEnable;
            Register<0x548, u32> pointSpriteEnable;
            Register<0x54A, u32> shaderExceptions;
+
+            Register<0x54C, type::ClearReportValue> clearReportValue;
+
            Register<0x54D, u32> multisampleEnable;
            Register<0x54E, type::ZtSelect> ztSelect;