Implement primitive Linear->Block Linear DMA engine copies

Slightly inaccurate and misses some features but good enough for most games, should be revisted later.
2024-11-27 02:04:22 +01:00 · 2022-04-09 17:56:06 +01:00 · 2022-04-09 17:56:06 +01:00 · 3e4e8de1d2
commit 3e4e8de1d2
parent 3c26921d54
4 changed files with 126 additions and 13 deletions
--- a/app/src/main/cpp/skyline/gpu/texture/format.h
+++ b/app/src/main/cpp/skyline/gpu/texture/format.h
@ -150,4 +150,22 @@ namespace skyline::gpu::format {
    #undef FORMAT_NORM_INT_FLOAT
    // @fmt:on
    inline const gpu::texture::FormatBase &GetFormatForBpp(u32 bytesPerPixel) {
        switch (bytesPerPixel) {
            case 1:
                return R8Uint;
            case 2:
                return R8G8Uint;
            case 4:
                return R8G8B8A8Uint;
            case 8:
                return R16G16B16A16Uint;
            case 16:
                return R32G32B32A32Uint;
            default:
                Logger::Error("Couldn't convert bytes per pixel: {}", bytesPerPixel);
                return R8Uint;
        }
    }
 }
--- a/app/src/main/cpp/skyline/gpu/texture/layout.h
+++ b/app/src/main/cpp/skyline/gpu/texture/layout.h
@ -12,7 +12,7 @@ namespace skyline::gpu::texture {
    constexpr u8 GobWidth{64}; // The width of a GOB in bytes
    constexpr u8 GobHeight{8}; // The height of a GOB in lines
-    size_t GetBlockLinearLayerSize(const GuestTexture &guest) {
+    inline size_t GetBlockLinearLayerSize(const GuestTexture &guest) {
        u32 blockHeight{guest.tileConfig.blockHeight}; //!< The height of the blocks in GOBs
        u32 robHeight{GobHeight * blockHeight}; //!< The height of a single ROB (Row of Blocks) in lines
        u32 surfaceHeightLines{util::DivideCeil(guest.dimensions.height, u32{guest.format->blockHeight})}; //!< The height of the surface in lines
@ -27,7 +27,7 @@ namespace skyline::gpu::texture {
    /**
     * @brief Copies pixel data between a linear and blocklinear texture
     */
-     template <typename CopyFunction>
+    template<typename CopyFunction>
    void CopyBlockLinearInternal(const GuestTexture &guest, u8 *blockLinear, u8 *linear, CopyFunction copyFunction) {
        u32 blockHeight{guest.tileConfig.blockHeight};
        u32 robHeight{GobHeight * blockHeight};
@ -99,15 +99,15 @@ namespace skyline::gpu::texture {
    /**
     * @brief Copies the contents of a blocklinear guest texture to a linear output buffer
     */
-    void CopyBlockLinearToLinear(const GuestTexture &guest, u8 *guestInput, u8 *linearOutput) {
+    inline void CopyBlockLinearToLinear(const GuestTexture &guest, u8 *guestInput, u8 *linearOutput) {
        CopyBlockLinearInternal(guest, guestInput, linearOutput, std::memcpy);
    }
    /**
     * @brief Copies the contents of a blocklinear guest texture to a linear output buffer
     */
-    void CopyLinearToBlockLinear(const GuestTexture &guest, u8 *linearInput, u8 *guestOutput) {
+    inline void CopyLinearToBlockLinear(const GuestTexture &guest, u8 *linearInput, u8 *guestOutput) {
-        CopyBlockLinearInternal(guest, guestOutput, linearInput, [](u8* src, u8* dst, size_t size) {
+        CopyBlockLinearInternal(guest, guestOutput, linearInput, [](u8 *src, u8 *dst, size_t size) {
            std::memcpy(dst, src, size);
        });
    }
@ -115,7 +115,7 @@ namespace skyline::gpu::texture {
    /**
     * @brief Copies the contents of a pitch-linear guest texture to a linear output buffer
     */
-    void CopyPitchLinearToLinear(const GuestTexture &guest, u8 *guestInput, u8 *linearOutput) {
+    inline void CopyPitchLinearToLinear(const GuestTexture &guest, u8 *guestInput, u8 *linearOutput) {
        auto sizeLine{guest.format->GetSize(guest.dimensions.width, 1)}; //!< The size of a single line of pixel data
        auto sizeStride{guest.tileConfig.pitch}; //!< The size of a single stride of pixel data
@ -132,7 +132,7 @@ namespace skyline::gpu::texture {
    /**
     * @brief Copies the contents of a linear buffer to a pitch-linear guest texture
     */
-    void CopyLinearToPitchLinear(const GuestTexture &guest, u8 *linearInput, u8 *guestOutput) {
+    inline void CopyLinearToPitchLinear(const GuestTexture &guest, u8 *linearInput, u8 *guestOutput) {
        auto sizeLine{guest.format->GetSize(guest.dimensions.width, 1)}; //!< The size of a single line of pixel data
        auto sizeStride{guest.tileConfig.pitch}; //!< The size of a single stride of pixel data
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.cpp
@ -1,6 +1,9 @@
 // SPDX-License-Identifier: MPL-2.0
 // Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
 // Copyright © 2022 yuzu Emulator Project (https://github.com/yuzu-emu/yuzu/)
 #include <gpu/texture/format.h>
 #include <gpu/texture/layout.h>
 #include <soc.h>
 #include <soc/gm20b/channel.h>
 #include <soc/gm20b/gmmu.h>
@ -27,9 +30,18 @@ namespace skyline::soc::gm20b::engine {
        if (*registers.lineLengthIn == 0)
            return; // Nothing to copy
        if (registers.launchDma->remapEnable) {
            Logger::Warn("DMA remapping is unimplemented!");
            return;
        }
        if (registers.launchDma->multiLineEnable) {
-            // 2D/3D copy
+            if (registers.launchDma->srcMemoryLayout == Registers::LaunchDma::MemoryLayout::Pitch &&
-            Logger::Warn("2D/3D DMA engine copies are unimplemented");
+                  registers.launchDma->dstMemoryLayout == Registers::LaunchDma::MemoryLayout::BlockLinear)
                CopyPitchToBlockLinear();
            else
                Logger::Warn("Unimplemented multi-line copy type: {} -> {}!",
                              static_cast<u8>(registers.launchDma->srcMemoryLayout), static_cast<u8>(registers.launchDma->dstMemoryLayout));
        } else {
            // 1D buffer copy
            // TODO: implement swizzled 1D copies based on VMM 'kind'
@ -38,6 +50,64 @@ namespace skyline::soc::gm20b::engine {
        }
    }
    void MaxwellDma::CopyPitchToBlockLinear() {
        if (registers.dstSurface->blockSize.Depth() > 1 || registers.dstSurface->depth > 1) {
            Logger::Warn("3D DMA engine copies are unimplemented!");
            return;
        }
        if (registers.dstSurface->blockSize.Width() != 1) {
            Logger::Warn("DMA engine copies with block widths other than 1 are unimplemented!");
            return;
        }
        u32 bytesPerPixel{static_cast<u32>(registers.remapComponents->ComponentSize() * registers.remapComponents->NumSrcComponents())};
        if (bytesPerPixel * *registers.lineLengthIn != *registers.pitchIn) {
            Logger::Warn("Non-linear DMA source textures are not implemented!");
            return;
        }
        if (registers.dstSurface->origin.x || registers.dstSurface->origin.y) {
            Logger::Warn("Non-zero origin DMA copies are not implemented!");
            return;
        }
        gpu::GuestTexture srcTexture{span<u8>{},
                                     gpu::texture::Dimensions{*registers.lineLengthIn, *registers.lineCount, 1},
                                     gpu::format::GetFormatForBpp(bytesPerPixel),
                                     gpu::texture::TileConfig{ .mode = gpu::texture::TileMode::Linear },
                                     gpu::texture::TextureType::e2D};
        if (auto mappings{channelCtx.asCtx->gmmu.TranslateRange(*registers.offsetIn, srcTexture.GetLayerSize())}; mappings.size() == 1) {
            srcTexture.mappings[0] = mappings[0];
        } else {
            Logger::Warn("DMA for split textures is unimplemented!");
            return;
        }
        if (*registers.lineLengthIn != registers.dstSurface->width)
            Logger::Warn("DMA copy width mismatch: src: {} dst: {}", *registers.lineLengthIn, registers.dstSurface->width);
        // This represents a single layer view into a potentially multi-layer texture
        gpu::GuestTexture dstTexture{span<u8>{},
                                     gpu::texture::Dimensions{*registers.lineLengthIn, registers.dstSurface->height, 1},
                                     gpu::format::GetFormatForBpp(bytesPerPixel),
                                     gpu::texture::TileConfig{ .mode = gpu::texture::TileMode::Block, .blockHeight = registers.dstSurface->blockSize.Height(), .blockDepth = 1 },
                                     gpu::texture::TextureType::e2D};
        u64 dstLayerAddress{*registers.offsetOut + dstTexture.GetLayerSize() * registers.dstSurface->layer};
        if (auto mappings{channelCtx.asCtx->gmmu.TranslateRange(dstLayerAddress, dstTexture.GetLayerSize())}; mappings.size() == 1) {
            dstTexture.mappings[0] = mappings[0];
        } else {
            Logger::Warn("DMA for split textures is unimplemented!");
            return;
        }
        Logger::Debug("{}x{}@0x{:X} -> {}x{}@0x{:X}", srcTexture.dimensions.width, srcTexture.dimensions.height, *registers.offsetIn, dstTexture.dimensions.width, dstTexture.dimensions.height, dstLayerAddress);
        gpu::texture::CopyLinearToBlockLinear(dstTexture, srcTexture.mappings.front().data(), dstTexture.mappings.front().data());
    }
    void MaxwellDma::CallMethodBatchNonInc(u32 method, span<u32> arguments) {
        for (u32 argument : arguments)
            HandleMethod(method, argument);
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.h
@ -22,6 +22,8 @@ namespace skyline::soc::gm20b::engine {
        void LaunchDma();
        void CopyPitchToBlockLinear();
      public:
        /**
         * @url https://github.com/NVIDIA/open-gpu-doc/blob/master/classes/dma-copy/clb0b5.h
@ -187,19 +189,42 @@ namespace skyline::soc::gm20b::engine {
                u8 _pad5_ : 2;
                u8 numDstComponentsMinusOne : 2;
                u8 _pad6_ : 6;
                u8 ComponentSize() {
                    return componentSizeMinusOne + 1;
                }
                u8 NumSrcComponents() {
                    return numSrcComponentsMinusOne + 1;
                }
                u8 NumDstComponents() {
                    return numDstComponentsMinusOne + 1;
                }
            };
            static_assert(sizeof(RemapComponents) == 0xC);
            Register<0x1C2, RemapComponents> remapComponents;
            struct Surface {
                // Nvidias docs here differ from other emus and deko3d so go with what they say
                struct {
-                    u8 width : 4;
+                    u8 widthLog2 : 4;
-                    u8 height : 4;
+                    u8 heightLog2 : 4;
-                    u8 depth : 4;
+                    u8 depthLog2 : 4;
                    u8 gobHeight : 4;
                    u16 _pad_;
                    u8 Width() {
                        return static_cast<u8>(1 << widthLog2);
                    }
                    u8 Height() {
                        return static_cast<u8>(1 << heightLog2);
                    }
                    u8 Depth() {
                        return static_cast<u8>(1 << depthLog2);
                    }
                } blockSize;
                u32 width;
                u32 height;