From d1f600601d94889e36e218a16add20cd20e6ee17 Mon Sep 17 00:00:00 2001 From: GPUCode <47210458+GPUCode@users.noreply.github.com> Date: Mon, 28 Aug 2023 11:09:23 +0300 Subject: [PATCH] renderer_software: Multi-thread processing (#6698) * renderer_software: Multi-thread processing * Doubles the performance in most cases * renderer_software: Move memory access out of the raster loop * Profiling shows this has a significant impact --- .../renderer_software/sw_framebuffer.cpp | 48 ++- .../renderer_software/sw_framebuffer.h | 21 +- .../renderer_software/sw_rasterizer.cpp | 305 +++++++++--------- .../renderer_software/sw_rasterizer.h | 8 +- 4 files changed, 201 insertions(+), 181 deletions(-) diff --git a/src/video_core/renderer_software/sw_framebuffer.cpp b/src/video_core/renderer_software/sw_framebuffer.cpp index 3f4e47b01..0b0695ca5 100644 --- a/src/video_core/renderer_software/sw_framebuffer.cpp +++ b/src/video_core/renderer_software/sw_framebuffer.cpp @@ -41,10 +41,22 @@ Framebuffer::Framebuffer(Memory::MemorySystem& memory_, const Pica::FramebufferR Framebuffer::~Framebuffer() = default; -void Framebuffer::DrawPixel(int x, int y, const Common::Vec4& color) const { - const auto& framebuffer = regs.framebuffer; - const PAddr addr = framebuffer.GetColorBufferPhysicalAddress(); +void Framebuffer::Bind() { + PAddr addr = regs.framebuffer.GetColorBufferPhysicalAddress(); + if (color_addr != addr) [[unlikely]] { + color_addr = addr; + color_buffer = memory.GetPhysicalPointer(color_addr); + } + addr = regs.framebuffer.GetDepthBufferPhysicalAddress(); + if (depth_addr != addr) [[unlikely]] { + depth_addr = addr; + depth_buffer = memory.GetPhysicalPointer(depth_addr); + } +} + +void Framebuffer::DrawPixel(u32 x, u32 y, const Common::Vec4& color) const { + const auto& framebuffer = regs.framebuffer; // Similarly to textures, the render framebuffer is laid out from bottom to top, too. // NOTE: The framebuffer height register contains the actual FB height minus one. y = framebuffer.height - y; @@ -54,8 +66,7 @@ void Framebuffer::DrawPixel(int x, int y, const Common::Vec4& color) const { GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(framebuffer.color_format.Value())); const u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * framebuffer.width * bytes_per_pixel; - u8* depth_buffer = memory.GetPhysicalPointer(addr); - u8* dst_pixel = depth_buffer + dst_offset; + u8* dst_pixel = color_buffer + dst_offset; switch (framebuffer.color_format) { case FramebufferRegs::ColorFormat::RGBA8: @@ -80,10 +91,8 @@ void Framebuffer::DrawPixel(int x, int y, const Common::Vec4& color) const { } } -const Common::Vec4 Framebuffer::GetPixel(int x, int y) const { +const Common::Vec4 Framebuffer::GetPixel(u32 x, u32 y) const { const auto& framebuffer = regs.framebuffer; - const PAddr addr = framebuffer.GetColorBufferPhysicalAddress(); - y = framebuffer.height - y; const u32 coarse_y = y & ~7; @@ -91,7 +100,6 @@ const Common::Vec4 Framebuffer::GetPixel(int x, int y) const { GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(framebuffer.color_format.Value())); const u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * framebuffer.width * bytes_per_pixel; - const u8* color_buffer = memory.GetPhysicalPointer(addr); const u8* src_pixel = color_buffer + src_offset; switch (framebuffer.color_format) { @@ -114,10 +122,8 @@ const Common::Vec4 Framebuffer::GetPixel(int x, int y) const { return {0, 0, 0, 0}; } -u32 Framebuffer::GetDepth(int x, int y) const { +u32 Framebuffer::GetDepth(u32 x, u32 y) const { const auto& framebuffer = regs.framebuffer; - const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress(); - y = framebuffer.height - y; const u32 coarse_y = y & ~7; @@ -125,7 +131,6 @@ u32 Framebuffer::GetDepth(int x, int y) const { const u32 stride = framebuffer.width * bytes_per_pixel; const u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; - const u8* depth_buffer = memory.GetPhysicalPointer(addr); const u8* src_pixel = depth_buffer + src_offset; switch (framebuffer.depth_format) { @@ -143,10 +148,8 @@ u32 Framebuffer::GetDepth(int x, int y) const { } } -u8 Framebuffer::GetStencil(int x, int y) const { +u8 Framebuffer::GetStencil(u32 x, u32 y) const { const auto& framebuffer = regs.framebuffer; - const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress(); - y = framebuffer.height - y; const u32 coarse_y = y & ~7; @@ -154,7 +157,6 @@ u8 Framebuffer::GetStencil(int x, int y) const { const u32 stride = framebuffer.width * bytes_per_pixel; const u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; - const u8* depth_buffer = memory.GetPhysicalPointer(addr); const u8* src_pixel = depth_buffer + src_offset; switch (framebuffer.depth_format) { @@ -169,10 +171,8 @@ u8 Framebuffer::GetStencil(int x, int y) const { } } -void Framebuffer::SetDepth(int x, int y, u32 value) const { +void Framebuffer::SetDepth(u32 x, u32 y, u32 value) const { const auto& framebuffer = regs.framebuffer; - const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress(); - y = framebuffer.height - y; const u32 coarse_y = y & ~7; @@ -180,7 +180,6 @@ void Framebuffer::SetDepth(int x, int y, u32 value) const { const u32 stride = framebuffer.width * bytes_per_pixel; const u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; - u8* depth_buffer = memory.GetPhysicalPointer(addr); u8* dst_pixel = depth_buffer + dst_offset; switch (framebuffer.depth_format) { @@ -201,10 +200,8 @@ void Framebuffer::SetDepth(int x, int y, u32 value) const { } } -void Framebuffer::SetStencil(int x, int y, u8 value) const { +void Framebuffer::SetStencil(u32 x, u32 y, u8 value) const { const auto& framebuffer = regs.framebuffer; - const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress(); - y = framebuffer.height - y; const u32 coarse_y = y & ~7; @@ -212,7 +209,6 @@ void Framebuffer::SetStencil(int x, int y, u8 value) const { const u32 stride = framebuffer.width * bytes_per_pixel; const u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; - u8* depth_buffer = memory.GetPhysicalPointer(addr); u8* dst_pixel = depth_buffer + dst_offset; switch (framebuffer.depth_format) { @@ -231,7 +227,7 @@ void Framebuffer::SetStencil(int x, int y, u8 value) const { } } -void Framebuffer::DrawShadowMapPixel(int x, int y, u32 depth, u8 stencil) const { +void Framebuffer::DrawShadowMapPixel(u32 x, u32 y, u32 depth, u8 stencil) const { const auto& framebuffer = regs.framebuffer; const auto& shadow = regs.shadow; const PAddr addr = framebuffer.GetColorBufferPhysicalAddress(); diff --git a/src/video_core/renderer_software/sw_framebuffer.h b/src/video_core/renderer_software/sw_framebuffer.h index db0cb66d5..db5c8d259 100644 --- a/src/video_core/renderer_software/sw_framebuffer.h +++ b/src/video_core/renderer_software/sw_framebuffer.h @@ -23,30 +23,37 @@ public: explicit Framebuffer(Memory::MemorySystem& memory, const Pica::FramebufferRegs& framebuffer); ~Framebuffer(); + /// Updates the framebuffer addresses from the PICA registers. + void Bind(); + /// Draws a pixel at the specified coordinates. - void DrawPixel(int x, int y, const Common::Vec4& color) const; + void DrawPixel(u32 x, u32 y, const Common::Vec4& color) const; /// Returns the current color at the specified coordinates. - [[nodiscard]] const Common::Vec4 GetPixel(int x, int y) const; + [[nodiscard]] const Common::Vec4 GetPixel(u32 x, u32 y) const; /// Returns the depth value at the specified coordinates. - [[nodiscard]] u32 GetDepth(int x, int y) const; + [[nodiscard]] u32 GetDepth(u32 x, u32 y) const; /// Returns the stencil value at the specified coordinates. - [[nodiscard]] u8 GetStencil(int x, int y) const; + [[nodiscard]] u8 GetStencil(u32 x, u32 y) const; /// Stores the provided depth value at the specified coordinates. - void SetDepth(int x, int y, u32 value) const; + void SetDepth(u32 x, u32 y, u32 value) const; /// Stores the provided stencil value at the specified coordinates. - void SetStencil(int x, int y, u8 value) const; + void SetStencil(u32 x, u32 y, u8 value) const; /// Draws a pixel to the shadow buffer. - void DrawShadowMapPixel(int x, int y, u32 depth, u8 stencil) const; + void DrawShadowMapPixel(u32 x, u32 y, u32 depth, u8 stencil) const; private: Memory::MemorySystem& memory; const Pica::FramebufferRegs& regs; + PAddr color_addr; + u8* color_buffer{}; + PAddr depth_addr; + u8* depth_buffer{}; }; u8 PerformStencilAction(Pica::FramebufferRegs::StencilAction action, u8 old_stencil, u8 ref); diff --git a/src/video_core/renderer_software/sw_rasterizer.cpp b/src/video_core/renderer_software/sw_rasterizer.cpp index 8b3d6bfee..9720a4d49 100644 --- a/src/video_core/renderer_software/sw_rasterizer.cpp +++ b/src/video_core/renderer_software/sw_rasterizer.cpp @@ -96,7 +96,9 @@ private: } // Anonymous namespace RasterizerSoftware::RasterizerSoftware(Memory::MemorySystem& memory_) - : memory{memory_}, state{Pica::g_state}, regs{state.regs}, fb{memory, regs.framebuffer} {} + : memory{memory_}, state{Pica::g_state}, regs{state.regs}, + num_sw_threads{std::max(std::thread::hardware_concurrency(), 2U)}, + sw_workers{num_sw_threads, "SwRenderer workers"}, fb{memory, regs.framebuffer} {} void RasterizerSoftware::AddTriangle(const Pica::Shader::OutputVertex& v0, const Pica::Shader::OutputVertex& v1, @@ -289,167 +291,180 @@ void RasterizerSoftware::ProcessTriangle(const Vertex& v0, const Vertex& v1, con const auto w_inverse = Common::MakeVec(v0.pos.w, v1.pos.w, v2.pos.w); - auto textures = regs.texturing.GetTextures(); + const auto textures = regs.texturing.GetTextures(); const auto tev_stages = regs.texturing.GetTevStages(); + fb.Bind(); + // Enter rasterization loop, starting at the center of the topleft bounding box corner. // TODO: Not sure if looping through x first might be faster for (u16 y = min_y + 8; y < max_y; y += 0x10) { - for (u16 x = min_x + 8; x < max_x; x += 0x10) { - // Do not process the pixel if it's inside the scissor box and the scissor mode is set - // to Exclude. - if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Exclude) { - if (x >= scissor_x1 && x < scissor_x2 && y >= scissor_y1 && y < scissor_y2) { + const auto process_scanline = [&, y] { + for (u16 x = min_x + 8; x < max_x; x += 0x10) { + // Do not process the pixel if it's inside the scissor box and the scissor mode is + // set to Exclude. + if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Exclude) { + if (x >= scissor_x1 && x < scissor_x2 && y >= scissor_y1 && y < scissor_y2) { + continue; + } + } + + // Calculate the barycentric coordinates w0, w1 and w2 + const s32 w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y}); + const s32 w1 = bias1 + SignedArea(vtxpos[2].xy(), vtxpos[0].xy(), {x, y}); + const s32 w2 = bias2 + SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), {x, y}); + const s32 wsum = w0 + w1 + w2; + + // If current pixel is not covered by the current primitive + if (w0 < 0 || w1 < 0 || w2 < 0) { continue; } - } - // Calculate the barycentric coordinates w0, w1 and w2 - const s32 w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y}); - const s32 w1 = bias1 + SignedArea(vtxpos[2].xy(), vtxpos[0].xy(), {x, y}); - const s32 w2 = bias2 + SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), {x, y}); - const s32 wsum = w0 + w1 + w2; + const auto baricentric_coordinates = Common::MakeVec( + f24::FromFloat32(static_cast(w0)), f24::FromFloat32(static_cast(w1)), + f24::FromFloat32(static_cast(w2))); + const f24 interpolated_w_inverse = + f24::One() / Common::Dot(w_inverse, baricentric_coordinates); - // If current pixel is not covered by the current primitive - if (w0 < 0 || w1 < 0 || w2 < 0) { - continue; - } + // interpolated_z = z / w + const float interpolated_z_over_w = + (v0.screenpos[2].ToFloat32() * w0 + v1.screenpos[2].ToFloat32() * w1 + + v2.screenpos[2].ToFloat32() * w2) / + wsum; - const auto baricentric_coordinates = Common::MakeVec( - f24::FromFloat32(static_cast(w0)), f24::FromFloat32(static_cast(w1)), - f24::FromFloat32(static_cast(w2))); - const f24 interpolated_w_inverse = - f24::One() / Common::Dot(w_inverse, baricentric_coordinates); + // Not fully accurate. About 3 bits in precision are missing. + // Z-Buffer (z / w * scale + offset) + const float depth_scale = + f24::FromRaw(regs.rasterizer.viewport_depth_range).ToFloat32(); + const float depth_offset = + f24::FromRaw(regs.rasterizer.viewport_depth_near_plane).ToFloat32(); + float depth = interpolated_z_over_w * depth_scale + depth_offset; - // interpolated_z = z / w - const float interpolated_z_over_w = - (v0.screenpos[2].ToFloat32() * w0 + v1.screenpos[2].ToFloat32() * w1 + - v2.screenpos[2].ToFloat32() * w2) / - wsum; + // Potentially switch to W-Buffer + if (regs.rasterizer.depthmap_enable == + Pica::RasterizerRegs::DepthBuffering::WBuffering) { + // W-Buffer (z * scale + w * offset = (z / w * scale + offset) * w) + depth *= interpolated_w_inverse.ToFloat32() * wsum; + } - // Not fully accurate. About 3 bits in precision are missing. - // Z-Buffer (z / w * scale + offset) - const float depth_scale = - f24::FromRaw(regs.rasterizer.viewport_depth_range).ToFloat32(); - const float depth_offset = - f24::FromRaw(regs.rasterizer.viewport_depth_near_plane).ToFloat32(); - float depth = interpolated_z_over_w * depth_scale + depth_offset; + // Clamp the result + depth = std::clamp(depth, 0.0f, 1.0f); - // Potentially switch to W-Buffer - if (regs.rasterizer.depthmap_enable == - Pica::RasterizerRegs::DepthBuffering::WBuffering) { - // W-Buffer (z * scale + w * offset = (z / w * scale + offset) * w) - depth *= interpolated_w_inverse.ToFloat32() * wsum; - } - - // Clamp the result - depth = std::clamp(depth, 0.0f, 1.0f); - - /** - * Perspective correct attribute interpolation: - * Attribute values cannot be calculated by simple linear interpolation since - * they are not linear in screen space. For example, when interpolating a - * texture coordinate across two vertices, something simple like - * u = (u0*w0 + u1*w1)/(w0+w1) - * will not work. However, the attribute value divided by the - * clipspace w-coordinate (u/w) and and the inverse w-coordinate (1/w) are linear - * in screenspace. Hence, we can linearly interpolate these two independently and - * calculate the interpolated attribute by dividing the results. - * I.e. - * u_over_w = ((u0/v0.pos.w)*w0 + (u1/v1.pos.w)*w1)/(w0+w1) - * one_over_w = (( 1/v0.pos.w)*w0 + ( 1/v1.pos.w)*w1)/(w0+w1) - * u = u_over_w / one_over_w - * - * The generalization to three vertices is straightforward in baricentric coordinates. - **/ - const auto get_interpolated_attribute = [&](f24 attr0, f24 attr1, f24 attr2) { - auto attr_over_w = Common::MakeVec(attr0, attr1, attr2); - f24 interpolated_attr_over_w = Common::Dot(attr_over_w, baricentric_coordinates); - return interpolated_attr_over_w * interpolated_w_inverse; - }; - - const Common::Vec4 primary_color{ - static_cast( - round(get_interpolated_attribute(v0.color.r(), v1.color.r(), v2.color.r()) - .ToFloat32() * - 255)), - static_cast( - round(get_interpolated_attribute(v0.color.g(), v1.color.g(), v2.color.g()) - .ToFloat32() * - 255)), - static_cast( - round(get_interpolated_attribute(v0.color.b(), v1.color.b(), v2.color.b()) - .ToFloat32() * - 255)), - static_cast( - round(get_interpolated_attribute(v0.color.a(), v1.color.a(), v2.color.a()) - .ToFloat32() * - 255)), - }; - - std::array, 3> uv; - uv[0].u() = get_interpolated_attribute(v0.tc0.u(), v1.tc0.u(), v2.tc0.u()); - uv[0].v() = get_interpolated_attribute(v0.tc0.v(), v1.tc0.v(), v2.tc0.v()); - uv[1].u() = get_interpolated_attribute(v0.tc1.u(), v1.tc1.u(), v2.tc1.u()); - uv[1].v() = get_interpolated_attribute(v0.tc1.v(), v1.tc1.v(), v2.tc1.v()); - uv[2].u() = get_interpolated_attribute(v0.tc2.u(), v1.tc2.u(), v2.tc2.u()); - uv[2].v() = get_interpolated_attribute(v0.tc2.v(), v1.tc2.v(), v2.tc2.v()); - - // Sample bound texture units. - const f24 tc0_w = get_interpolated_attribute(v0.tc0_w, v1.tc0_w, v2.tc0_w); - const auto texture_color = TextureColor(uv, textures, tc0_w); - - Common::Vec4 primary_fragment_color = {0, 0, 0, 0}; - Common::Vec4 secondary_fragment_color = {0, 0, 0, 0}; - - if (!regs.lighting.disable) { - const auto normquat = - Common::Quaternion{ - {get_interpolated_attribute(v0.quat.x, v1.quat.x, v2.quat.x).ToFloat32(), - get_interpolated_attribute(v0.quat.y, v1.quat.y, v2.quat.y).ToFloat32(), - get_interpolated_attribute(v0.quat.z, v1.quat.z, v2.quat.z).ToFloat32()}, - get_interpolated_attribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(), - } - .Normalized(); - - const Common::Vec3f view{ - get_interpolated_attribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(), - get_interpolated_attribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(), - get_interpolated_attribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(), + /** + * Perspective correct attribute interpolation: + * Attribute values cannot be calculated by simple linear interpolation since + * they are not linear in screen space. For example, when interpolating a + * texture coordinate across two vertices, something simple like + * u = (u0*w0 + u1*w1)/(w0+w1) + * will not work. However, the attribute value divided by the + * clipspace w-coordinate (u/w) and and the inverse w-coordinate (1/w) are linear + * in screenspace. Hence, we can linearly interpolate these two independently and + * calculate the interpolated attribute by dividing the results. + * I.e. + * u_over_w = ((u0/v0.pos.w)*w0 + (u1/v1.pos.w)*w1)/(w0+w1) + * one_over_w = (( 1/v0.pos.w)*w0 + ( 1/v1.pos.w)*w1)/(w0+w1) + * u = u_over_w / one_over_w + * + * The generalization to three vertices is straightforward in baricentric + *coordinates. + **/ + const auto get_interpolated_attribute = [&](f24 attr0, f24 attr1, f24 attr2) { + auto attr_over_w = Common::MakeVec(attr0, attr1, attr2); + f24 interpolated_attr_over_w = + Common::Dot(attr_over_w, baricentric_coordinates); + return interpolated_attr_over_w * interpolated_w_inverse; }; - std::tie(primary_fragment_color, secondary_fragment_color) = ComputeFragmentsColors( - regs.lighting, state.lighting, normquat, view, texture_color); - } - // Write the TEV stages. - auto combiner_output = WriteTevConfig(texture_color, tev_stages, primary_color, - primary_fragment_color, secondary_fragment_color); + const Common::Vec4 primary_color{ + static_cast( + round(get_interpolated_attribute(v0.color.r(), v1.color.r(), v2.color.r()) + .ToFloat32() * + 255)), + static_cast( + round(get_interpolated_attribute(v0.color.g(), v1.color.g(), v2.color.g()) + .ToFloat32() * + 255)), + static_cast( + round(get_interpolated_attribute(v0.color.b(), v1.color.b(), v2.color.b()) + .ToFloat32() * + 255)), + static_cast( + round(get_interpolated_attribute(v0.color.a(), v1.color.a(), v2.color.a()) + .ToFloat32() * + 255)), + }; - const auto& output_merger = regs.framebuffer.output_merger; - if (output_merger.fragment_operation_mode == - FramebufferRegs::FragmentOperationMode::Shadow) { - u32 depth_int = static_cast(depth * 0xFFFFFF); - // Use green color as the shadow intensity - u8 stencil = combiner_output.y; - fb.DrawShadowMapPixel(x >> 4, y >> 4, depth_int, stencil); - // Skip the normal output merger pipeline if it is in shadow mode - continue; - } + std::array, 3> uv; + uv[0].u() = get_interpolated_attribute(v0.tc0.u(), v1.tc0.u(), v2.tc0.u()); + uv[0].v() = get_interpolated_attribute(v0.tc0.v(), v1.tc0.v(), v2.tc0.v()); + uv[1].u() = get_interpolated_attribute(v0.tc1.u(), v1.tc1.u(), v2.tc1.u()); + uv[1].v() = get_interpolated_attribute(v0.tc1.v(), v1.tc1.v(), v2.tc1.v()); + uv[2].u() = get_interpolated_attribute(v0.tc2.u(), v1.tc2.u(), v2.tc2.u()); + uv[2].v() = get_interpolated_attribute(v0.tc2.v(), v1.tc2.v(), v2.tc2.v()); - // Does alpha testing happen before or after stencil? - if (!DoAlphaTest(combiner_output.a())) { - continue; + // Sample bound texture units. + const f24 tc0_w = get_interpolated_attribute(v0.tc0_w, v1.tc0_w, v2.tc0_w); + const auto texture_color = TextureColor(uv, textures, tc0_w); + + Common::Vec4 primary_fragment_color = {0, 0, 0, 0}; + Common::Vec4 secondary_fragment_color = {0, 0, 0, 0}; + + if (!regs.lighting.disable) { + const auto normquat = + Common::Quaternion{ + {get_interpolated_attribute(v0.quat.x, v1.quat.x, v2.quat.x) + .ToFloat32(), + get_interpolated_attribute(v0.quat.y, v1.quat.y, v2.quat.y) + .ToFloat32(), + get_interpolated_attribute(v0.quat.z, v1.quat.z, v2.quat.z) + .ToFloat32()}, + get_interpolated_attribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(), + } + .Normalized(); + + const Common::Vec3f view{ + get_interpolated_attribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(), + get_interpolated_attribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(), + get_interpolated_attribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(), + }; + std::tie(primary_fragment_color, secondary_fragment_color) = + ComputeFragmentsColors(regs.lighting, state.lighting, normquat, view, + texture_color); + } + + // Write the TEV stages. + auto combiner_output = + WriteTevConfig(texture_color, tev_stages, primary_color, primary_fragment_color, + secondary_fragment_color); + + const auto& output_merger = regs.framebuffer.output_merger; + if (output_merger.fragment_operation_mode == + FramebufferRegs::FragmentOperationMode::Shadow) { + const u32 depth_int = static_cast(depth * 0xFFFFFF); + // Use green color as the shadow intensity + const u8 stencil = combiner_output.y; + fb.DrawShadowMapPixel(x >> 4, y >> 4, depth_int, stencil); + // Skip the normal output merger pipeline if it is in shadow mode + continue; + } + + // Does alpha testing happen before or after stencil? + if (!DoAlphaTest(combiner_output.a())) { + continue; + } + WriteFog(depth, combiner_output); + if (!DoDepthStencilTest(x, y, depth)) { + continue; + } + const auto result = PixelColor(x, y, combiner_output); + if (regs.framebuffer.framebuffer.allow_color_write != 0) { + fb.DrawPixel(x >> 4, y >> 4, result); + } } - WriteFog(combiner_output, depth); - if (!DoDepthStencilTest(x, y, depth)) { - continue; - } - const auto result = PixelColor(x, y, combiner_output); - if (regs.framebuffer.framebuffer.allow_color_write != 0) { - fb.DrawPixel(x >> 4, y >> 4, result); - } - } + }; + sw_workers.QueueWork(std::move(process_scanline)); } + sw_workers.WaitForRequests(); } std::array, 4> RasterizerSoftware::TextureColor( @@ -573,7 +588,7 @@ std::array, 4> RasterizerSoftware::TextureColor( } Common::Vec4 RasterizerSoftware::PixelColor(u16 x, u16 y, - Common::Vec4& combiner_output) const { + Common::Vec4 combiner_output) const { const auto dest = fb.GetPixel(x >> 4, y >> 4); Common::Vec4 blend_output = combiner_output; @@ -771,7 +786,7 @@ Common::Vec4 RasterizerSoftware::WriteTevConfig( return combiner_output; } -void RasterizerSoftware::WriteFog(Common::Vec4& combiner_output, float depth) const { +void RasterizerSoftware::WriteFog(float depth, Common::Vec4& combiner_output) const { /** * Apply fog combiner. Not fully accurate. We'd have to know what data type is used to * store the depth etc. Using float for now until we know more about Pica datatypes. diff --git a/src/video_core/renderer_software/sw_rasterizer.h b/src/video_core/renderer_software/sw_rasterizer.h index 40f7ce781..c2cec9e9d 100644 --- a/src/video_core/renderer_software/sw_rasterizer.h +++ b/src/video_core/renderer_software/sw_rasterizer.h @@ -5,7 +5,7 @@ #pragma once #include - +#include "common/thread_worker.h" #include "video_core/rasterizer_interface.h" #include "video_core/regs_texturing.h" #include "video_core/renderer_software/sw_clipper.h" @@ -52,7 +52,7 @@ private: std::span textures, f24 tc0_w) const; /// Returns the final pixel color with blending or logic ops applied. - Common::Vec4 PixelColor(u16 x, u16 y, Common::Vec4& combiner_output) const; + Common::Vec4 PixelColor(u16 x, u16 y, Common::Vec4 combiner_output) const; /// Emulates the TEV configuration and returns the combiner output. Common::Vec4 WriteTevConfig( @@ -62,7 +62,7 @@ private: Common::Vec4 secondary_fragment_color); /// Blends fog to the combiner output if enabled. - void WriteFog(Common::Vec4& combiner_output, float depth) const; + void WriteFog(float depth, Common::Vec4& combiner_output) const; /// Performs the alpha test. Returns false if the test failed. bool DoAlphaTest(u8 alpha) const; @@ -74,6 +74,8 @@ private: Memory::MemorySystem& memory; Pica::State& state; const Pica::Regs& regs; + size_t num_sw_threads; + Common::ThreadWorker sw_workers; Framebuffer fb; };