From 58e725bd030c4653590578064b0636fe30475a87 Mon Sep 17 00:00:00 2001 From: GaryOderNichts <12049776+GaryOderNichts@users.noreply.github.com> Date: Thu, 19 Aug 2021 21:15:56 +0200 Subject: [PATCH] wiiu: use paired singles to speed up matrix multiplication --- src/math/Matrix.cpp | 55 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/src/math/Matrix.cpp b/src/math/Matrix.cpp index c0d909cb..cbc0813c 100644 --- a/src/math/Matrix.cpp +++ b/src/math/Matrix.cpp @@ -1,5 +1,9 @@ #include "common.h" +#ifdef ESPRESSO +#include +#endif + CMatrix::CMatrix(void) { m_attachment = nil; @@ -434,6 +438,7 @@ operator*(const CMatrix &m1, const CMatrix &m2) { // TODO: VU0 code CMatrix out; +#ifndef ESPRESSO out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz; out.ry = m1.ry * m2.rx + m1.fy * m2.ry + m1.uy * m2.rz; out.rz = m1.rz * m2.rx + m1.fz * m2.ry + m1.uz * m2.rz; @@ -446,6 +451,56 @@ operator*(const CMatrix &m1, const CMatrix &m2) out.px = m1.rx * m2.px + m1.fx * m2.py + m1.ux * m2.pz + m1.px; out.py = m1.ry * m2.px + m1.fy * m2.py + m1.uy * m2.pz + m1.py; out.pz = m1.rz * m2.px + m1.fz * m2.py + m1.uz * m2.pz + m1.pz; +#else + float A00_A01 = psq_l(0, &m2, 0, 0); + float B00_B01 = psq_l(0, &m1, 0, 0); + float D00_D01 = ps_muls0(B00_B01, A00_A01); + float B02_B03 = psq_l(8, &m1, 1, 0); + float D02_D03 = ps_muls0(B02_B03, A00_A01); + float A10_A11 = psq_l(16, &m2, 0, 0); + float D10_D11 = ps_muls0(B00_B01, A10_A11); + float D12_D13 = ps_muls0(B02_B03, A10_A11); + float A20_A21 = psq_l(32, &m2, 0, 0); + float D20_D21 = ps_muls0(B00_B01, A20_A21); + float D22_D23 = ps_muls0(B02_B03, A20_A21); + float A30_A31 = psq_l(48, &m2, 0, 0); + float B30_B31 = psq_l(48, &m1, 0, 0); + float D30_D31 = ps_madds0(B00_B01, A30_A31, B30_B31); + float B32_B33 = psq_l(56, &m1, 1, 0); + float D32_D33 = ps_madds0(B02_B03, A30_A31, B32_B33); + float B10_B11 = psq_l(16, &m1, 0, 0); + D00_D01 = ps_madds1(B10_B11, A00_A01, D00_D01); + float B12_B13 = psq_l(24, &m1, 1, 0); + D02_D03 = ps_madds1(B12_B13, A00_A01, D02_D03); + D10_D11 = ps_madds1(B10_B11, A10_A11, D10_D11); + D12_D13 = ps_madds1(B12_B13, A10_A11, D12_D13); + D20_D21 = ps_madds1(B10_B11, A20_A21, D20_D21); + D22_D23 = ps_madds1(B12_B13, A20_A21, D22_D23); + D30_D31 = ps_madds1(B10_B11, A30_A31, D30_D31); + D32_D33 = ps_madds1(B12_B13, A30_A31, D32_D33); + float B20_B21 = psq_l(32, &m1, 0, 0); + float A02_A03 = psq_l(8, &m2, 1, 0); + D00_D01 = ps_madds0(B20_B21, A02_A03, D00_D01); + float B22_B23 = psq_l(40, &m1, 1, 0); + D02_D03 = ps_madds0(B22_B23, A02_A03, D02_D03); + float A12_A13 = psq_l(24, &m2, 1, 0); + D10_D11 = ps_madds0(B20_B21, A12_A13, D10_D11); + D12_D13 = ps_madds0(B22_B23, A12_A13, D12_D13); + float A22_A23 = psq_l(40, &m2, 1, 0); + D20_D21 = ps_madds0(B20_B21, A22_A23, D20_D21); + D22_D23 = ps_madds0(B22_B23, A22_A23, D22_D23); + float A32_A33 = psq_l(56, &m2, 1, 0); + D30_D31 = ps_madds0(B20_B21, A32_A33, D30_D31); + D32_D33 = ps_madds0(B22_B23, A32_A33, D32_D33); + psq_st(D00_D01, 0, &out, 0, 0); + psq_st(D02_D03, 8, &out, 1, 0); + psq_st(D10_D11, 16, &out, 0, 0); + psq_st(D12_D13, 24, &out, 1, 0); + psq_st(D20_D21, 32, &out, 0, 0); + psq_st(D22_D23, 40, &out, 1, 0); + psq_st(D30_D31, 48, &out, 0, 0); + psq_st(D32_D33, 56, &out, 1, 0); +#endif return out; }