wiiu: use paired singles to speed up matrix multiplication

This commit is contained in:
GaryOderNichts 2021-08-19 21:15:56 +02:00
parent aa98030925
commit 58e725bd03

View File

@ -1,5 +1,9 @@
#include "common.h" #include "common.h"
#ifdef ESPRESSO
#include <paired.h>
#endif
CMatrix::CMatrix(void) CMatrix::CMatrix(void)
{ {
m_attachment = nil; m_attachment = nil;
@ -434,6 +438,7 @@ operator*(const CMatrix &m1, const CMatrix &m2)
{ {
// TODO: VU0 code // TODO: VU0 code
CMatrix out; CMatrix out;
#ifndef ESPRESSO
out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz; out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz;
out.ry = m1.ry * m2.rx + m1.fy * m2.ry + m1.uy * m2.rz; out.ry = m1.ry * m2.rx + m1.fy * m2.ry + m1.uy * m2.rz;
out.rz = m1.rz * m2.rx + m1.fz * m2.ry + m1.uz * m2.rz; out.rz = m1.rz * m2.rx + m1.fz * m2.ry + m1.uz * m2.rz;
@ -446,6 +451,56 @@ operator*(const CMatrix &m1, const CMatrix &m2)
out.px = m1.rx * m2.px + m1.fx * m2.py + m1.ux * m2.pz + m1.px; out.px = m1.rx * m2.px + m1.fx * m2.py + m1.ux * m2.pz + m1.px;
out.py = m1.ry * m2.px + m1.fy * m2.py + m1.uy * m2.pz + m1.py; out.py = m1.ry * m2.px + m1.fy * m2.py + m1.uy * m2.pz + m1.py;
out.pz = m1.rz * m2.px + m1.fz * m2.py + m1.uz * m2.pz + m1.pz; out.pz = m1.rz * m2.px + m1.fz * m2.py + m1.uz * m2.pz + m1.pz;
#else
float A00_A01 = psq_l(0, &m2, 0, 0);
float B00_B01 = psq_l(0, &m1, 0, 0);
float D00_D01 = ps_muls0(B00_B01, A00_A01);
float B02_B03 = psq_l(8, &m1, 1, 0);
float D02_D03 = ps_muls0(B02_B03, A00_A01);
float A10_A11 = psq_l(16, &m2, 0, 0);
float D10_D11 = ps_muls0(B00_B01, A10_A11);
float D12_D13 = ps_muls0(B02_B03, A10_A11);
float A20_A21 = psq_l(32, &m2, 0, 0);
float D20_D21 = ps_muls0(B00_B01, A20_A21);
float D22_D23 = ps_muls0(B02_B03, A20_A21);
float A30_A31 = psq_l(48, &m2, 0, 0);
float B30_B31 = psq_l(48, &m1, 0, 0);
float D30_D31 = ps_madds0(B00_B01, A30_A31, B30_B31);
float B32_B33 = psq_l(56, &m1, 1, 0);
float D32_D33 = ps_madds0(B02_B03, A30_A31, B32_B33);
float B10_B11 = psq_l(16, &m1, 0, 0);
D00_D01 = ps_madds1(B10_B11, A00_A01, D00_D01);
float B12_B13 = psq_l(24, &m1, 1, 0);
D02_D03 = ps_madds1(B12_B13, A00_A01, D02_D03);
D10_D11 = ps_madds1(B10_B11, A10_A11, D10_D11);
D12_D13 = ps_madds1(B12_B13, A10_A11, D12_D13);
D20_D21 = ps_madds1(B10_B11, A20_A21, D20_D21);
D22_D23 = ps_madds1(B12_B13, A20_A21, D22_D23);
D30_D31 = ps_madds1(B10_B11, A30_A31, D30_D31);
D32_D33 = ps_madds1(B12_B13, A30_A31, D32_D33);
float B20_B21 = psq_l(32, &m1, 0, 0);
float A02_A03 = psq_l(8, &m2, 1, 0);
D00_D01 = ps_madds0(B20_B21, A02_A03, D00_D01);
float B22_B23 = psq_l(40, &m1, 1, 0);
D02_D03 = ps_madds0(B22_B23, A02_A03, D02_D03);
float A12_A13 = psq_l(24, &m2, 1, 0);
D10_D11 = ps_madds0(B20_B21, A12_A13, D10_D11);
D12_D13 = ps_madds0(B22_B23, A12_A13, D12_D13);
float A22_A23 = psq_l(40, &m2, 1, 0);
D20_D21 = ps_madds0(B20_B21, A22_A23, D20_D21);
D22_D23 = ps_madds0(B22_B23, A22_A23, D22_D23);
float A32_A33 = psq_l(56, &m2, 1, 0);
D30_D31 = ps_madds0(B20_B21, A32_A33, D30_D31);
D32_D33 = ps_madds0(B22_B23, A32_A33, D32_D33);
psq_st(D00_D01, 0, &out, 0, 0);
psq_st(D02_D03, 8, &out, 1, 0);
psq_st(D10_D11, 16, &out, 0, 0);
psq_st(D12_D13, 24, &out, 1, 0);
psq_st(D20_D21, 32, &out, 0, 0);
psq_st(D22_D23, 40, &out, 1, 0);
psq_st(D30_D31, 48, &out, 0, 0);
psq_st(D32_D33, 56, &out, 1, 0);
#endif
return out; return out;
} }