wiiu: use paired singles to speed up matrix multiplication

2025-02-16 18:19:16 +01:00 · 2021-08-19 21:15:56 +02:00 · 2021-08-19 21:15:56 +02:00 · 58e725bd03
commit 58e725bd03
parent aa98030925
1 changed files with 55 additions and 0 deletions
--- a/src/math/Matrix.cpp
+++ b/src/math/Matrix.cpp
@ -1,5 +1,9 @@
 #include "common.h"

+#ifdef ESPRESSO
+#include <paired.h>
+#endif
+
 CMatrix::CMatrix(void)
 {
 	m_attachment = nil;
@ -434,6 +438,7 @@ operator*(const CMatrix &m1, const CMatrix &m2)
 {
 	// TODO: VU0 code
 	CMatrix out;
+#ifndef ESPRESSO
 	out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz;
 	out.ry = m1.ry * m2.rx + m1.fy * m2.ry + m1.uy * m2.rz;
 	out.rz = m1.rz * m2.rx + m1.fz * m2.ry + m1.uz * m2.rz;
@ -446,6 +451,56 @@ operator*(const CMatrix &m1, const CMatrix &m2)
 	out.px = m1.rx * m2.px + m1.fx * m2.py + m1.ux * m2.pz + m1.px;
 	out.py = m1.ry * m2.px + m1.fy * m2.py + m1.uy * m2.pz + m1.py;
 	out.pz = m1.rz * m2.px + m1.fz * m2.py + m1.uz * m2.pz + m1.pz;
+#else
+	float A00_A01 = psq_l(0, &m2, 0, 0);
+	float B00_B01 = psq_l(0, &m1, 0, 0);
+	float D00_D01 = ps_muls0(B00_B01, A00_A01);
+	float B02_B03 = psq_l(8, &m1, 1, 0);
+	float D02_D03 = ps_muls0(B02_B03, A00_A01);
+	float A10_A11 = psq_l(16, &m2, 0, 0);
+	float D10_D11 = ps_muls0(B00_B01, A10_A11);
+	float D12_D13 = ps_muls0(B02_B03, A10_A11);
+	float A20_A21 = psq_l(32, &m2, 0, 0);
+	float D20_D21 = ps_muls0(B00_B01, A20_A21);
+	float D22_D23 = ps_muls0(B02_B03, A20_A21);
+	float A30_A31 = psq_l(48, &m2, 0, 0);
+	float B30_B31 = psq_l(48, &m1, 0, 0);
+	float D30_D31 = ps_madds0(B00_B01, A30_A31, B30_B31);
+	float B32_B33 = psq_l(56, &m1, 1, 0);
+	float D32_D33 = ps_madds0(B02_B03, A30_A31, B32_B33);
+	float B10_B11 = psq_l(16, &m1, 0, 0);
+	D00_D01 = ps_madds1(B10_B11, A00_A01, D00_D01);
+	float B12_B13 = psq_l(24, &m1, 1, 0);
+	D02_D03 = ps_madds1(B12_B13, A00_A01, D02_D03);
+	D10_D11 = ps_madds1(B10_B11, A10_A11, D10_D11);
+	D12_D13 = ps_madds1(B12_B13, A10_A11, D12_D13);
+	D20_D21 = ps_madds1(B10_B11, A20_A21, D20_D21);
+	D22_D23 = ps_madds1(B12_B13, A20_A21, D22_D23);
+	D30_D31 = ps_madds1(B10_B11, A30_A31, D30_D31);
+	D32_D33 = ps_madds1(B12_B13, A30_A31, D32_D33);
+	float B20_B21 = psq_l(32, &m1, 0, 0);
+	float A02_A03 = psq_l(8, &m2, 1, 0);
+	D00_D01 = ps_madds0(B20_B21, A02_A03, D00_D01);
+	float B22_B23 = psq_l(40, &m1, 1, 0);
+	D02_D03 = ps_madds0(B22_B23, A02_A03, D02_D03);
+	float A12_A13 = psq_l(24, &m2, 1, 0);
+	D10_D11 = ps_madds0(B20_B21, A12_A13, D10_D11);
+	D12_D13 = ps_madds0(B22_B23, A12_A13, D12_D13);
+	float A22_A23 = psq_l(40, &m2, 1, 0);
+	D20_D21 = ps_madds0(B20_B21, A22_A23, D20_D21);
+	D22_D23 = ps_madds0(B22_B23, A22_A23, D22_D23);
+	float A32_A33 = psq_l(56, &m2, 1, 0);
+	D30_D31 = ps_madds0(B20_B21, A32_A33, D30_D31);
+	D32_D33 = ps_madds0(B22_B23, A32_A33, D32_D33);
+	psq_st(D00_D01, 0, &out, 0, 0);
+	psq_st(D02_D03, 8, &out, 1, 0);
+	psq_st(D10_D11, 16, &out, 0, 0);
+	psq_st(D12_D13, 24, &out, 1, 0);
+	psq_st(D20_D21, 32, &out, 0, 0);
+	psq_st(D22_D23, 40, &out, 1, 0);
+	psq_st(D30_D31, 48, &out, 0, 0);
+	psq_st(D32_D33, 56, &out, 1, 0);
+#endif
 	return out;
 }