From 5ea66369f81027edd7ae6019e6ecb7482453d608 Mon Sep 17 00:00:00 2001
From: Ryan Houdek <Sonicadvance1@gmail.com>
Date: Sat, 13 Jun 2015 23:16:56 -0500
Subject: [PATCH] [AArch64] Optimize paired quantized stores. Uses {S,U}QXTN to
 saturate the value to the maximum sizes instead of manually clamping using
 FMIX + FMAX.

---
 Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 85 +++-----------------
 1 file changed, 12 insertions(+), 73 deletions(-)

diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp
index 2e39791e47..c53225ef57 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp
@@ -99,14 +99,6 @@ void JitArm64AsmRoutineManager::Generate()
 	FlushIcache();
 }
 
-static float s_quantize_ranges[] =
-{
-	0.0f, 255.0f,        // U8
-	-128.0, 127.0f,      // S8
-	0.0f, 65535.0f,      // U16
-	-32768.0f, 32767.0f, // S16
-};
-
 void JitArm64AsmRoutineManager::GenerateCommon()
 {
 	// X0 is the scale
@@ -300,15 +292,9 @@ void JitArm64AsmRoutineManager::GenerateCommon()
 			float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
 			float_emit.FMUL(32, D0, D0, D1, 0);
 
-			// Have to clamp the result
-			MOVI2R(X2, (u64)&s_quantize_ranges[0]);
-			float_emit.LD2R(32, D1, X2);
-			float_emit.FMIN(32, D0, D0, D2);
-			float_emit.FMAX(32, D0, D0, D1);
-
 			float_emit.FCVTZU(32, D0, D0);
-			float_emit.XTN(16, D0, D0);
-			float_emit.XTN(8, D0, D0);
+			float_emit.UQXTN(16, D0, D0);
+			float_emit.UQXTN(8, D0, D0);
 		};
 
 		storePairedU8 = GetCodePtr();
@@ -334,15 +320,9 @@ void JitArm64AsmRoutineManager::GenerateCommon()
 			float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
 			float_emit.FMUL(32, D0, D0, D1, 0);
 
-			// Have to clamp the result
-			MOVI2R(X2, (u64)&s_quantize_ranges[1]);
-			float_emit.LD2R(32, D1, X2);
-			float_emit.FMIN(32, D0, D0, D2);
-			float_emit.FMAX(32, D0, D0, D1);
-
 			float_emit.FCVTZS(32, D0, D0);
-			float_emit.XTN(16, D0, D0);
-			float_emit.XTN(8, D0, D0);
+			float_emit.SQXTN(16, D0, D0);
+			float_emit.SQXTN(8, D0, D0);
 		};
 
 		storePairedS8 = GetCodePtr();
@@ -369,14 +349,8 @@ void JitArm64AsmRoutineManager::GenerateCommon()
 			float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
 			float_emit.FMUL(32, D0, D0, D1, 0);
 
-			// Have to clamp the result
-			MOVI2R(X2, (u64)&s_quantize_ranges[2]);
-			float_emit.LD2R(32, D1, X2);
-			float_emit.FMIN(32, D0, D0, D2);
-			float_emit.FMAX(32, D0, D0, D1);
-
 			float_emit.FCVTZU(32, D0, D0);
-			float_emit.XTN(16, D0, D0);
+			float_emit.UQXTN(16, D0, D0);
 			float_emit.REV16(8, D0, D0);
 		};
 
@@ -403,15 +377,8 @@ void JitArm64AsmRoutineManager::GenerateCommon()
 			float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
 			float_emit.FMUL(32, D0, D0, D1, 0);
 
-			// Have to clamp the result
-			MOVI2R(X2, (u64)&s_quantize_ranges[3]);
-			float_emit.LD2R(32, D1, X2);
-			float_emit.FMIN(32, D0, D0, D2);
-			float_emit.FMAX(32, D0, D0, D1);
-
-
 			float_emit.FCVTZS(32, D0, D0);
-			float_emit.XTN(16, D0, D0);
+			float_emit.SQXTN(16, D0, D0);
 			float_emit.REV16(8, D0, D0);
 		};
 
@@ -453,16 +420,9 @@ void JitArm64AsmRoutineManager::GenerateCommon()
 			float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
 			float_emit.FMUL(32, D0, D0, D1);
 
-			// Have to clamp the result
-			MOVI2R(X2, (u64)&s_quantize_ranges[0]);
-			float_emit.LDR(32, INDEX_UNSIGNED, S1, X2, 0);
-			float_emit.LDR(32, INDEX_UNSIGNED, S2, X2, 4);
-			float_emit.FMIN(S0, S0, S2);
-			float_emit.FMAX(S0, S0, S1);
-
 			float_emit.FCVTZU(32, D0, D0);
-			float_emit.XTN(16, D0, D0);
-			float_emit.XTN(8, D0, D0);
+			float_emit.UQXTN(16, D0, D0);
+			float_emit.UQXTN(8, D0, D0);
 		};
 
 		storeSingleU8 = GetCodePtr();
@@ -487,16 +447,9 @@ void JitArm64AsmRoutineManager::GenerateCommon()
 			float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
 			float_emit.FMUL(32, D0, D0, D1);
 
-			// Have to clamp the result
-			MOVI2R(X2, (u64)&s_quantize_ranges[1]);
-			float_emit.LDR(32, INDEX_UNSIGNED, S1, X2, 0);
-			float_emit.LDR(32, INDEX_UNSIGNED, S2, X2, 4);
-			float_emit.FMIN(S0, S0, S2);
-			float_emit.FMAX(S0, S0, S1);
-
 			float_emit.FCVTZS(32, D0, D0);
-			float_emit.XTN(16, D0, D0);
-			float_emit.XTN(8, D0, D0);
+			float_emit.SQXTN(16, D0, D0);
+			float_emit.SQXTN(8, D0, D0);
 		};
 
 		storeSingleS8 = GetCodePtr();
@@ -521,15 +474,8 @@ void JitArm64AsmRoutineManager::GenerateCommon()
 			float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
 			float_emit.FMUL(32, D0, D0, D1);
 
-			// Have to clamp the result
-			MOVI2R(X2, (u64)&s_quantize_ranges[2]);
-			float_emit.LDR(32, INDEX_UNSIGNED, S1, X2, 0);
-			float_emit.LDR(32, INDEX_UNSIGNED, S2, X2, 4);
-			float_emit.FMIN(S0, S0, S2);
-			float_emit.FMAX(S0, S0, S1);
-
 			float_emit.FCVTZU(32, D0, D0);
-			float_emit.XTN(16, D0, D0);
+			float_emit.UQXTN(16, D0, D0);
 		};
 
 		storeSingleU16 = GetCodePtr();
@@ -555,15 +501,8 @@ void JitArm64AsmRoutineManager::GenerateCommon()
 			float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
 			float_emit.FMUL(32, D0, D0, D1);
 
-			// Have to clamp the result
-			MOVI2R(X2, (u64)&s_quantize_ranges[3]);
-			float_emit.LDR(32, INDEX_UNSIGNED, S1, X2, 0);
-			float_emit.LDR(32, INDEX_UNSIGNED, S2, X2, 4);
-			float_emit.FMIN(S0, S0, S2);
-			float_emit.FMAX(S0, S0, S1);
-
 			float_emit.FCVTZS(32, D0, D0);
-			float_emit.XTN(16, D0, D0);
+			float_emit.SQXTN(16, D0, D0);
 		};
 
 		storeSingleS16 = GetCodePtr();