From 113c2dcd746d783c419b1767862376eb3d8e3ccf Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Tue, 9 Jun 2015 17:43:25 -0500 Subject: [PATCH] [AArch64] Clamp quantized store ranges. Fixes block dots in THP videos. Nintendo's THP video uses paired U8 stores to write their THP videos after decoding with floating point operations. Paired stores clamp the range to the minimum and maximum values(0 - 255 in this case). In some instances the resulting float will be larger than what a U8 can fit(Typically white) and results in black dots due to how AArch64 handles quantizing. --- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 69 ++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 16534edf5f..2e39791e47 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -99,6 +99,14 @@ void JitArm64AsmRoutineManager::Generate() FlushIcache(); } +static float s_quantize_ranges[] = +{ + 0.0f, 255.0f, // U8 + -128.0, 127.0f, // S8 + 0.0f, 65535.0f, // U16 + -32768.0f, 32767.0f, // S16 +}; + void JitArm64AsmRoutineManager::GenerateCommon() { // X0 is the scale @@ -291,6 +299,13 @@ void JitArm64AsmRoutineManager::GenerateCommon() ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.FMUL(32, D0, D0, D1, 0); + + // Have to clamp the result + MOVI2R(X2, (u64)&s_quantize_ranges[0]); + float_emit.LD2R(32, D1, X2); + float_emit.FMIN(32, D0, D0, D2); + float_emit.FMAX(32, D0, D0, D1); + float_emit.FCVTZU(32, D0, D0); float_emit.XTN(16, D0, D0); float_emit.XTN(8, D0, D0); @@ -318,6 +333,13 @@ void JitArm64AsmRoutineManager::GenerateCommon() ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.FMUL(32, D0, D0, D1, 0); + + // Have to clamp the result + MOVI2R(X2, (u64)&s_quantize_ranges[1]); + float_emit.LD2R(32, D1, X2); + float_emit.FMIN(32, D0, D0, D2); + float_emit.FMAX(32, D0, D0, D1); + float_emit.FCVTZS(32, D0, D0); float_emit.XTN(16, D0, D0); float_emit.XTN(8, D0, D0); @@ -346,6 +368,13 @@ void JitArm64AsmRoutineManager::GenerateCommon() ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.FMUL(32, D0, D0, D1, 0); + + // Have to clamp the result + MOVI2R(X2, (u64)&s_quantize_ranges[2]); + float_emit.LD2R(32, D1, X2); + float_emit.FMIN(32, D0, D0, D2); + float_emit.FMAX(32, D0, D0, D1); + float_emit.FCVTZU(32, D0, D0); float_emit.XTN(16, D0, D0); float_emit.REV16(8, D0, D0); @@ -373,6 +402,14 @@ void JitArm64AsmRoutineManager::GenerateCommon() ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.FMUL(32, D0, D0, D1, 0); + + // Have to clamp the result + MOVI2R(X2, (u64)&s_quantize_ranges[3]); + float_emit.LD2R(32, D1, X2); + float_emit.FMIN(32, D0, D0, D2); + float_emit.FMAX(32, D0, D0, D1); + + float_emit.FCVTZS(32, D0, D0); float_emit.XTN(16, D0, D0); float_emit.REV16(8, D0, D0); @@ -415,6 +452,14 @@ void JitArm64AsmRoutineManager::GenerateCommon() ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.FMUL(32, D0, D0, D1); + + // Have to clamp the result + MOVI2R(X2, (u64)&s_quantize_ranges[0]); + float_emit.LDR(32, INDEX_UNSIGNED, S1, X2, 0); + float_emit.LDR(32, INDEX_UNSIGNED, S2, X2, 4); + float_emit.FMIN(S0, S0, S2); + float_emit.FMAX(S0, S0, S1); + float_emit.FCVTZU(32, D0, D0); float_emit.XTN(16, D0, D0); float_emit.XTN(8, D0, D0); @@ -441,6 +486,14 @@ void JitArm64AsmRoutineManager::GenerateCommon() ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.FMUL(32, D0, D0, D1); + + // Have to clamp the result + MOVI2R(X2, (u64)&s_quantize_ranges[1]); + float_emit.LDR(32, INDEX_UNSIGNED, S1, X2, 0); + float_emit.LDR(32, INDEX_UNSIGNED, S2, X2, 4); + float_emit.FMIN(S0, S0, S2); + float_emit.FMAX(S0, S0, S1); + float_emit.FCVTZS(32, D0, D0); float_emit.XTN(16, D0, D0); float_emit.XTN(8, D0, D0); @@ -467,6 +520,14 @@ void JitArm64AsmRoutineManager::GenerateCommon() ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.FMUL(32, D0, D0, D1); + + // Have to clamp the result + MOVI2R(X2, (u64)&s_quantize_ranges[2]); + float_emit.LDR(32, INDEX_UNSIGNED, S1, X2, 0); + float_emit.LDR(32, INDEX_UNSIGNED, S2, X2, 4); + float_emit.FMIN(S0, S0, S2); + float_emit.FMAX(S0, S0, S1); + float_emit.FCVTZU(32, D0, D0); float_emit.XTN(16, D0, D0); }; @@ -493,6 +554,14 @@ void JitArm64AsmRoutineManager::GenerateCommon() ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); float_emit.FMUL(32, D0, D0, D1); + + // Have to clamp the result + MOVI2R(X2, (u64)&s_quantize_ranges[3]); + float_emit.LDR(32, INDEX_UNSIGNED, S1, X2, 0); + float_emit.LDR(32, INDEX_UNSIGNED, S2, X2, 4); + float_emit.FMIN(S0, S0, S2); + float_emit.FMAX(S0, S0, S1); + float_emit.FCVTZS(32, D0, D0); float_emit.XTN(16, D0, D0); };