diff --git a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp index 18ed533c0b..0e0adfac28 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp @@ -238,6 +238,8 @@ void Jit64AsmRoutineManager::GenerateCommon() GenFres(); mfcr = AlignCode4(); GenMfcr(); + cdts = AlignCode4(); + GenConvertDoubleToSingle(); GenQuantizedLoads(); GenQuantizedSingleLoads(); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp index 33a9f41a65..2ce40f08c8 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp @@ -110,14 +110,15 @@ void Jit64::stfXXX(UGeckoInstruction inst) RCOpArg Rs = fpr.Use(s, RCMode::Read); RegCache::Realize(Rs); CVTSD2SS(XMM0, Rs); + MOVD_xmm(R(RSCRATCH), XMM0); } else { RCX64Reg Rs = fpr.Bind(s, RCMode::Read); RegCache::Realize(Rs); - ConvertDoubleToSingle(XMM0, Rs); + MOVAPD(XMM0, Rs); + CALL(asm_routines.cdts); } - MOVD_xmm(R(RSCRATCH), XMM0); } else { diff --git a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp index 73986763cb..c7657c7af7 100644 --- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp +++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp @@ -868,89 +868,9 @@ void EmuCodeBlock::Force25BitPrecision(X64Reg output, const OpArg& input, X64Reg } } -// Since the following float conversion functions are used in non-arithmetic PPC float -// instructions, they must convert floats bitexact and never flush denormals to zero or turn SNaNs -// into QNaNs. This means we can't use CVTSS2SD/CVTSD2SS. The x87 FPU doesn't even support -// flush-to-zero so we can use FLD+FSTP even on denormals. -// If the number is a NaN, make sure to set the QNaN bit back to its original value. - -// Another problem is that officially, converting doubles to single format results in undefined -// behavior. Relying on undefined behavior is a bug so no software should ever do this. -// Super Mario 64 (on Wii VC) accidentally relies on this behavior. See issue #11173 - -alignas(16) static const __m128i double_exponent = _mm_set_epi64x(0, 0x7ff0000000000000); -alignas(16) static const __m128i double_fraction = _mm_set_epi64x(0, 0x000fffffffffffff); -alignas(16) static const __m128i double_sign_bit = _mm_set_epi64x(0, 0x8000000000000000); -alignas(16) static const __m128i double_explicit_top_bit = _mm_set_epi64x(0, 0x0010000000000000); -alignas(16) static const __m128i double_top_two_bits = _mm_set_epi64x(0, 0xc000000000000000); -alignas(16) static const __m128i double_bottom_bits = _mm_set_epi64x(0, 0x07ffffffe0000000); alignas(16) static const __m128i double_qnan_bit = _mm_set_epi64x(0xffffffffffffffff, 0xfff7ffffffffffff); -// This is the same algorithm used in the interpreter (and actual hardware) -// The documentation states that the conversion of a double with an outside the -// valid range for a single (or a single denormal) is undefined. -// But testing on actual hardware shows it always picks bits 0..1 and 5..34 -// unless the exponent is in the range of 874 to 896. -void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src) -{ - MOVAPD(XMM1, R(src)); - - // Grab Exponent - PAND(XMM1, MConst(double_exponent)); - PSRLQ(XMM1, 52); - MOVD_xmm(R(RSCRATCH), XMM1); - - // Check if the double is in the range of valid single subnormal - SUB(16, R(RSCRATCH), Imm16(874)); - CMP(16, R(RSCRATCH), Imm16(896 - 874)); - FixupBranch NoDenormalize = J_CC(CC_A); - - // Denormalise - - // shift = (905 - Exponent) plus the 21 bit double to single shift - MOV(16, R(RSCRATCH), Imm16(905 + 21)); - MOVD_xmm(XMM0, R(RSCRATCH)); - PSUBQ(XMM0, R(XMM1)); - - // xmm1 = fraction | 0x0010000000000000 - MOVAPD(XMM1, R(src)); - PAND(XMM1, MConst(double_fraction)); - POR(XMM1, MConst(double_explicit_top_bit)); - - // fraction >> shift - PSRLQ(XMM1, R(XMM0)); - - // OR the sign bit in. - MOVAPD(XMM0, R(src)); - PAND(XMM0, MConst(double_sign_bit)); - PSRLQ(XMM0, 32); - POR(XMM1, R(XMM0)); - - FixupBranch end = J(false); // Goto end - - SetJumpTarget(NoDenormalize); - - // Don't Denormalize - - // We want bits 0, 1 - MOVAPD(XMM1, R(src)); - PAND(XMM1, MConst(double_top_two_bits)); - PSRLQ(XMM1, 32); - - // And 5 through to 34 - MOVAPD(XMM0, R(src)); - PAND(XMM0, MConst(double_bottom_bits)); - PSRLQ(XMM0, 29); - - // OR them togther - POR(XMM1, R(XMM0)); - - // End - SetJumpTarget(end); - MOVDDUP(dst, R(XMM1)); -} - // Converting single->double is a bit easier because all single denormals are double normals. void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr) { diff --git a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp index 569c28edb5..ab06681a88 100644 --- a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp +++ b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp @@ -9,6 +9,7 @@ #include "Common/CPUDetect.h" #include "Common/CommonTypes.h" #include "Common/FloatUtils.h" +#include "Common/Intrinsics.h" #include "Common/JitRegister.h" #include "Common/x64ABI.h" #include "Common/x64Emitter.h" @@ -25,6 +26,97 @@ using namespace Gen; +alignas(16) static const __m128i double_fraction = _mm_set_epi64x(0, 0x000fffffffffffff); +alignas(16) static const __m128i double_sign_bit = _mm_set_epi64x(0, 0x8000000000000000); +alignas(16) static const __m128i double_explicit_top_bit = _mm_set_epi64x(0, 0x0010000000000000); +alignas(16) static const __m128i double_top_two_bits = _mm_set_epi64x(0, 0xc000000000000000); +alignas(16) static const __m128i double_bottom_bits = _mm_set_epi64x(0, 0x07ffffffe0000000); + +// Since the following float conversion functions are used in non-arithmetic PPC float +// instructions, they must convert floats bitexact and never flush denormals to zero or turn SNaNs +// into QNaNs. This means we can't use CVTSS2SD/CVTSD2SS. The x87 FPU doesn't even support +// flush-to-zero so we can use FLD+FSTP even on denormals. +// If the number is a NaN, make sure to set the QNaN bit back to its original value. + +// Another problem is that officially, converting doubles to single format results in undefined +// behavior. Relying on undefined behavior is a bug so no software should ever do this. +// Super Mario 64 (on Wii VC) accidentally relies on this behavior. See issue #11173 + +// This is the same algorithm used in the interpreter (and actual hardware) +// The documentation states that the conversion of a double with an outside the +// valid range for a single (or a single denormal) is undefined. +// But testing on actual hardware shows it always picks bits 0..1 and 5..34 +// unless the exponent is in the range of 874 to 896. + +void CommonAsmRoutines::GenConvertDoubleToSingle() +{ + // Input in XMM0, output to RSCRATCH + // Clobbers RSCRATCH/RSCRATCH2/XMM0/XMM1 + + const void* start = GetCodePtr(); + + // Grab Exponent + MOVQ_xmm(R(RSCRATCH), XMM0); + MOV(64, R(RSCRATCH2), R(RSCRATCH)); + SHR(64, R(RSCRATCH), Imm8(52)); + AND(16, R(RSCRATCH), Imm16(0x7ff)); + + // Check if the double is in the range of valid single subnormal + SUB(16, R(RSCRATCH), Imm16(874)); + CMP(16, R(RSCRATCH), Imm16(896 - 874)); + FixupBranch Denormalize = J_CC(CC_NA); + + // Don't Denormalize + + if (cpu_info.bBMI2) + { + // Extract bits 0-1 and 5-34 + MOV(64, R(RSCRATCH), Imm64(0xc7ffffffe0000000)); + PEXT(64, RSCRATCH, RSCRATCH2, R(RSCRATCH)); + } + else + { + // We want bits 0, 1 + MOVAPD(XMM1, R(XMM0)); + PAND(XMM1, MConst(double_top_two_bits)); + PSRLQ(XMM1, 32); + + // And 5 through to 34 + PAND(XMM0, MConst(double_bottom_bits)); + PSRLQ(XMM0, 29); + + // OR them togther + POR(XMM0, R(XMM1)); + MOVD_xmm(R(RSCRATCH), XMM0); + } + RET(); + + // Denormalise + SetJumpTarget(Denormalize); + + // shift = (905 - Exponent) plus the 21 bit double to single shift + NEG(16, R(RSCRATCH)); + ADD(16, R(RSCRATCH), Imm16((905 + 21) - 874)); + MOVQ_xmm(XMM1, R(RSCRATCH)); + + // XMM0 = fraction | 0x0010000000000000 + PAND(XMM0, MConst(double_fraction)); + POR(XMM0, MConst(double_explicit_top_bit)); + + // fraction >> shift + PSRLQ(XMM0, R(XMM1)); + MOVD_xmm(R(RSCRATCH), XMM0); + + // OR the sign bit in. + SHR(64, R(RSCRATCH2), Imm8(32)); + AND(32, R(RSCRATCH2), Imm32(0x80000000)); + + OR(32, R(RSCRATCH), R(RSCRATCH2)); + RET(); + + JitRegister::Register(start, GetCodePtr(), "JIT_cdts"); +} + void CommonAsmRoutines::GenFrsqrte() { const void* start = GetCodePtr(); diff --git a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h index 306b5e311e..d6fae8c184 100644 --- a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h +++ b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h @@ -31,6 +31,7 @@ public: void GenMfcr(); protected: + void GenConvertDoubleToSingle(); const u8* GenQuantizedLoadRuntime(bool single, EQuantizeType type); const u8* GenQuantizedStoreRuntime(bool single, EQuantizeType type); void GenQuantizedLoads(); diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h index 7463f8e9fa..d8e22a0a3a 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h @@ -25,6 +25,7 @@ struct CommonAsmRoutinesBase const u8* frsqrte; const u8* fres; const u8* mfcr; + const u8* cdts; // In: array index: GQR to use. // In: ECX: Address to read from. diff --git a/Source/UnitTests/Core/CMakeLists.txt b/Source/UnitTests/Core/CMakeLists.txt index 894310e126..1588afefe9 100644 --- a/Source/UnitTests/Core/CMakeLists.txt +++ b/Source/UnitTests/Core/CMakeLists.txt @@ -15,5 +15,8 @@ add_dolphin_test(ESFormatsTest IOS/ES/FormatsTest.cpp IOS/ES/TestBinaryData.cpp) add_dolphin_test(FileSystemTest IOS/FS/FileSystemTest.cpp) if(_M_X86) - add_dolphin_test(PowerPCTest PowerPC/Jit64Common/Frsqrte.cpp) + add_dolphin_test(PowerPCTest + PowerPC/Jit64Common/ConvertDoubleToSingle.cpp + PowerPC/Jit64Common/Frsqrte.cpp + ) endif() diff --git a/Source/UnitTests/Core/PowerPC/Jit64Common/ConvertDoubleToSingle.cpp b/Source/UnitTests/Core/PowerPC/Jit64Common/ConvertDoubleToSingle.cpp new file mode 100644 index 0000000000..41ca31b273 --- /dev/null +++ b/Source/UnitTests/Core/PowerPC/Jit64Common/ConvertDoubleToSingle.cpp @@ -0,0 +1,116 @@ +// Copyright 2019 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#include +#include +#include + +#include "Common/CommonTypes.h" +#include "Common/x64ABI.h" +#include "Core/PowerPC/Gekko.h" +#include "Core/PowerPC/Interpreter/Interpreter_FPUtils.h" +#include "Core/PowerPC/Jit64/Jit.h" +#include "Core/PowerPC/Jit64Common/Jit64AsmCommon.h" +#include "Core/PowerPC/Jit64Common/Jit64PowerPCState.h" + +#include + +namespace +{ +class TestCommonAsmRoutines : public CommonAsmRoutines +{ +public: + TestCommonAsmRoutines() : CommonAsmRoutines(jit) + { + using namespace Gen; + + AllocCodeSpace(4096); + m_const_pool.Init(AllocChildCodeSpace(1024), 1024); + + const auto raw_cdts = reinterpret_cast(AlignCode4()); + GenConvertDoubleToSingle(); + + wrapped_cdts = reinterpret_cast(AlignCode4()); + ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16); + + // Call + MOVQ_xmm(XMM0, R(ABI_PARAM1)); + ABI_CallFunction(raw_cdts); + MOV(32, R(ABI_RETURN), R(RSCRATCH)); + + ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16); + RET(); + } + + u32 (*wrapped_cdts)(u64); + Jit64 jit; +}; +} // namespace + +TEST(Jit64, ConvertDoubleToSingle) +{ + TestCommonAsmRoutines routines; + + const std::vector input_values{ + // Special values + 0x0000'0000'0000'0000, // positive zero + 0x0000'0000'0000'0001, // smallest positive denormal + 0x0000'0000'0100'0000, + 0x000F'FFFF'FFFF'FFFF, // largest positive denormal + 0x0010'0000'0000'0000, // smallest positive normal + 0x0010'0000'0000'0002, + 0x3FF0'0000'0000'0000, // 1.0 + 0x7FEF'FFFF'FFFF'FFFF, // largest positive normal + 0x7FF0'0000'0000'0000, // positive infinity + 0x7FF0'0000'0000'0001, // first positive SNaN + 0x7FF7'FFFF'FFFF'FFFF, // last positive SNaN + 0x7FF8'0000'0000'0000, // first positive QNaN + 0x7FFF'FFFF'FFFF'FFFF, // last positive QNaN + 0x8000'0000'0000'0000, // negative zero + 0x8000'0000'0000'0001, // smallest negative denormal + 0x8000'0000'0100'0000, + 0x800F'FFFF'FFFF'FFFF, // largest negative denormal + 0x8010'0000'0000'0000, // smallest negative normal + 0x8010'0000'0000'0002, + 0xBFF0'0000'0000'0000, // -1.0 + 0xFFEF'FFFF'FFFF'FFFF, // largest negative normal + 0xFFF0'0000'0000'0000, // negative infinity + 0xFFF0'0000'0000'0001, // first negative SNaN + 0xFFF7'FFFF'FFFF'FFFF, // last negative SNaN + 0xFFF8'0000'0000'0000, // first negative QNaN + 0xFFFF'FFFF'FFFF'FFFF, // last negative QNaN + + // (exp > 896) Boundary Case + 0x3800'0000'0000'0000, // 2^(-127) = Denormal in single-prec + 0x3810'0000'0000'0000, // 2^(-126) = Smallest single-prec normal + 0xB800'0000'0000'0000, // -2^(-127) = Denormal in single-prec + 0xB810'0000'0000'0000, // -2^(-126) = Smallest single-prec normal + 0x3800'1234'5678'9ABC, 0x3810'1234'5678'9ABC, 0xB800'1234'5678'9ABC, 0xB810'1234'5678'9ABC, + + // (exp >= 874) Boundary Case + 0x3680'0000'0000'0000, // 2^(-150) = Unrepresentable in single-prec + 0x36A0'0000'0000'0000, // 2^(-149) = Smallest single-prec denormal + 0x36B0'0000'0000'0000, // 2^(-148) = Single-prec denormal + 0xB680'0000'0000'0000, // -2^(-150) = Unrepresentable in single-prec + 0xB6A0'0000'0000'0000, // -2^(-149) = Smallest single-prec denormal + 0xB6B0'0000'0000'0000, // -2^(-148) = Single-prec denormal + 0x3680'1234'5678'9ABC, 0x36A0'1234'5678'9ABC, 0x36B0'1234'5678'9ABC, 0xB680'1234'5678'9ABC, + 0xB6A0'1234'5678'9ABC, 0xB6B0'1234'5678'9ABC, + + // Some typical numbers + 0x3FF8'0000'0000'0000, // 1.5 + 0x408F'4000'0000'0000, // 1000 + 0xC008'0000'0000'0000, // -3 + }; + + for (const u64 input : input_values) + { + const u32 expected = ConvertToSingle(input); + const u32 actual = routines.wrapped_cdts(input); + + printf("%016llx -> %08x == %08x\n", input, actual, expected); + + EXPECT_EQ(expected, actual); + } +} diff --git a/Source/UnitTests/Core/PowerPC/Jit64Common/Frsqrte.cpp b/Source/UnitTests/Core/PowerPC/Jit64Common/Frsqrte.cpp index 6577229382..18e0ee8044 100644 --- a/Source/UnitTests/Core/PowerPC/Jit64Common/Frsqrte.cpp +++ b/Source/UnitTests/Core/PowerPC/Jit64Common/Frsqrte.cpp @@ -16,6 +16,8 @@ #include +namespace +{ class TestCommonAsmRoutines : public CommonAsmRoutines { public: @@ -51,6 +53,7 @@ public: u64 (*wrapped_frsqrte)(u64, UReg_FPSCR&); Jit64 jit; }; +} // namespace TEST(Jit64, Frsqrte) {