Merge pull request #8120 from MerryMage/cdts

Jit64: Make DoubleToSingle a common asm routine
2025-04-12 09:51:24 +02:00 · 2020-01-25 19:10:37 +00:00 · 2020-01-25 19:10:37 +00:00 · 709862b818
commit 709862b818
parent e3a7922e12 f6afce781f
9 changed files with 222 additions and 83 deletions
--- a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp
@ -238,6 +238,8 @@ void Jit64AsmRoutineManager::GenerateCommon()
  GenFres();
  mfcr = AlignCode4();
  GenMfcr();
+  cdts = AlignCode4();
+  GenConvertDoubleToSingle();

  GenQuantizedLoads();
  GenQuantizedSingleLoads();
--- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
@ -110,14 +110,15 @@ void Jit64::stfXXX(UGeckoInstruction inst)
      RCOpArg Rs = fpr.Use(s, RCMode::Read);
      RegCache::Realize(Rs);
      CVTSD2SS(XMM0, Rs);
+      MOVD_xmm(R(RSCRATCH), XMM0);
    }
    else
    {
      RCX64Reg Rs = fpr.Bind(s, RCMode::Read);
      RegCache::Realize(Rs);
-      ConvertDoubleToSingle(XMM0, Rs);
+      MOVAPD(XMM0, Rs);
+      CALL(asm_routines.cdts);
    }
-    MOVD_xmm(R(RSCRATCH), XMM0);
  }
  else
  {
--- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
+++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
@ -868,89 +868,9 @@ void EmuCodeBlock::Force25BitPrecision(X64Reg output, const OpArg& input, X64Reg
  }
 }

-// Since the following float conversion functions are used in non-arithmetic PPC float
-// instructions, they must convert floats bitexact and never flush denormals to zero or turn SNaNs
-// into QNaNs. This means we can't use CVTSS2SD/CVTSD2SS. The x87 FPU doesn't even support
-// flush-to-zero so we can use FLD+FSTP even on denormals.
-// If the number is a NaN, make sure to set the QNaN bit back to its original value.
-
-// Another problem is that officially, converting doubles to single format results in undefined
-// behavior.  Relying on undefined behavior is a bug so no software should ever do this.
-// Super Mario 64 (on Wii VC) accidentally relies on this behavior.  See issue #11173
-
-alignas(16) static const __m128i double_exponent = _mm_set_epi64x(0, 0x7ff0000000000000);
-alignas(16) static const __m128i double_fraction = _mm_set_epi64x(0, 0x000fffffffffffff);
-alignas(16) static const __m128i double_sign_bit = _mm_set_epi64x(0, 0x8000000000000000);
-alignas(16) static const __m128i double_explicit_top_bit = _mm_set_epi64x(0, 0x0010000000000000);
-alignas(16) static const __m128i double_top_two_bits = _mm_set_epi64x(0, 0xc000000000000000);
-alignas(16) static const __m128i double_bottom_bits = _mm_set_epi64x(0, 0x07ffffffe0000000);
 alignas(16) static const __m128i double_qnan_bit = _mm_set_epi64x(0xffffffffffffffff,
                                                                  0xfff7ffffffffffff);

-// This is the same algorithm used in the interpreter (and actual hardware)
-// The documentation states that the conversion of a double with an outside the
-// valid range for a single (or a single denormal) is undefined.
-// But testing on actual hardware shows it always picks bits 0..1 and 5..34
-// unless the exponent is in the range of 874 to 896.
-void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
-{
-  MOVAPD(XMM1, R(src));
-
-  // Grab Exponent
-  PAND(XMM1, MConst(double_exponent));
-  PSRLQ(XMM1, 52);
-  MOVD_xmm(R(RSCRATCH), XMM1);
-
-  // Check if the double is in the range of valid single subnormal
-  SUB(16, R(RSCRATCH), Imm16(874));
-  CMP(16, R(RSCRATCH), Imm16(896 - 874));
-  FixupBranch NoDenormalize = J_CC(CC_A);
-
-  // Denormalise
-
-  // shift = (905 - Exponent) plus the 21 bit double to single shift
-  MOV(16, R(RSCRATCH), Imm16(905 + 21));
-  MOVD_xmm(XMM0, R(RSCRATCH));
-  PSUBQ(XMM0, R(XMM1));
-
-  // xmm1 = fraction | 0x0010000000000000
-  MOVAPD(XMM1, R(src));
-  PAND(XMM1, MConst(double_fraction));
-  POR(XMM1, MConst(double_explicit_top_bit));
-
-  // fraction >> shift
-  PSRLQ(XMM1, R(XMM0));
-
-  // OR the sign bit in.
-  MOVAPD(XMM0, R(src));
-  PAND(XMM0, MConst(double_sign_bit));
-  PSRLQ(XMM0, 32);
-  POR(XMM1, R(XMM0));
-
-  FixupBranch end = J(false);  // Goto end
-
-  SetJumpTarget(NoDenormalize);
-
-  // Don't Denormalize
-
-  // We want bits 0, 1
-  MOVAPD(XMM1, R(src));
-  PAND(XMM1, MConst(double_top_two_bits));
-  PSRLQ(XMM1, 32);
-
-  // And 5 through to 34
-  MOVAPD(XMM0, R(src));
-  PAND(XMM0, MConst(double_bottom_bits));
-  PSRLQ(XMM0, 29);
-
-  // OR them togther
-  POR(XMM1, R(XMM0));
-
-  // End
-  SetJumpTarget(end);
-  MOVDDUP(dst, R(XMM1));
-}
-
 // Converting single->double is a bit easier because all single denormals are double normals.
 void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr)
 {
--- a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp
+++ b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp
@ -9,6 +9,7 @@
 #include "Common/CPUDetect.h"
 #include "Common/CommonTypes.h"
 #include "Common/FloatUtils.h"
+#include "Common/Intrinsics.h"
 #include "Common/JitRegister.h"
 #include "Common/x64ABI.h"
 #include "Common/x64Emitter.h"
@ -25,6 +26,97 @@

 using namespace Gen;

+alignas(16) static const __m128i double_fraction = _mm_set_epi64x(0, 0x000fffffffffffff);
+alignas(16) static const __m128i double_sign_bit = _mm_set_epi64x(0, 0x8000000000000000);
+alignas(16) static const __m128i double_explicit_top_bit = _mm_set_epi64x(0, 0x0010000000000000);
+alignas(16) static const __m128i double_top_two_bits = _mm_set_epi64x(0, 0xc000000000000000);
+alignas(16) static const __m128i double_bottom_bits = _mm_set_epi64x(0, 0x07ffffffe0000000);
+
+// Since the following float conversion functions are used in non-arithmetic PPC float
+// instructions, they must convert floats bitexact and never flush denormals to zero or turn SNaNs
+// into QNaNs. This means we can't use CVTSS2SD/CVTSD2SS. The x87 FPU doesn't even support
+// flush-to-zero so we can use FLD+FSTP even on denormals.
+// If the number is a NaN, make sure to set the QNaN bit back to its original value.
+
+// Another problem is that officially, converting doubles to single format results in undefined
+// behavior.  Relying on undefined behavior is a bug so no software should ever do this.
+// Super Mario 64 (on Wii VC) accidentally relies on this behavior.  See issue #11173
+
+// This is the same algorithm used in the interpreter (and actual hardware)
+// The documentation states that the conversion of a double with an outside the
+// valid range for a single (or a single denormal) is undefined.
+// But testing on actual hardware shows it always picks bits 0..1 and 5..34
+// unless the exponent is in the range of 874 to 896.
+
+void CommonAsmRoutines::GenConvertDoubleToSingle()
+{
+  // Input in XMM0, output to RSCRATCH
+  // Clobbers RSCRATCH/RSCRATCH2/XMM0/XMM1
+
+  const void* start = GetCodePtr();
+
+  // Grab Exponent
+  MOVQ_xmm(R(RSCRATCH), XMM0);
+  MOV(64, R(RSCRATCH2), R(RSCRATCH));
+  SHR(64, R(RSCRATCH), Imm8(52));
+  AND(16, R(RSCRATCH), Imm16(0x7ff));
+
+  // Check if the double is in the range of valid single subnormal
+  SUB(16, R(RSCRATCH), Imm16(874));
+  CMP(16, R(RSCRATCH), Imm16(896 - 874));
+  FixupBranch Denormalize = J_CC(CC_NA);
+
+  // Don't Denormalize
+
+  if (cpu_info.bBMI2)
+  {
+    // Extract bits 0-1 and 5-34
+    MOV(64, R(RSCRATCH), Imm64(0xc7ffffffe0000000));
+    PEXT(64, RSCRATCH, RSCRATCH2, R(RSCRATCH));
+  }
+  else
+  {
+    // We want bits 0, 1
+    MOVAPD(XMM1, R(XMM0));
+    PAND(XMM1, MConst(double_top_two_bits));
+    PSRLQ(XMM1, 32);
+
+    // And 5 through to 34
+    PAND(XMM0, MConst(double_bottom_bits));
+    PSRLQ(XMM0, 29);
+
+    // OR them togther
+    POR(XMM0, R(XMM1));
+    MOVD_xmm(R(RSCRATCH), XMM0);
+  }
+  RET();
+
+  // Denormalise
+  SetJumpTarget(Denormalize);
+
+  // shift = (905 - Exponent) plus the 21 bit double to single shift
+  NEG(16, R(RSCRATCH));
+  ADD(16, R(RSCRATCH), Imm16((905 + 21) - 874));
+  MOVQ_xmm(XMM1, R(RSCRATCH));
+
+  // XMM0 = fraction | 0x0010000000000000
+  PAND(XMM0, MConst(double_fraction));
+  POR(XMM0, MConst(double_explicit_top_bit));
+
+  // fraction >> shift
+  PSRLQ(XMM0, R(XMM1));
+  MOVD_xmm(R(RSCRATCH), XMM0);
+
+  // OR the sign bit in.
+  SHR(64, R(RSCRATCH2), Imm8(32));
+  AND(32, R(RSCRATCH2), Imm32(0x80000000));
+
+  OR(32, R(RSCRATCH), R(RSCRATCH2));
+  RET();
+
+  JitRegister::Register(start, GetCodePtr(), "JIT_cdts");
+}
+
 void CommonAsmRoutines::GenFrsqrte()
 {
  const void* start = GetCodePtr();
--- a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h
+++ b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h
@ -31,6 +31,7 @@ public:
  void GenMfcr();

 protected:
+  void GenConvertDoubleToSingle();
  const u8* GenQuantizedLoadRuntime(bool single, EQuantizeType type);
  const u8* GenQuantizedStoreRuntime(bool single, EQuantizeType type);
  void GenQuantizedLoads();
--- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h
+++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h
@ -25,6 +25,7 @@ struct CommonAsmRoutinesBase
  const u8* frsqrte;
  const u8* fres;
  const u8* mfcr;
+  const u8* cdts;

  // In: array index: GQR to use.
  // In: ECX: Address to read from.
--- a/Source/UnitTests/Core/CMakeLists.txt
+++ b/Source/UnitTests/Core/CMakeLists.txt
@ -15,5 +15,8 @@ add_dolphin_test(ESFormatsTest IOS/ES/FormatsTest.cpp IOS/ES/TestBinaryData.cpp)
 add_dolphin_test(FileSystemTest IOS/FS/FileSystemTest.cpp)

 if(_M_X86)
-  add_dolphin_test(PowerPCTest PowerPC/Jit64Common/Frsqrte.cpp)
+  add_dolphin_test(PowerPCTest
+    PowerPC/Jit64Common/ConvertDoubleToSingle.cpp
+    PowerPC/Jit64Common/Frsqrte.cpp
+  )
 endif()
--- a/Source/UnitTests/Core/PowerPC/Jit64Common/ConvertDoubleToSingle.cpp
+++ b/Source/UnitTests/Core/PowerPC/Jit64Common/ConvertDoubleToSingle.cpp
@ -0,0 +1,116 @@
+// Copyright 2019 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include <cstring>
+#include <tuple>
+#include <vector>
+
+#include "Common/CommonTypes.h"
+#include "Common/x64ABI.h"
+#include "Core/PowerPC/Gekko.h"
+#include "Core/PowerPC/Interpreter/Interpreter_FPUtils.h"
+#include "Core/PowerPC/Jit64/Jit.h"
+#include "Core/PowerPC/Jit64Common/Jit64AsmCommon.h"
+#include "Core/PowerPC/Jit64Common/Jit64PowerPCState.h"
+
+#include <gtest/gtest.h>
+
+namespace
+{
+class TestCommonAsmRoutines : public CommonAsmRoutines
+{
+public:
+  TestCommonAsmRoutines() : CommonAsmRoutines(jit)
+  {
+    using namespace Gen;
+
+    AllocCodeSpace(4096);
+    m_const_pool.Init(AllocChildCodeSpace(1024), 1024);
+
+    const auto raw_cdts = reinterpret_cast<double (*)(double)>(AlignCode4());
+    GenConvertDoubleToSingle();
+
+    wrapped_cdts = reinterpret_cast<u32 (*)(u64)>(AlignCode4());
+    ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16);
+
+    // Call
+    MOVQ_xmm(XMM0, R(ABI_PARAM1));
+    ABI_CallFunction(raw_cdts);
+    MOV(32, R(ABI_RETURN), R(RSCRATCH));
+
+    ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16);
+    RET();
+  }
+
+  u32 (*wrapped_cdts)(u64);
+  Jit64 jit;
+};
+}  // namespace
+
+TEST(Jit64, ConvertDoubleToSingle)
+{
+  TestCommonAsmRoutines routines;
+
+  const std::vector<u64> input_values{
+      // Special values
+      0x0000'0000'0000'0000,  // positive zero
+      0x0000'0000'0000'0001,  // smallest positive denormal
+      0x0000'0000'0100'0000,
+      0x000F'FFFF'FFFF'FFFF,  // largest positive denormal
+      0x0010'0000'0000'0000,  // smallest positive normal
+      0x0010'0000'0000'0002,
+      0x3FF0'0000'0000'0000,  // 1.0
+      0x7FEF'FFFF'FFFF'FFFF,  // largest positive normal
+      0x7FF0'0000'0000'0000,  // positive infinity
+      0x7FF0'0000'0000'0001,  // first positive SNaN
+      0x7FF7'FFFF'FFFF'FFFF,  // last positive SNaN
+      0x7FF8'0000'0000'0000,  // first positive QNaN
+      0x7FFF'FFFF'FFFF'FFFF,  // last positive QNaN
+      0x8000'0000'0000'0000,  // negative zero
+      0x8000'0000'0000'0001,  // smallest negative denormal
+      0x8000'0000'0100'0000,
+      0x800F'FFFF'FFFF'FFFF,  // largest negative denormal
+      0x8010'0000'0000'0000,  // smallest negative normal
+      0x8010'0000'0000'0002,
+      0xBFF0'0000'0000'0000,  // -1.0
+      0xFFEF'FFFF'FFFF'FFFF,  // largest negative normal
+      0xFFF0'0000'0000'0000,  // negative infinity
+      0xFFF0'0000'0000'0001,  // first negative SNaN
+      0xFFF7'FFFF'FFFF'FFFF,  // last negative SNaN
+      0xFFF8'0000'0000'0000,  // first negative QNaN
+      0xFFFF'FFFF'FFFF'FFFF,  // last negative QNaN
+
+      // (exp > 896) Boundary Case
+      0x3800'0000'0000'0000,  // 2^(-127) = Denormal in single-prec
+      0x3810'0000'0000'0000,  // 2^(-126) = Smallest single-prec normal
+      0xB800'0000'0000'0000,  // -2^(-127) = Denormal in single-prec
+      0xB810'0000'0000'0000,  // -2^(-126) = Smallest single-prec normal
+      0x3800'1234'5678'9ABC, 0x3810'1234'5678'9ABC, 0xB800'1234'5678'9ABC, 0xB810'1234'5678'9ABC,
+
+      // (exp >= 874) Boundary Case
+      0x3680'0000'0000'0000,  // 2^(-150) = Unrepresentable in single-prec
+      0x36A0'0000'0000'0000,  // 2^(-149) = Smallest single-prec denormal
+      0x36B0'0000'0000'0000,  // 2^(-148) = Single-prec denormal
+      0xB680'0000'0000'0000,  // -2^(-150) = Unrepresentable in single-prec
+      0xB6A0'0000'0000'0000,  // -2^(-149) = Smallest single-prec denormal
+      0xB6B0'0000'0000'0000,  // -2^(-148) = Single-prec denormal
+      0x3680'1234'5678'9ABC, 0x36A0'1234'5678'9ABC, 0x36B0'1234'5678'9ABC, 0xB680'1234'5678'9ABC,
+      0xB6A0'1234'5678'9ABC, 0xB6B0'1234'5678'9ABC,
+
+      // Some typical numbers
+      0x3FF8'0000'0000'0000,  // 1.5
+      0x408F'4000'0000'0000,  // 1000
+      0xC008'0000'0000'0000,  // -3
+  };
+
+  for (const u64 input : input_values)
+  {
+    const u32 expected = ConvertToSingle(input);
+    const u32 actual = routines.wrapped_cdts(input);
+
+    printf("%016llx -> %08x == %08x\n", input, actual, expected);
+
+    EXPECT_EQ(expected, actual);
+  }
+}
--- a/Source/UnitTests/Core/PowerPC/Jit64Common/Frsqrte.cpp
+++ b/Source/UnitTests/Core/PowerPC/Jit64Common/Frsqrte.cpp
@ -16,6 +16,8 @@

 #include <gtest/gtest.h>

+namespace
+{
 class TestCommonAsmRoutines : public CommonAsmRoutines
 {
 public:
@ -51,6 +53,7 @@ public:
  u64 (*wrapped_frsqrte)(u64, UReg_FPSCR&);
  Jit64 jit;
 };
+}  // namespace

 TEST(Jit64, Frsqrte)
 {