diff --git a/Source/Core/Common/MathUtil.cpp b/Source/Core/Common/MathUtil.cpp index 36b1b85eaa..fb48e15df6 100644 --- a/Source/Core/Common/MathUtil.cpp +++ b/Source/Core/Common/MathUtil.cpp @@ -169,6 +169,71 @@ double ApproximateReciprocalSquareRoot(double val) return valf; } +const int fres_expected_base[] = +{ + 0x7ff800, 0x783800, 0x70ea00, 0x6a0800, + 0x638800, 0x5d6200, 0x579000, 0x520800, + 0x4cc800, 0x47ca00, 0x430800, 0x3e8000, + 0x3a2c00, 0x360800, 0x321400, 0x2e4a00, + 0x2aa800, 0x272c00, 0x23d600, 0x209e00, + 0x1d8800, 0x1a9000, 0x17ae00, 0x14f800, + 0x124400, 0x0fbe00, 0x0d3800, 0x0ade00, + 0x088400, 0x065000, 0x041c00, 0x020c00, +}; +const int fres_expected_dec[] = +{ + 0x3e1, 0x3a7, 0x371, 0x340, + 0x313, 0x2ea, 0x2c4, 0x2a0, + 0x27f, 0x261, 0x245, 0x22a, + 0x212, 0x1fb, 0x1e5, 0x1d1, + 0x1be, 0x1ac, 0x19b, 0x18b, + 0x17c, 0x16e, 0x15b, 0x15b, + 0x143, 0x143, 0x12d, 0x12d, + 0x11a, 0x11a, 0x108, 0x106, +}; + +// Used by fres and ps_res. +double ApproximateReciprocal(double val) +{ + union + { + double valf; + s64 vali; + }; + + valf = val; + s64 mantissa = vali & ((1LL << 52) - 1); + s64 sign = vali & (1ULL << 63); + s64 exponent = vali & (0x7FFLL << 52); + + // Special case 0 + if (mantissa == 0 && exponent == 0) + return sign ? -std::numeric_limits::infinity() : std::numeric_limits::infinity(); + + // Special case NaN-ish numbers + if (exponent == (0x7FFLL << 52)) + { + if (mantissa == 0) + return sign ? -0.0 : 0.0; + return 0.0 + valf; + } + + // Special case small inputs + if (exponent < (895LL << 52)) + return sign ? -std::numeric_limits::max() : std::numeric_limits::max(); + + // Special case large inputs + if (exponent >= (1149LL << 52)) + return sign ? -0.0f : 0.0f; + + exponent = (0x7FDLL << 52) - exponent; + + int i = (int)(mantissa >> 37); + vali = sign | exponent; + vali |= (s64)(fres_expected_base[i / 1024] - (fres_expected_dec[i / 1024] * (i % 1024) + 1) / 2) << 29; + return valf; +} + } // namespace inline void MatrixMul(int n, const float *a, const float *b, float *result) diff --git a/Source/Core/Common/MathUtil.h b/Source/Core/Common/MathUtil.h index bbdeb0fac8..b87a2650cb 100644 --- a/Source/Core/Common/MathUtil.h +++ b/Source/Core/Common/MathUtil.h @@ -125,9 +125,12 @@ u32 ClassifyFloat(float fvalue); extern const int frsqrte_expected_base[]; extern const int frsqrte_expected_dec[]; +extern const int fres_expected_base[]; +extern const int fres_expected_dec[]; -// The PowerPC approximate square root algorithm +// PowerPC approximation algorithms double ApproximateReciprocalSquareRoot(double val); +double ApproximateReciprocal(double val); template struct Rectangle diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h index 91b7300287..d98951b6a0 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h @@ -260,68 +260,3 @@ inline u64 ConvertToDouble(u32 _x) } } -// Used by fres and ps_res. -inline double ApproximateReciprocal(double val) -{ - static const int expected_base[] = { - 0x7ff800, 0x783800, 0x70ea00, 0x6a0800, - 0x638800, 0x5d6200, 0x579000, 0x520800, - 0x4cc800, 0x47ca00, 0x430800, 0x3e8000, - 0x3a2c00, 0x360800, 0x321400, 0x2e4a00, - 0x2aa800, 0x272c00, 0x23d600, 0x209e00, - 0x1d8800, 0x1a9000, 0x17ae00, 0x14f800, - 0x124400, 0x0fbe00, 0x0d3800, 0x0ade00, - 0x088400, 0x065000, 0x041c00, 0x020c00, - }; - static const int expected_dec[] = { - 0x3e1, 0x3a7, 0x371, 0x340, - 0x313, 0x2ea, 0x2c4, 0x2a0, - 0x27f, 0x261, 0x245, 0x22a, - 0x212, 0x1fb, 0x1e5, 0x1d1, - 0x1be, 0x1ac, 0x19b, 0x18b, - 0x17c, 0x16e, 0x15b, 0x15b, - 0x143, 0x143, 0x12d, 0x12d, - 0x11a, 0x11a, 0x108, 0x106, - }; - - union - { - double valf; - s64 vali; - }; - - valf = val; - s64 mantissa = vali & ((1LL << 52) - 1); - s64 sign = vali & (1ULL << 63); - s64 exponent = vali & (0x7FFLL << 52); - - // Special case 0 - if (mantissa == 0 && exponent == 0) - return sign ? -std::numeric_limits::infinity() : - std::numeric_limits::infinity(); - - // Special case NaN-ish numbers - if (exponent == (0x7FFLL << 52)) - { - if (mantissa == 0) - return sign ? -0.0 : 0.0; - return 0.0 + valf; - } - - // Special case small inputs - if (exponent < (895LL << 52)) - return sign ? -std::numeric_limits::max() : - std::numeric_limits::max(); - - // Special case large inputs - if (exponent >= (1149LL << 52)) - return sign ? -0.0f : 0.0f; - - exponent = (0x7FDLL << 52) - exponent; - - int i = (int)(mantissa >> 37); - vali = sign | exponent; - vali |= (s64)(expected_base[i / 1024] - (expected_dec[i / 1024] * (i % 1024) + 1) / 2) << 29; - return valf; -} - diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 2d6c98534c..c0b5c73260 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -190,6 +190,7 @@ public: void fmrx(UGeckoInstruction inst); void frspx(UGeckoInstruction inst); void frsqrtex(UGeckoInstruction inst); + void fresx(UGeckoInstruction inst); void cmpXX(UGeckoInstruction inst); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp index 52743a4b40..fa7c19aec8 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp @@ -324,7 +324,7 @@ static GekkoOPTemplate table59[] = {20, &Jit64::fp_arith}, //"fsubsx", OPTYPE_FPU, FL_RC_BIT_F}}, {21, &Jit64::fp_arith}, //"faddsx", OPTYPE_FPU, FL_RC_BIT_F}}, // {22, &Jit64::FallBackToInterpreter}, //"fsqrtsx", OPTYPE_FPU, FL_RC_BIT_F}}, // Not implemented on gekko - {24, &Jit64::FallBackToInterpreter}, //"fresx", OPTYPE_FPU, FL_RC_BIT_F}}, + {24, &Jit64::fresx}, //"fresx", OPTYPE_FPU, FL_RC_BIT_F}}, {25, &Jit64::fp_arith}, //"fmulsx", OPTYPE_FPU, FL_RC_BIT_F}}, {28, &Jit64::fmaddXX}, //"fmsubsx", OPTYPE_FPU, FL_RC_BIT_F}}, {29, &Jit64::fmaddXX}, //"fmaddsx", OPTYPE_FPU, FL_RC_BIT_F}}, diff --git a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp index ae5525beaa..2682ea80f2 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp @@ -151,6 +151,8 @@ void Jit64AsmRoutineManager::GenerateCommon() GenFifoFloatWrite(); frsqrte = AlignCode4(); GenFrsqrte(); + fres = AlignCode4(); + GenFres(); GenQuantizedLoads(); GenQuantizedStores(); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index eb9cee7b10..3c7a8d3d0a 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -386,3 +386,27 @@ void Jit64::frsqrtex(UGeckoInstruction inst) fpr.UnlockAll(); gpr.UnlockAllX(); } + +void Jit64::fresx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + int b = inst.FB; + int d = inst.FD; + static double test[2]; + + // resx requires ECX and EDX free + gpr.FlushLockX(ECX, EDX); + fpr.Lock(b, d); + fpr.BindToRegister(d, d == b); + MOVSD(XMM0, fpr.R(b)); + MOVSD(M(&test[0]), XMM0); + CALL((void *)asm_routines.fres); + MOVSD(M(&test[1]), XMM0); + MOVSD(fpr.R(d), XMM0); + SetFPRFIfNeeded(inst, fpr.RX(d)); + ERROR_LOG(COMMON, "%f %f\n", test[0], test[1]); + fpr.UnlockAll(); + gpr.UnlockAllX(); +} diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp index fdc1d6fdc6..7ab095bf36 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp @@ -117,6 +117,65 @@ void CommonAsmRoutines::GenFrsqrte() RET(); } +void CommonAsmRoutines::GenFres() +{ + // Assume input in XMM0. + // This function clobbers EAX, ECX, and EDX. + MOVQ_xmm(R(RAX), XMM0); + + // Zero inputs set an exception and take the complex path. + TEST(64, R(RAX), R(RAX)); + FixupBranch zero = J_CC(CC_Z); + + MOV(64, R(RCX), R(RAX)); + SHR(64, R(RCX), Imm8(52)); + MOV(32, R(EDX), R(ECX)); + AND(32, R(ECX), Imm32(0x7FF)); // exp + AND(32, R(EDX), Imm32(0x800)); // sign + CMP(32, R(ECX), Imm32(895)); + // Take the complex path for very large/small exponents. + FixupBranch complex1 = J_CC(CC_L); + CMP(32, R(ECX), Imm32(1149)); + FixupBranch complex2 = J_CC(CC_GE); + + SUB(32, R(ECX), Imm32(0x7FD)); + NEG(32, R(ECX)); + OR(32, R(ECX), R(EDX)); + SHL(64, R(RCX), Imm8(52)); // vali = sign | exponent + + MOV(64, R(RDX), R(RAX)); + SHR(64, R(RAX), Imm8(37)); + SHR(64, R(RDX), Imm8(47)); + AND(32, R(EAX), Imm32(0x3FF)); // i % 1024 + AND(32, R(RDX), Imm8(0x1F)); // i / 1024 + + IMUL(32, EAX, MScaled(RDX, SCALE_4, (u32)(u64)MathUtil::fres_expected_dec)); + ADD(32, R(EAX), Imm8(1)); + SHR(32, R(EAX), Imm8(1)); + + MOV(32, R(EDX), MScaled(RDX, SCALE_4, (u32)(u64)MathUtil::fres_expected_base)); + SUB(32, R(EDX), R(EAX)); + SHL(64, R(RDX), Imm8(29)); + OR(64, R(RDX), R(RCX)); // vali |= (s64)(fres_expected_base[i / 1024] - (fres_expected_dec[i / 1024] * (i % 1024) + 1) / 2) << 29 + MOVQ_xmm(XMM0, R(RDX)); + RET(); + + // Exception flags for zero input. + SetJumpTarget(zero); + TEST(32, M(&FPSCR), Imm32(FPSCR_ZX)); + FixupBranch skip_set_fx1 = J_CC(CC_NZ); + OR(32, M(&FPSCR), Imm32(FPSCR_FX)); + SetJumpTarget(skip_set_fx1); + OR(32, M(&FPSCR), Imm32(FPSCR_ZX)); + + SetJumpTarget(complex1); + SetJumpTarget(complex2); + ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, false); + ABI_CallFunction((void *)&MathUtil::ApproximateReciprocal); + ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, false); + RET(); +} + // Safe + Fast Quantizers, originally from JITIL by magumagu static const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h index 938a14d9fd..1ae548bce1 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h @@ -25,6 +25,7 @@ public: const u8 *doTiming; const u8 *frsqrte; + const u8 *fres; // In: array index: GQR to use. // In: ECX: Address to read from. @@ -59,5 +60,5 @@ public: void GenFifoXmm64Write(); void GenFifoFloatWrite(); void GenFrsqrte(); - + void GenFres(); };