diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp index e4c8c6afd8..b5f1e19a61 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp @@ -20,30 +20,31 @@ void Jit64::psq_st(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStorePairedOff); - FALLBACK_IF(js.memcheck || !inst.RA); + FALLBACK_IF(!inst.RA); bool update = inst.OPCD == 61; int offset = inst.SIMM_12; int a = inst.RA; - int s = inst.RS; // Fp numbers + int s = inst.RS; - gpr.FlushLockX(RSCRATCH, RSCRATCH_EXTRA); + gpr.FlushLockX(RSCRATCH_EXTRA); if (update) - gpr.BindToRegister(inst.RA, true, true); - fpr.BindToRegister(inst.RS, true, false); - MOV(32, R(RSCRATCH_EXTRA), gpr.R(inst.RA)); + gpr.BindToRegister(a, true, true); + fpr.BindToRegister(s, true, false); + MOV(32, R(RSCRATCH_EXTRA), gpr.R(a)); if (offset) ADD(32, R(RSCRATCH_EXTRA), Imm32((u32)offset)); - if (update && offset) + // In memcheck mode, don't update the address until the exception check + if (update && offset && !js.memcheck) MOV(32, gpr.R(a), R(RSCRATCH_EXTRA)); // Some games (e.g. Dirt 2) incorrectly set the unused bits which breaks the lookup table code. // Hence, we need to mask out the unused bits. The layout of the GQR register is // UU[SCALE]UUUUU[TYPE] where SCALE is 6 bits and TYPE is 3 bits, so we have to AND with // 0b0011111100000111, or 0x3F07. - MOV(32, R(RSCRATCH), Imm32(0x3F07)); - AND(32, R(RSCRATCH), PPCSTATE(spr[SPR_GQR0 + inst.I])); - MOVZX(32, 8, RSCRATCH2, R(RSCRATCH)); + MOV(32, R(RSCRATCH2), Imm32(0x3F07)); + AND(32, R(RSCRATCH2), PPCSTATE(spr[SPR_GQR0 + inst.I])); + MOVZX(32, 8, RSCRATCH, R(RSCRATCH2)); // FIXME: Fix ModR/M encoding to allow [RSCRATCH2*4+disp32] without a base register! if (inst.W) @@ -51,13 +52,20 @@ void Jit64::psq_st(UGeckoInstruction inst) // One value PXOR(XMM0, R(XMM0)); // TODO: See if we can get rid of this cheaply by tweaking the code in the singleStore* functions. CVTSD2SS(XMM0, fpr.R(s)); - CALLptr(MScaled(RSCRATCH2, SCALE_8, (u32)(u64)asm_routines.singleStoreQuantized)); + CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.singleStoreQuantized)); } else { // Pair of values CVTPD2PS(XMM0, fpr.R(s)); - CALLptr(MScaled(RSCRATCH2, SCALE_8, (u32)(u64)asm_routines.pairedStoreQuantized)); + CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.pairedStoreQuantized)); + } + + if (update && offset && js.memcheck) + { + MEMCHECK_START + ADD(32, gpr.R(a), Imm32((u32)offset)); + MEMCHECK_END } gpr.UnlockAll(); gpr.UnlockAllX(); @@ -67,33 +75,38 @@ void Jit64::psq_l(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStorePairedOff); - FALLBACK_IF(js.memcheck || !inst.RA); + FALLBACK_IF(!inst.RA); bool update = inst.OPCD == 57; int offset = inst.SIMM_12; + int a = inst.RA; + int s = inst.RS; - gpr.FlushLockX(RSCRATCH, RSCRATCH_EXTRA); - gpr.BindToRegister(inst.RA, true, update && offset); - fpr.BindToRegister(inst.RS, false, true); + gpr.FlushLockX(RSCRATCH_EXTRA); + gpr.BindToRegister(a, true, update && offset); + fpr.BindToRegister(s, false, true); if (offset) - LEA(32, RSCRATCH_EXTRA, MDisp(gpr.RX(inst.RA), offset)); + LEA(32, RSCRATCH_EXTRA, MDisp(gpr.RX(a), offset)); else - MOV(32, R(RSCRATCH_EXTRA), gpr.R(inst.RA)); - if (update && offset) - MOV(32, gpr.R(inst.RA), R(RSCRATCH_EXTRA)); - MOV(32, R(RSCRATCH), Imm32(0x3F07)); - AND(32, R(RSCRATCH), M(((char *)&GQR(inst.I)) + 2)); - MOVZX(32, 8, RSCRATCH2, R(RSCRATCH)); + MOV(32, R(RSCRATCH_EXTRA), gpr.R(a)); + // In memcheck mode, don't update the address until the exception check + if (update && offset && !js.memcheck) + MOV(32, gpr.R(a), R(RSCRATCH_EXTRA)); + MOV(32, R(RSCRATCH2), Imm32(0x3F07)); + AND(32, R(RSCRATCH2), M(((char *)&GQR(inst.I)) + 2)); + MOVZX(32, 8, RSCRATCH, R(RSCRATCH2)); if (inst.W) - OR(32, R(RSCRATCH2), Imm8(8)); + OR(32, R(RSCRATCH), Imm8(8)); - CALLptr(MScaled(RSCRATCH2, SCALE_8, (u32)(u64)asm_routines.pairedLoadQuantized)); + CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.pairedLoadQuantized)); - // MEMCHECK_START // FIXME: MMU does not work here because of unsafe memory access - - CVTPS2PD(fpr.RX(inst.RS), R(XMM0)); - - // MEMCHECK_END + MEMCHECK_START + CVTPS2PD(fpr.RX(s), R(XMM0)); + if (update && offset && js.memcheck) + { + ADD(32, gpr.R(a), Imm32((u32)offset)); + } + MEMCHECK_END gpr.UnlockAll(); gpr.UnlockAllX(); diff --git a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp index 695dab795e..8f6ce58408 100644 --- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp +++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp @@ -1590,13 +1590,13 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress) // Hence, we need to mask out the unused bits. The layout of the GQR register is // UU[SCALE]UUUUU[TYPE] where SCALE is 6 bits and TYPE is 3 bits, so we have to AND with // 0b0011111100000111, or 0x3F07. - Jit->MOV(32, R(RSCRATCH), Imm32(0x3F07)); - Jit->AND(32, R(RSCRATCH), M(((char *)&GQR(quantreg)) + 2)); - Jit->MOVZX(32, 8, RSCRATCH2, R(RSCRATCH)); - Jit->OR(32, R(RSCRATCH2), Imm8(w << 3)); + Jit->MOV(32, R(RSCRATCH2), Imm32(0x3F07)); + Jit->AND(32, R(RSCRATCH2), M(((char *)&GQR(quantreg)) + 2)); + Jit->MOVZX(32, 8, RSCRATCH, R(RSCRATCH2)); + Jit->OR(32, R(RSCRATCH), Imm8(w << 3)); Jit->MOV(32, R(RSCRATCH_EXTRA), regLocForInst(RI, getOp1(I))); - Jit->CALLptr(MScaled(RSCRATCH2, SCALE_8, (u32)(u64)(((JitIL *)jit)->asm_routines.pairedLoadQuantized))); + Jit->CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)(((JitIL *)jit)->asm_routines.pairedLoadQuantized))); Jit->MOVAPD(reg, R(XMM0)); RI.fregs[reg] = I; regNormalRegClear(RI, I); @@ -1641,13 +1641,13 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress) regSpill(RI, RSCRATCH); regSpill(RI, RSCRATCH2); u32 quantreg = *I >> 24; - Jit->MOV(32, R(RSCRATCH), Imm32(0x3F07)); - Jit->AND(32, R(RSCRATCH), PPCSTATE(spr[SPR_GQR0 + quantreg])); - Jit->MOVZX(32, 8, RSCRATCH2, R(RSCRATCH)); + Jit->MOV(32, R(RSCRATCH2), Imm32(0x3F07)); + Jit->AND(32, R(RSCRATCH2), PPCSTATE(spr[SPR_GQR0 + quantreg])); + Jit->MOVZX(32, 8, RSCRATCH, R(RSCRATCH2)); Jit->MOV(32, R(RSCRATCH_EXTRA), regLocForInst(RI, getOp2(I))); Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I))); - Jit->CALLptr(MScaled(RSCRATCH2, SCALE_8, (u32)(u64)(((JitIL *)jit)->asm_routines.pairedStoreQuantized))); + Jit->CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)(((JitIL *)jit)->asm_routines.pairedStoreQuantized))); if (RI.IInfo[I - RI.FirstI] & 4) fregClearInst(RI, getOp1(I)); if (RI.IInfo[I - RI.FirstI] & 8) diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp index dbce5dfb85..8a5e7dcfe5 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp @@ -17,6 +17,8 @@ (1 << (XMM0+16)) | \ (1 << (XMM1+16)))) +#define QUANTIZED_REGS_TO_SAVE_LOAD (QUANTIZED_REGS_TO_SAVE | (1 << RSCRATCH2)) + using namespace Gen; static int temp32; @@ -250,24 +252,29 @@ void CommonAsmRoutines::GenQuantizedStores() UD2(); const u8* storePairedFloat = AlignCode4(); + FixupBranch skip_complex, too_complex; SHUFPS(XMM0, R(XMM0), 1); MOVQ_xmm(M(&psTemp[0]), XMM0); - TEST(32, R(RSCRATCH_EXTRA), Imm32(0x0C000000)); - FixupBranch too_complex = J_CC(CC_NZ, true); - MOV(64, R(RSCRATCH), M(&psTemp[0])); - SwapAndStore(64, MComplex(RMEM, RSCRATCH_EXTRA, SCALE_1, 0), RSCRATCH); - FixupBranch skip_complex = J(true); - SetJumpTarget(too_complex); + if (!jit->js.memcheck) + { + TEST(32, R(RSCRATCH_EXTRA), Imm32(0x0C000000)); + too_complex = J_CC(CC_NZ, true); + MOV(64, R(RSCRATCH), M(&psTemp[0])); + SwapAndStore(64, MComplex(RMEM, RSCRATCH_EXTRA, SCALE_1, 0), RSCRATCH); + skip_complex = J(true); + SetJumpTarget(too_complex); + } // RSP alignment here is 8 due to the call. ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8); ABI_CallFunctionR((void *)&WriteDual32, RSCRATCH_EXTRA); ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8); - SetJumpTarget(skip_complex); + if (!jit->js.memcheck) + SetJumpTarget(skip_complex); RET(); const u8* storePairedU8 = AlignCode4(); - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); PUNPCKLDQ(XMM1, R(XMM1)); MULPS(XMM0, R(XMM1)); #ifdef QUANTIZE_OVERFLOW_SAFE @@ -284,8 +291,8 @@ void CommonAsmRoutines::GenQuantizedStores() RET(); const u8* storePairedS8 = AlignCode4(); - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); PUNPCKLDQ(XMM1, R(XMM1)); MULPS(XMM0, R(XMM1)); #ifdef QUANTIZE_OVERFLOW_SAFE @@ -303,8 +310,8 @@ void CommonAsmRoutines::GenQuantizedStores() RET(); const u8* storePairedU16 = AlignCode4(); - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); PUNPCKLDQ(XMM1, R(XMM1)); MULPS(XMM0, R(XMM1)); @@ -329,8 +336,8 @@ void CommonAsmRoutines::GenQuantizedStores() RET(); const u8* storePairedS16 = AlignCode4(); - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); // SHUFPS or UNPCKLPS might be a better choice here. The last one might just be an alias though. PUNPCKLDQ(XMM1, R(XMM1)); MULPS(XMM0, R(XMM1)); @@ -388,8 +395,8 @@ void CommonAsmRoutines::GenQuantizedSingleStores() }*/ const u8* storeSingleU8 = AlignCode4(); // Used by MKWii - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MULSS(XMM0, R(XMM1)); PXOR(XMM1, R(XMM1)); MAXSS(XMM0, R(XMM1)); @@ -399,8 +406,8 @@ void CommonAsmRoutines::GenQuantizedSingleStores() RET(); const u8* storeSingleS8 = AlignCode4(); - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MULSS(XMM0, R(XMM1)); MAXSS(XMM0, M((void *)&m_m128)); MINSS(XMM0, M((void *)&m_127)); @@ -409,8 +416,8 @@ void CommonAsmRoutines::GenQuantizedSingleStores() RET(); const u8* storeSingleU16 = AlignCode4(); // Used by MKWii - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MULSS(XMM0, R(XMM1)); PXOR(XMM1, R(XMM1)); MAXSS(XMM0, R(XMM1)); @@ -420,8 +427,8 @@ void CommonAsmRoutines::GenQuantizedSingleStores() RET(); const u8* storeSingleS16 = AlignCode4(); - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MULSS(XMM0, R(XMM1)); MAXSS(XMM0, M((void *)&m_m32768)); MINSS(XMM0, M((void *)&m_32767)); @@ -448,7 +455,13 @@ void CommonAsmRoutines::GenQuantizedLoads() UD2(); const u8* loadPairedFloatTwo = AlignCode4(); - if (cpu_info.bSSSE3) + if (jit->js.memcheck) + { + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 64, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_PROLOG); + ROL(64, R(RSCRATCH_EXTRA), Imm8(32)); + MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA)); + } + else if (cpu_info.bSSSE3) { MOVQ_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0)); PSHUFB(XMM0, M((void *)pbswapShuffle2x4)); @@ -462,7 +475,13 @@ void CommonAsmRoutines::GenQuantizedLoads() RET(); const u8* loadPairedFloatOne = AlignCode4(); - if (cpu_info.bSSSE3) + if (jit->js.memcheck) + { + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_PROLOG); + MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); + UNPCKLPS(XMM0, M((void*)m_one)); + } + else if (cpu_info.bSSSE3) { MOVD_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0)); PSHUFB(XMM0, M((void *)pbswapShuffle1x4)); @@ -477,99 +496,130 @@ void CommonAsmRoutines::GenQuantizedLoads() RET(); const u8* loadPairedU8Two = AlignCode4(); - UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0); + if (jit->js.memcheck) + { + // TODO: Support not swapping in safeLoadToReg to avoid bswapping twice + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG); + ROR(16, R(RSCRATCH_EXTRA), Imm8(8)); + } + else + { + UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0); + } MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); PXOR(XMM1, R(XMM1)); PUNPCKLBW(XMM0, R(XMM1)); PUNPCKLWD(XMM0, R(XMM1)); CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); PUNPCKLDQ(XMM1, R(XMM1)); MULPS(XMM0, R(XMM1)); RET(); const u8* loadPairedU8One = AlignCode4(); - UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx + if (jit->js.memcheck) + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG); + else + UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); CVTDQ2PS(XMM0, R(XMM0)); // Is CVTSI2SS better? - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); MULSS(XMM0, R(XMM1)); UNPCKLPS(XMM0, M((void*)m_one)); RET(); const u8* loadPairedS8Two = AlignCode4(); - UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0); + if (jit->js.memcheck) + { + // TODO: Support not swapping in safeLoadToReg to avoid bswapping twice + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG); + ROR(16, R(RSCRATCH_EXTRA), Imm8(8)); + } + else + { + UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0); + } MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); PUNPCKLBW(XMM0, R(XMM0)); PUNPCKLWD(XMM0, R(XMM0)); PSRAD(XMM0, 24); CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); PUNPCKLDQ(XMM1, R(XMM1)); MULPS(XMM0, R(XMM1)); RET(); const u8* loadPairedS8One = AlignCode4(); - UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); - SHL(32, R(RSCRATCH_EXTRA), Imm8(24)); - SAR(32, R(RSCRATCH_EXTRA), Imm8(24)); + if (jit->js.memcheck) + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_PROLOG); + else + UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); MULSS(XMM0, R(XMM1)); UNPCKLPS(XMM0, M((void*)m_one)); RET(); const u8* loadPairedU16Two = AlignCode4(); - UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); + // TODO: Support not swapping in (un)safeLoadToReg to avoid bswapping twice + if (jit->js.memcheck) + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG); + else + UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); ROL(32, R(RSCRATCH_EXTRA), Imm8(16)); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); PXOR(XMM1, R(XMM1)); PUNPCKLWD(XMM0, R(XMM1)); CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); PUNPCKLDQ(XMM1, R(XMM1)); MULPS(XMM0, R(XMM1)); RET(); const u8* loadPairedU16One = AlignCode4(); - UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); - SHR(32, R(RSCRATCH_EXTRA), Imm8(16)); + if (jit->js.memcheck) + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG); + else + UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); MULSS(XMM0, R(XMM1)); UNPCKLPS(XMM0, M((void*)m_one)); RET(); const u8* loadPairedS16Two = AlignCode4(); - UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); + if (jit->js.memcheck) + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG); + else + UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); ROL(32, R(RSCRATCH_EXTRA), Imm8(16)); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); PUNPCKLWD(XMM0, R(XMM0)); PSRAD(XMM0, 16); CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH), Imm8(6)); - AND(32, R(RSCRATCH), Imm32(0xFC)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); PUNPCKLDQ(XMM1, R(XMM1)); MULPS(XMM0, R(XMM1)); RET(); const u8* loadPairedS16One = AlignCode4(); - UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); - SAR(32, R(RSCRATCH_EXTRA), Imm8(16)); + if (jit->js.memcheck) + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_PROLOG); + else + UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH), Imm8(6)); - AND(32, R(RSCRATCH), Imm32(0xFC)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); MULSS(XMM0, R(XMM1)); UNPCKLPS(XMM0, M((void*)m_one)); RET(); diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index c12b9fedcf..ae445d8575 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -61,9 +61,12 @@ void EmuCodeBlock::UnsafeLoadRegToReg(X64Reg reg_addr, X64Reg reg_value, int acc } } -void EmuCodeBlock::UnsafeLoadRegToRegNoSwap(X64Reg reg_addr, X64Reg reg_value, int accessSize, s32 offset) +void EmuCodeBlock::UnsafeLoadRegToRegNoSwap(X64Reg reg_addr, X64Reg reg_value, int accessSize, s32 offset, bool signExtend) { - MOVZX(32, accessSize, reg_value, MComplex(RMEM, reg_addr, SCALE_1, offset)); + if (signExtend) + MOVSX(32, accessSize, reg_value, MComplex(RMEM, reg_addr, SCALE_1, offset)); + else + MOVZX(32, accessSize, reg_value, MComplex(RMEM, reg_addr, SCALE_1, offset)); } u8 *EmuCodeBlock::UnsafeLoadToReg(X64Reg reg_value, OpArg opAddress, int accessSize, s32 offset, bool signExtend) @@ -350,7 +353,8 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, FixupBranch fast = J_CC(CC_Z, true); - ABI_PushRegistersAndAdjustStack(registersInUse, 0); + size_t rsp_alignment = (flags & SAFE_LOADSTORE_NO_PROLOG) ? 8 : 0; + ABI_PushRegistersAndAdjustStack(registersInUse, rsp_alignment); switch (accessSize) { case 64: @@ -366,7 +370,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, ABI_CallFunctionA((void *)&Memory::Read_U8_ZX, addr_loc); break; } - ABI_PopRegistersAndAdjustStack(registersInUse, 0); + ABI_PopRegistersAndAdjustStack(registersInUse, rsp_alignment); MEMCHECK_START diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index e50eedf08f..164ef03f0f 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -40,7 +40,7 @@ public: void SwapAndStore(int size, const Gen::OpArg& dst, Gen::X64Reg src); void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset = 0, bool signExtend = false); - void UnsafeLoadRegToRegNoSwap(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset); + void UnsafeLoadRegToRegNoSwap(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset, bool signExtend = false); // these return the address of the MOV, for backpatching u8 *UnsafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset = 0, bool swap = true); u8 *UnsafeLoadToReg(Gen::X64Reg reg_value, Gen::OpArg opAddress, int accessSize, s32 offset, bool signExtend);