diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 7412489948..b243e6bb90 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -132,8 +132,6 @@ ps_adds1 */ -static int CODE_SIZE = 1024*1024*32; - void Jit64::Init() { jo.optimizeStack = true; @@ -169,10 +167,13 @@ void Jit64::Init() trampolines.Init(); AllocCodeSpace(CODE_SIZE); - blocks.Init(); asm_routines.Init(); + // important: do this *after* generating the global asm routines, because we can't use farcode in them. + // it'll crash because the farcode functions get cleared on JIT clears. + farcode.Init(js.memcheck ? FARCODE_SIZE_MMU : FARCODE_SIZE); + code_block.m_stats = &js.st; code_block.m_gpa = &js.gpa; code_block.m_fpa = &js.fpa; @@ -183,6 +184,7 @@ void Jit64::ClearCache() { blocks.Clear(); trampolines.ClearCodeSpace(); + farcode.ClearCodeSpace(); ClearCodeSpace(); } @@ -193,6 +195,7 @@ void Jit64::Shutdown() blocks.Shutdown(); trampolines.Shutdown(); asm_routines.Shutdown(); + farcode.Shutdown(); } // This is only called by FallBackToInterpreter() in this file. It will execute an instruction with the interpreter functions. @@ -372,7 +375,8 @@ void Jit64::Trace() void STACKALIGN Jit64::Jit(u32 em_address) { - if (GetSpaceLeft() < 0x10000 || blocks.IsFull() || SConfig::GetInstance().m_LocalCoreStartupParameter.bJITNoBlockCache) + if (GetSpaceLeft() < 0x10000 || farcode.GetSpaceLeft() < 0x10000 || blocks.IsFull() || + SConfig::GetInstance().m_LocalCoreStartupParameter.bJITNoBlockCache) { ClearCache(); } @@ -525,12 +529,13 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc { if ((opinfo->flags & FL_USE_FPU) && !js.firstFPInstructionFound) { - gpr.Flush(); - fpr.Flush(); - //This instruction uses FPU - needs to add FP exception bailout TEST(32, PPCSTATE(msr), Imm32(1 << 13)); // Test FP enabled bit - FixupBranch b1 = J_CC(CC_NZ, true); + FixupBranch b1 = J_CC(CC_Z, true); + SwitchToFarCode(); + SetJumpTarget(b1); + gpr.Flush(FLUSH_MAINTAIN_STATE); + fpr.Flush(FLUSH_MAINTAIN_STATE); // If a FPU exception occurs, the exception handler will read // from PC. Update PC with the latest value in case that happens. @@ -538,32 +543,34 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc OR(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_FPU_UNAVAILABLE)); WriteExceptionExit(); - SetJumpTarget(b1); - + SwitchToNearCode(); js.firstFPInstructionFound = true; } // Add an external exception check if the instruction writes to the FIFO. if (jit->js.fifoWriteAddresses.find(ops[i].address) != jit->js.fifoWriteAddresses.end()) { - gpr.Flush(); - fpr.Flush(); - TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_ISI | EXCEPTION_PROGRAM | EXCEPTION_SYSCALL | EXCEPTION_FPU_UNAVAILABLE | EXCEPTION_DSI | EXCEPTION_ALIGNMENT)); - FixupBranch clearInt = J_CC(CC_NZ, true); + FixupBranch clearInt = J_CC(CC_NZ); TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_EXTERNAL_INT)); - FixupBranch noExtException = J_CC(CC_Z, true); + FixupBranch extException = J_CC(CC_NZ, true); + SwitchToFarCode(); + SetJumpTarget(extException); TEST(32, PPCSTATE(msr), Imm32(0x0008000)); FixupBranch noExtIntEnable = J_CC(CC_Z, true); TEST(32, M((void *)&ProcessorInterface::m_InterruptCause), Imm32(ProcessorInterface::INT_CAUSE_CP | ProcessorInterface::INT_CAUSE_PE_TOKEN | ProcessorInterface::INT_CAUSE_PE_FINISH)); FixupBranch noCPInt = J_CC(CC_Z, true); + gpr.Flush(FLUSH_MAINTAIN_STATE); + fpr.Flush(FLUSH_MAINTAIN_STATE); + MOV(32, PPCSTATE(pc), Imm32(ops[i].address)); WriteExternalExceptionExit(); + SwitchToNearCode(); + SetJumpTarget(noCPInt); SetJumpTarget(noExtIntEnable); - SetJumpTarget(noExtException); SetJumpTarget(clearInt); } @@ -585,18 +592,20 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc if (js.memcheck && (opinfo->flags & FL_LOADSTORE)) { - // In case we are about to jump to the dispatcher, flush regs - gpr.Flush(); - fpr.Flush(); - TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_DSI)); - FixupBranch noMemException = J_CC(CC_Z, true); + FixupBranch memException = J_CC(CC_NZ, true); + + SwitchToFarCode(); + SetJumpTarget(memException); + + gpr.Flush(FLUSH_MAINTAIN_STATE); + fpr.Flush(FLUSH_MAINTAIN_STATE); // If a memory exception occurs, the exception handler will read // from PC. Update PC with the latest value in case that happens. MOV(32, PPCSTATE(pc), Imm32(ops[i].address)); WriteExceptionExit(); - SetJumpTarget(noMemException); + SwitchToNearCode(); } if (opinfo->flags & FL_LOADSTORE) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index a31da48fa9..bf44c1ecbc 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -238,7 +238,7 @@ void Jit64::lXXx(UGeckoInstruction inst) if (update && storeAddress) { gpr.BindToRegister(a, true, true); - MEMCHECK_START + MEMCHECK_START(false) MOV(32, gpr.R(a), opAddress); MEMCHECK_END } @@ -279,18 +279,20 @@ void Jit64::dcbz(UGeckoInstruction inst) ADD(32, R(RSCRATCH), gpr.R(a)); AND(32, R(RSCRATCH), Imm32(~31)); TEST(32, R(RSCRATCH), Imm32(mem_mask)); - FixupBranch fast = J_CC(CC_Z, true); + FixupBranch slow = J_CC(CC_NZ, true); // Should this code ever run? I can't find any games that use DCBZ on non-physical addresses, but // supposedly there are, at least for some MMU titles. Let's be careful and support it to be sure. + SwitchToFarCode(); + SetJumpTarget(slow); MOV(32, M(&PC), Imm32(jit->js.compilerPC)); u32 registersInUse = CallerSavedRegistersInUse(); ABI_PushRegistersAndAdjustStack(registersInUse, 0); ABI_CallFunctionR((void *)&Memory::ClearCacheLine, RSCRATCH); ABI_PopRegistersAndAdjustStack(registersInUse, 0); + FixupBranch exit = J(true); - FixupBranch exit = J(); - SetJumpTarget(fast); + SwitchToNearCode(); PXOR(XMM0, R(XMM0)); MOVAPS(MComplex(RMEM, RSCRATCH, SCALE_1, 0), XMM0); MOVAPS(MComplex(RMEM, RSCRATCH, SCALE_1, 16), XMM0); @@ -411,7 +413,7 @@ void Jit64::stX(UGeckoInstruction inst) if (update && offset) { - MEMCHECK_START + MEMCHECK_START(false) gpr.KillImmediate(a, true, true); ADD(32, gpr.R(a), Imm32((u32)offset)); @@ -433,10 +435,11 @@ void Jit64::stXx(UGeckoInstruction inst) int a = inst.RA, b = inst.RB, s = inst.RS; FALLBACK_IF(!a || a == s || a == b); + bool update = !!(inst.SUBOP10 & 32); gpr.Lock(a, b, s); - if (inst.SUBOP10 & 32) + if (update) { gpr.BindToRegister(a, true, true); ADD(32, gpr.R(a), gpr.R(b)); @@ -483,6 +486,14 @@ void Jit64::stXx(UGeckoInstruction inst) } SafeWriteRegToReg(reg_value, RSCRATCH2, accessSize, 0, CallerSavedRegistersInUse()); + if (update && js.memcheck) + { + // revert the address change if an exception occurred + MEMCHECK_START(true) + SUB(32, gpr.R(a), gpr.R(b)); + MEMCHECK_END; + } + gpr.UnlockAll(); gpr.UnlockAllX(); } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp index 687327bb37..4e35c13caa 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp @@ -27,13 +27,18 @@ void Jit64::lfXXX(UGeckoInstruction inst) int a = inst.RA; int b = inst.RB; - FALLBACK_IF(!indexed && !a); + FALLBACK_IF((!indexed && !a) || (update && a == d)); if (update) gpr.BindToRegister(a, true, true); s32 offset = 0; OpArg addr = gpr.R(a); + if (update && js.memcheck) + { + addr = R(RSCRATCH2); + MOV(32, addr, gpr.R(a)); + } if (indexed) { if (update) @@ -58,14 +63,17 @@ void Jit64::lfXXX(UGeckoInstruction inst) if (update) ADD(32, addr, Imm32((s32)(s16)inst.SIMM_16)); else - offset = (s32)(s16)inst.SIMM_16; + offset = (s16)inst.SIMM_16; } - SafeLoadToReg(RSCRATCH, addr, single ? 32 : 64, offset, CallerSavedRegistersInUse(), false); + u32 registersInUse = CallerSavedRegistersInUse(); + if (update && js.memcheck) + registersInUse |= (1 << RSCRATCH2); + SafeLoadToReg(RSCRATCH, addr, single ? 32 : 64, offset, registersInUse, false); fpr.Lock(d); fpr.BindToRegister(d, js.memcheck || !single); - MEMCHECK_START + MEMCHECK_START(false) if (single) { ConvertSingleToDouble(fpr.RX(d), RSCRATCH, true); @@ -75,6 +83,8 @@ void Jit64::lfXXX(UGeckoInstruction inst) MOVQ_xmm(XMM0, R(RSCRATCH)); MOVSD(fpr.RX(d), R(XMM0)); } + if (update && js.memcheck) + MOV(32, gpr.R(a), addr); MEMCHECK_END fpr.UnlockAll(); gpr.UnlockAll(); @@ -93,9 +103,10 @@ void Jit64::stfXXX(UGeckoInstruction inst) int a = inst.RA; int b = inst.RB; - FALLBACK_IF(!indexed && !a); + FALLBACK_IF((!indexed && !a) || (update && (a == s || a == b))); s32 offset = 0; + s32 imm = (s16)inst.SIMM_16; if (indexed) { if (update) @@ -121,11 +132,11 @@ void Jit64::stfXXX(UGeckoInstruction inst) if (update) { gpr.BindToRegister(a, true, true); - ADD(32, gpr.R(a), Imm32((s32)(s16)inst.SIMM_16)); + ADD(32, gpr.R(a), Imm32(imm)); } else { - offset = (s32)(s16)inst.SIMM_16; + offset = imm; } MOV(32, R(RSCRATCH2), gpr.R(a)); } @@ -145,6 +156,13 @@ void Jit64::stfXXX(UGeckoInstruction inst) MOV(64, R(RSCRATCH), fpr.R(s)); SafeWriteRegToReg(RSCRATCH, RSCRATCH2, 64, offset, CallerSavedRegistersInUse()); } + if (js.memcheck && update) + { + // revert the address change if an exception occurred + MEMCHECK_START(true) + SUB(32, gpr.R(a), indexed ? gpr.R(b) : Imm32(imm)); + MEMCHECK_END + } gpr.UnlockAll(); gpr.UnlockAllX(); } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp index e4c8c6afd8..5f87d22ecb 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp @@ -20,30 +20,31 @@ void Jit64::psq_st(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStorePairedOff); - FALLBACK_IF(js.memcheck || !inst.RA); + FALLBACK_IF(!inst.RA); bool update = inst.OPCD == 61; int offset = inst.SIMM_12; int a = inst.RA; - int s = inst.RS; // Fp numbers + int s = inst.RS; - gpr.FlushLockX(RSCRATCH, RSCRATCH_EXTRA); + gpr.FlushLockX(RSCRATCH_EXTRA); if (update) - gpr.BindToRegister(inst.RA, true, true); - fpr.BindToRegister(inst.RS, true, false); - MOV(32, R(RSCRATCH_EXTRA), gpr.R(inst.RA)); + gpr.BindToRegister(a, true, true); + fpr.BindToRegister(s, true, false); + MOV(32, R(RSCRATCH_EXTRA), gpr.R(a)); if (offset) ADD(32, R(RSCRATCH_EXTRA), Imm32((u32)offset)); - if (update && offset) + // In memcheck mode, don't update the address until the exception check + if (update && offset && !js.memcheck) MOV(32, gpr.R(a), R(RSCRATCH_EXTRA)); // Some games (e.g. Dirt 2) incorrectly set the unused bits which breaks the lookup table code. // Hence, we need to mask out the unused bits. The layout of the GQR register is // UU[SCALE]UUUUU[TYPE] where SCALE is 6 bits and TYPE is 3 bits, so we have to AND with // 0b0011111100000111, or 0x3F07. - MOV(32, R(RSCRATCH), Imm32(0x3F07)); - AND(32, R(RSCRATCH), PPCSTATE(spr[SPR_GQR0 + inst.I])); - MOVZX(32, 8, RSCRATCH2, R(RSCRATCH)); + MOV(32, R(RSCRATCH2), Imm32(0x3F07)); + AND(32, R(RSCRATCH2), PPCSTATE(spr[SPR_GQR0 + inst.I])); + MOVZX(32, 8, RSCRATCH, R(RSCRATCH2)); // FIXME: Fix ModR/M encoding to allow [RSCRATCH2*4+disp32] without a base register! if (inst.W) @@ -51,13 +52,20 @@ void Jit64::psq_st(UGeckoInstruction inst) // One value PXOR(XMM0, R(XMM0)); // TODO: See if we can get rid of this cheaply by tweaking the code in the singleStore* functions. CVTSD2SS(XMM0, fpr.R(s)); - CALLptr(MScaled(RSCRATCH2, SCALE_8, (u32)(u64)asm_routines.singleStoreQuantized)); + CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.singleStoreQuantized)); } else { // Pair of values CVTPD2PS(XMM0, fpr.R(s)); - CALLptr(MScaled(RSCRATCH2, SCALE_8, (u32)(u64)asm_routines.pairedStoreQuantized)); + CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.pairedStoreQuantized)); + } + + if (update && offset && js.memcheck) + { + MEMCHECK_START(false) + ADD(32, gpr.R(a), Imm32((u32)offset)); + MEMCHECK_END } gpr.UnlockAll(); gpr.UnlockAllX(); @@ -67,33 +75,38 @@ void Jit64::psq_l(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStorePairedOff); - FALLBACK_IF(js.memcheck || !inst.RA); + FALLBACK_IF(!inst.RA); bool update = inst.OPCD == 57; int offset = inst.SIMM_12; + int a = inst.RA; + int s = inst.RS; - gpr.FlushLockX(RSCRATCH, RSCRATCH_EXTRA); - gpr.BindToRegister(inst.RA, true, update && offset); - fpr.BindToRegister(inst.RS, false, true); + gpr.FlushLockX(RSCRATCH_EXTRA); + gpr.BindToRegister(a, true, update && offset); + fpr.BindToRegister(s, false, true); if (offset) - LEA(32, RSCRATCH_EXTRA, MDisp(gpr.RX(inst.RA), offset)); + LEA(32, RSCRATCH_EXTRA, MDisp(gpr.RX(a), offset)); else - MOV(32, R(RSCRATCH_EXTRA), gpr.R(inst.RA)); - if (update && offset) - MOV(32, gpr.R(inst.RA), R(RSCRATCH_EXTRA)); - MOV(32, R(RSCRATCH), Imm32(0x3F07)); - AND(32, R(RSCRATCH), M(((char *)&GQR(inst.I)) + 2)); - MOVZX(32, 8, RSCRATCH2, R(RSCRATCH)); + MOV(32, R(RSCRATCH_EXTRA), gpr.R(a)); + // In memcheck mode, don't update the address until the exception check + if (update && offset && !js.memcheck) + MOV(32, gpr.R(a), R(RSCRATCH_EXTRA)); + MOV(32, R(RSCRATCH2), Imm32(0x3F07)); + AND(32, R(RSCRATCH2), M(((char *)&GQR(inst.I)) + 2)); + MOVZX(32, 8, RSCRATCH, R(RSCRATCH2)); if (inst.W) - OR(32, R(RSCRATCH2), Imm8(8)); + OR(32, R(RSCRATCH), Imm8(8)); - CALLptr(MScaled(RSCRATCH2, SCALE_8, (u32)(u64)asm_routines.pairedLoadQuantized)); + CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.pairedLoadQuantized)); - // MEMCHECK_START // FIXME: MMU does not work here because of unsafe memory access - - CVTPS2PD(fpr.RX(inst.RS), R(XMM0)); - - // MEMCHECK_END + MEMCHECK_START(false) + CVTPS2PD(fpr.RX(s), R(XMM0)); + if (update && offset && js.memcheck) + { + ADD(32, gpr.R(a), Imm32((u32)offset)); + } + MEMCHECK_END gpr.UnlockAll(); gpr.UnlockAllX(); diff --git a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp index 4a5a43313b..8f920efda0 100644 --- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp +++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp @@ -1590,13 +1590,13 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress) // Hence, we need to mask out the unused bits. The layout of the GQR register is // UU[SCALE]UUUUU[TYPE] where SCALE is 6 bits and TYPE is 3 bits, so we have to AND with // 0b0011111100000111, or 0x3F07. - Jit->MOV(32, R(RSCRATCH), Imm32(0x3F07)); - Jit->AND(32, R(RSCRATCH), M(((char *)&GQR(quantreg)) + 2)); - Jit->MOVZX(32, 8, RSCRATCH2, R(RSCRATCH)); - Jit->OR(32, R(RSCRATCH2), Imm8(w << 3)); + Jit->MOV(32, R(RSCRATCH2), Imm32(0x3F07)); + Jit->AND(32, R(RSCRATCH2), M(((char *)&GQR(quantreg)) + 2)); + Jit->MOVZX(32, 8, RSCRATCH, R(RSCRATCH2)); + Jit->OR(32, R(RSCRATCH), Imm8(w << 3)); Jit->MOV(32, R(RSCRATCH_EXTRA), regLocForInst(RI, getOp1(I))); - Jit->CALLptr(MScaled(RSCRATCH2, SCALE_8, (u32)(u64)(((JitIL *)jit)->asm_routines.pairedLoadQuantized))); + Jit->CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)(((JitIL *)jit)->asm_routines.pairedLoadQuantized))); Jit->MOVAPD(reg, R(XMM0)); RI.fregs[reg] = I; regNormalRegClear(RI, I); @@ -1641,13 +1641,13 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress) regSpill(RI, RSCRATCH); regSpill(RI, RSCRATCH2); u32 quantreg = *I >> 24; - Jit->MOV(32, R(RSCRATCH), Imm32(0x3F07)); - Jit->AND(32, R(RSCRATCH), PPCSTATE(spr[SPR_GQR0 + quantreg])); - Jit->MOVZX(32, 8, RSCRATCH2, R(RSCRATCH)); + Jit->MOV(32, R(RSCRATCH2), Imm32(0x3F07)); + Jit->AND(32, R(RSCRATCH2), PPCSTATE(spr[SPR_GQR0 + quantreg])); + Jit->MOVZX(32, 8, RSCRATCH, R(RSCRATCH2)); Jit->MOV(32, R(RSCRATCH_EXTRA), regLocForInst(RI, getOp2(I))); Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I))); - Jit->CALLptr(MScaled(RSCRATCH2, SCALE_8, (u32)(u64)(((JitIL *)jit)->asm_routines.pairedStoreQuantized))); + Jit->CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)(((JitIL *)jit)->asm_routines.pairedStoreQuantized))); if (RI.IInfo[I - RI.FirstI] & 4) fregClearInst(RI, getOp1(I)); if (RI.IInfo[I - RI.FirstI] & 8) diff --git a/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp b/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp index 077f4b7a1f..81260249c7 100644 --- a/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp +++ b/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp @@ -241,8 +241,6 @@ namespace JitILProfiler } }; -static int CODE_SIZE = 1024*1024*32; - void JitIL::Init() { jo.optimizeStack = true; @@ -273,10 +271,11 @@ void JitIL::Init() trampolines.Init(); AllocCodeSpace(CODE_SIZE); - blocks.Init(); asm_routines.Init(); + farcode.Init(js.memcheck ? FARCODE_SIZE_MMU : FARCODE_SIZE); + code_block.m_stats = &js.st; code_block.m_gpa = &js.gpa; code_block.m_fpa = &js.fpa; @@ -306,6 +305,7 @@ void JitIL::Shutdown() blocks.Shutdown(); trampolines.Shutdown(); asm_routines.Shutdown(); + farcode.Shutdown(); } @@ -504,7 +504,8 @@ void JitIL::Trace() void STACKALIGN JitIL::Jit(u32 em_address) { - if (GetSpaceLeft() < 0x10000 || blocks.IsFull() || SConfig::GetInstance().m_LocalCoreStartupParameter.bJITNoBlockCache) + if (GetSpaceLeft() < 0x10000 || farcode.GetSpaceLeft() < 0x10000 || blocks.IsFull() || + SConfig::GetInstance().m_LocalCoreStartupParameter.bJITNoBlockCache) { ClearCache(); } diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp index dbce5dfb85..8a5e7dcfe5 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp @@ -17,6 +17,8 @@ (1 << (XMM0+16)) | \ (1 << (XMM1+16)))) +#define QUANTIZED_REGS_TO_SAVE_LOAD (QUANTIZED_REGS_TO_SAVE | (1 << RSCRATCH2)) + using namespace Gen; static int temp32; @@ -250,24 +252,29 @@ void CommonAsmRoutines::GenQuantizedStores() UD2(); const u8* storePairedFloat = AlignCode4(); + FixupBranch skip_complex, too_complex; SHUFPS(XMM0, R(XMM0), 1); MOVQ_xmm(M(&psTemp[0]), XMM0); - TEST(32, R(RSCRATCH_EXTRA), Imm32(0x0C000000)); - FixupBranch too_complex = J_CC(CC_NZ, true); - MOV(64, R(RSCRATCH), M(&psTemp[0])); - SwapAndStore(64, MComplex(RMEM, RSCRATCH_EXTRA, SCALE_1, 0), RSCRATCH); - FixupBranch skip_complex = J(true); - SetJumpTarget(too_complex); + if (!jit->js.memcheck) + { + TEST(32, R(RSCRATCH_EXTRA), Imm32(0x0C000000)); + too_complex = J_CC(CC_NZ, true); + MOV(64, R(RSCRATCH), M(&psTemp[0])); + SwapAndStore(64, MComplex(RMEM, RSCRATCH_EXTRA, SCALE_1, 0), RSCRATCH); + skip_complex = J(true); + SetJumpTarget(too_complex); + } // RSP alignment here is 8 due to the call. ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8); ABI_CallFunctionR((void *)&WriteDual32, RSCRATCH_EXTRA); ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8); - SetJumpTarget(skip_complex); + if (!jit->js.memcheck) + SetJumpTarget(skip_complex); RET(); const u8* storePairedU8 = AlignCode4(); - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); PUNPCKLDQ(XMM1, R(XMM1)); MULPS(XMM0, R(XMM1)); #ifdef QUANTIZE_OVERFLOW_SAFE @@ -284,8 +291,8 @@ void CommonAsmRoutines::GenQuantizedStores() RET(); const u8* storePairedS8 = AlignCode4(); - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); PUNPCKLDQ(XMM1, R(XMM1)); MULPS(XMM0, R(XMM1)); #ifdef QUANTIZE_OVERFLOW_SAFE @@ -303,8 +310,8 @@ void CommonAsmRoutines::GenQuantizedStores() RET(); const u8* storePairedU16 = AlignCode4(); - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); PUNPCKLDQ(XMM1, R(XMM1)); MULPS(XMM0, R(XMM1)); @@ -329,8 +336,8 @@ void CommonAsmRoutines::GenQuantizedStores() RET(); const u8* storePairedS16 = AlignCode4(); - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); // SHUFPS or UNPCKLPS might be a better choice here. The last one might just be an alias though. PUNPCKLDQ(XMM1, R(XMM1)); MULPS(XMM0, R(XMM1)); @@ -388,8 +395,8 @@ void CommonAsmRoutines::GenQuantizedSingleStores() }*/ const u8* storeSingleU8 = AlignCode4(); // Used by MKWii - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MULSS(XMM0, R(XMM1)); PXOR(XMM1, R(XMM1)); MAXSS(XMM0, R(XMM1)); @@ -399,8 +406,8 @@ void CommonAsmRoutines::GenQuantizedSingleStores() RET(); const u8* storeSingleS8 = AlignCode4(); - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MULSS(XMM0, R(XMM1)); MAXSS(XMM0, M((void *)&m_m128)); MINSS(XMM0, M((void *)&m_127)); @@ -409,8 +416,8 @@ void CommonAsmRoutines::GenQuantizedSingleStores() RET(); const u8* storeSingleU16 = AlignCode4(); // Used by MKWii - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MULSS(XMM0, R(XMM1)); PXOR(XMM1, R(XMM1)); MAXSS(XMM0, R(XMM1)); @@ -420,8 +427,8 @@ void CommonAsmRoutines::GenQuantizedSingleStores() RET(); const u8* storeSingleS16 = AlignCode4(); - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS)); MULSS(XMM0, R(XMM1)); MAXSS(XMM0, M((void *)&m_m32768)); MINSS(XMM0, M((void *)&m_32767)); @@ -448,7 +455,13 @@ void CommonAsmRoutines::GenQuantizedLoads() UD2(); const u8* loadPairedFloatTwo = AlignCode4(); - if (cpu_info.bSSSE3) + if (jit->js.memcheck) + { + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 64, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_PROLOG); + ROL(64, R(RSCRATCH_EXTRA), Imm8(32)); + MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA)); + } + else if (cpu_info.bSSSE3) { MOVQ_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0)); PSHUFB(XMM0, M((void *)pbswapShuffle2x4)); @@ -462,7 +475,13 @@ void CommonAsmRoutines::GenQuantizedLoads() RET(); const u8* loadPairedFloatOne = AlignCode4(); - if (cpu_info.bSSSE3) + if (jit->js.memcheck) + { + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_PROLOG); + MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); + UNPCKLPS(XMM0, M((void*)m_one)); + } + else if (cpu_info.bSSSE3) { MOVD_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0)); PSHUFB(XMM0, M((void *)pbswapShuffle1x4)); @@ -477,99 +496,130 @@ void CommonAsmRoutines::GenQuantizedLoads() RET(); const u8* loadPairedU8Two = AlignCode4(); - UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0); + if (jit->js.memcheck) + { + // TODO: Support not swapping in safeLoadToReg to avoid bswapping twice + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG); + ROR(16, R(RSCRATCH_EXTRA), Imm8(8)); + } + else + { + UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0); + } MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); PXOR(XMM1, R(XMM1)); PUNPCKLBW(XMM0, R(XMM1)); PUNPCKLWD(XMM0, R(XMM1)); CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); PUNPCKLDQ(XMM1, R(XMM1)); MULPS(XMM0, R(XMM1)); RET(); const u8* loadPairedU8One = AlignCode4(); - UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx + if (jit->js.memcheck) + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG); + else + UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); CVTDQ2PS(XMM0, R(XMM0)); // Is CVTSI2SS better? - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); MULSS(XMM0, R(XMM1)); UNPCKLPS(XMM0, M((void*)m_one)); RET(); const u8* loadPairedS8Two = AlignCode4(); - UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0); + if (jit->js.memcheck) + { + // TODO: Support not swapping in safeLoadToReg to avoid bswapping twice + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG); + ROR(16, R(RSCRATCH_EXTRA), Imm8(8)); + } + else + { + UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0); + } MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); PUNPCKLBW(XMM0, R(XMM0)); PUNPCKLWD(XMM0, R(XMM0)); PSRAD(XMM0, 24); CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); PUNPCKLDQ(XMM1, R(XMM1)); MULPS(XMM0, R(XMM1)); RET(); const u8* loadPairedS8One = AlignCode4(); - UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); - SHL(32, R(RSCRATCH_EXTRA), Imm8(24)); - SAR(32, R(RSCRATCH_EXTRA), Imm8(24)); + if (jit->js.memcheck) + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_PROLOG); + else + UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); MULSS(XMM0, R(XMM1)); UNPCKLPS(XMM0, M((void*)m_one)); RET(); const u8* loadPairedU16Two = AlignCode4(); - UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); + // TODO: Support not swapping in (un)safeLoadToReg to avoid bswapping twice + if (jit->js.memcheck) + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG); + else + UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); ROL(32, R(RSCRATCH_EXTRA), Imm8(16)); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); PXOR(XMM1, R(XMM1)); PUNPCKLWD(XMM0, R(XMM1)); CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); PUNPCKLDQ(XMM1, R(XMM1)); MULPS(XMM0, R(XMM1)); RET(); const u8* loadPairedU16One = AlignCode4(); - UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); - SHR(32, R(RSCRATCH_EXTRA), Imm8(16)); + if (jit->js.memcheck) + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG); + else + UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH), Imm8(6)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); MULSS(XMM0, R(XMM1)); UNPCKLPS(XMM0, M((void*)m_one)); RET(); const u8* loadPairedS16Two = AlignCode4(); - UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); + if (jit->js.memcheck) + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG); + else + UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); ROL(32, R(RSCRATCH_EXTRA), Imm8(16)); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); PUNPCKLWD(XMM0, R(XMM0)); PSRAD(XMM0, 16); CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH), Imm8(6)); - AND(32, R(RSCRATCH), Imm32(0xFC)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); PUNPCKLDQ(XMM1, R(XMM1)); MULPS(XMM0, R(XMM1)); RET(); const u8* loadPairedS16One = AlignCode4(); - UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); - SAR(32, R(RSCRATCH_EXTRA), Imm8(16)); + if (jit->js.memcheck) + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_PROLOG); + else + UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); CVTDQ2PS(XMM0, R(XMM0)); - SHR(32, R(RSCRATCH), Imm8(6)); - AND(32, R(RSCRATCH), Imm32(0xFC)); - MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS)); + SHR(32, R(RSCRATCH2), Imm8(6)); + MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS)); MULSS(XMM0, R(XMM1)); UNPCKLPS(XMM0, M((void*)m_one)); RET(); diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index c12b9fedcf..55b7382408 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -61,9 +61,12 @@ void EmuCodeBlock::UnsafeLoadRegToReg(X64Reg reg_addr, X64Reg reg_value, int acc } } -void EmuCodeBlock::UnsafeLoadRegToRegNoSwap(X64Reg reg_addr, X64Reg reg_value, int accessSize, s32 offset) +void EmuCodeBlock::UnsafeLoadRegToRegNoSwap(X64Reg reg_addr, X64Reg reg_value, int accessSize, s32 offset, bool signExtend) { - MOVZX(32, accessSize, reg_value, MComplex(RMEM, reg_addr, SCALE_1, offset)); + if (signExtend) + MOVSX(32, accessSize, reg_value, MComplex(RMEM, reg_addr, SCALE_1, offset)); + else + MOVZX(32, accessSize, reg_value, MComplex(RMEM, reg_addr, SCALE_1, offset)); } u8 *EmuCodeBlock::UnsafeLoadToReg(X64Reg reg_value, OpArg opAddress, int accessSize, s32 offset, bool signExtend) @@ -315,8 +318,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, } ABI_PopRegistersAndAdjustStack(registersInUse, 0); - MEMCHECK_START - + MEMCHECK_START(false) if (signExtend && accessSize < 32) { // Need to sign extend values coming from the Read_U* functions. @@ -326,7 +328,6 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, { MOVZX(64, accessSize, reg_value, R(ABI_RETURN)); } - MEMCHECK_END } } @@ -348,9 +349,17 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, } TEST(32, addr_loc, Imm32(mem_mask)); - FixupBranch fast = J_CC(CC_Z, true); + FixupBranch slow, exit; + slow = J_CC(CC_NZ, farcode.Enabled()); + UnsafeLoadToReg(reg_value, addr_loc, accessSize, 0, signExtend); + if (farcode.Enabled()) + SwitchToFarCode(); + else + exit = J(true); + SetJumpTarget(slow); - ABI_PushRegistersAndAdjustStack(registersInUse, 0); + size_t rsp_alignment = (flags & SAFE_LOADSTORE_NO_PROLOG) ? 8 : 0; + ABI_PushRegistersAndAdjustStack(registersInUse, rsp_alignment); switch (accessSize) { case 64: @@ -366,10 +375,9 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, ABI_CallFunctionA((void *)&Memory::Read_U8_ZX, addr_loc); break; } - ABI_PopRegistersAndAdjustStack(registersInUse, 0); - - MEMCHECK_START + ABI_PopRegistersAndAdjustStack(registersInUse, rsp_alignment); + MEMCHECK_START(false) if (signExtend && accessSize < 32) { // Need to sign extend values coming from the Read_U* functions. @@ -379,12 +387,13 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, { MOVZX(64, accessSize, reg_value, R(ABI_RETURN)); } - MEMCHECK_END - FixupBranch exit = J(); - SetJumpTarget(fast); - UnsafeLoadToReg(reg_value, addr_loc, accessSize, 0, signExtend); + if (farcode.Enabled()) + { + exit = J(true); + SwitchToNearCode(); + } SetJumpTarget(exit); } } @@ -466,12 +475,21 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce } #endif + bool swap = !(flags & SAFE_LOADSTORE_NO_SWAP); + + FixupBranch slow, exit; TEST(32, R(reg_addr), Imm32(mem_mask)); - FixupBranch fast = J_CC(CC_Z, true); + slow = J_CC(CC_NZ, farcode.Enabled()); + UnsafeWriteRegToReg(reg_value, reg_addr, accessSize, 0, swap); + if (farcode.Enabled()) + SwitchToFarCode(); + else + exit = J(true); + SetJumpTarget(slow); // PC is used by memory watchpoints (if enabled) or to print accurate PC locations in debug logs MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC)); + size_t rsp_alignment = (flags & SAFE_LOADSTORE_NO_PROLOG) ? 8 : 0; - bool swap = !(flags & SAFE_LOADSTORE_NO_SWAP); ABI_PushRegistersAndAdjustStack(registersInUse, rsp_alignment); switch (accessSize) { @@ -489,9 +507,11 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce break; } ABI_PopRegistersAndAdjustStack(registersInUse, rsp_alignment); - FixupBranch exit = J(); - SetJumpTarget(fast); - UnsafeWriteRegToReg(reg_value, reg_addr, accessSize, 0, swap); + if (farcode.Enabled()) + { + exit = J(true); + SwitchToNearCode(); + } SetJumpTarget(exit); } @@ -655,15 +675,17 @@ void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src) // to save an instruction, since diverting a few more floats to the slow path can't hurt much. SUB(8, R(RSCRATCH), Imm8(0x6D)); CMP(8, R(RSCRATCH), Imm8(0x3)); - FixupBranch x87Conversion = J_CC(CC_BE); + FixupBranch x87Conversion = J_CC(CC_BE, true); CVTSD2SS(dst, R(src)); - FixupBranch continue1 = J(); + SwitchToFarCode(); SetJumpTarget(x87Conversion); MOVSD(M(&temp64), src); FLD(64, M(&temp64)); FSTP(32, M(&temp32)); MOVSS(dst, M(&temp32)); + FixupBranch continue1 = J(true); + SwitchToNearCode(); SetJumpTarget(continue1); // We'd normally need to MOVDDUP here to put the single in the top half of the output register too, but @@ -692,16 +714,17 @@ void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr // through the slow path (0x00800000), but the performance effects of that should be negligible. SUB(32, R(gprsrc), Imm8(1)); TEST(32, R(gprsrc), Imm32(0x7f800000)); - - FixupBranch x87Conversion = J_CC(CC_Z); + FixupBranch x87Conversion = J_CC(CC_Z, true); CVTSS2SD(dst, R(dst)); - FixupBranch continue1 = J(); + SwitchToFarCode(); SetJumpTarget(x87Conversion); MOVSS(M(&temp32), dst); FLD(32, M(&temp32)); FSTP(64, M(&temp64)); MOVSD(dst, M(&temp64)); + FixupBranch continue1 = J(true); + SwitchToNearCode(); SetJumpTarget(continue1); MOVDDUP(dst, R(dst)); diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index e50eedf08f..e004df69ce 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -11,11 +11,13 @@ namespace MMIO { class Mapping; } -#define MEMCHECK_START \ +// If inv is true, invert the check (i.e. skip over the associated code if an exception hits, +// instead of skipping over the code if an exception isn't hit). +#define MEMCHECK_START(inv) \ Gen::FixupBranch memException; \ if (jit->js.memcheck) \ { TEST(32, PPCSTATE(Exceptions), Gen::Imm32(EXCEPTION_DSI)); \ - memException = J_CC(Gen::CC_NZ, true); } + memException = J_CC((inv) ? Gen::CC_Z : Gen::CC_NZ, true); } #define MEMCHECK_END \ if (jit->js.memcheck) \ @@ -32,15 +34,50 @@ namespace MMIO { class Mapping; } #define PPCSTATE_SRR0 PPCSTATE(spr[SPR_SRR0]) #define PPCSTATE_SRR1 PPCSTATE(spr[SPR_SRR1]) +// A place to throw blocks of code we don't want polluting the cache, e.g. rarely taken +// exception branches. +class FarCodeCache : public Gen::X64CodeBlock +{ +private: + bool m_enabled = false; +public: + bool Enabled() { return m_enabled; } + void Init(int size) { AllocCodeSpace(size); m_enabled = true; } + void Shutdown() { FreeCodeSpace(); m_enabled = false; } +}; + // Like XCodeBlock but has some utilities for memory access. class EmuCodeBlock : public Gen::X64CodeBlock { public: + static const int CODE_SIZE = 1024 * 1024 * 32; + + // a bit of a hack; the MMU results in a vast amount more code ending up in the far cache, + // mostly exception handling, so give it a whole bunch more space if the MMU is on. + static const int FARCODE_SIZE = 1024 * 1024 * 8; + static const int FARCODE_SIZE_MMU = 1024 * 1024 * 48; + + FarCodeCache farcode; + u8* nearcode; // Backed up when we switch to far code. + + // Simple functions to switch between near and far code emitting + void SwitchToFarCode() + { + nearcode = GetWritableCodePtr(); + SetCodePtr(farcode.GetWritableCodePtr()); + } + + void SwitchToNearCode() + { + farcode.SetCodePtr(GetWritableCodePtr()); + SetCodePtr(nearcode); + } + void LoadAndSwap(int size, Gen::X64Reg dst, const Gen::OpArg& src); void SwapAndStore(int size, const Gen::OpArg& dst, Gen::X64Reg src); void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset = 0, bool signExtend = false); - void UnsafeLoadRegToRegNoSwap(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset); + void UnsafeLoadRegToRegNoSwap(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset, bool signExtend = false); // these return the address of the MOV, for backpatching u8 *UnsafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset = 0, bool swap = true); u8 *UnsafeLoadToReg(Gen::X64Reg reg_value, Gen::OpArg opAddress, int accessSize, s32 offset, bool signExtend);