mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2025-01-25 15:31:17 +01:00
Merge pull request #927 from FioraAeterna/fastermmu
Fiora's Faster MMU Project
This commit is contained in:
commit
2ab19c7cec
@ -132,8 +132,6 @@ ps_adds1
|
|||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static int CODE_SIZE = 1024*1024*32;
|
|
||||||
|
|
||||||
void Jit64::Init()
|
void Jit64::Init()
|
||||||
{
|
{
|
||||||
jo.optimizeStack = true;
|
jo.optimizeStack = true;
|
||||||
@ -169,10 +167,13 @@ void Jit64::Init()
|
|||||||
|
|
||||||
trampolines.Init();
|
trampolines.Init();
|
||||||
AllocCodeSpace(CODE_SIZE);
|
AllocCodeSpace(CODE_SIZE);
|
||||||
|
|
||||||
blocks.Init();
|
blocks.Init();
|
||||||
asm_routines.Init();
|
asm_routines.Init();
|
||||||
|
|
||||||
|
// important: do this *after* generating the global asm routines, because we can't use farcode in them.
|
||||||
|
// it'll crash because the farcode functions get cleared on JIT clears.
|
||||||
|
farcode.Init(js.memcheck ? FARCODE_SIZE_MMU : FARCODE_SIZE);
|
||||||
|
|
||||||
code_block.m_stats = &js.st;
|
code_block.m_stats = &js.st;
|
||||||
code_block.m_gpa = &js.gpa;
|
code_block.m_gpa = &js.gpa;
|
||||||
code_block.m_fpa = &js.fpa;
|
code_block.m_fpa = &js.fpa;
|
||||||
@ -183,6 +184,7 @@ void Jit64::ClearCache()
|
|||||||
{
|
{
|
||||||
blocks.Clear();
|
blocks.Clear();
|
||||||
trampolines.ClearCodeSpace();
|
trampolines.ClearCodeSpace();
|
||||||
|
farcode.ClearCodeSpace();
|
||||||
ClearCodeSpace();
|
ClearCodeSpace();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -193,6 +195,7 @@ void Jit64::Shutdown()
|
|||||||
blocks.Shutdown();
|
blocks.Shutdown();
|
||||||
trampolines.Shutdown();
|
trampolines.Shutdown();
|
||||||
asm_routines.Shutdown();
|
asm_routines.Shutdown();
|
||||||
|
farcode.Shutdown();
|
||||||
}
|
}
|
||||||
|
|
||||||
// This is only called by FallBackToInterpreter() in this file. It will execute an instruction with the interpreter functions.
|
// This is only called by FallBackToInterpreter() in this file. It will execute an instruction with the interpreter functions.
|
||||||
@ -372,7 +375,8 @@ void Jit64::Trace()
|
|||||||
|
|
||||||
void STACKALIGN Jit64::Jit(u32 em_address)
|
void STACKALIGN Jit64::Jit(u32 em_address)
|
||||||
{
|
{
|
||||||
if (GetSpaceLeft() < 0x10000 || blocks.IsFull() || SConfig::GetInstance().m_LocalCoreStartupParameter.bJITNoBlockCache)
|
if (GetSpaceLeft() < 0x10000 || farcode.GetSpaceLeft() < 0x10000 || blocks.IsFull() ||
|
||||||
|
SConfig::GetInstance().m_LocalCoreStartupParameter.bJITNoBlockCache)
|
||||||
{
|
{
|
||||||
ClearCache();
|
ClearCache();
|
||||||
}
|
}
|
||||||
@ -525,12 +529,13 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
|
|||||||
{
|
{
|
||||||
if ((opinfo->flags & FL_USE_FPU) && !js.firstFPInstructionFound)
|
if ((opinfo->flags & FL_USE_FPU) && !js.firstFPInstructionFound)
|
||||||
{
|
{
|
||||||
gpr.Flush();
|
|
||||||
fpr.Flush();
|
|
||||||
|
|
||||||
//This instruction uses FPU - needs to add FP exception bailout
|
//This instruction uses FPU - needs to add FP exception bailout
|
||||||
TEST(32, PPCSTATE(msr), Imm32(1 << 13)); // Test FP enabled bit
|
TEST(32, PPCSTATE(msr), Imm32(1 << 13)); // Test FP enabled bit
|
||||||
FixupBranch b1 = J_CC(CC_NZ, true);
|
FixupBranch b1 = J_CC(CC_Z, true);
|
||||||
|
SwitchToFarCode();
|
||||||
|
SetJumpTarget(b1);
|
||||||
|
gpr.Flush(FLUSH_MAINTAIN_STATE);
|
||||||
|
fpr.Flush(FLUSH_MAINTAIN_STATE);
|
||||||
|
|
||||||
// If a FPU exception occurs, the exception handler will read
|
// If a FPU exception occurs, the exception handler will read
|
||||||
// from PC. Update PC with the latest value in case that happens.
|
// from PC. Update PC with the latest value in case that happens.
|
||||||
@ -538,32 +543,34 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
|
|||||||
OR(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_FPU_UNAVAILABLE));
|
OR(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_FPU_UNAVAILABLE));
|
||||||
WriteExceptionExit();
|
WriteExceptionExit();
|
||||||
|
|
||||||
SetJumpTarget(b1);
|
SwitchToNearCode();
|
||||||
|
|
||||||
js.firstFPInstructionFound = true;
|
js.firstFPInstructionFound = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add an external exception check if the instruction writes to the FIFO.
|
// Add an external exception check if the instruction writes to the FIFO.
|
||||||
if (jit->js.fifoWriteAddresses.find(ops[i].address) != jit->js.fifoWriteAddresses.end())
|
if (jit->js.fifoWriteAddresses.find(ops[i].address) != jit->js.fifoWriteAddresses.end())
|
||||||
{
|
{
|
||||||
gpr.Flush();
|
|
||||||
fpr.Flush();
|
|
||||||
|
|
||||||
TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_ISI | EXCEPTION_PROGRAM | EXCEPTION_SYSCALL | EXCEPTION_FPU_UNAVAILABLE | EXCEPTION_DSI | EXCEPTION_ALIGNMENT));
|
TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_ISI | EXCEPTION_PROGRAM | EXCEPTION_SYSCALL | EXCEPTION_FPU_UNAVAILABLE | EXCEPTION_DSI | EXCEPTION_ALIGNMENT));
|
||||||
FixupBranch clearInt = J_CC(CC_NZ, true);
|
FixupBranch clearInt = J_CC(CC_NZ);
|
||||||
TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_EXTERNAL_INT));
|
TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_EXTERNAL_INT));
|
||||||
FixupBranch noExtException = J_CC(CC_Z, true);
|
FixupBranch extException = J_CC(CC_NZ, true);
|
||||||
|
SwitchToFarCode();
|
||||||
|
SetJumpTarget(extException);
|
||||||
TEST(32, PPCSTATE(msr), Imm32(0x0008000));
|
TEST(32, PPCSTATE(msr), Imm32(0x0008000));
|
||||||
FixupBranch noExtIntEnable = J_CC(CC_Z, true);
|
FixupBranch noExtIntEnable = J_CC(CC_Z, true);
|
||||||
TEST(32, M((void *)&ProcessorInterface::m_InterruptCause), Imm32(ProcessorInterface::INT_CAUSE_CP | ProcessorInterface::INT_CAUSE_PE_TOKEN | ProcessorInterface::INT_CAUSE_PE_FINISH));
|
TEST(32, M((void *)&ProcessorInterface::m_InterruptCause), Imm32(ProcessorInterface::INT_CAUSE_CP | ProcessorInterface::INT_CAUSE_PE_TOKEN | ProcessorInterface::INT_CAUSE_PE_FINISH));
|
||||||
FixupBranch noCPInt = J_CC(CC_Z, true);
|
FixupBranch noCPInt = J_CC(CC_Z, true);
|
||||||
|
|
||||||
|
gpr.Flush(FLUSH_MAINTAIN_STATE);
|
||||||
|
fpr.Flush(FLUSH_MAINTAIN_STATE);
|
||||||
|
|
||||||
MOV(32, PPCSTATE(pc), Imm32(ops[i].address));
|
MOV(32, PPCSTATE(pc), Imm32(ops[i].address));
|
||||||
WriteExternalExceptionExit();
|
WriteExternalExceptionExit();
|
||||||
|
|
||||||
|
SwitchToNearCode();
|
||||||
|
|
||||||
SetJumpTarget(noCPInt);
|
SetJumpTarget(noCPInt);
|
||||||
SetJumpTarget(noExtIntEnable);
|
SetJumpTarget(noExtIntEnable);
|
||||||
SetJumpTarget(noExtException);
|
|
||||||
SetJumpTarget(clearInt);
|
SetJumpTarget(clearInt);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -585,18 +592,20 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
|
|||||||
|
|
||||||
if (js.memcheck && (opinfo->flags & FL_LOADSTORE))
|
if (js.memcheck && (opinfo->flags & FL_LOADSTORE))
|
||||||
{
|
{
|
||||||
// In case we are about to jump to the dispatcher, flush regs
|
|
||||||
gpr.Flush();
|
|
||||||
fpr.Flush();
|
|
||||||
|
|
||||||
TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_DSI));
|
TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_DSI));
|
||||||
FixupBranch noMemException = J_CC(CC_Z, true);
|
FixupBranch memException = J_CC(CC_NZ, true);
|
||||||
|
|
||||||
|
SwitchToFarCode();
|
||||||
|
SetJumpTarget(memException);
|
||||||
|
|
||||||
|
gpr.Flush(FLUSH_MAINTAIN_STATE);
|
||||||
|
fpr.Flush(FLUSH_MAINTAIN_STATE);
|
||||||
|
|
||||||
// If a memory exception occurs, the exception handler will read
|
// If a memory exception occurs, the exception handler will read
|
||||||
// from PC. Update PC with the latest value in case that happens.
|
// from PC. Update PC with the latest value in case that happens.
|
||||||
MOV(32, PPCSTATE(pc), Imm32(ops[i].address));
|
MOV(32, PPCSTATE(pc), Imm32(ops[i].address));
|
||||||
WriteExceptionExit();
|
WriteExceptionExit();
|
||||||
SetJumpTarget(noMemException);
|
SwitchToNearCode();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (opinfo->flags & FL_LOADSTORE)
|
if (opinfo->flags & FL_LOADSTORE)
|
||||||
|
@ -238,7 +238,7 @@ void Jit64::lXXx(UGeckoInstruction inst)
|
|||||||
if (update && storeAddress)
|
if (update && storeAddress)
|
||||||
{
|
{
|
||||||
gpr.BindToRegister(a, true, true);
|
gpr.BindToRegister(a, true, true);
|
||||||
MEMCHECK_START
|
MEMCHECK_START(false)
|
||||||
MOV(32, gpr.R(a), opAddress);
|
MOV(32, gpr.R(a), opAddress);
|
||||||
MEMCHECK_END
|
MEMCHECK_END
|
||||||
}
|
}
|
||||||
@ -279,18 +279,20 @@ void Jit64::dcbz(UGeckoInstruction inst)
|
|||||||
ADD(32, R(RSCRATCH), gpr.R(a));
|
ADD(32, R(RSCRATCH), gpr.R(a));
|
||||||
AND(32, R(RSCRATCH), Imm32(~31));
|
AND(32, R(RSCRATCH), Imm32(~31));
|
||||||
TEST(32, R(RSCRATCH), Imm32(mem_mask));
|
TEST(32, R(RSCRATCH), Imm32(mem_mask));
|
||||||
FixupBranch fast = J_CC(CC_Z, true);
|
FixupBranch slow = J_CC(CC_NZ, true);
|
||||||
|
|
||||||
// Should this code ever run? I can't find any games that use DCBZ on non-physical addresses, but
|
// Should this code ever run? I can't find any games that use DCBZ on non-physical addresses, but
|
||||||
// supposedly there are, at least for some MMU titles. Let's be careful and support it to be sure.
|
// supposedly there are, at least for some MMU titles. Let's be careful and support it to be sure.
|
||||||
|
SwitchToFarCode();
|
||||||
|
SetJumpTarget(slow);
|
||||||
MOV(32, M(&PC), Imm32(jit->js.compilerPC));
|
MOV(32, M(&PC), Imm32(jit->js.compilerPC));
|
||||||
u32 registersInUse = CallerSavedRegistersInUse();
|
u32 registersInUse = CallerSavedRegistersInUse();
|
||||||
ABI_PushRegistersAndAdjustStack(registersInUse, 0);
|
ABI_PushRegistersAndAdjustStack(registersInUse, 0);
|
||||||
ABI_CallFunctionR((void *)&Memory::ClearCacheLine, RSCRATCH);
|
ABI_CallFunctionR((void *)&Memory::ClearCacheLine, RSCRATCH);
|
||||||
ABI_PopRegistersAndAdjustStack(registersInUse, 0);
|
ABI_PopRegistersAndAdjustStack(registersInUse, 0);
|
||||||
|
FixupBranch exit = J(true);
|
||||||
|
|
||||||
FixupBranch exit = J();
|
SwitchToNearCode();
|
||||||
SetJumpTarget(fast);
|
|
||||||
PXOR(XMM0, R(XMM0));
|
PXOR(XMM0, R(XMM0));
|
||||||
MOVAPS(MComplex(RMEM, RSCRATCH, SCALE_1, 0), XMM0);
|
MOVAPS(MComplex(RMEM, RSCRATCH, SCALE_1, 0), XMM0);
|
||||||
MOVAPS(MComplex(RMEM, RSCRATCH, SCALE_1, 16), XMM0);
|
MOVAPS(MComplex(RMEM, RSCRATCH, SCALE_1, 16), XMM0);
|
||||||
@ -411,7 +413,7 @@ void Jit64::stX(UGeckoInstruction inst)
|
|||||||
|
|
||||||
if (update && offset)
|
if (update && offset)
|
||||||
{
|
{
|
||||||
MEMCHECK_START
|
MEMCHECK_START(false)
|
||||||
gpr.KillImmediate(a, true, true);
|
gpr.KillImmediate(a, true, true);
|
||||||
|
|
||||||
ADD(32, gpr.R(a), Imm32((u32)offset));
|
ADD(32, gpr.R(a), Imm32((u32)offset));
|
||||||
@ -433,10 +435,11 @@ void Jit64::stXx(UGeckoInstruction inst)
|
|||||||
|
|
||||||
int a = inst.RA, b = inst.RB, s = inst.RS;
|
int a = inst.RA, b = inst.RB, s = inst.RS;
|
||||||
FALLBACK_IF(!a || a == s || a == b);
|
FALLBACK_IF(!a || a == s || a == b);
|
||||||
|
bool update = !!(inst.SUBOP10 & 32);
|
||||||
|
|
||||||
gpr.Lock(a, b, s);
|
gpr.Lock(a, b, s);
|
||||||
|
|
||||||
if (inst.SUBOP10 & 32)
|
if (update)
|
||||||
{
|
{
|
||||||
gpr.BindToRegister(a, true, true);
|
gpr.BindToRegister(a, true, true);
|
||||||
ADD(32, gpr.R(a), gpr.R(b));
|
ADD(32, gpr.R(a), gpr.R(b));
|
||||||
@ -483,6 +486,14 @@ void Jit64::stXx(UGeckoInstruction inst)
|
|||||||
}
|
}
|
||||||
SafeWriteRegToReg(reg_value, RSCRATCH2, accessSize, 0, CallerSavedRegistersInUse());
|
SafeWriteRegToReg(reg_value, RSCRATCH2, accessSize, 0, CallerSavedRegistersInUse());
|
||||||
|
|
||||||
|
if (update && js.memcheck)
|
||||||
|
{
|
||||||
|
// revert the address change if an exception occurred
|
||||||
|
MEMCHECK_START(true)
|
||||||
|
SUB(32, gpr.R(a), gpr.R(b));
|
||||||
|
MEMCHECK_END;
|
||||||
|
}
|
||||||
|
|
||||||
gpr.UnlockAll();
|
gpr.UnlockAll();
|
||||||
gpr.UnlockAllX();
|
gpr.UnlockAllX();
|
||||||
}
|
}
|
||||||
|
@ -27,13 +27,18 @@ void Jit64::lfXXX(UGeckoInstruction inst)
|
|||||||
int a = inst.RA;
|
int a = inst.RA;
|
||||||
int b = inst.RB;
|
int b = inst.RB;
|
||||||
|
|
||||||
FALLBACK_IF(!indexed && !a);
|
FALLBACK_IF((!indexed && !a) || (update && a == d));
|
||||||
|
|
||||||
if (update)
|
if (update)
|
||||||
gpr.BindToRegister(a, true, true);
|
gpr.BindToRegister(a, true, true);
|
||||||
|
|
||||||
s32 offset = 0;
|
s32 offset = 0;
|
||||||
OpArg addr = gpr.R(a);
|
OpArg addr = gpr.R(a);
|
||||||
|
if (update && js.memcheck)
|
||||||
|
{
|
||||||
|
addr = R(RSCRATCH2);
|
||||||
|
MOV(32, addr, gpr.R(a));
|
||||||
|
}
|
||||||
if (indexed)
|
if (indexed)
|
||||||
{
|
{
|
||||||
if (update)
|
if (update)
|
||||||
@ -58,14 +63,17 @@ void Jit64::lfXXX(UGeckoInstruction inst)
|
|||||||
if (update)
|
if (update)
|
||||||
ADD(32, addr, Imm32((s32)(s16)inst.SIMM_16));
|
ADD(32, addr, Imm32((s32)(s16)inst.SIMM_16));
|
||||||
else
|
else
|
||||||
offset = (s32)(s16)inst.SIMM_16;
|
offset = (s16)inst.SIMM_16;
|
||||||
}
|
}
|
||||||
|
|
||||||
SafeLoadToReg(RSCRATCH, addr, single ? 32 : 64, offset, CallerSavedRegistersInUse(), false);
|
u32 registersInUse = CallerSavedRegistersInUse();
|
||||||
|
if (update && js.memcheck)
|
||||||
|
registersInUse |= (1 << RSCRATCH2);
|
||||||
|
SafeLoadToReg(RSCRATCH, addr, single ? 32 : 64, offset, registersInUse, false);
|
||||||
fpr.Lock(d);
|
fpr.Lock(d);
|
||||||
fpr.BindToRegister(d, js.memcheck || !single);
|
fpr.BindToRegister(d, js.memcheck || !single);
|
||||||
|
|
||||||
MEMCHECK_START
|
MEMCHECK_START(false)
|
||||||
if (single)
|
if (single)
|
||||||
{
|
{
|
||||||
ConvertSingleToDouble(fpr.RX(d), RSCRATCH, true);
|
ConvertSingleToDouble(fpr.RX(d), RSCRATCH, true);
|
||||||
@ -75,6 +83,8 @@ void Jit64::lfXXX(UGeckoInstruction inst)
|
|||||||
MOVQ_xmm(XMM0, R(RSCRATCH));
|
MOVQ_xmm(XMM0, R(RSCRATCH));
|
||||||
MOVSD(fpr.RX(d), R(XMM0));
|
MOVSD(fpr.RX(d), R(XMM0));
|
||||||
}
|
}
|
||||||
|
if (update && js.memcheck)
|
||||||
|
MOV(32, gpr.R(a), addr);
|
||||||
MEMCHECK_END
|
MEMCHECK_END
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
gpr.UnlockAll();
|
gpr.UnlockAll();
|
||||||
@ -93,9 +103,10 @@ void Jit64::stfXXX(UGeckoInstruction inst)
|
|||||||
int a = inst.RA;
|
int a = inst.RA;
|
||||||
int b = inst.RB;
|
int b = inst.RB;
|
||||||
|
|
||||||
FALLBACK_IF(!indexed && !a);
|
FALLBACK_IF((!indexed && !a) || (update && (a == s || a == b)));
|
||||||
|
|
||||||
s32 offset = 0;
|
s32 offset = 0;
|
||||||
|
s32 imm = (s16)inst.SIMM_16;
|
||||||
if (indexed)
|
if (indexed)
|
||||||
{
|
{
|
||||||
if (update)
|
if (update)
|
||||||
@ -121,11 +132,11 @@ void Jit64::stfXXX(UGeckoInstruction inst)
|
|||||||
if (update)
|
if (update)
|
||||||
{
|
{
|
||||||
gpr.BindToRegister(a, true, true);
|
gpr.BindToRegister(a, true, true);
|
||||||
ADD(32, gpr.R(a), Imm32((s32)(s16)inst.SIMM_16));
|
ADD(32, gpr.R(a), Imm32(imm));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
offset = (s32)(s16)inst.SIMM_16;
|
offset = imm;
|
||||||
}
|
}
|
||||||
MOV(32, R(RSCRATCH2), gpr.R(a));
|
MOV(32, R(RSCRATCH2), gpr.R(a));
|
||||||
}
|
}
|
||||||
@ -145,6 +156,13 @@ void Jit64::stfXXX(UGeckoInstruction inst)
|
|||||||
MOV(64, R(RSCRATCH), fpr.R(s));
|
MOV(64, R(RSCRATCH), fpr.R(s));
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH2, 64, offset, CallerSavedRegistersInUse());
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH2, 64, offset, CallerSavedRegistersInUse());
|
||||||
}
|
}
|
||||||
|
if (js.memcheck && update)
|
||||||
|
{
|
||||||
|
// revert the address change if an exception occurred
|
||||||
|
MEMCHECK_START(true)
|
||||||
|
SUB(32, gpr.R(a), indexed ? gpr.R(b) : Imm32(imm));
|
||||||
|
MEMCHECK_END
|
||||||
|
}
|
||||||
gpr.UnlockAll();
|
gpr.UnlockAll();
|
||||||
gpr.UnlockAllX();
|
gpr.UnlockAllX();
|
||||||
}
|
}
|
||||||
|
@ -20,30 +20,31 @@ void Jit64::psq_st(UGeckoInstruction inst)
|
|||||||
{
|
{
|
||||||
INSTRUCTION_START
|
INSTRUCTION_START
|
||||||
JITDISABLE(bJITLoadStorePairedOff);
|
JITDISABLE(bJITLoadStorePairedOff);
|
||||||
FALLBACK_IF(js.memcheck || !inst.RA);
|
FALLBACK_IF(!inst.RA);
|
||||||
|
|
||||||
bool update = inst.OPCD == 61;
|
bool update = inst.OPCD == 61;
|
||||||
|
|
||||||
int offset = inst.SIMM_12;
|
int offset = inst.SIMM_12;
|
||||||
int a = inst.RA;
|
int a = inst.RA;
|
||||||
int s = inst.RS; // Fp numbers
|
int s = inst.RS;
|
||||||
|
|
||||||
gpr.FlushLockX(RSCRATCH, RSCRATCH_EXTRA);
|
gpr.FlushLockX(RSCRATCH_EXTRA);
|
||||||
if (update)
|
if (update)
|
||||||
gpr.BindToRegister(inst.RA, true, true);
|
gpr.BindToRegister(a, true, true);
|
||||||
fpr.BindToRegister(inst.RS, true, false);
|
fpr.BindToRegister(s, true, false);
|
||||||
MOV(32, R(RSCRATCH_EXTRA), gpr.R(inst.RA));
|
MOV(32, R(RSCRATCH_EXTRA), gpr.R(a));
|
||||||
if (offset)
|
if (offset)
|
||||||
ADD(32, R(RSCRATCH_EXTRA), Imm32((u32)offset));
|
ADD(32, R(RSCRATCH_EXTRA), Imm32((u32)offset));
|
||||||
if (update && offset)
|
// In memcheck mode, don't update the address until the exception check
|
||||||
|
if (update && offset && !js.memcheck)
|
||||||
MOV(32, gpr.R(a), R(RSCRATCH_EXTRA));
|
MOV(32, gpr.R(a), R(RSCRATCH_EXTRA));
|
||||||
// Some games (e.g. Dirt 2) incorrectly set the unused bits which breaks the lookup table code.
|
// Some games (e.g. Dirt 2) incorrectly set the unused bits which breaks the lookup table code.
|
||||||
// Hence, we need to mask out the unused bits. The layout of the GQR register is
|
// Hence, we need to mask out the unused bits. The layout of the GQR register is
|
||||||
// UU[SCALE]UUUUU[TYPE] where SCALE is 6 bits and TYPE is 3 bits, so we have to AND with
|
// UU[SCALE]UUUUU[TYPE] where SCALE is 6 bits and TYPE is 3 bits, so we have to AND with
|
||||||
// 0b0011111100000111, or 0x3F07.
|
// 0b0011111100000111, or 0x3F07.
|
||||||
MOV(32, R(RSCRATCH), Imm32(0x3F07));
|
MOV(32, R(RSCRATCH2), Imm32(0x3F07));
|
||||||
AND(32, R(RSCRATCH), PPCSTATE(spr[SPR_GQR0 + inst.I]));
|
AND(32, R(RSCRATCH2), PPCSTATE(spr[SPR_GQR0 + inst.I]));
|
||||||
MOVZX(32, 8, RSCRATCH2, R(RSCRATCH));
|
MOVZX(32, 8, RSCRATCH, R(RSCRATCH2));
|
||||||
|
|
||||||
// FIXME: Fix ModR/M encoding to allow [RSCRATCH2*4+disp32] without a base register!
|
// FIXME: Fix ModR/M encoding to allow [RSCRATCH2*4+disp32] without a base register!
|
||||||
if (inst.W)
|
if (inst.W)
|
||||||
@ -51,13 +52,20 @@ void Jit64::psq_st(UGeckoInstruction inst)
|
|||||||
// One value
|
// One value
|
||||||
PXOR(XMM0, R(XMM0)); // TODO: See if we can get rid of this cheaply by tweaking the code in the singleStore* functions.
|
PXOR(XMM0, R(XMM0)); // TODO: See if we can get rid of this cheaply by tweaking the code in the singleStore* functions.
|
||||||
CVTSD2SS(XMM0, fpr.R(s));
|
CVTSD2SS(XMM0, fpr.R(s));
|
||||||
CALLptr(MScaled(RSCRATCH2, SCALE_8, (u32)(u64)asm_routines.singleStoreQuantized));
|
CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.singleStoreQuantized));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Pair of values
|
// Pair of values
|
||||||
CVTPD2PS(XMM0, fpr.R(s));
|
CVTPD2PS(XMM0, fpr.R(s));
|
||||||
CALLptr(MScaled(RSCRATCH2, SCALE_8, (u32)(u64)asm_routines.pairedStoreQuantized));
|
CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.pairedStoreQuantized));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (update && offset && js.memcheck)
|
||||||
|
{
|
||||||
|
MEMCHECK_START(false)
|
||||||
|
ADD(32, gpr.R(a), Imm32((u32)offset));
|
||||||
|
MEMCHECK_END
|
||||||
}
|
}
|
||||||
gpr.UnlockAll();
|
gpr.UnlockAll();
|
||||||
gpr.UnlockAllX();
|
gpr.UnlockAllX();
|
||||||
@ -67,33 +75,38 @@ void Jit64::psq_l(UGeckoInstruction inst)
|
|||||||
{
|
{
|
||||||
INSTRUCTION_START
|
INSTRUCTION_START
|
||||||
JITDISABLE(bJITLoadStorePairedOff);
|
JITDISABLE(bJITLoadStorePairedOff);
|
||||||
FALLBACK_IF(js.memcheck || !inst.RA);
|
FALLBACK_IF(!inst.RA);
|
||||||
|
|
||||||
bool update = inst.OPCD == 57;
|
bool update = inst.OPCD == 57;
|
||||||
int offset = inst.SIMM_12;
|
int offset = inst.SIMM_12;
|
||||||
|
int a = inst.RA;
|
||||||
|
int s = inst.RS;
|
||||||
|
|
||||||
gpr.FlushLockX(RSCRATCH, RSCRATCH_EXTRA);
|
gpr.FlushLockX(RSCRATCH_EXTRA);
|
||||||
gpr.BindToRegister(inst.RA, true, update && offset);
|
gpr.BindToRegister(a, true, update && offset);
|
||||||
fpr.BindToRegister(inst.RS, false, true);
|
fpr.BindToRegister(s, false, true);
|
||||||
if (offset)
|
if (offset)
|
||||||
LEA(32, RSCRATCH_EXTRA, MDisp(gpr.RX(inst.RA), offset));
|
LEA(32, RSCRATCH_EXTRA, MDisp(gpr.RX(a), offset));
|
||||||
else
|
else
|
||||||
MOV(32, R(RSCRATCH_EXTRA), gpr.R(inst.RA));
|
MOV(32, R(RSCRATCH_EXTRA), gpr.R(a));
|
||||||
if (update && offset)
|
// In memcheck mode, don't update the address until the exception check
|
||||||
MOV(32, gpr.R(inst.RA), R(RSCRATCH_EXTRA));
|
if (update && offset && !js.memcheck)
|
||||||
MOV(32, R(RSCRATCH), Imm32(0x3F07));
|
MOV(32, gpr.R(a), R(RSCRATCH_EXTRA));
|
||||||
AND(32, R(RSCRATCH), M(((char *)&GQR(inst.I)) + 2));
|
MOV(32, R(RSCRATCH2), Imm32(0x3F07));
|
||||||
MOVZX(32, 8, RSCRATCH2, R(RSCRATCH));
|
AND(32, R(RSCRATCH2), M(((char *)&GQR(inst.I)) + 2));
|
||||||
|
MOVZX(32, 8, RSCRATCH, R(RSCRATCH2));
|
||||||
if (inst.W)
|
if (inst.W)
|
||||||
OR(32, R(RSCRATCH2), Imm8(8));
|
OR(32, R(RSCRATCH), Imm8(8));
|
||||||
|
|
||||||
CALLptr(MScaled(RSCRATCH2, SCALE_8, (u32)(u64)asm_routines.pairedLoadQuantized));
|
CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.pairedLoadQuantized));
|
||||||
|
|
||||||
// MEMCHECK_START // FIXME: MMU does not work here because of unsafe memory access
|
MEMCHECK_START(false)
|
||||||
|
CVTPS2PD(fpr.RX(s), R(XMM0));
|
||||||
CVTPS2PD(fpr.RX(inst.RS), R(XMM0));
|
if (update && offset && js.memcheck)
|
||||||
|
{
|
||||||
// MEMCHECK_END
|
ADD(32, gpr.R(a), Imm32((u32)offset));
|
||||||
|
}
|
||||||
|
MEMCHECK_END
|
||||||
|
|
||||||
gpr.UnlockAll();
|
gpr.UnlockAll();
|
||||||
gpr.UnlockAllX();
|
gpr.UnlockAllX();
|
||||||
|
@ -1590,13 +1590,13 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
|
|||||||
// Hence, we need to mask out the unused bits. The layout of the GQR register is
|
// Hence, we need to mask out the unused bits. The layout of the GQR register is
|
||||||
// UU[SCALE]UUUUU[TYPE] where SCALE is 6 bits and TYPE is 3 bits, so we have to AND with
|
// UU[SCALE]UUUUU[TYPE] where SCALE is 6 bits and TYPE is 3 bits, so we have to AND with
|
||||||
// 0b0011111100000111, or 0x3F07.
|
// 0b0011111100000111, or 0x3F07.
|
||||||
Jit->MOV(32, R(RSCRATCH), Imm32(0x3F07));
|
Jit->MOV(32, R(RSCRATCH2), Imm32(0x3F07));
|
||||||
Jit->AND(32, R(RSCRATCH), M(((char *)&GQR(quantreg)) + 2));
|
Jit->AND(32, R(RSCRATCH2), M(((char *)&GQR(quantreg)) + 2));
|
||||||
Jit->MOVZX(32, 8, RSCRATCH2, R(RSCRATCH));
|
Jit->MOVZX(32, 8, RSCRATCH, R(RSCRATCH2));
|
||||||
Jit->OR(32, R(RSCRATCH2), Imm8(w << 3));
|
Jit->OR(32, R(RSCRATCH), Imm8(w << 3));
|
||||||
|
|
||||||
Jit->MOV(32, R(RSCRATCH_EXTRA), regLocForInst(RI, getOp1(I)));
|
Jit->MOV(32, R(RSCRATCH_EXTRA), regLocForInst(RI, getOp1(I)));
|
||||||
Jit->CALLptr(MScaled(RSCRATCH2, SCALE_8, (u32)(u64)(((JitIL *)jit)->asm_routines.pairedLoadQuantized)));
|
Jit->CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)(((JitIL *)jit)->asm_routines.pairedLoadQuantized)));
|
||||||
Jit->MOVAPD(reg, R(XMM0));
|
Jit->MOVAPD(reg, R(XMM0));
|
||||||
RI.fregs[reg] = I;
|
RI.fregs[reg] = I;
|
||||||
regNormalRegClear(RI, I);
|
regNormalRegClear(RI, I);
|
||||||
@ -1641,13 +1641,13 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
|
|||||||
regSpill(RI, RSCRATCH);
|
regSpill(RI, RSCRATCH);
|
||||||
regSpill(RI, RSCRATCH2);
|
regSpill(RI, RSCRATCH2);
|
||||||
u32 quantreg = *I >> 24;
|
u32 quantreg = *I >> 24;
|
||||||
Jit->MOV(32, R(RSCRATCH), Imm32(0x3F07));
|
Jit->MOV(32, R(RSCRATCH2), Imm32(0x3F07));
|
||||||
Jit->AND(32, R(RSCRATCH), PPCSTATE(spr[SPR_GQR0 + quantreg]));
|
Jit->AND(32, R(RSCRATCH2), PPCSTATE(spr[SPR_GQR0 + quantreg]));
|
||||||
Jit->MOVZX(32, 8, RSCRATCH2, R(RSCRATCH));
|
Jit->MOVZX(32, 8, RSCRATCH, R(RSCRATCH2));
|
||||||
|
|
||||||
Jit->MOV(32, R(RSCRATCH_EXTRA), regLocForInst(RI, getOp2(I)));
|
Jit->MOV(32, R(RSCRATCH_EXTRA), regLocForInst(RI, getOp2(I)));
|
||||||
Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I)));
|
Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I)));
|
||||||
Jit->CALLptr(MScaled(RSCRATCH2, SCALE_8, (u32)(u64)(((JitIL *)jit)->asm_routines.pairedStoreQuantized)));
|
Jit->CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)(((JitIL *)jit)->asm_routines.pairedStoreQuantized)));
|
||||||
if (RI.IInfo[I - RI.FirstI] & 4)
|
if (RI.IInfo[I - RI.FirstI] & 4)
|
||||||
fregClearInst(RI, getOp1(I));
|
fregClearInst(RI, getOp1(I));
|
||||||
if (RI.IInfo[I - RI.FirstI] & 8)
|
if (RI.IInfo[I - RI.FirstI] & 8)
|
||||||
|
@ -241,8 +241,6 @@ namespace JitILProfiler
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static int CODE_SIZE = 1024*1024*32;
|
|
||||||
|
|
||||||
void JitIL::Init()
|
void JitIL::Init()
|
||||||
{
|
{
|
||||||
jo.optimizeStack = true;
|
jo.optimizeStack = true;
|
||||||
@ -273,10 +271,11 @@ void JitIL::Init()
|
|||||||
|
|
||||||
trampolines.Init();
|
trampolines.Init();
|
||||||
AllocCodeSpace(CODE_SIZE);
|
AllocCodeSpace(CODE_SIZE);
|
||||||
|
|
||||||
blocks.Init();
|
blocks.Init();
|
||||||
asm_routines.Init();
|
asm_routines.Init();
|
||||||
|
|
||||||
|
farcode.Init(js.memcheck ? FARCODE_SIZE_MMU : FARCODE_SIZE);
|
||||||
|
|
||||||
code_block.m_stats = &js.st;
|
code_block.m_stats = &js.st;
|
||||||
code_block.m_gpa = &js.gpa;
|
code_block.m_gpa = &js.gpa;
|
||||||
code_block.m_fpa = &js.fpa;
|
code_block.m_fpa = &js.fpa;
|
||||||
@ -306,6 +305,7 @@ void JitIL::Shutdown()
|
|||||||
blocks.Shutdown();
|
blocks.Shutdown();
|
||||||
trampolines.Shutdown();
|
trampolines.Shutdown();
|
||||||
asm_routines.Shutdown();
|
asm_routines.Shutdown();
|
||||||
|
farcode.Shutdown();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -504,7 +504,8 @@ void JitIL::Trace()
|
|||||||
|
|
||||||
void STACKALIGN JitIL::Jit(u32 em_address)
|
void STACKALIGN JitIL::Jit(u32 em_address)
|
||||||
{
|
{
|
||||||
if (GetSpaceLeft() < 0x10000 || blocks.IsFull() || SConfig::GetInstance().m_LocalCoreStartupParameter.bJITNoBlockCache)
|
if (GetSpaceLeft() < 0x10000 || farcode.GetSpaceLeft() < 0x10000 || blocks.IsFull() ||
|
||||||
|
SConfig::GetInstance().m_LocalCoreStartupParameter.bJITNoBlockCache)
|
||||||
{
|
{
|
||||||
ClearCache();
|
ClearCache();
|
||||||
}
|
}
|
||||||
|
@ -17,6 +17,8 @@
|
|||||||
(1 << (XMM0+16)) | \
|
(1 << (XMM0+16)) | \
|
||||||
(1 << (XMM1+16))))
|
(1 << (XMM1+16))))
|
||||||
|
|
||||||
|
#define QUANTIZED_REGS_TO_SAVE_LOAD (QUANTIZED_REGS_TO_SAVE | (1 << RSCRATCH2))
|
||||||
|
|
||||||
using namespace Gen;
|
using namespace Gen;
|
||||||
|
|
||||||
static int temp32;
|
static int temp32;
|
||||||
@ -250,24 +252,29 @@ void CommonAsmRoutines::GenQuantizedStores()
|
|||||||
UD2();
|
UD2();
|
||||||
const u8* storePairedFloat = AlignCode4();
|
const u8* storePairedFloat = AlignCode4();
|
||||||
|
|
||||||
|
FixupBranch skip_complex, too_complex;
|
||||||
SHUFPS(XMM0, R(XMM0), 1);
|
SHUFPS(XMM0, R(XMM0), 1);
|
||||||
MOVQ_xmm(M(&psTemp[0]), XMM0);
|
MOVQ_xmm(M(&psTemp[0]), XMM0);
|
||||||
|
if (!jit->js.memcheck)
|
||||||
|
{
|
||||||
TEST(32, R(RSCRATCH_EXTRA), Imm32(0x0C000000));
|
TEST(32, R(RSCRATCH_EXTRA), Imm32(0x0C000000));
|
||||||
FixupBranch too_complex = J_CC(CC_NZ, true);
|
too_complex = J_CC(CC_NZ, true);
|
||||||
MOV(64, R(RSCRATCH), M(&psTemp[0]));
|
MOV(64, R(RSCRATCH), M(&psTemp[0]));
|
||||||
SwapAndStore(64, MComplex(RMEM, RSCRATCH_EXTRA, SCALE_1, 0), RSCRATCH);
|
SwapAndStore(64, MComplex(RMEM, RSCRATCH_EXTRA, SCALE_1, 0), RSCRATCH);
|
||||||
FixupBranch skip_complex = J(true);
|
skip_complex = J(true);
|
||||||
SetJumpTarget(too_complex);
|
SetJumpTarget(too_complex);
|
||||||
|
}
|
||||||
// RSP alignment here is 8 due to the call.
|
// RSP alignment here is 8 due to the call.
|
||||||
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
||||||
ABI_CallFunctionR((void *)&WriteDual32, RSCRATCH_EXTRA);
|
ABI_CallFunctionR((void *)&WriteDual32, RSCRATCH_EXTRA);
|
||||||
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
||||||
|
if (!jit->js.memcheck)
|
||||||
SetJumpTarget(skip_complex);
|
SetJumpTarget(skip_complex);
|
||||||
RET();
|
RET();
|
||||||
|
|
||||||
const u8* storePairedU8 = AlignCode4();
|
const u8* storePairedU8 = AlignCode4();
|
||||||
SHR(32, R(RSCRATCH), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(6));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS));
|
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
PUNPCKLDQ(XMM1, R(XMM1));
|
PUNPCKLDQ(XMM1, R(XMM1));
|
||||||
MULPS(XMM0, R(XMM1));
|
MULPS(XMM0, R(XMM1));
|
||||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
#ifdef QUANTIZE_OVERFLOW_SAFE
|
||||||
@ -284,8 +291,8 @@ void CommonAsmRoutines::GenQuantizedStores()
|
|||||||
RET();
|
RET();
|
||||||
|
|
||||||
const u8* storePairedS8 = AlignCode4();
|
const u8* storePairedS8 = AlignCode4();
|
||||||
SHR(32, R(RSCRATCH), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(6));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS));
|
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
PUNPCKLDQ(XMM1, R(XMM1));
|
PUNPCKLDQ(XMM1, R(XMM1));
|
||||||
MULPS(XMM0, R(XMM1));
|
MULPS(XMM0, R(XMM1));
|
||||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
#ifdef QUANTIZE_OVERFLOW_SAFE
|
||||||
@ -303,8 +310,8 @@ void CommonAsmRoutines::GenQuantizedStores()
|
|||||||
RET();
|
RET();
|
||||||
|
|
||||||
const u8* storePairedU16 = AlignCode4();
|
const u8* storePairedU16 = AlignCode4();
|
||||||
SHR(32, R(RSCRATCH), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(6));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS));
|
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
PUNPCKLDQ(XMM1, R(XMM1));
|
PUNPCKLDQ(XMM1, R(XMM1));
|
||||||
MULPS(XMM0, R(XMM1));
|
MULPS(XMM0, R(XMM1));
|
||||||
|
|
||||||
@ -329,8 +336,8 @@ void CommonAsmRoutines::GenQuantizedStores()
|
|||||||
RET();
|
RET();
|
||||||
|
|
||||||
const u8* storePairedS16 = AlignCode4();
|
const u8* storePairedS16 = AlignCode4();
|
||||||
SHR(32, R(RSCRATCH), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(6));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS));
|
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
// SHUFPS or UNPCKLPS might be a better choice here. The last one might just be an alias though.
|
// SHUFPS or UNPCKLPS might be a better choice here. The last one might just be an alias though.
|
||||||
PUNPCKLDQ(XMM1, R(XMM1));
|
PUNPCKLDQ(XMM1, R(XMM1));
|
||||||
MULPS(XMM0, R(XMM1));
|
MULPS(XMM0, R(XMM1));
|
||||||
@ -388,8 +395,8 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
|
|||||||
}*/
|
}*/
|
||||||
|
|
||||||
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
|
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
|
||||||
SHR(32, R(RSCRATCH), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(6));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS));
|
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
MULSS(XMM0, R(XMM1));
|
MULSS(XMM0, R(XMM1));
|
||||||
PXOR(XMM1, R(XMM1));
|
PXOR(XMM1, R(XMM1));
|
||||||
MAXSS(XMM0, R(XMM1));
|
MAXSS(XMM0, R(XMM1));
|
||||||
@ -399,8 +406,8 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
|
|||||||
RET();
|
RET();
|
||||||
|
|
||||||
const u8* storeSingleS8 = AlignCode4();
|
const u8* storeSingleS8 = AlignCode4();
|
||||||
SHR(32, R(RSCRATCH), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(6));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS));
|
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
MULSS(XMM0, R(XMM1));
|
MULSS(XMM0, R(XMM1));
|
||||||
MAXSS(XMM0, M((void *)&m_m128));
|
MAXSS(XMM0, M((void *)&m_m128));
|
||||||
MINSS(XMM0, M((void *)&m_127));
|
MINSS(XMM0, M((void *)&m_127));
|
||||||
@ -409,8 +416,8 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
|
|||||||
RET();
|
RET();
|
||||||
|
|
||||||
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
|
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
|
||||||
SHR(32, R(RSCRATCH), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(6));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS));
|
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
MULSS(XMM0, R(XMM1));
|
MULSS(XMM0, R(XMM1));
|
||||||
PXOR(XMM1, R(XMM1));
|
PXOR(XMM1, R(XMM1));
|
||||||
MAXSS(XMM0, R(XMM1));
|
MAXSS(XMM0, R(XMM1));
|
||||||
@ -420,8 +427,8 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
|
|||||||
RET();
|
RET();
|
||||||
|
|
||||||
const u8* storeSingleS16 = AlignCode4();
|
const u8* storeSingleS16 = AlignCode4();
|
||||||
SHR(32, R(RSCRATCH), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(6));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS));
|
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
MULSS(XMM0, R(XMM1));
|
MULSS(XMM0, R(XMM1));
|
||||||
MAXSS(XMM0, M((void *)&m_m32768));
|
MAXSS(XMM0, M((void *)&m_m32768));
|
||||||
MINSS(XMM0, M((void *)&m_32767));
|
MINSS(XMM0, M((void *)&m_32767));
|
||||||
@ -448,7 +455,13 @@ void CommonAsmRoutines::GenQuantizedLoads()
|
|||||||
UD2();
|
UD2();
|
||||||
|
|
||||||
const u8* loadPairedFloatTwo = AlignCode4();
|
const u8* loadPairedFloatTwo = AlignCode4();
|
||||||
if (cpu_info.bSSSE3)
|
if (jit->js.memcheck)
|
||||||
|
{
|
||||||
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 64, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_PROLOG);
|
||||||
|
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
|
||||||
|
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
}
|
||||||
|
else if (cpu_info.bSSSE3)
|
||||||
{
|
{
|
||||||
MOVQ_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
|
MOVQ_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
|
||||||
PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
|
PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
|
||||||
@ -462,7 +475,13 @@ void CommonAsmRoutines::GenQuantizedLoads()
|
|||||||
RET();
|
RET();
|
||||||
|
|
||||||
const u8* loadPairedFloatOne = AlignCode4();
|
const u8* loadPairedFloatOne = AlignCode4();
|
||||||
if (cpu_info.bSSSE3)
|
if (jit->js.memcheck)
|
||||||
|
{
|
||||||
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_PROLOG);
|
||||||
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
UNPCKLPS(XMM0, M((void*)m_one));
|
||||||
|
}
|
||||||
|
else if (cpu_info.bSSSE3)
|
||||||
{
|
{
|
||||||
MOVD_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
|
MOVD_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
|
||||||
PSHUFB(XMM0, M((void *)pbswapShuffle1x4));
|
PSHUFB(XMM0, M((void *)pbswapShuffle1x4));
|
||||||
@ -477,99 +496,130 @@ void CommonAsmRoutines::GenQuantizedLoads()
|
|||||||
RET();
|
RET();
|
||||||
|
|
||||||
const u8* loadPairedU8Two = AlignCode4();
|
const u8* loadPairedU8Two = AlignCode4();
|
||||||
|
if (jit->js.memcheck)
|
||||||
|
{
|
||||||
|
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
|
||||||
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG);
|
||||||
|
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
|
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
|
||||||
|
}
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
PXOR(XMM1, R(XMM1));
|
PXOR(XMM1, R(XMM1));
|
||||||
PUNPCKLBW(XMM0, R(XMM1));
|
PUNPCKLBW(XMM0, R(XMM1));
|
||||||
PUNPCKLWD(XMM0, R(XMM1));
|
PUNPCKLWD(XMM0, R(XMM1));
|
||||||
CVTDQ2PS(XMM0, R(XMM0));
|
CVTDQ2PS(XMM0, R(XMM0));
|
||||||
SHR(32, R(RSCRATCH), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(6));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS));
|
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
PUNPCKLDQ(XMM1, R(XMM1));
|
PUNPCKLDQ(XMM1, R(XMM1));
|
||||||
MULPS(XMM0, R(XMM1));
|
MULPS(XMM0, R(XMM1));
|
||||||
RET();
|
RET();
|
||||||
|
|
||||||
const u8* loadPairedU8One = AlignCode4();
|
const u8* loadPairedU8One = AlignCode4();
|
||||||
|
if (jit->js.memcheck)
|
||||||
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG);
|
||||||
|
else
|
||||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx
|
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
CVTDQ2PS(XMM0, R(XMM0)); // Is CVTSI2SS better?
|
CVTDQ2PS(XMM0, R(XMM0)); // Is CVTSI2SS better?
|
||||||
SHR(32, R(RSCRATCH), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(6));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS));
|
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
MULSS(XMM0, R(XMM1));
|
MULSS(XMM0, R(XMM1));
|
||||||
UNPCKLPS(XMM0, M((void*)m_one));
|
UNPCKLPS(XMM0, M((void*)m_one));
|
||||||
RET();
|
RET();
|
||||||
|
|
||||||
const u8* loadPairedS8Two = AlignCode4();
|
const u8* loadPairedS8Two = AlignCode4();
|
||||||
|
if (jit->js.memcheck)
|
||||||
|
{
|
||||||
|
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
|
||||||
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG);
|
||||||
|
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
|
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
|
||||||
|
}
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
PUNPCKLBW(XMM0, R(XMM0));
|
PUNPCKLBW(XMM0, R(XMM0));
|
||||||
PUNPCKLWD(XMM0, R(XMM0));
|
PUNPCKLWD(XMM0, R(XMM0));
|
||||||
PSRAD(XMM0, 24);
|
PSRAD(XMM0, 24);
|
||||||
CVTDQ2PS(XMM0, R(XMM0));
|
CVTDQ2PS(XMM0, R(XMM0));
|
||||||
SHR(32, R(RSCRATCH), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(6));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS));
|
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
PUNPCKLDQ(XMM1, R(XMM1));
|
PUNPCKLDQ(XMM1, R(XMM1));
|
||||||
MULPS(XMM0, R(XMM1));
|
MULPS(XMM0, R(XMM1));
|
||||||
RET();
|
RET();
|
||||||
|
|
||||||
const u8* loadPairedS8One = AlignCode4();
|
const u8* loadPairedS8One = AlignCode4();
|
||||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0);
|
if (jit->js.memcheck)
|
||||||
SHL(32, R(RSCRATCH_EXTRA), Imm8(24));
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_PROLOG);
|
||||||
SAR(32, R(RSCRATCH_EXTRA), Imm8(24));
|
else
|
||||||
|
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true);
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
CVTDQ2PS(XMM0, R(XMM0));
|
CVTDQ2PS(XMM0, R(XMM0));
|
||||||
SHR(32, R(RSCRATCH), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(6));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS));
|
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
MULSS(XMM0, R(XMM1));
|
MULSS(XMM0, R(XMM1));
|
||||||
UNPCKLPS(XMM0, M((void*)m_one));
|
UNPCKLPS(XMM0, M((void*)m_one));
|
||||||
RET();
|
RET();
|
||||||
|
|
||||||
const u8* loadPairedU16Two = AlignCode4();
|
const u8* loadPairedU16Two = AlignCode4();
|
||||||
|
// TODO: Support not swapping in (un)safeLoadToReg to avoid bswapping twice
|
||||||
|
if (jit->js.memcheck)
|
||||||
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG);
|
||||||
|
else
|
||||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
|
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
|
||||||
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
PXOR(XMM1, R(XMM1));
|
PXOR(XMM1, R(XMM1));
|
||||||
PUNPCKLWD(XMM0, R(XMM1));
|
PUNPCKLWD(XMM0, R(XMM1));
|
||||||
CVTDQ2PS(XMM0, R(XMM0));
|
CVTDQ2PS(XMM0, R(XMM0));
|
||||||
SHR(32, R(RSCRATCH), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(6));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS));
|
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
PUNPCKLDQ(XMM1, R(XMM1));
|
PUNPCKLDQ(XMM1, R(XMM1));
|
||||||
MULPS(XMM0, R(XMM1));
|
MULPS(XMM0, R(XMM1));
|
||||||
RET();
|
RET();
|
||||||
|
|
||||||
const u8* loadPairedU16One = AlignCode4();
|
const u8* loadPairedU16One = AlignCode4();
|
||||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
|
if (jit->js.memcheck)
|
||||||
SHR(32, R(RSCRATCH_EXTRA), Imm8(16));
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG);
|
||||||
|
else
|
||||||
|
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false);
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
CVTDQ2PS(XMM0, R(XMM0));
|
CVTDQ2PS(XMM0, R(XMM0));
|
||||||
SHR(32, R(RSCRATCH), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(6));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS));
|
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
MULSS(XMM0, R(XMM1));
|
MULSS(XMM0, R(XMM1));
|
||||||
UNPCKLPS(XMM0, M((void*)m_one));
|
UNPCKLPS(XMM0, M((void*)m_one));
|
||||||
RET();
|
RET();
|
||||||
|
|
||||||
const u8* loadPairedS16Two = AlignCode4();
|
const u8* loadPairedS16Two = AlignCode4();
|
||||||
|
if (jit->js.memcheck)
|
||||||
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG);
|
||||||
|
else
|
||||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
|
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
|
||||||
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
PUNPCKLWD(XMM0, R(XMM0));
|
PUNPCKLWD(XMM0, R(XMM0));
|
||||||
PSRAD(XMM0, 16);
|
PSRAD(XMM0, 16);
|
||||||
CVTDQ2PS(XMM0, R(XMM0));
|
CVTDQ2PS(XMM0, R(XMM0));
|
||||||
SHR(32, R(RSCRATCH), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(6));
|
||||||
AND(32, R(RSCRATCH), Imm32(0xFC));
|
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS));
|
|
||||||
PUNPCKLDQ(XMM1, R(XMM1));
|
PUNPCKLDQ(XMM1, R(XMM1));
|
||||||
MULPS(XMM0, R(XMM1));
|
MULPS(XMM0, R(XMM1));
|
||||||
RET();
|
RET();
|
||||||
|
|
||||||
const u8* loadPairedS16One = AlignCode4();
|
const u8* loadPairedS16One = AlignCode4();
|
||||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
|
if (jit->js.memcheck)
|
||||||
SAR(32, R(RSCRATCH_EXTRA), Imm8(16));
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_PROLOG);
|
||||||
|
else
|
||||||
|
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true);
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
CVTDQ2PS(XMM0, R(XMM0));
|
CVTDQ2PS(XMM0, R(XMM0));
|
||||||
SHR(32, R(RSCRATCH), Imm8(6));
|
SHR(32, R(RSCRATCH2), Imm8(6));
|
||||||
AND(32, R(RSCRATCH), Imm32(0xFC));
|
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS));
|
|
||||||
MULSS(XMM0, R(XMM1));
|
MULSS(XMM0, R(XMM1));
|
||||||
UNPCKLPS(XMM0, M((void*)m_one));
|
UNPCKLPS(XMM0, M((void*)m_one));
|
||||||
RET();
|
RET();
|
||||||
|
@ -61,8 +61,11 @@ void EmuCodeBlock::UnsafeLoadRegToReg(X64Reg reg_addr, X64Reg reg_value, int acc
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmuCodeBlock::UnsafeLoadRegToRegNoSwap(X64Reg reg_addr, X64Reg reg_value, int accessSize, s32 offset)
|
void EmuCodeBlock::UnsafeLoadRegToRegNoSwap(X64Reg reg_addr, X64Reg reg_value, int accessSize, s32 offset, bool signExtend)
|
||||||
{
|
{
|
||||||
|
if (signExtend)
|
||||||
|
MOVSX(32, accessSize, reg_value, MComplex(RMEM, reg_addr, SCALE_1, offset));
|
||||||
|
else
|
||||||
MOVZX(32, accessSize, reg_value, MComplex(RMEM, reg_addr, SCALE_1, offset));
|
MOVZX(32, accessSize, reg_value, MComplex(RMEM, reg_addr, SCALE_1, offset));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -315,8 +318,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,
|
|||||||
}
|
}
|
||||||
ABI_PopRegistersAndAdjustStack(registersInUse, 0);
|
ABI_PopRegistersAndAdjustStack(registersInUse, 0);
|
||||||
|
|
||||||
MEMCHECK_START
|
MEMCHECK_START(false)
|
||||||
|
|
||||||
if (signExtend && accessSize < 32)
|
if (signExtend && accessSize < 32)
|
||||||
{
|
{
|
||||||
// Need to sign extend values coming from the Read_U* functions.
|
// Need to sign extend values coming from the Read_U* functions.
|
||||||
@ -326,7 +328,6 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,
|
|||||||
{
|
{
|
||||||
MOVZX(64, accessSize, reg_value, R(ABI_RETURN));
|
MOVZX(64, accessSize, reg_value, R(ABI_RETURN));
|
||||||
}
|
}
|
||||||
|
|
||||||
MEMCHECK_END
|
MEMCHECK_END
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -348,9 +349,17 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,
|
|||||||
}
|
}
|
||||||
TEST(32, addr_loc, Imm32(mem_mask));
|
TEST(32, addr_loc, Imm32(mem_mask));
|
||||||
|
|
||||||
FixupBranch fast = J_CC(CC_Z, true);
|
FixupBranch slow, exit;
|
||||||
|
slow = J_CC(CC_NZ, farcode.Enabled());
|
||||||
|
UnsafeLoadToReg(reg_value, addr_loc, accessSize, 0, signExtend);
|
||||||
|
if (farcode.Enabled())
|
||||||
|
SwitchToFarCode();
|
||||||
|
else
|
||||||
|
exit = J(true);
|
||||||
|
SetJumpTarget(slow);
|
||||||
|
|
||||||
ABI_PushRegistersAndAdjustStack(registersInUse, 0);
|
size_t rsp_alignment = (flags & SAFE_LOADSTORE_NO_PROLOG) ? 8 : 0;
|
||||||
|
ABI_PushRegistersAndAdjustStack(registersInUse, rsp_alignment);
|
||||||
switch (accessSize)
|
switch (accessSize)
|
||||||
{
|
{
|
||||||
case 64:
|
case 64:
|
||||||
@ -366,10 +375,9 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,
|
|||||||
ABI_CallFunctionA((void *)&Memory::Read_U8_ZX, addr_loc);
|
ABI_CallFunctionA((void *)&Memory::Read_U8_ZX, addr_loc);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
ABI_PopRegistersAndAdjustStack(registersInUse, 0);
|
ABI_PopRegistersAndAdjustStack(registersInUse, rsp_alignment);
|
||||||
|
|
||||||
MEMCHECK_START
|
|
||||||
|
|
||||||
|
MEMCHECK_START(false)
|
||||||
if (signExtend && accessSize < 32)
|
if (signExtend && accessSize < 32)
|
||||||
{
|
{
|
||||||
// Need to sign extend values coming from the Read_U* functions.
|
// Need to sign extend values coming from the Read_U* functions.
|
||||||
@ -379,12 +387,13 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,
|
|||||||
{
|
{
|
||||||
MOVZX(64, accessSize, reg_value, R(ABI_RETURN));
|
MOVZX(64, accessSize, reg_value, R(ABI_RETURN));
|
||||||
}
|
}
|
||||||
|
|
||||||
MEMCHECK_END
|
MEMCHECK_END
|
||||||
|
|
||||||
FixupBranch exit = J();
|
if (farcode.Enabled())
|
||||||
SetJumpTarget(fast);
|
{
|
||||||
UnsafeLoadToReg(reg_value, addr_loc, accessSize, 0, signExtend);
|
exit = J(true);
|
||||||
|
SwitchToNearCode();
|
||||||
|
}
|
||||||
SetJumpTarget(exit);
|
SetJumpTarget(exit);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -466,12 +475,21 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
bool swap = !(flags & SAFE_LOADSTORE_NO_SWAP);
|
||||||
|
|
||||||
|
FixupBranch slow, exit;
|
||||||
TEST(32, R(reg_addr), Imm32(mem_mask));
|
TEST(32, R(reg_addr), Imm32(mem_mask));
|
||||||
FixupBranch fast = J_CC(CC_Z, true);
|
slow = J_CC(CC_NZ, farcode.Enabled());
|
||||||
|
UnsafeWriteRegToReg(reg_value, reg_addr, accessSize, 0, swap);
|
||||||
|
if (farcode.Enabled())
|
||||||
|
SwitchToFarCode();
|
||||||
|
else
|
||||||
|
exit = J(true);
|
||||||
|
SetJumpTarget(slow);
|
||||||
// PC is used by memory watchpoints (if enabled) or to print accurate PC locations in debug logs
|
// PC is used by memory watchpoints (if enabled) or to print accurate PC locations in debug logs
|
||||||
MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC));
|
MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC));
|
||||||
|
|
||||||
size_t rsp_alignment = (flags & SAFE_LOADSTORE_NO_PROLOG) ? 8 : 0;
|
size_t rsp_alignment = (flags & SAFE_LOADSTORE_NO_PROLOG) ? 8 : 0;
|
||||||
bool swap = !(flags & SAFE_LOADSTORE_NO_SWAP);
|
|
||||||
ABI_PushRegistersAndAdjustStack(registersInUse, rsp_alignment);
|
ABI_PushRegistersAndAdjustStack(registersInUse, rsp_alignment);
|
||||||
switch (accessSize)
|
switch (accessSize)
|
||||||
{
|
{
|
||||||
@ -489,9 +507,11 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
ABI_PopRegistersAndAdjustStack(registersInUse, rsp_alignment);
|
ABI_PopRegistersAndAdjustStack(registersInUse, rsp_alignment);
|
||||||
FixupBranch exit = J();
|
if (farcode.Enabled())
|
||||||
SetJumpTarget(fast);
|
{
|
||||||
UnsafeWriteRegToReg(reg_value, reg_addr, accessSize, 0, swap);
|
exit = J(true);
|
||||||
|
SwitchToNearCode();
|
||||||
|
}
|
||||||
SetJumpTarget(exit);
|
SetJumpTarget(exit);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -655,15 +675,17 @@ void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
|
|||||||
// to save an instruction, since diverting a few more floats to the slow path can't hurt much.
|
// to save an instruction, since diverting a few more floats to the slow path can't hurt much.
|
||||||
SUB(8, R(RSCRATCH), Imm8(0x6D));
|
SUB(8, R(RSCRATCH), Imm8(0x6D));
|
||||||
CMP(8, R(RSCRATCH), Imm8(0x3));
|
CMP(8, R(RSCRATCH), Imm8(0x3));
|
||||||
FixupBranch x87Conversion = J_CC(CC_BE);
|
FixupBranch x87Conversion = J_CC(CC_BE, true);
|
||||||
CVTSD2SS(dst, R(src));
|
CVTSD2SS(dst, R(src));
|
||||||
FixupBranch continue1 = J();
|
|
||||||
|
|
||||||
|
SwitchToFarCode();
|
||||||
SetJumpTarget(x87Conversion);
|
SetJumpTarget(x87Conversion);
|
||||||
MOVSD(M(&temp64), src);
|
MOVSD(M(&temp64), src);
|
||||||
FLD(64, M(&temp64));
|
FLD(64, M(&temp64));
|
||||||
FSTP(32, M(&temp32));
|
FSTP(32, M(&temp32));
|
||||||
MOVSS(dst, M(&temp32));
|
MOVSS(dst, M(&temp32));
|
||||||
|
FixupBranch continue1 = J(true);
|
||||||
|
SwitchToNearCode();
|
||||||
|
|
||||||
SetJumpTarget(continue1);
|
SetJumpTarget(continue1);
|
||||||
// We'd normally need to MOVDDUP here to put the single in the top half of the output register too, but
|
// We'd normally need to MOVDDUP here to put the single in the top half of the output register too, but
|
||||||
@ -692,16 +714,17 @@ void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr
|
|||||||
// through the slow path (0x00800000), but the performance effects of that should be negligible.
|
// through the slow path (0x00800000), but the performance effects of that should be negligible.
|
||||||
SUB(32, R(gprsrc), Imm8(1));
|
SUB(32, R(gprsrc), Imm8(1));
|
||||||
TEST(32, R(gprsrc), Imm32(0x7f800000));
|
TEST(32, R(gprsrc), Imm32(0x7f800000));
|
||||||
|
FixupBranch x87Conversion = J_CC(CC_Z, true);
|
||||||
FixupBranch x87Conversion = J_CC(CC_Z);
|
|
||||||
CVTSS2SD(dst, R(dst));
|
CVTSS2SD(dst, R(dst));
|
||||||
FixupBranch continue1 = J();
|
|
||||||
|
|
||||||
|
SwitchToFarCode();
|
||||||
SetJumpTarget(x87Conversion);
|
SetJumpTarget(x87Conversion);
|
||||||
MOVSS(M(&temp32), dst);
|
MOVSS(M(&temp32), dst);
|
||||||
FLD(32, M(&temp32));
|
FLD(32, M(&temp32));
|
||||||
FSTP(64, M(&temp64));
|
FSTP(64, M(&temp64));
|
||||||
MOVSD(dst, M(&temp64));
|
MOVSD(dst, M(&temp64));
|
||||||
|
FixupBranch continue1 = J(true);
|
||||||
|
SwitchToNearCode();
|
||||||
|
|
||||||
SetJumpTarget(continue1);
|
SetJumpTarget(continue1);
|
||||||
MOVDDUP(dst, R(dst));
|
MOVDDUP(dst, R(dst));
|
||||||
|
@ -11,11 +11,13 @@
|
|||||||
|
|
||||||
namespace MMIO { class Mapping; }
|
namespace MMIO { class Mapping; }
|
||||||
|
|
||||||
#define MEMCHECK_START \
|
// If inv is true, invert the check (i.e. skip over the associated code if an exception hits,
|
||||||
|
// instead of skipping over the code if an exception isn't hit).
|
||||||
|
#define MEMCHECK_START(inv) \
|
||||||
Gen::FixupBranch memException; \
|
Gen::FixupBranch memException; \
|
||||||
if (jit->js.memcheck) \
|
if (jit->js.memcheck) \
|
||||||
{ TEST(32, PPCSTATE(Exceptions), Gen::Imm32(EXCEPTION_DSI)); \
|
{ TEST(32, PPCSTATE(Exceptions), Gen::Imm32(EXCEPTION_DSI)); \
|
||||||
memException = J_CC(Gen::CC_NZ, true); }
|
memException = J_CC((inv) ? Gen::CC_Z : Gen::CC_NZ, true); }
|
||||||
|
|
||||||
#define MEMCHECK_END \
|
#define MEMCHECK_END \
|
||||||
if (jit->js.memcheck) \
|
if (jit->js.memcheck) \
|
||||||
@ -32,15 +34,50 @@ namespace MMIO { class Mapping; }
|
|||||||
#define PPCSTATE_SRR0 PPCSTATE(spr[SPR_SRR0])
|
#define PPCSTATE_SRR0 PPCSTATE(spr[SPR_SRR0])
|
||||||
#define PPCSTATE_SRR1 PPCSTATE(spr[SPR_SRR1])
|
#define PPCSTATE_SRR1 PPCSTATE(spr[SPR_SRR1])
|
||||||
|
|
||||||
|
// A place to throw blocks of code we don't want polluting the cache, e.g. rarely taken
|
||||||
|
// exception branches.
|
||||||
|
class FarCodeCache : public Gen::X64CodeBlock
|
||||||
|
{
|
||||||
|
private:
|
||||||
|
bool m_enabled = false;
|
||||||
|
public:
|
||||||
|
bool Enabled() { return m_enabled; }
|
||||||
|
void Init(int size) { AllocCodeSpace(size); m_enabled = true; }
|
||||||
|
void Shutdown() { FreeCodeSpace(); m_enabled = false; }
|
||||||
|
};
|
||||||
|
|
||||||
// Like XCodeBlock but has some utilities for memory access.
|
// Like XCodeBlock but has some utilities for memory access.
|
||||||
class EmuCodeBlock : public Gen::X64CodeBlock
|
class EmuCodeBlock : public Gen::X64CodeBlock
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
static const int CODE_SIZE = 1024 * 1024 * 32;
|
||||||
|
|
||||||
|
// a bit of a hack; the MMU results in a vast amount more code ending up in the far cache,
|
||||||
|
// mostly exception handling, so give it a whole bunch more space if the MMU is on.
|
||||||
|
static const int FARCODE_SIZE = 1024 * 1024 * 8;
|
||||||
|
static const int FARCODE_SIZE_MMU = 1024 * 1024 * 48;
|
||||||
|
|
||||||
|
FarCodeCache farcode;
|
||||||
|
u8* nearcode; // Backed up when we switch to far code.
|
||||||
|
|
||||||
|
// Simple functions to switch between near and far code emitting
|
||||||
|
void SwitchToFarCode()
|
||||||
|
{
|
||||||
|
nearcode = GetWritableCodePtr();
|
||||||
|
SetCodePtr(farcode.GetWritableCodePtr());
|
||||||
|
}
|
||||||
|
|
||||||
|
void SwitchToNearCode()
|
||||||
|
{
|
||||||
|
farcode.SetCodePtr(GetWritableCodePtr());
|
||||||
|
SetCodePtr(nearcode);
|
||||||
|
}
|
||||||
|
|
||||||
void LoadAndSwap(int size, Gen::X64Reg dst, const Gen::OpArg& src);
|
void LoadAndSwap(int size, Gen::X64Reg dst, const Gen::OpArg& src);
|
||||||
void SwapAndStore(int size, const Gen::OpArg& dst, Gen::X64Reg src);
|
void SwapAndStore(int size, const Gen::OpArg& dst, Gen::X64Reg src);
|
||||||
|
|
||||||
void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset = 0, bool signExtend = false);
|
void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset = 0, bool signExtend = false);
|
||||||
void UnsafeLoadRegToRegNoSwap(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset);
|
void UnsafeLoadRegToRegNoSwap(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset, bool signExtend = false);
|
||||||
// these return the address of the MOV, for backpatching
|
// these return the address of the MOV, for backpatching
|
||||||
u8 *UnsafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset = 0, bool swap = true);
|
u8 *UnsafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset = 0, bool swap = true);
|
||||||
u8 *UnsafeLoadToReg(Gen::X64Reg reg_value, Gen::OpArg opAddress, int accessSize, s32 offset, bool signExtend);
|
u8 *UnsafeLoadToReg(Gen::X64Reg reg_value, Gen::OpArg opAddress, int accessSize, s32 offset, bool signExtend);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user