JIT a few more instructions. Very minor speed boost in a few games.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@961 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
hrydgard 2008-10-25 15:59:09 +00:00
parent f77624147d
commit 5247f6661b
13 changed files with 228 additions and 29 deletions

View File

@ -148,6 +148,8 @@ fnegx
frspx
frsqrtex
ps_sum0
ps_muls0
ps_adds1
*/

View File

@ -96,6 +96,7 @@ namespace Jit64
void addx(UGeckoInstruction inst);
void orx(UGeckoInstruction inst);
void xorx(UGeckoInstruction inst);
void andx(UGeckoInstruction inst);
void mulli(UGeckoInstruction inst);
void mulhwux(UGeckoInstruction inst);
@ -120,6 +121,7 @@ namespace Jit64
void mtmsr(UGeckoInstruction inst);
void mfmsr(UGeckoInstruction inst);
void mftb(UGeckoInstruction inst);
void mtcrf(UGeckoInstruction inst);
void reg_imm(UGeckoInstruction inst);
@ -130,6 +132,8 @@ namespace Jit64
void ps_mergeXX(UGeckoInstruction inst);
void ps_maddXX(UGeckoInstruction inst);
void ps_rsqrte(UGeckoInstruction inst);
void ps_sum(UGeckoInstruction inst);
void ps_muls(UGeckoInstruction inst);
void fp_arith_s(UGeckoInstruction inst);

View File

@ -54,6 +54,7 @@ const u8 *fifoDirectWrite8;
const u8 *fifoDirectWrite16;
const u8 *fifoDirectWrite32;
const u8 *fifoDirectWriteFloat;
const u8 *fifoDirectWriteXmm64;
bool compareEnabled = false;
@ -308,6 +309,19 @@ void GenFifoFloatWrite()
RET();
}
void GenFifoXmm64Write()
{
// Assume value in XMM0. Assume pre-byteswapped (unlike the others here!)
PUSH(ESI);
MOV(32, R(EAX), Imm32((u32)(u64)GPFifo::m_gatherPipe));
MOV(32, R(ESI), M(&GPFifo::m_gatherPipeCount));
MOVQ_xmm(MComplex(RAX, RSI, 1, 0), XMM0);
ADD(32, R(ESI), Imm8(8));
MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI));
POP(ESI);
RET();
}
void GenerateCommon()
{
computeRc = AlignCode16();
@ -332,6 +346,8 @@ void GenerateCommon()
GenFifoWrite(32);
fifoDirectWriteFloat = AlignCode4();
GenFifoFloatWrite();
fifoDirectWriteXmm64 = AlignCode4();
GenFifoXmm64Write();
computeRcFp = AlignCode16();
//CMPSD(R(XMM0), M(&zero),

View File

@ -39,6 +39,7 @@ namespace Jit64
extern const u8 *fifoDirectWrite16;
extern const u8 *fifoDirectWrite32;
extern const u8 *fifoDirectWriteFloat;
extern const u8 *fifoDirectWriteXmm64;
extern bool compareEnabled;
void Generate();

View File

@ -101,7 +101,19 @@ namespace Jit64
int d = inst.RD, a = inst.RA, s = inst.RS;
switch (inst.OPCD)
{
case 14: regimmop(d, a, false, (u32)(s32)inst.SIMM_16, Add, ADD); break; //addi
case 14: // addi
// occasionally used as MOV - emulate, with immediate propagation
if (gpr.R(a).IsImm() && d != a && a != 0) {
gpr.SetImmediate32(d, (u32)gpr.R(a).offset + (u32)(s32)(s16)inst.SIMM_16);
} else if (inst.SIMM_16 == 0 && d != a && a != 0) {
gpr.Lock(a);
gpr.LoadToX64(d, false, true);
MOV(32, gpr.R(d), gpr.R(a));
gpr.UnlockAll();
} else {
regimmop(d, a, false, (u32)(s32)inst.SIMM_16, Add, ADD); //addi
}
break;
case 15: regimmop(d, a, false, (u32)inst.SIMM_16 << 16, Add, ADD); break; //addis
case 24:
if (a == 0 && s == 0 && inst.UIMM == 0 && !inst.Rc) //check for nop
@ -292,6 +304,39 @@ namespace Jit64
}
}
// m_GPR[_inst.RA] = m_GPR[_inst.RS] ^ m_GPR[_inst.RB];
void xorx(UGeckoInstruction inst)
{
#ifdef JIT_OFF_OPTIONS
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITIntegerOff)
{Default(inst); return;} // turn off from debugger
#endif
INSTRUCTION_START;
int a = inst.RA;
int s = inst.RS;
int b = inst.RB;
if (s == b) {
gpr.SetImmediate32(a, 0);
}
else
{
gpr.LoadToX64(a, a == s || a == b, true);
gpr.Lock(a, s, b);
MOV(32, R(EAX), gpr.R(s));
XOR(32, R(EAX), gpr.R(b));
MOV(32, gpr.R(a), R(EAX));
gpr.UnlockAll();
}
if (inst.Rc)
{
MOV(32, R(EAX), gpr.R(a));
CALL((u8*)Asm::computeRc);
}
}
void andx(UGeckoInstruction inst)
{
#ifdef JIT_OFF_OPTIONS

View File

@ -91,7 +91,7 @@ namespace Jit64
// TODO(ector): Make it dynamically enable/disable idle skipping where appropriate
// Will give nice boost to dual core mode
// (mb2): I agree,
// IMHO those Idles should be always skipped and replaced by a more controlable "native" Idle methode
// IMHO those Idles should always be skipped and replaced by a more controllable "native" Idle methode
// ... maybe the throttle one already do that :p
// if (CommandProcessor::AllowIdleSkipping() && PixelEngine::AllowIdleSkipping())
if (Core::GetStartupParameter().bSkipIdle &&
@ -100,17 +100,16 @@ namespace Jit64
Memory::ReadUnchecked_U32(js.compilerPC + 4) == 0x28000000 &&
Memory::ReadUnchecked_U32(js.compilerPC + 8) == 0x4182fff8)
{
gpr.Flush(FLUSH_ALL);
fpr.Flush(FLUSH_ALL);
if (Core::GetStartupParameter().bUseDualCore)
CALL((void *)&PowerPC::OnIdleDC);
else
ABI_CallFunctionC((void *)&PowerPC::OnIdle, PowerPC::ppcState.gpr[a] + (s32)(s16)inst.SIMM_16);
MOV(32, M(&PowerPC::ppcState.pc), Imm32(js.compilerPC + 12));
JMP(Asm::testExceptions, true);
js.compilerPC += 8;
return;
gpr.Flush(FLUSH_ALL);
fpr.Flush(FLUSH_ALL);
if (Core::GetStartupParameter().bUseDualCore)
CALL((void *)&PowerPC::OnIdleDC);
else
ABI_CallFunctionC((void *)&PowerPC::OnIdle, PowerPC::ppcState.gpr[a] + (s32)(s16)inst.SIMM_16);
MOV(32, M(&PowerPC::ppcState.pc), Imm32(js.compilerPC + 12));
JMP(Asm::testExceptions, true);
js.compilerPC += 8;
return;
}
s32 offset = (s32)(s16)inst.SIMM_16;
@ -236,7 +235,7 @@ namespace Jit64
default: _assert_msg_(DYNA_REC, 0, "AWETKLJASDLKF"); return;
}
if (gpr.R(a).IsImm() && !update)
if (gpr.R(a).IsImm())
{
// If we already know the address through constant folding, we can do some
// fun tricks...
@ -244,6 +243,8 @@ namespace Jit64
addr += offset;
if ((addr & 0xFFFFF000) == 0xCC008000 && jo.optimizeGatherPipe)
{
if (offset && update)
gpr.SetImmediate32(a, addr);
gpr.FlushLockX(ABI_PARAM1);
MOV(32, R(ABI_PARAM1), gpr.R(s));
// INT3();
@ -261,6 +262,8 @@ namespace Jit64
}
else if (Memory::IsRAMAddress(addr) && accessSize == 32)
{
if (offset && update)
gpr.SetImmediate32(a, addr);
MOV(accessSize, R(EAX), gpr.R(s));
BSWAP(accessSize, EAX);
WriteToConstRamAddress(accessSize, R(EAX), addr);

View File

@ -52,8 +52,11 @@
namespace Jit64 {
double GC_ALIGNED16(psTemp[2]) = {1.0, 1.0};
u64 GC_ALIGNED16(temp64);
const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15};
const u8 GC_ALIGNED16(pbswapShuffleNoop[16]) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
static double GC_ALIGNED16(psTemp[2]) = {1.0, 1.0};
static u64 GC_ALIGNED16(temp64);
// TODO(ector): Improve 64-bit version
void WriteDual32(u64 value, u32 address)
@ -183,6 +186,20 @@ void psq_st(UGeckoInstruction inst)
if (stType == QUANTIZE_FLOAT)
{
DISABLE_32BIT;
if (gpr.R(a).IsImm() && !update && cpu_info.bSSSE3)
{
u32 addr = gpr.R(a).offset + offset;
if (addr == 0xCC008000) {
// Writing to FIFO. Let's do fast method.
CVTPD2PS(XMM0, fpr.R(s));
PSHUFB(XMM0, M((void*)&pbswapShuffle2x4));
CALL((void*)Asm::fifoDirectWriteXmm64);
js.fifoBytesThisBlock += 8;
return;
}
}
gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
gpr.Lock(a);
fpr.Lock(s);
@ -282,9 +299,6 @@ void psq_st(UGeckoInstruction inst)
}
}
const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15};
const u8 GC_ALIGNED16(pbswapShuffleNoop[16]) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
void psq_l(UGeckoInstruction inst)
{
#ifdef JIT_OFF_OPTIONS

View File

@ -247,6 +247,92 @@ namespace Jit64
}
}
void ps_sum(UGeckoInstruction inst)
{
#ifdef JIT_OFF_OPTIONS
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff)
{Default(inst); return;} // turn off from debugger
#endif
INSTRUCTION_START;
if (inst.Rc) {
Default(inst); return;
}
int d = inst.FD;
int a = inst.FA;
int b = inst.FB;
int c = inst.FC;
fpr.Lock(a,b,c,d);
fpr.LoadToX64(d, d == a || d == b || d == c, true);
switch (inst.SUBOP5)
{
case 10:
// Do the sum in upper subregisters, merge uppers
MOVDDUP(XMM0, fpr.R(a));
MOVAPD(XMM1, fpr.R(b));
ADDPD(XMM0, R(XMM1));
UNPCKHPD(XMM0, fpr.R(c)); //merge
MOVAPD(fpr.R(d), XMM0);
break;
case 11:
// Do the sum in lower subregisters, merge lowers
MOVAPD(XMM0, fpr.R(a));
MOVAPD(XMM1, fpr.R(b));
SHUFPD(XMM1, R(XMM1), 5); // copy higher to lower
ADDPD(XMM0, R(XMM1)); // sum lowers
MOVAPD(XMM1, fpr.R(c));
UNPCKLPD(XMM1, R(XMM0)); // merge
MOVAPD(fpr.R(d), XMM1);
break;
default:
PanicAlert("ps_sum WTF!!!");
}
ForceSinglePrecisionP(fpr.RX(d));
fpr.UnlockAll();
}
void ps_muls(UGeckoInstruction inst)
{
Default(inst); return;
#ifdef JIT_OFF_OPTIONS
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff)
{Default(inst); return;} // turn off from debugger
#endif
INSTRUCTION_START;
if (inst.Rc) {
Default(inst); return;
}
int d = inst.FD;
int a = inst.FA;
int c = inst.FC;
fpr.Lock(a, c, d);
fpr.LoadToX64(d, d == a || d == c, true);
switch (inst.SUBOP5)
{
case 12:
// Single multiply scalar high
// TODO - faster version for when regs are different
MOVAPD(XMM0, fpr.R(c));
MOVDDUP(XMM1, fpr.R(a));
MULPS(XMM0, R(XMM1));
MOVAPD(fpr.R(d), XMM0);
break;
case 13:
// TODO - faster version for when regs are different
MOVAPD(XMM0, fpr.R(c));
MOVAPD(XMM1, fpr.R(a));
SHUFPD(XMM1, R(XMM1), 5); // copy higher to lower
MULPD(XMM0, R(XMM1)); // sum lowers
MOVAPD(fpr.R(d), XMM0);
break;
default:
PanicAlert("ps_muls WTF!!!");
}
ForceSinglePrecisionP(fpr.RX(d));
fpr.UnlockAll();
}
//TODO: find easy cases and optimize them, do a breakout like ps_arith
void ps_mergeXX(UGeckoInstruction inst)
{

View File

@ -156,5 +156,29 @@ namespace Jit64
INSTRUCTION_START;
mfspr(inst);
}
void mtcrf(UGeckoInstruction inst)
{
u32 mask = 0;
u32 crm = inst.CRM;
gpr.FlushLockX(ECX);
if (crm == 0xFF) {
MOV(32, R(EAX), gpr.R(inst.RS));
MOV(32, M(&PowerPC::ppcState.cr), R(EAX));
} else {
//TODO: use lookup table? probably not worth it
for (int i = 0; i < 8; i++) {
if (crm & (1 << i))
mask |= 0xF << (i*4);
}
MOV(32, R(EAX), gpr.R(inst.RS));
MOV(32, R(ECX), M(&PowerPC::ppcState.cr));
AND(32, R(EAX), Imm32(mask));
AND(32, R(ECX), Imm32(~mask));
OR(32, R(EAX), R(ECX));
MOV(32, M(&PowerPC::ppcState.cr), R(EAX));
}
gpr.UnlockAllX();
}
}

View File

@ -55,7 +55,9 @@ void UnsafeLoadRegToReg(X64Reg reg_addr, X64Reg reg_value, int accessSize, s32 o
SHR(32, R(EAX), Imm8(16));
}
if (signExtend && accessSize < 32) {
MOVSX(32, accessSize, EAX, R(EAX));
// For 16-bit, this must be done AFTER the BSWAP.
// TODO: bake 8-bit into the original load.
MOVSX(32, accessSize, EAX, R(EAX));
}
}
@ -75,6 +77,7 @@ void SafeLoadRegToEAX(X64Reg reg, int accessSize, s32 offset, bool signExtend)
case 8: ABI_CallFunctionR(ProtectFunction((void *)&Memory::Read_U8, 1), reg); break;
}
if (signExtend && accessSize < 32) {
// Need to sign extend values coming from the Read_U* functions.
MOVSX(32, accessSize, EAX, R(EAX));
}
SetJumpTarget(arg2);
@ -111,7 +114,7 @@ void SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int accessSize, s32 of
void WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address)
{
#ifdef _M_X64
MOV(accessSize, MDisp(RBX, address & 0x3FFFFFFF), arg);
MOV(accessSize, MDisp(RBX, address & 0x3FFFFFFF), arg);
#else
MOV(accessSize, M((void*)(Memory::base + (address & Memory::MEMVIEW32_MASK))), arg);
#endif

View File

@ -33,4 +33,4 @@ void WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address);
void ForceSinglePrecisionS(X64Reg xmm);
void ForceSinglePrecisionP(X64Reg xmm);
} // namespace
} // namespace

View File

@ -659,7 +659,8 @@ void FindFunctions(u32 startAddr, u32 endAddr, SymbolDB *func_db)
//Step 2:
func_db->FillInCallers();
int numLeafs = 0, numNice = 0, numUnNice = 0, numTimer=0, numRFI=0, numStraightLeaf=0;
int numLeafs = 0, numNice = 0, numUnNice = 0;
int numTimer = 0, numRFI = 0, numStraightLeaf = 0;
int leafSize = 0, niceSize = 0, unniceSize = 0;
for (SymbolDB::XFuncMap::iterator iter = func_db->GetIterator(); iter != func_db->End(); iter++)
{

View File

@ -224,10 +224,10 @@ GekkoOPTemplate table4[] =
GekkoOPTemplate table4_2[] =
{
{10, Interpreter::ps_sum0, Jit64::Default, {"ps_sum0", OPTYPE_PS, 0}},
{11, Interpreter::ps_sum1, Jit64::Default, {"ps_sum1", OPTYPE_PS, 0}},
{12, Interpreter::ps_muls0, Jit64::Default, {"ps_muls0", OPTYPE_PS, 0}},
{13, Interpreter::ps_muls1, Jit64::Default, {"ps_muls1", OPTYPE_PS, 0}},
{10, Interpreter::ps_sum0, Jit64::ps_sum, {"ps_sum0", OPTYPE_PS, 0}},
{11, Interpreter::ps_sum1, Jit64::ps_sum, {"ps_sum1", OPTYPE_PS, 0}},
{12, Interpreter::ps_muls0, Jit64::ps_muls, {"ps_muls0", OPTYPE_PS, 0}},
{13, Interpreter::ps_muls1, Jit64::ps_muls, {"ps_muls1", OPTYPE_PS, 0}},
{14, Interpreter::ps_madds0, Jit64::Default, {"ps_madds0", OPTYPE_PS, 0}},
{15, Interpreter::ps_madds1, Jit64::Default, {"ps_madds1", OPTYPE_PS, 0}},
{18, Interpreter::ps_div, Jit64::ps_arith, {"ps_div", OPTYPE_PS, 0, 16}},
@ -278,7 +278,7 @@ GekkoOPTemplate table31[] =
{60, Interpreter::andcx, Jit64::Default, {"andcx", OPTYPE_INTEGER, FL_IN_AB | FL_OUT_S | FL_RC_BIT}},
{444, Interpreter::orx, Jit64::orx, {"orx", OPTYPE_INTEGER, FL_IN_AB | FL_OUT_S | FL_RC_BIT}},
{124, Interpreter::norx, Jit64::Default, {"norx", OPTYPE_INTEGER, FL_IN_AB | FL_OUT_S | FL_RC_BIT}},
{316, Interpreter::xorx, Jit64::Default, {"xorx", OPTYPE_INTEGER, FL_IN_AB | FL_OUT_S | FL_RC_BIT}},
{316, Interpreter::xorx, Jit64::xorx, {"xorx", OPTYPE_INTEGER, FL_IN_AB | FL_OUT_S | FL_RC_BIT}},
{412, Interpreter::orcx, Jit64::Default, {"orcx", OPTYPE_INTEGER, FL_IN_AB | FL_OUT_S | FL_RC_BIT}},
{476, Interpreter::nandx, Jit64::Default, {"nandx", OPTYPE_INTEGER, FL_IN_AB | FL_OUT_S | FL_RC_BIT}},
{284, Interpreter::eqvx, Jit64::Default, {"eqvx", OPTYPE_INTEGER, FL_IN_AB | FL_OUT_S | FL_RC_BIT}},
@ -359,7 +359,7 @@ GekkoOPTemplate table31[] =
{19, Interpreter::mfcr, Jit64::Default, {"mfcr", OPTYPE_SYSTEM, 0}},
{83, Interpreter::mfmsr, Jit64::mfmsr, {"mfmsr", OPTYPE_SYSTEM, 0}},
{144, Interpreter::mtcrf, Jit64::Default, {"mtcrf", OPTYPE_SYSTEM, 0}},
{144, Interpreter::mtcrf, Jit64::mtcrf, {"mtcrf", OPTYPE_SYSTEM, 0}},
{146, Interpreter::mtmsr, Jit64::mtmsr, {"mtmsr", OPTYPE_SYSTEM, FL_ENDBLOCK}},
{210, Interpreter::mtsr, Jit64::Default, {"mtsr", OPTYPE_SYSTEM, 0}},
{242, Interpreter::mtsrin, Jit64::Default, {"mtsrin", OPTYPE_SYSTEM, 0}},