From b4d78829c32415a1df05f35c104e8b096c3e0f82 Mon Sep 17 00:00:00 2001 From: magumagu9 Date: Sun, 4 Jan 2009 08:28:45 +0000 Subject: [PATCH] A bit more WIP JIT work; primary change is psq_st implementation. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1758 8ced0084-cf51-0410-be5f-012b33b47a6e --- Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp | 38 ++++- Source/Core/Core/Src/PowerPC/Jit64IL/IR.h | 14 +- Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h | 3 + .../Core/Core/Src/PowerPC/Jit64IL/JitAsm.cpp | 141 ++++++++++++++++++ Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.h | 2 + .../Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp | 2 + .../Src/PowerPC/Jit64IL/Jit_FloatingPoint.cpp | 3 - .../PowerPC/Jit64IL/Jit_LoadStoreFloating.cpp | 3 - .../PowerPC/Jit64IL/Jit_LoadStorePaired.cpp | 13 +- .../Core/Src/PowerPC/Jit64IL/Jit_Paired.cpp | 3 - 10 files changed, 203 insertions(+), 19 deletions(-) diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp index b38b645d82..7db7c6e954 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp @@ -153,7 +153,7 @@ InstLoc IRBuilder::EmitUOp(unsigned Opcode, InstLoc Op1, unsigned extra) { return curIndex; } -InstLoc IRBuilder::EmitBiOp(unsigned Opcode, InstLoc Op1, InstLoc Op2) { +InstLoc IRBuilder::EmitBiOp(unsigned Opcode, InstLoc Op1, InstLoc Op2, unsigned extra) { InstLoc curIndex = &InstList[InstList.size()]; unsigned backOp1 = curIndex - 1 - Op1; if (backOp1 >= 255) { @@ -168,7 +168,7 @@ InstLoc IRBuilder::EmitBiOp(unsigned Opcode, InstLoc Op1, InstLoc Op2) { backOp1++; curIndex++; } - InstList.push_back(Opcode | backOp1 << 8 | backOp2 << 16); + InstList.push_back(Opcode | (backOp1 << 8) | (backOp2 << 16) | (extra << 24)); return curIndex; } @@ -451,7 +451,7 @@ InstLoc IRBuilder::FoldInterpreterFallback(InstLoc Op1, InstLoc Op2) { return EmitBiOp(InterpreterFallback, Op1, Op2); } -InstLoc IRBuilder::FoldBiOp(unsigned Opcode, InstLoc Op1, InstLoc Op2) { +InstLoc IRBuilder::FoldBiOp(unsigned Opcode, InstLoc Op1, InstLoc Op2, unsigned extra) { switch (Opcode) { case Add: return FoldAdd(Op1, Op2); case And: return FoldAnd(Op1, Op2); @@ -462,7 +462,7 @@ InstLoc IRBuilder::FoldBiOp(unsigned Opcode, InstLoc Op1, InstLoc Op2) { case Rol: return FoldRol(Op1, Op2); case BranchCond: return FoldBranchCond(Op1, Op2); case InterpreterFallback: return FoldInterpreterFallback(Op1, Op2); - default: return EmitBiOp(Opcode, Op1, Op2); + default: return EmitBiOp(Opcode, Op1, Op2, extra); } } @@ -1019,6 +1019,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { case DupSingleToMReg: case DoubleToSingle: case ExpandPackedToMReg: + case CompactMRegToPacked: if (thisUsed) regMarkUse(RI, I, getOp1(I), 1); break; @@ -1075,6 +1076,10 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { regMarkUse(RI, I, getOp1(I), 1); regMarkMemAddress(RI, I, getOp2(I), 2); break; + case StorePaired: + regMarkUse(RI, I, getOp1(I), 1); + regMarkUse(RI, I, getOp2(I), 2); + break; case BranchUncond: if (!isImm(*getOp1(I))) regMarkUse(RI, I, getOp1(I), 1); @@ -1390,6 +1395,23 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { regNormalRegClear(RI, I); break; } + case StorePaired: { + regSpill(RI, EAX); + regSpill(RI, EDX); + unsigned quantreg = *I >> 24; + Jit->MOVZX(32, 16, EAX, M(&PowerPC::ppcState.spr[SPR_GQR0 + quantreg])); + Jit->MOVZX(32, 8, EDX, R(AL)); + // FIXME: Fix ModR/M encoding to allow [EDX*4+disp32]! + Jit->SHL(32, R(EDX), Imm8(2)); + Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I))); + Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I))); + Jit->CALLptr(MDisp(EDX, (u32)asm_routines.pairedStoreQuantized)); + if (RI.IInfo[I - RI.FirstI] & 4) + fregClearInst(RI, getOp1(I)); + if (RI.IInfo[I - RI.FirstI] & 8) + regClearInst(RI, getOp2(I)); + break; + } case DupSingleToMReg: { if (!thisUsed) break; X64Reg reg = fregFindFreeReg(RI); @@ -1417,6 +1439,14 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { fregNormalRegClear(RI, I); break; } + case CompactMRegToPacked: { + if (!thisUsed) break; + X64Reg reg = fregFindFreeReg(RI); + Jit->CVTPD2PS(reg, fregLocForInst(RI, getOp1(I))); + RI.fregs[reg] = I; + fregNormalRegClear(RI, I); + break; + } case LoadFReg: { if (!thisUsed) break; X64Reg reg = fregFindFreeReg(RI); diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h index 4e0f734581..4fafd65b81 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h @@ -146,10 +146,12 @@ namespace IREmitter { LoadSingle, LoadDouble, LoadPaired, // This handles quantizers itself + StorePaired, DoubleToSingle, DupSingleToMReg, InsertDoubleInMReg, ExpandPackedToMReg, + CompactMRegToPacked, LoadFReg, StoreFReg, FSMul, @@ -232,7 +234,8 @@ namespace IREmitter { InstLoc EmitZeroOp(unsigned Opcode, unsigned extra); InstLoc EmitUOp(unsigned OpCode, InstLoc Op1, unsigned extra = 0); - InstLoc EmitBiOp(unsigned OpCode, InstLoc Op1, InstLoc Op2); + InstLoc EmitBiOp(unsigned OpCode, InstLoc Op1, InstLoc Op2, + unsigned extra = 0); InstLoc FoldAdd(InstLoc Op1, InstLoc Op2); InstLoc FoldAnd(InstLoc Op1, InstLoc Op2); @@ -248,7 +251,8 @@ namespace IREmitter { InstLoc FoldZeroOp(unsigned Opcode, unsigned extra); InstLoc FoldUOp(unsigned OpCode, InstLoc Op1, unsigned extra = 0); - InstLoc FoldBiOp(unsigned OpCode, InstLoc Op1, InstLoc Op2); + InstLoc FoldBiOp(unsigned OpCode, InstLoc Op1, InstLoc Op2, + unsigned extra = 0); unsigned ComputeKnownZeroBits(InstLoc I); @@ -389,6 +393,9 @@ namespace IREmitter { InstLoc EmitLoadPaired(InstLoc addr, unsigned quantReg) { return FoldUOp(LoadPaired, addr, quantReg); } + InstLoc EmitStorePaired(InstLoc value, InstLoc addr, unsigned quantReg) { + return FoldBiOp(StorePaired, value, addr, quantReg); + } InstLoc EmitLoadFReg(unsigned freg) { return FoldZeroOp(LoadFReg, freg); } @@ -404,6 +411,9 @@ namespace IREmitter { InstLoc EmitExpandPackedToMReg(InstLoc val) { return FoldUOp(ExpandPackedToMReg, val); } + InstLoc EmitCompactMRegToPacked(InstLoc val) { + return FoldUOp(CompactMRegToPacked, val); + } InstLoc EmitFSMul(InstLoc op1, InstLoc op2) { return FoldBiOp(FSMul, op1, op2); } diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h index 8f059656c2..4204e512f8 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h @@ -58,6 +58,9 @@ struct CONTEXT #endif +// #define INSTRUCTION_START Default(inst); return; +// #define INSTRUCTION_START PPCTables::CountInstruction(inst); +#define INSTRUCTION_START class TrampolineCache : public Gen::XCodeBlock { diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.cpp index 1f4a95910c..87d12da355 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.cpp @@ -28,6 +28,7 @@ #include "ABI.h" #include "Jit.h" #include "JitCache.h" +#include "Thunk.h" #include "../../HW/CPUCompare.h" #include "../../HW/GPFifo.h" @@ -213,6 +214,145 @@ const float m_dequantizeTableS[] = float psTemp[2]; +void AsmRoutineManager::GenQuantizedStores() { + const u8* storePairedIllegal = AlignCode4(); + UD2(); + const u8* storePairedFloat = AlignCode4(); + if (cpu_info.bSSSE3) { + PSHUFB(XMM0, M((void *)pbswapShuffle2x4)); +#ifdef _M_X64 + MOVQ_xmm(MComplex(RBX, RCX, 1, 0), XMM0); +#else + AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); + MOVQ_xmm(MDisp(ECX, (u32)Memory::base), XMM0); +#endif + } else { +#ifdef _M_X64 + MOVQ_xmm(R(RCX), XMM0); + ROL(64, RCX, Imm8(32)); + BSWAP(64, RCX); + MOV(64, MComplex(RBX, RCX, 1, 0), R(RCX)); +#else +#if 0 + AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); + MOVQ_xmm(XMM0, MDisp(ECX, (u32)Memory::base)); + PXOR(XMM1, R(XMM1)); + PSHUFLW(XMM0, R(XMM0), 0xB1); + MOVAPD(XMM1, R(XMM0)); + PSRLW(XMM0, 8); + PSLLW(XMM1, 8); + POR(XMM0, R(XMM1)); +#else + MOVQ_xmm(M(&psTemp[0]), XMM0); +#if 0 + TEST(32, R(ECX), Imm32(0x0C000000)); + FixupBranch argh = J_CC(CC_NZ); + MOV(32, R(EAX), M(&psTemp)); + BSWAP(32, EAX); + AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); + MOV(32, MDisp(ECX, (u32)Memory::base), R(EAX)); + MOV(32, R(EAX), M(((char*)&psTemp) + 4)); + BSWAP(32, EAX); + MOV(32, MDisp(ECX, 4+(u32)Memory::base), R(EAX)); + FixupBranch arg2 = J(); + SetJumpTarget(argh); +#endif + MOV(32, R(EAX), M(((char*)&psTemp))); + ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), EAX, ECX); + MOV(32, R(EAX), M(((char*)&psTemp)+4)); + ADD(32, R(ECX), Imm32(4)); + ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), EAX, ECX); +#if 0 + SetJumpTarget(arg2); +#endif +#endif +#endif + } + RET(); + + const u8* storePairedU8 = AlignCode4(); + SHR(32, R(EAX), Imm8(6)); + MOVSS(XMM1, MDisp(EAX, (u32)m_quantizeTableS)); + PUNPCKLDQ(XMM1, R(XMM1)); + MULPS(XMM0, R(XMM1)); + CVTPS2DQ(XMM0, R(XMM0)); + PACKSSDW(XMM0, R(XMM0)); + PACKUSWB(XMM0, R(XMM0)); + MOVD_xmm(R(EAX), XMM0); +#ifdef _M_X64 + MOV(16, MComplex(RBX, RCX, 1, 0), R(AX)); +#else + AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); + MOV(16, MDisp(ECX, (u32)Memory::base), R(AX)); +#endif + RET(); + + const u8* storePairedS8 = AlignCode4(); + SHR(32, R(EAX), Imm8(6)); + MOVSS(XMM1, MDisp(EAX, (u32)m_quantizeTableS)); + PUNPCKLDQ(XMM1, R(XMM1)); + MULPS(XMM0, R(XMM1)); + CVTPS2DQ(XMM0, R(XMM0)); + PACKSSDW(XMM0, R(XMM0)); + PACKSSWB(XMM0, R(XMM0)); + MOVD_xmm(R(EAX), XMM0); +#ifdef _M_X64 + MOV(16, MComplex(RBX, RCX, 1, 0), R(AX)); +#else + AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); + MOV(16, MDisp(ECX, (u32)Memory::base), R(AX)); +#endif + RET(); + + const u8* storePairedU16 = AlignCode4(); + SHR(32, R(EAX), Imm8(6)); + MOVSS(XMM1, MDisp(EAX, (u32)m_quantizeTableS)); + PUNPCKLDQ(XMM1, R(XMM1)); + MULPS(XMM0, R(XMM1)); + CVTPS2DQ(XMM0, R(XMM0)); + PXOR(XMM1, R(XMM1)); + PCMPGTD(XMM1, R(XMM0)); + PANDN(XMM0, R(XMM1)); + PACKSSDW(XMM0, R(XMM0)); //PACKUSDW(XMM0, R(XMM0)); // FIXME: Wrong! + MOVD_xmm(R(EAX), XMM0); + BSWAP(32, EAX); + ROL(32, R(EAX), Imm8(16)); +#ifdef _M_X64 + MOV(32, MComplex(RBX, RCX, 1, 0), R(EAX)); +#else + AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); + MOV(32, MDisp(ECX, (u32)Memory::base), R(EAX)); +#endif + RET(); + + const u8* storePairedS16 = AlignCode4(); + SHR(32, R(EAX), Imm8(6)); + MOVSS(XMM1, MDisp(EAX, (u32)m_quantizeTableS)); + PUNPCKLDQ(XMM1, R(XMM1)); + MULPS(XMM0, R(XMM1)); + CVTPS2DQ(XMM0, R(XMM0)); + PACKSSDW(XMM0, R(XMM0)); + MOVD_xmm(R(EAX), XMM0); + BSWAP(32, EAX); + ROL(32, R(EAX), Imm8(16)); +#ifdef _M_X64 + MOV(32, MComplex(RBX, RCX, 1, 0), R(EAX)); +#else + AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); + MOV(32, MDisp(ECX, (u32)Memory::base), R(EAX)); +#endif + RET(); + + pairedStoreQuantized[0] = storePairedFloat; + pairedStoreQuantized[1] = storePairedIllegal; + pairedStoreQuantized[2] = storePairedIllegal; + pairedStoreQuantized[3] = storePairedIllegal; + pairedStoreQuantized[4] = storePairedU8; + pairedStoreQuantized[5] = storePairedU16; + pairedStoreQuantized[6] = storePairedS8; + pairedStoreQuantized[7] = storePairedS16; +} + void AsmRoutineManager::GenQuantizedLoads() { const u8* loadPairedIllegal = AlignCode4(); UD2(); @@ -429,6 +569,7 @@ void AsmRoutineManager::GenerateCommon() JMP(dispatcher, true); GenQuantizedLoads(); + GenQuantizedStores(); computeRcFp = AlignCode16(); //CMPSD(R(XMM0), M(&zero), diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.h b/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.h index 7fd646d0fe..c18be7193c 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.h +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.h @@ -43,6 +43,7 @@ private: void GenFifoFloatWrite(); void GenFifoXmm64Write(); void GenQuantizedLoads(); + void GenQuantizedStores(); public: void Init() { @@ -82,6 +83,7 @@ public: const u8 *doReJit; const u8 *pairedLoadQuantized[8]; + const u8 *pairedStoreQuantized[8]; bool compareEnabled; }; diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp index 4185956172..d802e4e780 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp @@ -57,6 +57,8 @@ using namespace Gen; void Jit64::bx(UGeckoInstruction inst) { NORMALBRANCH_START + INSTRUCTION_START; + if (inst.LK) ibuild.EmitStoreLink(ibuild.EmitIntConst(js.compilerPC + 4)); diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_FloatingPoint.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_FloatingPoint.cpp index 4d2d67be8e..f64a5e1556 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_FloatingPoint.cpp @@ -26,9 +26,6 @@ #include "JitCache.h" #include "JitRegCache.h" -#define INSTRUCTION_START -// #define INSTRUCTION_START Default(inst); return; - void Jit64::fp_arith_s(UGeckoInstruction inst) { if (inst.Rc || inst.OPCD != 59 || inst.SUBOP5 != 25) { diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStoreFloating.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStoreFloating.cpp index 2b63931d28..9e70c74b4f 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStoreFloating.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStoreFloating.cpp @@ -36,9 +36,6 @@ #include "JitAsm.h" #include "JitRegCache.h" -// #define INSTRUCTION_START Default(inst); return; -#define INSTRUCTION_START - // pshufb todo: MOVQ const u8 GC_ALIGNED16(bswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const u8 GC_ALIGNED16(bswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15}; diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStorePaired.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStorePaired.cpp index d0c2f330c2..40c774bc55 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStorePaired.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStorePaired.cpp @@ -37,14 +37,19 @@ #include "JitAsm.h" #include "JitRegCache.h" -#define INSTRUCTION_START -// #define INSTRUCTION_START Default(inst); return; - // The big problem is likely instructions that set the quantizers in the same block. // We will have to break block after quantizers are written to. void Jit64::psq_st(UGeckoInstruction inst) { - Default(inst); return; + if (inst.W) {Default(inst); return;} + IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_12), val; + if (inst.RA) + addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA)); + if (inst.OPCD == 61) + ibuild.EmitStoreGReg(addr, inst.RA); + val = ibuild.EmitLoadFReg(inst.RS); + val = ibuild.EmitCompactMRegToPacked(val); + ibuild.EmitStorePaired(val, addr, inst.I); } void Jit64::psq_l(UGeckoInstruction inst) diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Paired.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Paired.cpp index 91ca5829e5..dd2f95ba25 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Paired.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Paired.cpp @@ -35,9 +35,6 @@ // cmppd, andpd, andnpd, or // lfsx, ps_merge01 etc -// #define INSTRUCTION_START Default(inst); return; -#define INSTRUCTION_START - const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL}; const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL}; const double GC_ALIGNED16(psOneOne[2]) = {1.0, 1.0};