diff --git a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp index f96495cfa1..bcf0e801dd 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp @@ -237,6 +237,7 @@ void Jit64AsmRoutineManager::GenerateCommon() GenMfcr(); GenQuantizedLoads(); + GenQuantizedSingleLoads(); GenQuantizedStores(); GenQuantizedSingleStores(); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp index 26cb06c37e..9da4185d0e 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp @@ -148,16 +148,18 @@ void Jit64::psq_lXX(UGeckoInstruction inst) } else { - MOV(32, R(RSCRATCH2), Imm32(0x3F07)); - // Get the high part of the GQR register OpArg gqr = PPCSTATE(spr[SPR_GQR0 + i]); gqr.AddMemOffset(2); + MOV(32, R(RSCRATCH2), Imm32(0x3F07)); AND(32, R(RSCRATCH2), gqr); - MOVZX(32, 8, RSCRATCH, R(RSCRATCH2)); - - CALLptr(MScaled(RSCRATCH, SCALE_8, PtrOffset(&asm_routines.pairedLoadQuantized[w * 8]))); + LEA(64, RSCRATCH, M(w ? asm_routines.singleLoadQuantized : asm_routines.pairedLoadQuantized)); + // 8-bit operations do not zero upper 32-bits of 64-bit registers. + // Here we know that RSCRATCH's least significant byte is zero. + OR(8, R(RSCRATCH), R(RSCRATCH2)); + SHL(8, R(RSCRATCH), Imm8(3)); + CALLptr(MatR(RSCRATCH)); } CVTPS2PD(fpr.RX(s), R(XMM0)); diff --git a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp index d5b1652a5a..d2d8873e90 100644 --- a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp +++ b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp @@ -277,13 +277,22 @@ const u8* CommonAsmRoutines::GenQuantizedStoreRuntime(bool single, EQuantizeType void CommonAsmRoutines::GenQuantizedLoads() { - pairedLoadQuantized = reinterpret_cast(const_cast(AlignCode16())); - ReserveCodeSpace(16 * sizeof(u8*)); + // Aligned to 256 bytes as least significant byte needs to be zero (See: Jit64::psq_lXX). + pairedLoadQuantized = reinterpret_cast(const_cast(AlignCodeTo(256))); + ReserveCodeSpace(8 * sizeof(u8*)); for (int type = 0; type < 8; type++) pairedLoadQuantized[type] = GenQuantizedLoadRuntime(false, static_cast(type)); +} + +void CommonAsmRoutines::GenQuantizedSingleLoads() +{ + // Aligned to 256 bytes as least significant byte needs to be zero (See: Jit64::psq_lXX). + singleLoadQuantized = reinterpret_cast(const_cast(AlignCodeTo(256))); + ReserveCodeSpace(8 * sizeof(u8*)); + for (int type = 0; type < 8; type++) - pairedLoadQuantized[type + 8] = GenQuantizedLoadRuntime(true, static_cast(type)); + singleLoadQuantized[type] = GenQuantizedLoadRuntime(true, static_cast(type)); } const u8* CommonAsmRoutines::GenQuantizedLoadRuntime(bool single, EQuantizeType type) diff --git a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h index c098400f55..09087f94b1 100644 --- a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h +++ b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h @@ -33,6 +33,7 @@ protected: const u8* GenQuantizedLoadRuntime(bool single, EQuantizeType type); const u8* GenQuantizedStoreRuntime(bool single, EQuantizeType type); void GenQuantizedLoads(); + void GenQuantizedSingleLoads(); void GenQuantizedStores(); void GenQuantizedSingleStores(); }; diff --git a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp index b24d16c95a..bf03519aee 100644 --- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp +++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp @@ -1616,10 +1616,12 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress) Jit->MOV(32, R(RSCRATCH2), Imm32(0x3F07)); Jit->AND(32, R(RSCRATCH2), M(((char*)&GQR(quantreg)) + 2)); Jit->MOVZX(32, 8, RSCRATCH, R(RSCRATCH2)); - Jit->OR(32, R(RSCRATCH), Imm8(w << 3)); + + const u8** table = + w ? Jit->asm_routines.singleLoadQuantized : Jit->asm_routines.pairedLoadQuantized; Jit->MOV(32, R(RSCRATCH_EXTRA), regLocForInst(RI, getOp1(I))); - Jit->CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)(Jit->asm_routines.pairedLoadQuantized))); + Jit->CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)table)); Jit->MOVAPD(reg, R(XMM0)); RI.fregs[reg] = I; regNormalRegClear(RI, I); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index 4742fc6f5a..bf080d1ecf 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -81,7 +81,7 @@ void JitArm64::psq_l(UGeckoInstruction inst) UBFM(type_reg, scale_reg, 16, 18); // Type UBFM(scale_reg, scale_reg, 24, 29); // Scale - MOVP2R(X30, &pairedLoadQuantized[inst.W * 8]); + MOVP2R(X30, inst.W ? singleLoadQuantized : pairedLoadQuantized); LDR(X30, X30, ArithOption(EncodeRegTo64(type_reg), true)); BLR(X30); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 8c105c4cc1..dfc37bef1b 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -336,7 +336,7 @@ void JitArm64::GenerateCommonAsm() JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedLoad"); pairedLoadQuantized = reinterpret_cast(const_cast(AlignCode16())); - ReserveCodeSpace(16 * sizeof(u8*)); + ReserveCodeSpace(8 * sizeof(u8*)); pairedLoadQuantized[0] = loadPairedFloatTwo; pairedLoadQuantized[1] = loadPairedIllegal; @@ -347,14 +347,17 @@ void JitArm64::GenerateCommonAsm() pairedLoadQuantized[6] = loadPairedS8Two; pairedLoadQuantized[7] = loadPairedS16Two; - pairedLoadQuantized[8] = loadPairedFloatOne; - pairedLoadQuantized[9] = loadPairedIllegal; - pairedLoadQuantized[10] = loadPairedIllegal; - pairedLoadQuantized[11] = loadPairedIllegal; - pairedLoadQuantized[12] = loadPairedU8One; - pairedLoadQuantized[13] = loadPairedU16One; - pairedLoadQuantized[14] = loadPairedS8One; - pairedLoadQuantized[15] = loadPairedS16One; + singleLoadQuantized = reinterpret_cast(const_cast(AlignCode16())); + ReserveCodeSpace(8 * sizeof(u8*)); + + singleLoadQuantized[0] = loadPairedFloatOne; + singleLoadQuantized[1] = loadPairedIllegal; + singleLoadQuantized[2] = loadPairedIllegal; + singleLoadQuantized[3] = loadPairedIllegal; + singleLoadQuantized[4] = loadPairedU8One; + singleLoadQuantized[5] = loadPairedU16One; + singleLoadQuantized[6] = loadPairedS8One; + singleLoadQuantized[7] = loadPairedS16One; // Stores start = GetCodePtr(); diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h index 73c920cf18..3322dccade 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h @@ -39,6 +39,12 @@ public: // Trashes: all three RSCRATCH const u8** pairedLoadQuantized; + // In: array index: GQR to use. + // In: ECX: Address to read from. + // Out: XMM0: Bottom 32-bit slot holds the read value. + // Trashes: all three RSCRATCH + const u8** singleLoadQuantized; + // In: array index: GQR to use. // In: ECX: Address to write to. // In: XMM0: Bottom two 32-bit slots hold the pair of floats to be written.