diff --git a/dependencies/ih264d/CMakeLists.txt b/dependencies/ih264d/CMakeLists.txt index 295b028a..212cf346 100644 --- a/dependencies/ih264d/CMakeLists.txt +++ b/dependencies/ih264d/CMakeLists.txt @@ -2,10 +2,6 @@ project ("ih264d") -set(LIBAVCDEC_X86_INCLUDES "common/x86" "decoder/x86") - -include_directories("common/" "decoder/" ${LIBAVCDEC_X86_INCLUDES}) - add_library (ih264d "common/ih264_buf_mgr.c" "common/ih264_buf_mgr.h" @@ -53,21 +49,6 @@ add_library (ih264d "common/ih264_weighted_pred.h" "common/ithread.c" "common/ithread.h" -"common/x86/ih264_chroma_intra_pred_filters_ssse3.c" -"common/x86/ih264_deblk_chroma_ssse3.c" -"common/x86/ih264_deblk_luma_ssse3.c" -"common/x86/ih264_ihadamard_scaling_sse42.c" -"common/x86/ih264_ihadamard_scaling_ssse3.c" -"common/x86/ih264_inter_pred_filters_ssse3.c" -"common/x86/ih264_iquant_itrans_recon_dc_ssse3.c" -"common/x86/ih264_iquant_itrans_recon_sse42.c" -"common/x86/ih264_iquant_itrans_recon_ssse3.c" -"common/x86/ih264_luma_intra_pred_filters_ssse3.c" -"common/x86/ih264_mem_fns_ssse3.c" -"common/x86/ih264_padding_ssse3.c" -"common/x86/ih264_platform_macros.h" -"common/x86/ih264_resi_trans_quant_sse42.c" -"common/x86/ih264_weighted_pred_sse42.c" "decoder/ih264d.h" "decoder/ih264d_api.c" "decoder/ih264d_bitstrm.c" @@ -134,10 +115,71 @@ add_library (ih264d "decoder/ih264d_vui.h" "decoder/iv.h" "decoder/ivd.h" +) + +if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64") +set(LIBAVCDEC_X86_INCLUDES "common/x86" "decoder/x86") +include_directories("common/" "decoder/" ${LIBAVCDEC_X86_INCLUDES}) +target_sources(ih264d PRIVATE +"common/x86/ih264_chroma_intra_pred_filters_ssse3.c" +"common/x86/ih264_deblk_chroma_ssse3.c" +"common/x86/ih264_deblk_luma_ssse3.c" +"common/x86/ih264_ihadamard_scaling_sse42.c" +"common/x86/ih264_ihadamard_scaling_ssse3.c" +"common/x86/ih264_inter_pred_filters_ssse3.c" +"common/x86/ih264_iquant_itrans_recon_dc_ssse3.c" +"common/x86/ih264_iquant_itrans_recon_sse42.c" +"common/x86/ih264_iquant_itrans_recon_ssse3.c" +"common/x86/ih264_luma_intra_pred_filters_ssse3.c" +"common/x86/ih264_mem_fns_ssse3.c" +"common/x86/ih264_padding_ssse3.c" +"common/x86/ih264_platform_macros.h" +"common/x86/ih264_resi_trans_quant_sse42.c" +"common/x86/ih264_weighted_pred_sse42.c" "decoder/x86/ih264d_function_selector.c" "decoder/x86/ih264d_function_selector_sse42.c" "decoder/x86/ih264d_function_selector_ssse3.c" ) +elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") +enable_language( C CXX ASM ) +set(LIBAVCDEC_ARM_INCLUDES "common/armv8" "decoder/arm") +include_directories("common/" "decoder/" ${LIBAVCDEC_ARM_INCLUDES}) +target_sources(ih264d PRIVATE +"common/armv8/ih264_deblk_chroma_av8.s" +"common/armv8/ih264_deblk_luma_av8.s" +"common/armv8/ih264_default_weighted_pred_av8.s" +"common/armv8/ih264_ihadamard_scaling_av8.s" +"common/armv8/ih264_inter_pred_chroma_av8.s" +"common/armv8/ih264_inter_pred_filters_luma_horz_av8.s" +"common/armv8/ih264_inter_pred_filters_luma_vert_av8.s" +"common/armv8/ih264_inter_pred_luma_copy_av8.s" +"common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s" +"common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s" +"common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s" +"common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s" +"common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s" +"common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s" +"common/armv8/ih264_intra_pred_chroma_av8.s" +"common/armv8/ih264_intra_pred_luma_16x16_av8.s" +"common/armv8/ih264_intra_pred_luma_4x4_av8.s" +"common/armv8/ih264_intra_pred_luma_8x8_av8.s" +"common/armv8/ih264_iquant_itrans_recon_av8.s" +"common/armv8/ih264_iquant_itrans_recon_dc_av8.s" +"common/armv8/ih264_mem_fns_neon_av8.s" +"common/armv8/ih264_neon_macros.s" +"common/armv8/ih264_padding_neon_av8.s" +"common/armv8/ih264_platform_macros.h" +"common/armv8/ih264_resi_trans_quant_av8.s" +"common/armv8/ih264_weighted_bi_pred_av8.s" +"common/armv8/ih264_weighted_pred_av8.s" +"decoder/arm/ih264d_function_selector_a9q.c" +"decoder/arm/ih264d_function_selector_av8.c" +"decoder/arm/ih264d_function_selector.c" +) +target_compile_options(ih264d PRIVATE -DARMV8) +else() +message(FATAL_ERROR "ih264d unknown architecture: ${CMAKE_SYSTEM_PROCESSOR}") +endif() if(MSVC) set_property(TARGET ih264d PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") diff --git a/src/Cafe/GamePatch.cpp b/src/Cafe/GamePatch.cpp index 0d5e30ab..bd4630fe 100644 --- a/src/Cafe/GamePatch.cpp +++ b/src/Cafe/GamePatch.cpp @@ -83,7 +83,7 @@ void hleExport_xcx_enterCriticalSection(PPCInterpreter_t* hCPU) osLib_returnFromFunction(hCPU, 0); return; } - _mm_pause(); + _mm_pause(); } PPCCore_switchToScheduler(); } diff --git a/src/Cafe/HW/Espresso/PPCState.h b/src/Cafe/HW/Espresso/PPCState.h index 1e36c099..a9f2d3ee 100644 --- a/src/Cafe/HW/Espresso/PPCState.h +++ b/src/Cafe/HW/Espresso/PPCState.h @@ -203,7 +203,6 @@ extern uint64 ppcMainThreadDECCycleStart; // at which cycle the dec register was void PPCTimer_init(); void PPCTimer_waitForInit(); uint64 PPCTimer_getFromRDTSC(); -bool PPCTimer_hasInvariantRDTSCSupport(); uint64 PPCTimer_microsecondsToTsc(uint64 us); uint64 PPCTimer_tscToMicroseconds(uint64 us); diff --git a/src/Cafe/HW/Espresso/PPCTimer.cpp b/src/Cafe/HW/Espresso/PPCTimer.cpp index 153458d8..fe105962 100644 --- a/src/Cafe/HW/Espresso/PPCTimer.cpp +++ b/src/Cafe/HW/Espresso/PPCTimer.cpp @@ -1,9 +1,14 @@ #include "Cafe/HW/Espresso/Const.h" -#include #include "asm/x64util.h" #include "config/ActiveSettings.h" #include "util/helpers/fspinlock.h" #include "util/highresolutiontimer/HighResolutionTimer.h" +#include "Common/cpu_features.h" + +#if defined(ARCH_X86_64) +#include +#pragma intrinsic(__rdtsc) +#endif uint64 _rdtscLastMeasure = 0; uint64 _rdtscFrequency = 0; @@ -18,8 +23,6 @@ static_assert(sizeof(uint128_t) == 16); uint128_t _rdtscAcc{}; -#pragma intrinsic(__rdtsc) - uint64 muldiv64(uint64 a, uint64 b, uint64 d) { uint64 diva = a / d; @@ -29,17 +32,12 @@ uint64 muldiv64(uint64 a, uint64 b, uint64 d) return diva * b + moda * divb + moda * modb / d; } -bool PPCTimer_hasInvariantRDTSCSupport() -{ - uint32 cpuv[4]; - cpuid((int*)cpuv, 0x80000007); - return ((cpuv[3] >> 8) & 1); -} - uint64 PPCTimer_estimateRDTSCFrequency() { - if (PPCTimer_hasInvariantRDTSCSupport() == false) - forceLog_printf("Invariant TSC not supported"); + #if defined(ARCH_X86_64) + if (!g_CPUFeatures.x86.invariant_tsc) + cemuLog_log(LogType::Force, "Invariant TSC not supported"); + #endif _mm_mfence(); uint64 tscStart = __rdtsc(); diff --git a/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.cpp b/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.cpp index 588e5397..92b72b82 100644 --- a/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.cpp @@ -8,11 +8,10 @@ #include "Cafe/OS/libs/coreinit/coreinit_CodeGen.h" #include "config/ActiveSettings.h" #include "config/LaunchSettings.h" - -#include "util/helpers/fspinlock.h" #include "Common/ExceptionHandler/ExceptionHandler.h" +#include "Common/cpu_features.h" +#include "util/helpers/fspinlock.h" #include "util/helpers/helpers.h" - #include "util/MemMapper/MemMapper.h" struct PPCInvalidationRange @@ -461,6 +460,20 @@ void PPCRecompiler_invalidateRange(uint32 startAddr, uint32 endAddr) PPCRecompilerState.recompilerSpinlock.unlock(); } +#if defined(ARCH_X86_64) +void PPCRecompiler_initPlatform() +{ + // mxcsr + ppcRecompilerInstanceData->_x64XMM_mxCsr_ftzOn = 0x1F80 | 0x8000; + ppcRecompilerInstanceData->_x64XMM_mxCsr_ftzOff = 0x1F80; +} +#else +void PPCRecompiler_initPlatform() +{ + +} +#endif + void PPCRecompiler_init() { if (ActiveSettings::GetCPUMode() == CPUMode::SinglecoreInterpreter) @@ -569,21 +582,9 @@ void PPCRecompiler_init() ppcRecompilerInstanceData->_psq_st_scale_ps0_ps1[(i + 32) * 2 + 1] = br; } - // mxcsr - ppcRecompilerInstanceData->_x64XMM_mxCsr_ftzOn = 0x1F80 | 0x8000; - ppcRecompilerInstanceData->_x64XMM_mxCsr_ftzOff = 0x1F80; - - // query processor extensions - int cpuInfo[4]; - cpuid(cpuInfo, 0x80000001); - hasLZCNTSupport = ((cpuInfo[2] >> 5) & 1) != 0; - cpuid(cpuInfo, 0x1); - hasMOVBESupport = ((cpuInfo[2] >> 22) & 1) != 0; - hasAVXSupport = ((cpuInfo[2] >> 28) & 1) != 0; - cpuidex(cpuInfo, 0x7, 0); - hasBMI2Support = ((cpuInfo[1] >> 8) & 1) != 0; - - forceLog_printf("Recompiler initialized. CPU extensions: %s%s%s", hasLZCNTSupport ? "LZCNT " : "", hasMOVBESupport ? "MOVBE " : "", hasAVXSupport ? "AVX " : ""); + PPCRecompiler_initPlatform(); + + forceLog_printf("Recompiler initialized"); ppcRecompilerEnabled = true; diff --git a/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.h b/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.h index 82fa15e6..4f89b985 100644 --- a/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.h +++ b/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.h @@ -384,12 +384,6 @@ extern void ATTR_MS_ABI (*PPCRecompiler_leaveRecompilerCode_unvisited)(); #define PPC_REC_INVALID_FUNCTION ((PPCRecFunction_t*)-1) -// CPUID -extern bool hasLZCNTSupport; -extern bool hasMOVBESupport; -extern bool hasBMI2Support; -extern bool hasAVXSupport; - // todo - move some of the stuff above into PPCRecompilerInternal.h // recompiler interface diff --git a/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerImlGenFPU.cpp b/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerImlGenFPU.cpp index e30a9683..1efc41b8 100644 --- a/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerImlGenFPU.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerImlGenFPU.cpp @@ -3,14 +3,6 @@ #include "PPCRecompilerIml.h" #include "Cafe/GameProfile/GameProfile.h" -bool hasSSE1Support = true; -bool hasSSE2Support = true; -bool hasSSE3Support = true; -bool hasLZCNTSupport = false; -bool hasMOVBESupport = false; -bool hasBMI2Support = false; -bool hasAVXSupport = false; - void PPCRecompilerImlGen_generateNewInstruction_fpr_r_memory(ppcImlGenContext_t* ppcImlGenContext, uint8 registerDestination, uint8 registerMemory, sint32 immS32, uint32 mode, bool switchEndian, uint8 registerGQR = PPC_REC_INVALID_REGISTER) { // load from memory @@ -145,8 +137,6 @@ void PPRecompilerImmGen_optionalRoundPairFPRToSinglePrecision(ppcImlGenContext_t bool PPCRecompilerImlGen_LFS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) { - if( hasSSE1Support == false ) - return false; sint32 rA, frD; uint32 imm; PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm); @@ -167,8 +157,6 @@ bool PPCRecompilerImlGen_LFS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode bool PPCRecompilerImlGen_LFSU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) { - if( hasSSE1Support == false ) - return false; sint32 rA, frD; uint32 imm; PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm); @@ -191,8 +179,6 @@ bool PPCRecompilerImlGen_LFSU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod bool PPCRecompilerImlGen_LFSX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) { - if( hasSSE2Support == false ) - return false; sint32 rA, frD, rB; PPC_OPC_TEMPL_X(opcode, frD, rA, rB); if( rA == 0 ) @@ -218,8 +204,6 @@ bool PPCRecompilerImlGen_LFSX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod bool PPCRecompilerImlGen_LFSUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) { - if( hasSSE2Support == false ) - return false; sint32 rA, frD, rB; PPC_OPC_TEMPL_X(opcode, frD, rA, rB); if( rA == 0 ) @@ -247,8 +231,6 @@ bool PPCRecompilerImlGen_LFSUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opco bool PPCRecompilerImlGen_LFD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) { - if( hasSSE1Support == false ) - return false; sint32 rA, frD; uint32 imm; PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm); @@ -266,8 +248,6 @@ bool PPCRecompilerImlGen_LFD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode bool PPCRecompilerImlGen_LFDU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) { - if( hasSSE1Support == false ) - return false; sint32 rA, frD; uint32 imm; PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm); @@ -288,8 +268,6 @@ bool PPCRecompilerImlGen_LFDU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod bool PPCRecompilerImlGen_LFDX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) { - if( hasSSE2Support == false ) - return false; sint32 rA, frD, rB; PPC_OPC_TEMPL_X(opcode, frD, rA, rB); if( rA == 0 ) @@ -308,8 +286,6 @@ bool PPCRecompilerImlGen_LFDX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod bool PPCRecompilerImlGen_LFDUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) { - if( hasSSE2Support == false ) - return false; sint32 rA, frD, rB; PPC_OPC_TEMPL_X(opcode, frD, rA, rB); if( rA == 0 ) @@ -330,8 +306,6 @@ bool PPCRecompilerImlGen_LFDUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opco bool PPCRecompilerImlGen_STFS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) { - if( hasSSE1Support == false ) - return false; sint32 rA, frD; uint32 imm; PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm); @@ -346,8 +320,6 @@ bool PPCRecompilerImlGen_STFS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod bool PPCRecompilerImlGen_STFSU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) { - if( hasSSE1Support == false ) - return false; sint32 rA, frD; uint32 imm; PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm); @@ -364,8 +336,6 @@ bool PPCRecompilerImlGen_STFSU(ppcImlGenContext_t* ppcImlGenContext, uint32 opco bool PPCRecompilerImlGen_STFSX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) { - if( hasSSE2Support == false ) - return false; sint32 rA, frS, rB; PPC_OPC_TEMPL_X(opcode, frS, rA, rB); if( rA == 0 ) @@ -392,8 +362,6 @@ bool PPCRecompilerImlGen_STFSX(ppcImlGenContext_t* ppcImlGenContext, uint32 opco bool PPCRecompilerImlGen_STFSUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) { - if( hasSSE2Support == false ) - return false; sint32 rA, frS, rB; PPC_OPC_TEMPL_X(opcode, frS, rA, rB); if( rA == 0 ) @@ -415,8 +383,6 @@ bool PPCRecompilerImlGen_STFSUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opc bool PPCRecompilerImlGen_STFD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) { - if( hasSSE1Support == false ) - return false; sint32 rA, frD; uint32 imm; PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm); @@ -435,8 +401,6 @@ bool PPCRecompilerImlGen_STFD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod bool PPCRecompilerImlGen_STFDU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) { - if( hasSSE1Support == false ) - return false; sint32 rA, frD; uint32 imm; PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm); @@ -458,8 +422,6 @@ bool PPCRecompilerImlGen_STFDU(ppcImlGenContext_t* ppcImlGenContext, uint32 opco bool PPCRecompilerImlGen_STFDX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) { - if( hasSSE2Support == false ) - return false; sint32 rA, frS, rB; PPC_OPC_TEMPL_X(opcode, frS, rA, rB); if( rA == 0 ) @@ -485,8 +447,6 @@ bool PPCRecompilerImlGen_STFDX(ppcImlGenContext_t* ppcImlGenContext, uint32 opco bool PPCRecompilerImlGen_STFIWX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) { - if( hasSSE2Support == false ) - return false; sint32 rA, frS, rB; PPC_OPC_TEMPL_X(opcode, frS, rA, rB); // get memory gpr registers @@ -959,10 +919,6 @@ bool PPCRecompilerImlGen_FCMPO(ppcImlGenContext_t* ppcImlGenContext, uint32 opco sint32 crfD, frA, frB; PPC_OPC_TEMPL_X(opcode, crfD, frA, frB); crfD >>= 2; - if( hasSSE2Support == false ) - { - return false; - } uint32 fprRegisterA = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frA); uint32 fprRegisterB = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frB); PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_FCMPO_BOTTOM, fprRegisterA, fprRegisterB, crfD); @@ -974,10 +930,6 @@ bool PPCRecompilerImlGen_FCMPU(ppcImlGenContext_t* ppcImlGenContext, uint32 opco sint32 crfD, frA, frB; PPC_OPC_TEMPL_X(opcode, crfD, frA, frB); crfD >>= 2; - if( hasSSE2Support == false ) - { - return false; - } uint32 fprRegisterA = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frA); uint32 fprRegisterB = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frB); PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_FCMPU_BOTTOM, fprRegisterA, fprRegisterB, crfD); @@ -1120,8 +1072,6 @@ bool PPCRecompilerImlGen_FCTIWZ(ppcImlGenContext_t* ppcImlGenContext, uint32 opc bool PPCRecompilerImlGen_PSQ_L(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) { - if (hasSSE2Support == false) - return false; int rA, frD; uint32 immUnused; PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, immUnused); @@ -1146,8 +1096,6 @@ bool PPCRecompilerImlGen_PSQ_L(ppcImlGenContext_t* ppcImlGenContext, uint32 opco bool PPCRecompilerImlGen_PSQ_LU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) { - if (hasSSE2Support == false) - return false; int rA, frD; uint32 immUnused; PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, immUnused); diff --git a/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerX64.cpp b/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerX64.cpp index 8e8a63d9..14d2febb 100644 --- a/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerX64.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerX64.cpp @@ -5,8 +5,8 @@ #include "PPCRecompilerIml.h" #include "PPCRecompilerX64.h" #include "Cafe/OS/libs/coreinit/coreinit_Time.h" - #include "util/MemMapper/MemMapper.h" +#include "Common/cpu_features.h" sint32 x64Gen_registerMap[12] = // virtual GPR to x64 register mapping { @@ -381,7 +381,7 @@ bool PPCRecompilerX64Gen_imlInstruction_load(PPCRecFunction_t* PPCRecFunction, p { x64Gen_lea_reg64Low32_reg64Low32PlusReg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem, realRegisterMem2); } - if( hasMOVBESupport && switchEndian ) + if( g_CPUFeatures.x86.movbe && switchEndian ) { if (indexed) { @@ -419,7 +419,7 @@ bool PPCRecompilerX64Gen_imlInstruction_load(PPCRecFunction_t* PPCRecFunction, p { x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); } - if( hasMOVBESupport && switchEndian ) + if( g_CPUFeatures.x86.movbe && switchEndian ) { x64Gen_movBEZeroExtend_reg64Low16_mem16Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); if( indexed && realRegisterMem != realRegisterData ) @@ -477,7 +477,7 @@ bool PPCRecompilerX64Gen_imlInstruction_load(PPCRecFunction_t* PPCRecFunction, p assert_dbg(); if( indexed ) x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); // can be replaced with LEA temp, [memReg1+memReg2] (this way we can avoid the SUB instruction after the move) - if( hasMOVBESupport ) + if( g_CPUFeatures.x86.movbe ) { x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); if( indexed && realRegisterMem != realRegisterData ) @@ -537,7 +537,7 @@ bool PPCRecompilerX64Gen_imlInstruction_store(PPCRecFunction_t* PPCRecFunction, if (indexed) PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); uint32 valueRegister; - if ((swapEndian == false || hasMOVBESupport) && realRegisterMem != realRegisterData) + if ((swapEndian == false || g_CPUFeatures.x86.movbe) && realRegisterMem != realRegisterData) { valueRegister = realRegisterData; } @@ -546,11 +546,11 @@ bool PPCRecompilerX64Gen_imlInstruction_store(PPCRecFunction_t* PPCRecFunction, x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData); valueRegister = REG_RESV_TEMP; } - if (hasMOVBESupport == false && swapEndian) + if (g_CPUFeatures.x86.movbe == false && swapEndian) x64Gen_bswap_reg64Lower32bit(x64GenContext, valueRegister); if (indexed) x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); - if (hasMOVBESupport && swapEndian) + if (g_CPUFeatures.x86.movbe && swapEndian) x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, valueRegister); else x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, valueRegister); @@ -802,8 +802,7 @@ bool PPCRecompilerX64Gen_imlInstruction_r_r(PPCRecFunction_t* PPCRecFunction, pp // count leading zeros PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); cemu_assert_debug(imlInstruction->crRegister == PPC_REC_INVALID_REGISTER); - // LZCNT instruction (part of SSE4, CPUID.80000001H:ECX.ABM[Bit 5]) - if( hasLZCNTSupport ) + if( g_CPUFeatures.x86.lzcnt ) { x64Gen_lzcnt_reg64Low32_reg64Low32(x64GenContext, tempToRealRegister(imlInstruction->op_r_r.registerResult), tempToRealRegister(imlInstruction->op_r_r.registerA)); } @@ -1521,12 +1520,12 @@ bool PPCRecompilerX64Gen_imlInstruction_r_r_r(PPCRecFunction_t* PPCRecFunction, sint32 rRegOperand1 = tempToRealRegister(imlInstruction->op_r_r_r.registerA); sint32 rRegOperand2 = tempToRealRegister(imlInstruction->op_r_r_r.registerB); - if (hasBMI2Support && imlInstruction->operation == PPCREC_IML_OP_SRW) + if (g_CPUFeatures.x86.bmi2 && imlInstruction->operation == PPCREC_IML_OP_SRW) { // use BMI2 SHRX if available x64Gen_shrx_reg64_reg64_reg64(x64GenContext, rRegResult, rRegOperand1, rRegOperand2); } - else if (hasBMI2Support && imlInstruction->operation == PPCREC_IML_OP_SLW) + else if (g_CPUFeatures.x86.bmi2 && imlInstruction->operation == PPCREC_IML_OP_SLW) { // use BMI2 SHLX if available x64Gen_shlx_reg64_reg64_reg64(x64GenContext, rRegResult, rRegOperand1, rRegOperand2); diff --git a/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerX64FPU.cpp b/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerX64FPU.cpp index fbb95b2f..d83f67de 100644 --- a/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerX64FPU.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerX64FPU.cpp @@ -2,6 +2,7 @@ #include "PPCRecompilerIml.h" #include "PPCRecompilerX64.h" #include "asm/x64util.h" +#include "Common/cpu_features.h" void PPCRecompilerX64Gen_imlInstruction_fpr_r_name(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, PPCRecImlInstruction_t* imlInstruction) { @@ -86,7 +87,7 @@ void PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext_t* ppcImlGenContext, { x64Gen_mov_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, memRegEx); x64Gen_add_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, memReg); - if (hasMOVBESupport) + if (g_CPUFeatures.x86.movbe) { x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, memImmS32); } @@ -98,7 +99,7 @@ void PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext_t* ppcImlGenContext, } else { - if (hasMOVBESupport) + if (g_CPUFeatures.x86.movbe) { x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, memReg, memImmS32); } @@ -108,7 +109,7 @@ void PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext_t* ppcImlGenContext, x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP); } } - if (hasAVXSupport) + if (g_CPUFeatures.x86.avx) { x64Gen_movd_xmmReg_reg64Low32(x64GenContext, REG_RESV_FPR_TEMP, REG_RESV_TEMP); } @@ -280,21 +281,21 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_load(PPCRecFunction_t* PPCRecFunctio { x64Gen_mov_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem2); x64Gen_add_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem); - if( hasMOVBESupport ) + if( g_CPUFeatures.x86.movbe ) x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32); else x64Emit_mov_reg32_mem32(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32); } else { - if( hasMOVBESupport ) + if( g_CPUFeatures.x86.movbe ) x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, realRegisterMem, imlInstruction->op_storeLoad.immS32); else x64Emit_mov_reg32_mem32(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, realRegisterMem, imlInstruction->op_storeLoad.immS32); } - if( hasMOVBESupport == false ) + if( g_CPUFeatures.x86.movbe == false ) x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP); - if( hasAVXSupport ) + if( g_CPUFeatures.x86.avx ) { x64Gen_movd_xmmReg_reg64Low32(x64GenContext, realRegisterXMM, REG_RESV_TEMP); } @@ -316,7 +317,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_load(PPCRecFunction_t* PPCRecFunctio } else if( mode == PPCREC_FPR_LD_MODE_DOUBLE_INTO_PS0 ) { - if( hasAVXSupport ) + if( g_CPUFeatures.x86.avx ) { if( indexed ) { @@ -419,7 +420,7 @@ void PPCRecompilerX64Gen_imlInstr_psq_store(ppcImlGenContext_t* ppcImlGenContext if (mode == PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0) { x64Gen_cvtsd2ss_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, registerXMM); - if (hasAVXSupport) + if (g_CPUFeatures.x86.avx) { x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, REG_RESV_FPR_TEMP); } @@ -428,14 +429,14 @@ void PPCRecompilerX64Gen_imlInstr_psq_store(ppcImlGenContext_t* ppcImlGenContext x64Gen_movsd_memReg64_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR)); x64Emit_mov_reg64_mem32(x64GenContext, REG_RESV_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR)); } - if (hasMOVBESupport == false) + if (g_CPUFeatures.x86.movbe == false) x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP); if (indexed) { cemu_assert_debug(memReg != memRegEx); x64Gen_add_reg64Low32_reg64Low32(x64GenContext, memReg, memRegEx); } - if (hasMOVBESupport) + if (g_CPUFeatures.x86.movbe) x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, memReg, memImmS32, REG_RESV_TEMP); else x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, memReg, memImmS32, REG_RESV_TEMP); @@ -604,7 +605,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti if (imlInstruction->op_storeLoad.flags2.notExpanded) { // value is already in single format - if (hasAVXSupport) + if (g_CPUFeatures.x86.avx) { x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, realRegisterXMM); } @@ -617,7 +618,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti else { x64Gen_cvtsd2ss_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, realRegisterXMM); - if (hasAVXSupport) + if (g_CPUFeatures.x86.avx) { x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, REG_RESV_FPR_TEMP); } @@ -627,7 +628,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti x64Emit_mov_reg64_mem32(x64GenContext, REG_RESV_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR)); } } - if( hasMOVBESupport == false ) + if( g_CPUFeatures.x86.movbe == false ) x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP); if( indexed ) { @@ -635,7 +636,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti assert_dbg(); x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); } - if( hasMOVBESupport ) + if( g_CPUFeatures.x86.movbe ) x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP); else x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP); @@ -668,7 +669,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti } else if( mode == PPCREC_FPR_ST_MODE_UI32_FROM_PS0 ) { - if( hasAVXSupport ) + if( g_CPUFeatures.x86.avx ) { x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, realRegisterXMM); } @@ -749,7 +750,7 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r_r(PPCRecFunction_t* PPCRecFunction // unpack top to bottom and top x64Gen_unpckhpd_xmmReg_xmmReg(x64GenContext, imlInstruction->op_fpr_r_r.registerResult, imlInstruction->op_fpr_r_r.registerOperand); } - //else if ( hasAVXSupport ) + //else if ( g_CPUFeatures.x86.avx ) //{ // // unpack top to bottom and top with non-destructive destination // // update: On Ivy Bridge this causes weird stalls? @@ -1056,7 +1057,7 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r_r_r(PPCRecFunction_t* PPCRecFuncti { x64Gen_subpd_xmmReg_xmmReg(x64GenContext, imlInstruction->op_fpr_r_r_r.registerResult, imlInstruction->op_fpr_r_r_r.registerOperandB); } - else if (hasAVXSupport) + else if (g_CPUFeatures.x86.avx) { x64Gen_avx_VSUBPD_xmm_xmm_xmm(x64GenContext, imlInstruction->op_fpr_r_r_r.registerResult, imlInstruction->op_fpr_r_r_r.registerOperandA, imlInstruction->op_fpr_r_r_r.registerOperandB); } diff --git a/src/Cafe/HW/Latte/Core/LatteIndices.cpp b/src/Cafe/HW/Latte/Core/LatteIndices.cpp index e5f3364a..fa52332a 100644 --- a/src/Cafe/HW/Latte/Core/LatteIndices.cpp +++ b/src/Cafe/HW/Latte/Core/LatteIndices.cpp @@ -1,8 +1,9 @@ #include "Cafe/HW/Latte/Core/LatteConst.h" #include "Cafe/HW/Latte/Renderer/Renderer.h" - #include "Cafe/HW/Latte/ISA/RegDefines.h" +#include "Common/cpu_features.h" +#if defined(ARCH_X86_64) #if __GNUC__ #include #endif @@ -14,6 +15,7 @@ #define ATTRIBUTE_AVX2 #define ATTRIBUTE_SSE41 #endif +#endif struct { @@ -292,6 +294,7 @@ void LatteIndices_generateAutoLineLoopIndices(void* indexDataOutput, uint32 coun indexMax = std::max(count, 1u) - 1; } +#if defined(ARCH_X86_64) ATTRIBUTE_AVX2 void LatteIndices_fastConvertU16_AVX2(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax) { @@ -487,6 +490,7 @@ void LatteIndices_fastConvertU32_AVX2(const void* indexDataInput, void* indexDat indexMax = std::max(indexMax, _maxIndex); indexMin = std::min(indexMin, _minIndex); } +#endif template void _LatteIndices_alternativeCalculateIndexMinMax(const void* indexData, uint32 count, uint32 primitiveRestartIndex, uint32& indexMin, uint32& indexMax) @@ -669,19 +673,27 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 { if (indexType == LatteIndexType::U16_BE) { - if (_cpuExtension_AVX2) + #if defined(ARCH_X86_64) + if (g_CPUFeatures.x86.avx2) LatteIndices_fastConvertU16_AVX2(indexData, indexOutputPtr, count, indexMin, indexMax); - else if (_cpuExtension_SSE4_1 && _cpuExtension_SSSE3) + else if (g_CPUFeatures.x86.sse4_1 && g_CPUFeatures.x86.ssse3) LatteIndices_fastConvertU16_SSE41(indexData, indexOutputPtr, count, indexMin, indexMax); else LatteIndices_convertBE(indexData, indexOutputPtr, count, indexMin, indexMax); + #else + LatteIndices_convertBE(indexData, indexOutputPtr, count, indexMin, indexMax); + #endif } else if (indexType == LatteIndexType::U32_BE) { - if (_cpuExtension_AVX2) + #if defined(ARCH_X86_64) + if (g_CPUFeatures.x86.avx2) LatteIndices_fastConvertU32_AVX2(indexData, indexOutputPtr, count, indexMin, indexMax); else LatteIndices_convertBE(indexData, indexOutputPtr, count, indexMin, indexMax); + #else + LatteIndices_convertBE(indexData, indexOutputPtr, count, indexMin, indexMax); + #endif } else if (indexType == LatteIndexType::U16_LE) { @@ -714,4 +726,4 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 LatteIndexCache.outputCount = outputCount; LatteIndexCache.indexBufferOffset = indexBufferOffset; LatteIndexCache.indexBufferIndex = indexBufferIndex; -} \ No newline at end of file +} diff --git a/src/Cafe/HW/Latte/Core/LatteTextureCache.cpp b/src/Cafe/HW/Latte/Core/LatteTextureCache.cpp index 2caa2cd0..a71bd6a6 100644 --- a/src/Cafe/HW/Latte/Core/LatteTextureCache.cpp +++ b/src/Cafe/HW/Latte/Core/LatteTextureCache.cpp @@ -2,6 +2,7 @@ #include "Cafe/HW/Latte/Core/LatteDraw.h" #include "Cafe/HW/Latte/Core/LatteTexture.h" #include "Cafe/HW/Latte/Renderer/Renderer.h" +#include "Common/cpu_features.h" std::unordered_set g_allTextures; @@ -146,7 +147,7 @@ uint32 LatteTexture_CalculateTextureDataHash(LatteTexture* hostTexture) if( isCompressedFormat == false ) { #if BOOST_OS_WINDOWS - if (_cpuExtension_AVX2) + if (g_CPUFeatures.x86.avx2) { __m256i h256 = { 0 }; __m256i* readPtr = (__m256i*)texDataU32; diff --git a/src/Cafe/OS/libs/coreinit/coreinit_Thread.cpp b/src/Cafe/OS/libs/coreinit/coreinit_Thread.cpp index d57580f3..406068eb 100644 --- a/src/Cafe/OS/libs/coreinit/coreinit_Thread.cpp +++ b/src/Cafe/OS/libs/coreinit/coreinit_Thread.cpp @@ -1075,7 +1075,9 @@ namespace coreinit { OSHostThread* hostThread = (OSHostThread*)_thread; + #if defined(ARCH_X86_64) _mm_setcsr(_mm_getcsr() | 0x8000); // flush denormals to zero + #endif uint32 lehmer_lcg = 12345; @@ -1116,7 +1118,9 @@ namespace coreinit { SetThreadName(fmt::format("OSSchedulerThread[core={}]", (uintptr_t)_assignedCoreIndex).c_str()); t_assignedCoreIndex = (sint32)(uintptr_t)_assignedCoreIndex; + #if defined(ARCH_X86_64) _mm_setcsr(_mm_getcsr() | 0x8000); // flush denormals to zero + #endif t_schedulerFiber = Fiber::PrepareCurrentThread(); // create scheduler idle fiber and switch to it diff --git a/src/Common/CMakeLists.txt b/src/Common/CMakeLists.txt index 7e5825f1..f07150c5 100644 --- a/src/Common/CMakeLists.txt +++ b/src/Common/CMakeLists.txt @@ -1,5 +1,7 @@ add_library(CemuCommon betype.h + cpu_features.cpp + cpu_features.h enumFlags.h ExceptionHandler/ExceptionHandler.h FileStream.h diff --git a/src/Common/cpu_features.cpp b/src/Common/cpu_features.cpp new file mode 100644 index 00000000..dfea8851 --- /dev/null +++ b/src/Common/cpu_features.cpp @@ -0,0 +1,101 @@ +#include "cpu_features.h" + +// wrappers with uniform prototype for implementation-specific x86 CPU id +#if defined(ARCH_X86_64) +#ifdef __GNUC__ +#include +#endif + +inline void cpuid(int cpuInfo[4], int functionId) { +#if defined(_MSC_VER) + __cpuid(cpuInfo, functionId); +#elif defined(__GNUC__) + __cpuid(functionId, cpuInfo[0], cpuInfo[1], cpuInfo[2], cpuInfo[3]); +#else +#error No definition for cpuid +#endif +} + +inline void cpuidex(int cpuInfo[4], int functionId, int subFunctionId) { +#if defined(_MSC_VER) + __cpuidex(cpuInfo, functionId, subFunctionId); +#elif defined(__GNUC__) + __cpuid_count(functionId, subFunctionId, cpuInfo[0], cpuInfo[1], cpuInfo[2], cpuInfo[3]); +#else +#error No definition for cpuidex +#endif +} +#endif + + +CPUFeaturesImpl::CPUFeaturesImpl() +{ +#if defined(ARCH_X86_64) + int cpuInfo[4]; + cpuid(cpuInfo, 0x80000001); + x86.lzcnt = ((cpuInfo[2] >> 5) & 1) != 0; + cpuid(cpuInfo, 0x1); + x86.movbe = ((cpuInfo[2] >> 22) & 1) != 0; + x86.avx = ((cpuInfo[2] >> 28) & 1) != 0; + x86.aesni = ((cpuInfo[2] >> 25) & 1) != 0; + x86.ssse3 = ((cpuInfo[2] >> 9) & 1) != 0; + x86.sse4_1 = ((cpuInfo[2] >> 19) & 1) != 0; + cpuidex(cpuInfo, 0x7, 0); + x86.avx2 = ((cpuInfo[1] >> 5) & 1) != 0; + x86.bmi2 = ((cpuInfo[1] >> 8) & 1) != 0; + cpuid(cpuInfo, 0x80000007); + x86.invariant_tsc = ((cpuInfo[3] >> 8) & 1); + // get CPU brand name + uint32_t nExIds, i = 0; + memset(m_cpuBrandName, 0, sizeof(m_cpuBrandName)); + cpuid(cpuInfo, 0x80000000); + nExIds = (uint32_t)cpuInfo[0]; + for (uint32_t i = 0x80000000; i <= nExIds; ++i) + { + cpuid(cpuInfo, i); + if (i == 0x80000002) + memcpy(m_cpuBrandName, cpuInfo, sizeof(cpuInfo)); + else if (i == 0x80000003) + memcpy(m_cpuBrandName + 16, cpuInfo, sizeof(cpuInfo)); + else if (i == 0x80000004) + memcpy(m_cpuBrandName + 32, cpuInfo, sizeof(cpuInfo)); + } +#endif +} + +std::string CPUFeaturesImpl::GetCPUName() +{ + return { m_cpuBrandName }; +} + +std::string CPUFeaturesImpl::GetCommaSeparatedExtensionList() +{ + std::string tmp; + auto appendExt = [&tmp](const char* str) + { + if (!tmp.empty()) + tmp.append(", "); + tmp.append(str); + }; + if (x86.ssse3) + appendExt("SSSE3"); + if (x86.sse4_1) + appendExt("SSE4.1"); + if (x86.avx) + appendExt("AVX"); + if (x86.avx2) + appendExt("AVX2"); + if (x86.lzcnt) + appendExt("LZCNT"); + if (x86.movbe) + appendExt("MOVBE"); + if (x86.bmi2) + appendExt("BMI2"); + if (x86.aesni) + appendExt("AES-NI"); + if(x86.invariant_tsc) + appendExt("INVARIANT-TSC"); + return tmp; +} + +CPUFeaturesImpl g_CPUFeatures; diff --git a/src/Common/cpu_features.h b/src/Common/cpu_features.h new file mode 100644 index 00000000..d2bec82c --- /dev/null +++ b/src/Common/cpu_features.h @@ -0,0 +1,26 @@ + +class CPUFeaturesImpl +{ +public: + CPUFeaturesImpl(); + + std::string GetCPUName(); // empty if not available + std::string GetCommaSeparatedExtensionList(); + + struct + { + bool ssse3{ false }; + bool sse4_1{ false }; + bool avx{ false }; + bool avx2{ false }; + bool lzcnt{ false }; + bool movbe{ false }; + bool bmi2{ false }; + bool aesni{ false }; + bool invariant_tsc{ false }; + }x86; +private: + char m_cpuBrandName[0x40]{ 0 }; +}; + +extern CPUFeaturesImpl g_CPUFeatures; \ No newline at end of file diff --git a/src/Common/precompiled.h b/src/Common/precompiled.h index 898a6883..1eceaabd 100644 --- a/src/Common/precompiled.h +++ b/src/Common/precompiled.h @@ -24,13 +24,22 @@ // } // #endif +// arch defines + +#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) +#define ARCH_X86_64 +#endif + // c includes #include #include #include #include #include + +#if defined(ARCH_X86_64) #include +#endif // c++ includes #include @@ -106,11 +115,6 @@ using uint8le = uint8_t; #include "Cemu/Logging/CemuDebugLogging.h" #include "Cemu/Logging/CemuLogging.h" -// CPU extensions -extern bool _cpuExtension_SSSE3; -extern bool _cpuExtension_SSE4_1; -extern bool _cpuExtension_AVX2; - // manual endian-swapping #if _MSC_VER @@ -251,30 +255,35 @@ inline uint64 _udiv128(uint64 highDividend, uint64 lowDividend, uint64 divisor, #define NOEXPORT __attribute__ ((visibility ("hidden"))) #endif -#ifdef __GNUC__ -#include -#endif +// On aarch64 we handle some of the x86 intrinsics by implementing them as wrappers +#if defined(__aarch64__) -inline void cpuid(int cpuInfo[4], int functionId) { -#if defined(_MSC_VER) - __cpuid(cpuInfo, functionId); -#elif defined(__GNUC__) - __cpuid(functionId, cpuInfo[0], cpuInfo[1], cpuInfo[2], cpuInfo[3]); -#else - #error No definition for cpuid -#endif +inline void _mm_pause() +{ + asm volatile("yield"); } -inline void cpuidex(int cpuInfo[4], int functionId, int subFunctionId) { -#if defined(_MSC_VER) - __cpuidex(cpuInfo, functionId, subFunctionId); -#elif defined(__GNUC__) - __cpuid_count(functionId, subFunctionId, cpuInfo[0], cpuInfo[1], cpuInfo[2], cpuInfo[3]); -#else - #error No definition for cpuidex -#endif +inline uint64 __rdtsc() +{ + uint64 t; + asm volatile("mrs %0, cntvct_el0" : "=r" (t)); + return t; } +inline void _mm_mfence() +{ + +} + +inline unsigned char _addcarry_u64(unsigned char carry, unsigned long long a, unsigned long long b, unsigned long long *result) +{ + *result = a + b + (unsigned long long)carry; + if (*result < a) + return 1; + return 0; +} + +#endif // MEMPTR #include "Common/MemPtr.h" diff --git a/src/asm/CMakeLists.txt b/src/asm/CMakeLists.txt index ff7f2ab4..5d9f84c2 100644 --- a/src/asm/CMakeLists.txt +++ b/src/asm/CMakeLists.txt @@ -1,39 +1,47 @@ project(CemuAsm C) -if (WIN32) +if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)") - enable_language(C ASM_MASM) + if (WIN32) - add_library(CemuAsm x64util_masm.asm) - set_source_files_properties(x64util_masm.asm PROPERTIES LANGUAGE ASM_MASM) + enable_language(C ASM_MASM) - # workaround for cr flag being passed to LINK.exe which considers it an input file and thus fails - # doesn't always seem to happen. The Windows CI builds were fine, but locally I would run into this problem - # possibly related to https://gitlab.kitware.com/cmake/cmake/-/issues/18889 - set(CMAKE_ASM_MASM_CREATE_STATIC_LIBRARY " /OUT: ") + add_library(CemuAsm x64util_masm.asm) + set_source_files_properties(x64util_masm.asm PROPERTIES LANGUAGE ASM_MASM) + # workaround for cr flag being passed to LINK.exe which considers it an input file and thus fails + # doesn't always seem to happen. The Windows CI builds were fine, but locally I would run into this problem + # possibly related to https://gitlab.kitware.com/cmake/cmake/-/issues/18889 + set(CMAKE_ASM_MASM_CREATE_STATIC_LIBRARY " /OUT: ") + + set_property(TARGET CemuAsm PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") + + else() + + # NASM + if (APPLE) + set(CMAKE_ASM_NASM_COMPILE_OBJECT " -g -Fdwarf -f macho64 --prefix _ -o ") + else() + set(CMAKE_ASM_NASM_COMPILE_OBJECT " -g -Fdwarf -f elf64 -o ") + endif() + set(CMAKE_ASM_NASM_LINK_EXECUTABLE "ld -fPIC -o ") + + enable_language(C ASM_NASM) + + add_library(CemuAsm x64util_nasm.asm) + set_source_files_properties(x64util_nasm.asm PROPERTIES LANGUAGE ASM_NASM) + + if (APPLE) + set_target_properties(CemuAsm PROPERTIES NASM_OBJ_FORMAT macho64) + else() + set_target_properties(CemuAsm PROPERTIES NASM_OBJ_FORMAT elf64) + endif() + set_target_properties(CemuAsm PROPERTIES LINKER_LANGUAGE C) + + endif() + +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64)|(AARCH64)") + add_library(CemuAsm stub.cpp) else() - - # NASM - if (APPLE) - set(CMAKE_ASM_NASM_COMPILE_OBJECT " -g -Fdwarf -f macho64 --prefix _ -o ") - else() - set(CMAKE_ASM_NASM_COMPILE_OBJECT " -g -Fdwarf -f elf64 -o ") - endif() - set(CMAKE_ASM_NASM_LINK_EXECUTABLE "ld -fPIC -o ") - - enable_language(C ASM_NASM) - - add_library(CemuAsm x64util_nasm.asm) - set_source_files_properties(x64util_nasm.asm PROPERTIES LANGUAGE ASM_NASM) - - if (APPLE) - set_target_properties(CemuAsm PROPERTIES NASM_OBJ_FORMAT macho64) - else() - set_target_properties(CemuAsm PROPERTIES NASM_OBJ_FORMAT elf64) - endif() - set_target_properties(CemuAsm PROPERTIES LINKER_LANGUAGE C) - + message(STATUS "CemuAsm - Unsupported arch: ${CMAKE_SYSTEM_PROCESSOR}") endif() - -set_property(TARGET CemuAsm PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") diff --git a/src/asm/stub.cpp b/src/asm/stub.cpp new file mode 100644 index 00000000..8d1c8b69 --- /dev/null +++ b/src/asm/stub.cpp @@ -0,0 +1 @@ + diff --git a/src/asm/x64util.h b/src/asm/x64util.h index 32669b97..885c2f63 100644 --- a/src/asm/x64util.h +++ b/src/asm/x64util.h @@ -1,4 +1,20 @@ #pragma once +#if defined(ARCH_X86_64) + extern "C" void recompiler_fres(); extern "C" void recompiler_frsqrte(); + +#else + +// stubbed on non-x86 for now +static void recompiler_fres() +{ + cemu_assert_unimplemented(); +} +static void recompiler_frsqrte() +{ + cemu_assert_unimplemented(); +} + +#endif diff --git a/src/config/CemuConfig.cpp b/src/config/CemuConfig.cpp index 3774c1fa..feb3ac14 100644 --- a/src/config/CemuConfig.cpp +++ b/src/config/CemuConfig.cpp @@ -64,6 +64,7 @@ void CemuConfig::Load(XMLConfigParser& parser) save_screenshot = parser.get("save_screenshot", save_screenshot); did_show_vulkan_warning = parser.get("vk_warning", did_show_vulkan_warning); did_show_graphic_pack_download = parser.get("gp_download", did_show_graphic_pack_download); + did_show_macos_disclaimer = parser.get("macos_disclaimer", did_show_macos_disclaimer); fullscreen = parser.get("fullscreen", fullscreen); proxy_server = parser.get("proxy_server", ""); @@ -365,6 +366,7 @@ void CemuConfig::Save(XMLConfigParser& parser) config.set("save_screenshot", save_screenshot); config.set("vk_warning", did_show_vulkan_warning); config.set("gp_download", did_show_graphic_pack_download); + config.set("macos_disclaimer", did_show_macos_disclaimer); config.set("fullscreen", fullscreen); config.set("proxy_server", proxy_server.GetValue().c_str()); diff --git a/src/config/CemuConfig.h b/src/config/CemuConfig.h index 4de2001b..f3469374 100644 --- a/src/config/CemuConfig.h +++ b/src/config/CemuConfig.h @@ -407,6 +407,7 @@ struct CemuConfig ConfigValue did_show_vulkan_warning{false}; ConfigValue did_show_graphic_pack_download{false}; + ConfigValue did_show_macos_disclaimer{false}; int game_list_style = 0; std::string game_list_column_order; diff --git a/src/gui/CemuApp.cpp b/src/gui/CemuApp.cpp index 02a360a9..4a5974e5 100644 --- a/src/gui/CemuApp.cpp +++ b/src/gui/CemuApp.cpp @@ -174,6 +174,23 @@ bool CemuApp::OnInit() SetTopWindow(m_mainFrame); m_mainFrame->Show(); + + // show warning on macOS about state of builds +#if BOOST_OS_MACOS + if (!GetConfig().did_show_macos_disclaimer) + { + const auto message = _( + "Thank you for testing the in-development build of Cemu for macOS.\n \n" + "The macOS port is currently purely experimental and should not be considered stable or ready for issue-free gameplay. " + "There are also known issues with degraded performance due to the use of MoltenVk and Rosetta for ARM Macs. We appreciate your patience while we improve Cemu for macOS."); + wxMessageDialog dialog(nullptr, message, "Preview version", wxCENTRE | wxOK | wxICON_WARNING); + dialog.SetOKLabel(_("I understand")); + dialog.ShowModal(); + GetConfig().did_show_macos_disclaimer = true; + g_config.Save(); + } +#endif + return true; } diff --git a/src/main.cpp b/src/main.cpp index 1566a83f..80e30855 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -19,6 +19,7 @@ #include "Cafe/TitleList/SaveList.h" #include "Common/ExceptionHandler/ExceptionHandler.h" +#include "Common/cpu_features.h" #include #include "util/helpers/helpers.h" @@ -37,8 +38,13 @@ #define SDL_MAIN_HANDLED #include -#if BOOST_OS_LINUX || BOOST_OS_MACOS +#if BOOST_OS_LINUX #define _putenv(__s) putenv((char*)(__s)) +#include +#elif BOOST_OS_MACOS +#define _putenv(__s) putenv((char*)(__s)) +#include +#include #endif #if BOOST_OS_WINDOWS @@ -49,41 +55,32 @@ extern "C" } #endif -bool _cpuExtension_SSSE3 = false; -bool _cpuExtension_SSE4_1 = false; -bool _cpuExtension_AVX2 = false; - std::atomic_bool g_isGPUInitFinished = false; std::wstring executablePath; void logCPUAndMemoryInfo() { - #if BOOST_OS_WINDOWS - int CPUInfo[4] = { -1 }; - unsigned nExIds, i = 0; - char CPUBrandString[0x40]; - // Get the information associated with each extended ID. - cpuid(CPUInfo, 0x80000000); - nExIds = CPUInfo[0]; - for (i = 0x80000000; i <= nExIds; ++i) - { - cpuid(CPUInfo, i); - // Interpret CPU brand string - if (i == 0x80000002) - memcpy(CPUBrandString, CPUInfo, sizeof(CPUInfo)); - else if (i == 0x80000003) - memcpy(CPUBrandString + 16, CPUInfo, sizeof(CPUInfo)); - else if (i == 0x80000004) - memcpy(CPUBrandString + 32, CPUInfo, sizeof(CPUInfo)); - } - forceLog_printf("CPU: %s", CPUBrandString); + std::string cpuName = g_CPUFeatures.GetCPUName(); + if (!cpuName.empty()) + cemuLog_log(LogType::Force, "CPU: {}", cpuName); + #if BOOST_OS_WINDOWS MEMORYSTATUSEX statex; statex.dwLength = sizeof(statex); GlobalMemoryStatusEx(&statex); uint32 memoryInMB = (uint32)(statex.ullTotalPhys / 1024LL / 1024LL); forceLog_printf("RAM: %uMB", memoryInMB); + #elif BOOST_OS_LINUX + struct sysinfo info {}; + sysinfo(&info); + cemuLog_log(LogType::Force, "RAM: {}MB", ((static_cast(info.totalram) * info.mem_unit) / 1024LL / 1024LL)); + #elif BOOST_OS_MACOS + int64_t totalRam; + size_t size = sizeof(totalRam); + int result = sysctlbyname("hw.memsize", &totalRam, &size, NULL, 0); + if (result == 0) + cemuLog_log(LogType::Force, "RAM: {}MB", (totalRam / 1024LL / 1024LL)); #endif } @@ -120,32 +117,7 @@ void infoLog_cemuStartup() checkForWine(); // CPU and RAM info logCPUAndMemoryInfo(); - // extensions that Cemu uses - char cpuExtensionStr[256]; - strcpy(cpuExtensionStr, ""); - if (_cpuExtension_SSSE3) - { - strcat(cpuExtensionStr, "SSSE3"); - } - if (_cpuExtension_SSE4_1) - { - if (cpuExtensionStr[0] != '\0') - strcat(cpuExtensionStr, ", "); - strcat(cpuExtensionStr, "SSE4.1"); - } - if (_cpuExtension_AVX2) - { - if (cpuExtensionStr[0] != '\0') - strcat(cpuExtensionStr, ", "); - strcat(cpuExtensionStr, "AVX2"); - } - if (AES128_useAESNI()) - { - if (cpuExtensionStr[0] != '\0') - strcat(cpuExtensionStr, ", "); - strcat(cpuExtensionStr, "AES-NI"); - } - cemuLog_force("Used CPU extensions: {}", cpuExtensionStr); + cemuLog_log(LogType::Force, "Used CPU extensions: {}", g_CPUFeatures.GetCommaSeparatedExtensionList()); } // some implementations of _putenv dont copy the string and instead only store a pointer @@ -194,14 +166,6 @@ void mainEmulatorCommonInit() AES128_init(); // init PPC timer (call this as early as possible because it measures frequency of RDTSC using an asynchronous thread over 3 seconds) PPCTimer_init(); - // check available CPU extensions - int cpuInfo[4]; - cpuid(cpuInfo, 0x1); - _cpuExtension_SSSE3 = ((cpuInfo[2] >> 9) & 1) != 0; - _cpuExtension_SSE4_1 = ((cpuInfo[2] >> 19) & 1) != 0; - - cpuidex(cpuInfo, 0x7, 0); - _cpuExtension_AVX2 = ((cpuInfo[1] >> 5) & 1) != 0; #if BOOST_OS_WINDOWS executablePath.resize(4096); @@ -382,4 +346,4 @@ int main(int argc, char *argv[]) extern "C" DLLEXPORT uint64 gameMeta_getTitleId() { return CafeSystem::GetForegroundTitleId(); -} \ No newline at end of file +} diff --git a/src/util/crypto/aes128.cpp b/src/util/crypto/aes128.cpp index c8da6c8e..8ab7b608 100644 --- a/src/util/crypto/aes128.cpp +++ b/src/util/crypto/aes128.cpp @@ -10,6 +10,7 @@ /* Includes: */ /*****************************************************************************/ #include "aes128.h" +#include "Common/cpu_features.h" /*****************************************************************************/ /* Defines: */ @@ -23,8 +24,6 @@ // The number of rounds in AES Cipher. #define Nr 10 -bool useAESNI = false; - typedef uint8 state_t[4][4]; typedef struct @@ -601,6 +600,7 @@ void AES128_CBC_decrypt_updateIV(uint8* output, uint8* input, uint32 length, con memcpy(iv, newIv, KEYLEN); } +#if defined(ARCH_X86_64) inline __m128i AESNI128_ASSIST( __m128i temp1, __m128i temp2) @@ -792,6 +792,7 @@ void __aesni__AES128_ECB_encrypt(uint8* input, const uint8* key, uint8* output) feedback = _mm_aesenclast_si128(feedback, ((__m128i*)expandedKey)[10]); _mm_storeu_si128(&((__m128i*)output)[0], feedback); } +#endif void(*AES128_ECB_encrypt)(uint8* input, const uint8* key, uint8* output); void (*AES128_CBC_decrypt)(uint8* output, uint8* input, uint32 length, const uint8* key, const uint8* iv) = nullptr; @@ -836,10 +837,8 @@ void AES128_init() lookupTable_multiply[i] = (vE << 0) | (v9 << 8) | (vD << 16) | (vB << 24); } // check if AES-NI is available - int v[4]; - cpuid(v, 1); - useAESNI = (v[2] & 0x2000000) != 0; - if (useAESNI) + #if defined(ARCH_X86_64) + if (g_CPUFeatures.x86.aesni) { // AES-NI implementation AES128_CBC_decrypt = __aesni__AES128_CBC_decrypt; @@ -851,9 +850,8 @@ void AES128_init() AES128_CBC_decrypt = __soft__AES128_CBC_decrypt; AES128_ECB_encrypt = __soft__AES128_ECB_encrypt; } + #else + AES128_CBC_decrypt = __soft__AES128_CBC_decrypt; + AES128_ECB_encrypt = __soft__AES128_ECB_encrypt; + #endif } - -bool AES128_useAESNI() -{ - return useAESNI; -} \ No newline at end of file diff --git a/src/util/crypto/aes128.h b/src/util/crypto/aes128.h index fbe56541..7993329c 100644 --- a/src/util/crypto/aes128.h +++ b/src/util/crypto/aes128.h @@ -2,7 +2,6 @@ #define _AES_H_ void AES128_init(); -bool AES128_useAESNI(); extern void(*AES128_ECB_encrypt)(uint8* input, const uint8* key, uint8* output); diff --git a/src/util/helpers/fspinlock.h b/src/util/helpers/fspinlock.h index 4fa642f4..3994299f 100644 --- a/src/util/helpers/fspinlock.h +++ b/src/util/helpers/fspinlock.h @@ -19,7 +19,8 @@ public: { if (!m_lockBool.exchange(true, std::memory_order_acquire)) break; - while (m_lockBool.load(std::memory_order_relaxed)) _mm_pause(); + while (m_lockBool.load(std::memory_order_relaxed)) + _mm_pause(); } } @@ -36,4 +37,4 @@ public: private: mutable std::atomic m_lockBool = false; -}; \ No newline at end of file +};