Make codebase more CPU-agnostic + MacOS disclaimer (#559)

This commit is contained in:
Exzap 2022-12-07 00:48:24 +00:00 committed by GitHub
parent 445b0afa95
commit 2c81d240a5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
26 changed files with 416 additions and 272 deletions

View File

@ -2,10 +2,6 @@
project ("ih264d") project ("ih264d")
set(LIBAVCDEC_X86_INCLUDES "common/x86" "decoder/x86")
include_directories("common/" "decoder/" ${LIBAVCDEC_X86_INCLUDES})
add_library (ih264d add_library (ih264d
"common/ih264_buf_mgr.c" "common/ih264_buf_mgr.c"
"common/ih264_buf_mgr.h" "common/ih264_buf_mgr.h"
@ -53,21 +49,6 @@ add_library (ih264d
"common/ih264_weighted_pred.h" "common/ih264_weighted_pred.h"
"common/ithread.c" "common/ithread.c"
"common/ithread.h" "common/ithread.h"
"common/x86/ih264_chroma_intra_pred_filters_ssse3.c"
"common/x86/ih264_deblk_chroma_ssse3.c"
"common/x86/ih264_deblk_luma_ssse3.c"
"common/x86/ih264_ihadamard_scaling_sse42.c"
"common/x86/ih264_ihadamard_scaling_ssse3.c"
"common/x86/ih264_inter_pred_filters_ssse3.c"
"common/x86/ih264_iquant_itrans_recon_dc_ssse3.c"
"common/x86/ih264_iquant_itrans_recon_sse42.c"
"common/x86/ih264_iquant_itrans_recon_ssse3.c"
"common/x86/ih264_luma_intra_pred_filters_ssse3.c"
"common/x86/ih264_mem_fns_ssse3.c"
"common/x86/ih264_padding_ssse3.c"
"common/x86/ih264_platform_macros.h"
"common/x86/ih264_resi_trans_quant_sse42.c"
"common/x86/ih264_weighted_pred_sse42.c"
"decoder/ih264d.h" "decoder/ih264d.h"
"decoder/ih264d_api.c" "decoder/ih264d_api.c"
"decoder/ih264d_bitstrm.c" "decoder/ih264d_bitstrm.c"
@ -134,10 +115,71 @@ add_library (ih264d
"decoder/ih264d_vui.h" "decoder/ih264d_vui.h"
"decoder/iv.h" "decoder/iv.h"
"decoder/ivd.h" "decoder/ivd.h"
)
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64")
set(LIBAVCDEC_X86_INCLUDES "common/x86" "decoder/x86")
include_directories("common/" "decoder/" ${LIBAVCDEC_X86_INCLUDES})
target_sources(ih264d PRIVATE
"common/x86/ih264_chroma_intra_pred_filters_ssse3.c"
"common/x86/ih264_deblk_chroma_ssse3.c"
"common/x86/ih264_deblk_luma_ssse3.c"
"common/x86/ih264_ihadamard_scaling_sse42.c"
"common/x86/ih264_ihadamard_scaling_ssse3.c"
"common/x86/ih264_inter_pred_filters_ssse3.c"
"common/x86/ih264_iquant_itrans_recon_dc_ssse3.c"
"common/x86/ih264_iquant_itrans_recon_sse42.c"
"common/x86/ih264_iquant_itrans_recon_ssse3.c"
"common/x86/ih264_luma_intra_pred_filters_ssse3.c"
"common/x86/ih264_mem_fns_ssse3.c"
"common/x86/ih264_padding_ssse3.c"
"common/x86/ih264_platform_macros.h"
"common/x86/ih264_resi_trans_quant_sse42.c"
"common/x86/ih264_weighted_pred_sse42.c"
"decoder/x86/ih264d_function_selector.c" "decoder/x86/ih264d_function_selector.c"
"decoder/x86/ih264d_function_selector_sse42.c" "decoder/x86/ih264d_function_selector_sse42.c"
"decoder/x86/ih264d_function_selector_ssse3.c" "decoder/x86/ih264d_function_selector_ssse3.c"
) )
elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
enable_language( C CXX ASM )
set(LIBAVCDEC_ARM_INCLUDES "common/armv8" "decoder/arm")
include_directories("common/" "decoder/" ${LIBAVCDEC_ARM_INCLUDES})
target_sources(ih264d PRIVATE
"common/armv8/ih264_deblk_chroma_av8.s"
"common/armv8/ih264_deblk_luma_av8.s"
"common/armv8/ih264_default_weighted_pred_av8.s"
"common/armv8/ih264_ihadamard_scaling_av8.s"
"common/armv8/ih264_inter_pred_chroma_av8.s"
"common/armv8/ih264_inter_pred_filters_luma_horz_av8.s"
"common/armv8/ih264_inter_pred_filters_luma_vert_av8.s"
"common/armv8/ih264_inter_pred_luma_copy_av8.s"
"common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s"
"common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s"
"common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s"
"common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s"
"common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s"
"common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s"
"common/armv8/ih264_intra_pred_chroma_av8.s"
"common/armv8/ih264_intra_pred_luma_16x16_av8.s"
"common/armv8/ih264_intra_pred_luma_4x4_av8.s"
"common/armv8/ih264_intra_pred_luma_8x8_av8.s"
"common/armv8/ih264_iquant_itrans_recon_av8.s"
"common/armv8/ih264_iquant_itrans_recon_dc_av8.s"
"common/armv8/ih264_mem_fns_neon_av8.s"
"common/armv8/ih264_neon_macros.s"
"common/armv8/ih264_padding_neon_av8.s"
"common/armv8/ih264_platform_macros.h"
"common/armv8/ih264_resi_trans_quant_av8.s"
"common/armv8/ih264_weighted_bi_pred_av8.s"
"common/armv8/ih264_weighted_pred_av8.s"
"decoder/arm/ih264d_function_selector_a9q.c"
"decoder/arm/ih264d_function_selector_av8.c"
"decoder/arm/ih264d_function_selector.c"
)
target_compile_options(ih264d PRIVATE -DARMV8)
else()
message(FATAL_ERROR "ih264d unknown architecture: ${CMAKE_SYSTEM_PROCESSOR}")
endif()
if(MSVC) if(MSVC)
set_property(TARGET ih264d PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>") set_property(TARGET ih264d PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")

View File

@ -83,7 +83,7 @@ void hleExport_xcx_enterCriticalSection(PPCInterpreter_t* hCPU)
osLib_returnFromFunction(hCPU, 0); osLib_returnFromFunction(hCPU, 0);
return; return;
} }
_mm_pause(); _mm_pause();
} }
PPCCore_switchToScheduler(); PPCCore_switchToScheduler();
} }

View File

@ -203,7 +203,6 @@ extern uint64 ppcMainThreadDECCycleStart; // at which cycle the dec register was
void PPCTimer_init(); void PPCTimer_init();
void PPCTimer_waitForInit(); void PPCTimer_waitForInit();
uint64 PPCTimer_getFromRDTSC(); uint64 PPCTimer_getFromRDTSC();
bool PPCTimer_hasInvariantRDTSCSupport();
uint64 PPCTimer_microsecondsToTsc(uint64 us); uint64 PPCTimer_microsecondsToTsc(uint64 us);
uint64 PPCTimer_tscToMicroseconds(uint64 us); uint64 PPCTimer_tscToMicroseconds(uint64 us);

View File

@ -1,9 +1,14 @@
#include "Cafe/HW/Espresso/Const.h" #include "Cafe/HW/Espresso/Const.h"
#include <immintrin.h>
#include "asm/x64util.h" #include "asm/x64util.h"
#include "config/ActiveSettings.h" #include "config/ActiveSettings.h"
#include "util/helpers/fspinlock.h" #include "util/helpers/fspinlock.h"
#include "util/highresolutiontimer/HighResolutionTimer.h" #include "util/highresolutiontimer/HighResolutionTimer.h"
#include "Common/cpu_features.h"
#if defined(ARCH_X86_64)
#include <immintrin.h>
#pragma intrinsic(__rdtsc)
#endif
uint64 _rdtscLastMeasure = 0; uint64 _rdtscLastMeasure = 0;
uint64 _rdtscFrequency = 0; uint64 _rdtscFrequency = 0;
@ -18,8 +23,6 @@ static_assert(sizeof(uint128_t) == 16);
uint128_t _rdtscAcc{}; uint128_t _rdtscAcc{};
#pragma intrinsic(__rdtsc)
uint64 muldiv64(uint64 a, uint64 b, uint64 d) uint64 muldiv64(uint64 a, uint64 b, uint64 d)
{ {
uint64 diva = a / d; uint64 diva = a / d;
@ -29,17 +32,12 @@ uint64 muldiv64(uint64 a, uint64 b, uint64 d)
return diva * b + moda * divb + moda * modb / d; return diva * b + moda * divb + moda * modb / d;
} }
bool PPCTimer_hasInvariantRDTSCSupport()
{
uint32 cpuv[4];
cpuid((int*)cpuv, 0x80000007);
return ((cpuv[3] >> 8) & 1);
}
uint64 PPCTimer_estimateRDTSCFrequency() uint64 PPCTimer_estimateRDTSCFrequency()
{ {
if (PPCTimer_hasInvariantRDTSCSupport() == false) #if defined(ARCH_X86_64)
forceLog_printf("Invariant TSC not supported"); if (!g_CPUFeatures.x86.invariant_tsc)
cemuLog_log(LogType::Force, "Invariant TSC not supported");
#endif
_mm_mfence(); _mm_mfence();
uint64 tscStart = __rdtsc(); uint64 tscStart = __rdtsc();

View File

@ -8,11 +8,10 @@
#include "Cafe/OS/libs/coreinit/coreinit_CodeGen.h" #include "Cafe/OS/libs/coreinit/coreinit_CodeGen.h"
#include "config/ActiveSettings.h" #include "config/ActiveSettings.h"
#include "config/LaunchSettings.h" #include "config/LaunchSettings.h"
#include "util/helpers/fspinlock.h"
#include "Common/ExceptionHandler/ExceptionHandler.h" #include "Common/ExceptionHandler/ExceptionHandler.h"
#include "Common/cpu_features.h"
#include "util/helpers/fspinlock.h"
#include "util/helpers/helpers.h" #include "util/helpers/helpers.h"
#include "util/MemMapper/MemMapper.h" #include "util/MemMapper/MemMapper.h"
struct PPCInvalidationRange struct PPCInvalidationRange
@ -461,6 +460,20 @@ void PPCRecompiler_invalidateRange(uint32 startAddr, uint32 endAddr)
PPCRecompilerState.recompilerSpinlock.unlock(); PPCRecompilerState.recompilerSpinlock.unlock();
} }
#if defined(ARCH_X86_64)
void PPCRecompiler_initPlatform()
{
// mxcsr
ppcRecompilerInstanceData->_x64XMM_mxCsr_ftzOn = 0x1F80 | 0x8000;
ppcRecompilerInstanceData->_x64XMM_mxCsr_ftzOff = 0x1F80;
}
#else
void PPCRecompiler_initPlatform()
{
}
#endif
void PPCRecompiler_init() void PPCRecompiler_init()
{ {
if (ActiveSettings::GetCPUMode() == CPUMode::SinglecoreInterpreter) if (ActiveSettings::GetCPUMode() == CPUMode::SinglecoreInterpreter)
@ -569,21 +582,9 @@ void PPCRecompiler_init()
ppcRecompilerInstanceData->_psq_st_scale_ps0_ps1[(i + 32) * 2 + 1] = br; ppcRecompilerInstanceData->_psq_st_scale_ps0_ps1[(i + 32) * 2 + 1] = br;
} }
// mxcsr PPCRecompiler_initPlatform();
ppcRecompilerInstanceData->_x64XMM_mxCsr_ftzOn = 0x1F80 | 0x8000;
ppcRecompilerInstanceData->_x64XMM_mxCsr_ftzOff = 0x1F80; forceLog_printf("Recompiler initialized");
// query processor extensions
int cpuInfo[4];
cpuid(cpuInfo, 0x80000001);
hasLZCNTSupport = ((cpuInfo[2] >> 5) & 1) != 0;
cpuid(cpuInfo, 0x1);
hasMOVBESupport = ((cpuInfo[2] >> 22) & 1) != 0;
hasAVXSupport = ((cpuInfo[2] >> 28) & 1) != 0;
cpuidex(cpuInfo, 0x7, 0);
hasBMI2Support = ((cpuInfo[1] >> 8) & 1) != 0;
forceLog_printf("Recompiler initialized. CPU extensions: %s%s%s", hasLZCNTSupport ? "LZCNT " : "", hasMOVBESupport ? "MOVBE " : "", hasAVXSupport ? "AVX " : "");
ppcRecompilerEnabled = true; ppcRecompilerEnabled = true;

View File

@ -384,12 +384,6 @@ extern void ATTR_MS_ABI (*PPCRecompiler_leaveRecompilerCode_unvisited)();
#define PPC_REC_INVALID_FUNCTION ((PPCRecFunction_t*)-1) #define PPC_REC_INVALID_FUNCTION ((PPCRecFunction_t*)-1)
// CPUID
extern bool hasLZCNTSupport;
extern bool hasMOVBESupport;
extern bool hasBMI2Support;
extern bool hasAVXSupport;
// todo - move some of the stuff above into PPCRecompilerInternal.h // todo - move some of the stuff above into PPCRecompilerInternal.h
// recompiler interface // recompiler interface

View File

@ -3,14 +3,6 @@
#include "PPCRecompilerIml.h" #include "PPCRecompilerIml.h"
#include "Cafe/GameProfile/GameProfile.h" #include "Cafe/GameProfile/GameProfile.h"
bool hasSSE1Support = true;
bool hasSSE2Support = true;
bool hasSSE3Support = true;
bool hasLZCNTSupport = false;
bool hasMOVBESupport = false;
bool hasBMI2Support = false;
bool hasAVXSupport = false;
void PPCRecompilerImlGen_generateNewInstruction_fpr_r_memory(ppcImlGenContext_t* ppcImlGenContext, uint8 registerDestination, uint8 registerMemory, sint32 immS32, uint32 mode, bool switchEndian, uint8 registerGQR = PPC_REC_INVALID_REGISTER) void PPCRecompilerImlGen_generateNewInstruction_fpr_r_memory(ppcImlGenContext_t* ppcImlGenContext, uint8 registerDestination, uint8 registerMemory, sint32 immS32, uint32 mode, bool switchEndian, uint8 registerGQR = PPC_REC_INVALID_REGISTER)
{ {
// load from memory // load from memory
@ -145,8 +137,6 @@ void PPRecompilerImmGen_optionalRoundPairFPRToSinglePrecision(ppcImlGenContext_t
bool PPCRecompilerImlGen_LFS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) bool PPCRecompilerImlGen_LFS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{ {
if( hasSSE1Support == false )
return false;
sint32 rA, frD; sint32 rA, frD;
uint32 imm; uint32 imm;
PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm); PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm);
@ -167,8 +157,6 @@ bool PPCRecompilerImlGen_LFS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode
bool PPCRecompilerImlGen_LFSU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) bool PPCRecompilerImlGen_LFSU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{ {
if( hasSSE1Support == false )
return false;
sint32 rA, frD; sint32 rA, frD;
uint32 imm; uint32 imm;
PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm); PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm);
@ -191,8 +179,6 @@ bool PPCRecompilerImlGen_LFSU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
bool PPCRecompilerImlGen_LFSX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) bool PPCRecompilerImlGen_LFSX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{ {
if( hasSSE2Support == false )
return false;
sint32 rA, frD, rB; sint32 rA, frD, rB;
PPC_OPC_TEMPL_X(opcode, frD, rA, rB); PPC_OPC_TEMPL_X(opcode, frD, rA, rB);
if( rA == 0 ) if( rA == 0 )
@ -218,8 +204,6 @@ bool PPCRecompilerImlGen_LFSX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
bool PPCRecompilerImlGen_LFSUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) bool PPCRecompilerImlGen_LFSUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{ {
if( hasSSE2Support == false )
return false;
sint32 rA, frD, rB; sint32 rA, frD, rB;
PPC_OPC_TEMPL_X(opcode, frD, rA, rB); PPC_OPC_TEMPL_X(opcode, frD, rA, rB);
if( rA == 0 ) if( rA == 0 )
@ -247,8 +231,6 @@ bool PPCRecompilerImlGen_LFSUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
bool PPCRecompilerImlGen_LFD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) bool PPCRecompilerImlGen_LFD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{ {
if( hasSSE1Support == false )
return false;
sint32 rA, frD; sint32 rA, frD;
uint32 imm; uint32 imm;
PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm); PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm);
@ -266,8 +248,6 @@ bool PPCRecompilerImlGen_LFD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode
bool PPCRecompilerImlGen_LFDU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) bool PPCRecompilerImlGen_LFDU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{ {
if( hasSSE1Support == false )
return false;
sint32 rA, frD; sint32 rA, frD;
uint32 imm; uint32 imm;
PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm); PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm);
@ -288,8 +268,6 @@ bool PPCRecompilerImlGen_LFDU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
bool PPCRecompilerImlGen_LFDX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) bool PPCRecompilerImlGen_LFDX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{ {
if( hasSSE2Support == false )
return false;
sint32 rA, frD, rB; sint32 rA, frD, rB;
PPC_OPC_TEMPL_X(opcode, frD, rA, rB); PPC_OPC_TEMPL_X(opcode, frD, rA, rB);
if( rA == 0 ) if( rA == 0 )
@ -308,8 +286,6 @@ bool PPCRecompilerImlGen_LFDX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
bool PPCRecompilerImlGen_LFDUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) bool PPCRecompilerImlGen_LFDUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{ {
if( hasSSE2Support == false )
return false;
sint32 rA, frD, rB; sint32 rA, frD, rB;
PPC_OPC_TEMPL_X(opcode, frD, rA, rB); PPC_OPC_TEMPL_X(opcode, frD, rA, rB);
if( rA == 0 ) if( rA == 0 )
@ -330,8 +306,6 @@ bool PPCRecompilerImlGen_LFDUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
bool PPCRecompilerImlGen_STFS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) bool PPCRecompilerImlGen_STFS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{ {
if( hasSSE1Support == false )
return false;
sint32 rA, frD; sint32 rA, frD;
uint32 imm; uint32 imm;
PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm); PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm);
@ -346,8 +320,6 @@ bool PPCRecompilerImlGen_STFS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
bool PPCRecompilerImlGen_STFSU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) bool PPCRecompilerImlGen_STFSU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{ {
if( hasSSE1Support == false )
return false;
sint32 rA, frD; sint32 rA, frD;
uint32 imm; uint32 imm;
PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm); PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm);
@ -364,8 +336,6 @@ bool PPCRecompilerImlGen_STFSU(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
bool PPCRecompilerImlGen_STFSX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) bool PPCRecompilerImlGen_STFSX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{ {
if( hasSSE2Support == false )
return false;
sint32 rA, frS, rB; sint32 rA, frS, rB;
PPC_OPC_TEMPL_X(opcode, frS, rA, rB); PPC_OPC_TEMPL_X(opcode, frS, rA, rB);
if( rA == 0 ) if( rA == 0 )
@ -392,8 +362,6 @@ bool PPCRecompilerImlGen_STFSX(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
bool PPCRecompilerImlGen_STFSUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) bool PPCRecompilerImlGen_STFSUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{ {
if( hasSSE2Support == false )
return false;
sint32 rA, frS, rB; sint32 rA, frS, rB;
PPC_OPC_TEMPL_X(opcode, frS, rA, rB); PPC_OPC_TEMPL_X(opcode, frS, rA, rB);
if( rA == 0 ) if( rA == 0 )
@ -415,8 +383,6 @@ bool PPCRecompilerImlGen_STFSUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opc
bool PPCRecompilerImlGen_STFD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) bool PPCRecompilerImlGen_STFD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{ {
if( hasSSE1Support == false )
return false;
sint32 rA, frD; sint32 rA, frD;
uint32 imm; uint32 imm;
PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm); PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm);
@ -435,8 +401,6 @@ bool PPCRecompilerImlGen_STFD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
bool PPCRecompilerImlGen_STFDU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) bool PPCRecompilerImlGen_STFDU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{ {
if( hasSSE1Support == false )
return false;
sint32 rA, frD; sint32 rA, frD;
uint32 imm; uint32 imm;
PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm); PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, imm);
@ -458,8 +422,6 @@ bool PPCRecompilerImlGen_STFDU(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
bool PPCRecompilerImlGen_STFDX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) bool PPCRecompilerImlGen_STFDX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{ {
if( hasSSE2Support == false )
return false;
sint32 rA, frS, rB; sint32 rA, frS, rB;
PPC_OPC_TEMPL_X(opcode, frS, rA, rB); PPC_OPC_TEMPL_X(opcode, frS, rA, rB);
if( rA == 0 ) if( rA == 0 )
@ -485,8 +447,6 @@ bool PPCRecompilerImlGen_STFDX(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
bool PPCRecompilerImlGen_STFIWX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) bool PPCRecompilerImlGen_STFIWX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{ {
if( hasSSE2Support == false )
return false;
sint32 rA, frS, rB; sint32 rA, frS, rB;
PPC_OPC_TEMPL_X(opcode, frS, rA, rB); PPC_OPC_TEMPL_X(opcode, frS, rA, rB);
// get memory gpr registers // get memory gpr registers
@ -959,10 +919,6 @@ bool PPCRecompilerImlGen_FCMPO(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
sint32 crfD, frA, frB; sint32 crfD, frA, frB;
PPC_OPC_TEMPL_X(opcode, crfD, frA, frB); PPC_OPC_TEMPL_X(opcode, crfD, frA, frB);
crfD >>= 2; crfD >>= 2;
if( hasSSE2Support == false )
{
return false;
}
uint32 fprRegisterA = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frA); uint32 fprRegisterA = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frA);
uint32 fprRegisterB = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frB); uint32 fprRegisterB = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frB);
PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_FCMPO_BOTTOM, fprRegisterA, fprRegisterB, crfD); PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_FCMPO_BOTTOM, fprRegisterA, fprRegisterB, crfD);
@ -974,10 +930,6 @@ bool PPCRecompilerImlGen_FCMPU(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
sint32 crfD, frA, frB; sint32 crfD, frA, frB;
PPC_OPC_TEMPL_X(opcode, crfD, frA, frB); PPC_OPC_TEMPL_X(opcode, crfD, frA, frB);
crfD >>= 2; crfD >>= 2;
if( hasSSE2Support == false )
{
return false;
}
uint32 fprRegisterA = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frA); uint32 fprRegisterA = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frA);
uint32 fprRegisterB = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frB); uint32 fprRegisterB = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frB);
PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_FCMPU_BOTTOM, fprRegisterA, fprRegisterB, crfD); PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_FCMPU_BOTTOM, fprRegisterA, fprRegisterB, crfD);
@ -1120,8 +1072,6 @@ bool PPCRecompilerImlGen_FCTIWZ(ppcImlGenContext_t* ppcImlGenContext, uint32 opc
bool PPCRecompilerImlGen_PSQ_L(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) bool PPCRecompilerImlGen_PSQ_L(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{ {
if (hasSSE2Support == false)
return false;
int rA, frD; int rA, frD;
uint32 immUnused; uint32 immUnused;
PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, immUnused); PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, immUnused);
@ -1146,8 +1096,6 @@ bool PPCRecompilerImlGen_PSQ_L(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
bool PPCRecompilerImlGen_PSQ_LU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode) bool PPCRecompilerImlGen_PSQ_LU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode)
{ {
if (hasSSE2Support == false)
return false;
int rA, frD; int rA, frD;
uint32 immUnused; uint32 immUnused;
PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, immUnused); PPC_OPC_TEMPL_D_SImm(opcode, frD, rA, immUnused);

View File

@ -5,8 +5,8 @@
#include "PPCRecompilerIml.h" #include "PPCRecompilerIml.h"
#include "PPCRecompilerX64.h" #include "PPCRecompilerX64.h"
#include "Cafe/OS/libs/coreinit/coreinit_Time.h" #include "Cafe/OS/libs/coreinit/coreinit_Time.h"
#include "util/MemMapper/MemMapper.h" #include "util/MemMapper/MemMapper.h"
#include "Common/cpu_features.h"
sint32 x64Gen_registerMap[12] = // virtual GPR to x64 register mapping sint32 x64Gen_registerMap[12] = // virtual GPR to x64 register mapping
{ {
@ -381,7 +381,7 @@ bool PPCRecompilerX64Gen_imlInstruction_load(PPCRecFunction_t* PPCRecFunction, p
{ {
x64Gen_lea_reg64Low32_reg64Low32PlusReg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem, realRegisterMem2); x64Gen_lea_reg64Low32_reg64Low32PlusReg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem, realRegisterMem2);
} }
if( hasMOVBESupport && switchEndian ) if( g_CPUFeatures.x86.movbe && switchEndian )
{ {
if (indexed) if (indexed)
{ {
@ -419,7 +419,7 @@ bool PPCRecompilerX64Gen_imlInstruction_load(PPCRecFunction_t* PPCRecFunction, p
{ {
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
} }
if( hasMOVBESupport && switchEndian ) if( g_CPUFeatures.x86.movbe && switchEndian )
{ {
x64Gen_movBEZeroExtend_reg64Low16_mem16Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); x64Gen_movBEZeroExtend_reg64Low16_mem16Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32);
if( indexed && realRegisterMem != realRegisterData ) if( indexed && realRegisterMem != realRegisterData )
@ -477,7 +477,7 @@ bool PPCRecompilerX64Gen_imlInstruction_load(PPCRecFunction_t* PPCRecFunction, p
assert_dbg(); assert_dbg();
if( indexed ) if( indexed )
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); // can be replaced with LEA temp, [memReg1+memReg2] (this way we can avoid the SUB instruction after the move) x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); // can be replaced with LEA temp, [memReg1+memReg2] (this way we can avoid the SUB instruction after the move)
if( hasMOVBESupport ) if( g_CPUFeatures.x86.movbe )
{ {
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32);
if( indexed && realRegisterMem != realRegisterData ) if( indexed && realRegisterMem != realRegisterData )
@ -537,7 +537,7 @@ bool PPCRecompilerX64Gen_imlInstruction_store(PPCRecFunction_t* PPCRecFunction,
if (indexed) if (indexed)
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
uint32 valueRegister; uint32 valueRegister;
if ((swapEndian == false || hasMOVBESupport) && realRegisterMem != realRegisterData) if ((swapEndian == false || g_CPUFeatures.x86.movbe) && realRegisterMem != realRegisterData)
{ {
valueRegister = realRegisterData; valueRegister = realRegisterData;
} }
@ -546,11 +546,11 @@ bool PPCRecompilerX64Gen_imlInstruction_store(PPCRecFunction_t* PPCRecFunction,
x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData); x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData);
valueRegister = REG_RESV_TEMP; valueRegister = REG_RESV_TEMP;
} }
if (hasMOVBESupport == false && swapEndian) if (g_CPUFeatures.x86.movbe == false && swapEndian)
x64Gen_bswap_reg64Lower32bit(x64GenContext, valueRegister); x64Gen_bswap_reg64Lower32bit(x64GenContext, valueRegister);
if (indexed) if (indexed)
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
if (hasMOVBESupport && swapEndian) if (g_CPUFeatures.x86.movbe && swapEndian)
x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, valueRegister); x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, valueRegister);
else else
x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, valueRegister); x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, valueRegister);
@ -802,8 +802,7 @@ bool PPCRecompilerX64Gen_imlInstruction_r_r(PPCRecFunction_t* PPCRecFunction, pp
// count leading zeros // count leading zeros
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
cemu_assert_debug(imlInstruction->crRegister == PPC_REC_INVALID_REGISTER); cemu_assert_debug(imlInstruction->crRegister == PPC_REC_INVALID_REGISTER);
// LZCNT instruction (part of SSE4, CPUID.80000001H:ECX.ABM[Bit 5]) if( g_CPUFeatures.x86.lzcnt )
if( hasLZCNTSupport )
{ {
x64Gen_lzcnt_reg64Low32_reg64Low32(x64GenContext, tempToRealRegister(imlInstruction->op_r_r.registerResult), tempToRealRegister(imlInstruction->op_r_r.registerA)); x64Gen_lzcnt_reg64Low32_reg64Low32(x64GenContext, tempToRealRegister(imlInstruction->op_r_r.registerResult), tempToRealRegister(imlInstruction->op_r_r.registerA));
} }
@ -1521,12 +1520,12 @@ bool PPCRecompilerX64Gen_imlInstruction_r_r_r(PPCRecFunction_t* PPCRecFunction,
sint32 rRegOperand1 = tempToRealRegister(imlInstruction->op_r_r_r.registerA); sint32 rRegOperand1 = tempToRealRegister(imlInstruction->op_r_r_r.registerA);
sint32 rRegOperand2 = tempToRealRegister(imlInstruction->op_r_r_r.registerB); sint32 rRegOperand2 = tempToRealRegister(imlInstruction->op_r_r_r.registerB);
if (hasBMI2Support && imlInstruction->operation == PPCREC_IML_OP_SRW) if (g_CPUFeatures.x86.bmi2 && imlInstruction->operation == PPCREC_IML_OP_SRW)
{ {
// use BMI2 SHRX if available // use BMI2 SHRX if available
x64Gen_shrx_reg64_reg64_reg64(x64GenContext, rRegResult, rRegOperand1, rRegOperand2); x64Gen_shrx_reg64_reg64_reg64(x64GenContext, rRegResult, rRegOperand1, rRegOperand2);
} }
else if (hasBMI2Support && imlInstruction->operation == PPCREC_IML_OP_SLW) else if (g_CPUFeatures.x86.bmi2 && imlInstruction->operation == PPCREC_IML_OP_SLW)
{ {
// use BMI2 SHLX if available // use BMI2 SHLX if available
x64Gen_shlx_reg64_reg64_reg64(x64GenContext, rRegResult, rRegOperand1, rRegOperand2); x64Gen_shlx_reg64_reg64_reg64(x64GenContext, rRegResult, rRegOperand1, rRegOperand2);

View File

@ -2,6 +2,7 @@
#include "PPCRecompilerIml.h" #include "PPCRecompilerIml.h"
#include "PPCRecompilerX64.h" #include "PPCRecompilerX64.h"
#include "asm/x64util.h" #include "asm/x64util.h"
#include "Common/cpu_features.h"
void PPCRecompilerX64Gen_imlInstruction_fpr_r_name(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, PPCRecImlInstruction_t* imlInstruction) void PPCRecompilerX64Gen_imlInstruction_fpr_r_name(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, PPCRecImlInstruction_t* imlInstruction)
{ {
@ -86,7 +87,7 @@ void PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext_t* ppcImlGenContext,
{ {
x64Gen_mov_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, memRegEx); x64Gen_mov_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, memRegEx);
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, memReg); x64Gen_add_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, memReg);
if (hasMOVBESupport) if (g_CPUFeatures.x86.movbe)
{ {
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, memImmS32); x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, memImmS32);
} }
@ -98,7 +99,7 @@ void PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext_t* ppcImlGenContext,
} }
else else
{ {
if (hasMOVBESupport) if (g_CPUFeatures.x86.movbe)
{ {
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, memReg, memImmS32); x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, memReg, memImmS32);
} }
@ -108,7 +109,7 @@ void PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext_t* ppcImlGenContext,
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP); x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
} }
} }
if (hasAVXSupport) if (g_CPUFeatures.x86.avx)
{ {
x64Gen_movd_xmmReg_reg64Low32(x64GenContext, REG_RESV_FPR_TEMP, REG_RESV_TEMP); x64Gen_movd_xmmReg_reg64Low32(x64GenContext, REG_RESV_FPR_TEMP, REG_RESV_TEMP);
} }
@ -280,21 +281,21 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_load(PPCRecFunction_t* PPCRecFunctio
{ {
x64Gen_mov_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem2); x64Gen_mov_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem2);
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem); x64Gen_add_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem);
if( hasMOVBESupport ) if( g_CPUFeatures.x86.movbe )
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32); x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32);
else else
x64Emit_mov_reg32_mem32(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32); x64Emit_mov_reg32_mem32(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32);
} }
else else
{ {
if( hasMOVBESupport ) if( g_CPUFeatures.x86.movbe )
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, realRegisterMem, imlInstruction->op_storeLoad.immS32); x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, realRegisterMem, imlInstruction->op_storeLoad.immS32);
else else
x64Emit_mov_reg32_mem32(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, realRegisterMem, imlInstruction->op_storeLoad.immS32); x64Emit_mov_reg32_mem32(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, realRegisterMem, imlInstruction->op_storeLoad.immS32);
} }
if( hasMOVBESupport == false ) if( g_CPUFeatures.x86.movbe == false )
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP); x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
if( hasAVXSupport ) if( g_CPUFeatures.x86.avx )
{ {
x64Gen_movd_xmmReg_reg64Low32(x64GenContext, realRegisterXMM, REG_RESV_TEMP); x64Gen_movd_xmmReg_reg64Low32(x64GenContext, realRegisterXMM, REG_RESV_TEMP);
} }
@ -316,7 +317,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_load(PPCRecFunction_t* PPCRecFunctio
} }
else if( mode == PPCREC_FPR_LD_MODE_DOUBLE_INTO_PS0 ) else if( mode == PPCREC_FPR_LD_MODE_DOUBLE_INTO_PS0 )
{ {
if( hasAVXSupport ) if( g_CPUFeatures.x86.avx )
{ {
if( indexed ) if( indexed )
{ {
@ -419,7 +420,7 @@ void PPCRecompilerX64Gen_imlInstr_psq_store(ppcImlGenContext_t* ppcImlGenContext
if (mode == PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0) if (mode == PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0)
{ {
x64Gen_cvtsd2ss_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, registerXMM); x64Gen_cvtsd2ss_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, registerXMM);
if (hasAVXSupport) if (g_CPUFeatures.x86.avx)
{ {
x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, REG_RESV_FPR_TEMP); x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, REG_RESV_FPR_TEMP);
} }
@ -428,14 +429,14 @@ void PPCRecompilerX64Gen_imlInstr_psq_store(ppcImlGenContext_t* ppcImlGenContext
x64Gen_movsd_memReg64_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR)); x64Gen_movsd_memReg64_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR));
x64Emit_mov_reg64_mem32(x64GenContext, REG_RESV_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR)); x64Emit_mov_reg64_mem32(x64GenContext, REG_RESV_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR));
} }
if (hasMOVBESupport == false) if (g_CPUFeatures.x86.movbe == false)
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP); x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
if (indexed) if (indexed)
{ {
cemu_assert_debug(memReg != memRegEx); cemu_assert_debug(memReg != memRegEx);
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, memReg, memRegEx); x64Gen_add_reg64Low32_reg64Low32(x64GenContext, memReg, memRegEx);
} }
if (hasMOVBESupport) if (g_CPUFeatures.x86.movbe)
x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, memReg, memImmS32, REG_RESV_TEMP); x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, memReg, memImmS32, REG_RESV_TEMP);
else else
x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, memReg, memImmS32, REG_RESV_TEMP); x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, memReg, memImmS32, REG_RESV_TEMP);
@ -604,7 +605,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti
if (imlInstruction->op_storeLoad.flags2.notExpanded) if (imlInstruction->op_storeLoad.flags2.notExpanded)
{ {
// value is already in single format // value is already in single format
if (hasAVXSupport) if (g_CPUFeatures.x86.avx)
{ {
x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, realRegisterXMM); x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, realRegisterXMM);
} }
@ -617,7 +618,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti
else else
{ {
x64Gen_cvtsd2ss_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, realRegisterXMM); x64Gen_cvtsd2ss_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, realRegisterXMM);
if (hasAVXSupport) if (g_CPUFeatures.x86.avx)
{ {
x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, REG_RESV_FPR_TEMP); x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, REG_RESV_FPR_TEMP);
} }
@ -627,7 +628,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti
x64Emit_mov_reg64_mem32(x64GenContext, REG_RESV_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR)); x64Emit_mov_reg64_mem32(x64GenContext, REG_RESV_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR));
} }
} }
if( hasMOVBESupport == false ) if( g_CPUFeatures.x86.movbe == false )
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP); x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
if( indexed ) if( indexed )
{ {
@ -635,7 +636,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti
assert_dbg(); assert_dbg();
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
} }
if( hasMOVBESupport ) if( g_CPUFeatures.x86.movbe )
x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP); x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP);
else else
x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP); x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP);
@ -668,7 +669,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti
} }
else if( mode == PPCREC_FPR_ST_MODE_UI32_FROM_PS0 ) else if( mode == PPCREC_FPR_ST_MODE_UI32_FROM_PS0 )
{ {
if( hasAVXSupport ) if( g_CPUFeatures.x86.avx )
{ {
x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, realRegisterXMM); x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, realRegisterXMM);
} }
@ -749,7 +750,7 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r_r(PPCRecFunction_t* PPCRecFunction
// unpack top to bottom and top // unpack top to bottom and top
x64Gen_unpckhpd_xmmReg_xmmReg(x64GenContext, imlInstruction->op_fpr_r_r.registerResult, imlInstruction->op_fpr_r_r.registerOperand); x64Gen_unpckhpd_xmmReg_xmmReg(x64GenContext, imlInstruction->op_fpr_r_r.registerResult, imlInstruction->op_fpr_r_r.registerOperand);
} }
//else if ( hasAVXSupport ) //else if ( g_CPUFeatures.x86.avx )
//{ //{
// // unpack top to bottom and top with non-destructive destination // // unpack top to bottom and top with non-destructive destination
// // update: On Ivy Bridge this causes weird stalls? // // update: On Ivy Bridge this causes weird stalls?
@ -1056,7 +1057,7 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r_r_r(PPCRecFunction_t* PPCRecFuncti
{ {
x64Gen_subpd_xmmReg_xmmReg(x64GenContext, imlInstruction->op_fpr_r_r_r.registerResult, imlInstruction->op_fpr_r_r_r.registerOperandB); x64Gen_subpd_xmmReg_xmmReg(x64GenContext, imlInstruction->op_fpr_r_r_r.registerResult, imlInstruction->op_fpr_r_r_r.registerOperandB);
} }
else if (hasAVXSupport) else if (g_CPUFeatures.x86.avx)
{ {
x64Gen_avx_VSUBPD_xmm_xmm_xmm(x64GenContext, imlInstruction->op_fpr_r_r_r.registerResult, imlInstruction->op_fpr_r_r_r.registerOperandA, imlInstruction->op_fpr_r_r_r.registerOperandB); x64Gen_avx_VSUBPD_xmm_xmm_xmm(x64GenContext, imlInstruction->op_fpr_r_r_r.registerResult, imlInstruction->op_fpr_r_r_r.registerOperandA, imlInstruction->op_fpr_r_r_r.registerOperandB);
} }

View File

@ -1,8 +1,9 @@
#include "Cafe/HW/Latte/Core/LatteConst.h" #include "Cafe/HW/Latte/Core/LatteConst.h"
#include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/Renderer/Renderer.h"
#include "Cafe/HW/Latte/ISA/RegDefines.h" #include "Cafe/HW/Latte/ISA/RegDefines.h"
#include "Common/cpu_features.h"
#if defined(ARCH_X86_64)
#if __GNUC__ #if __GNUC__
#include <immintrin.h> #include <immintrin.h>
#endif #endif
@ -14,6 +15,7 @@
#define ATTRIBUTE_AVX2 #define ATTRIBUTE_AVX2
#define ATTRIBUTE_SSE41 #define ATTRIBUTE_SSE41
#endif #endif
#endif
struct struct
{ {
@ -292,6 +294,7 @@ void LatteIndices_generateAutoLineLoopIndices(void* indexDataOutput, uint32 coun
indexMax = std::max(count, 1u) - 1; indexMax = std::max(count, 1u) - 1;
} }
#if defined(ARCH_X86_64)
ATTRIBUTE_AVX2 ATTRIBUTE_AVX2
void LatteIndices_fastConvertU16_AVX2(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax) void LatteIndices_fastConvertU16_AVX2(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax)
{ {
@ -487,6 +490,7 @@ void LatteIndices_fastConvertU32_AVX2(const void* indexDataInput, void* indexDat
indexMax = std::max(indexMax, _maxIndex); indexMax = std::max(indexMax, _maxIndex);
indexMin = std::min(indexMin, _minIndex); indexMin = std::min(indexMin, _minIndex);
} }
#endif
template<typename T> template<typename T>
void _LatteIndices_alternativeCalculateIndexMinMax(const void* indexData, uint32 count, uint32 primitiveRestartIndex, uint32& indexMin, uint32& indexMax) void _LatteIndices_alternativeCalculateIndexMinMax(const void* indexData, uint32 count, uint32 primitiveRestartIndex, uint32& indexMin, uint32& indexMax)
@ -669,19 +673,27 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
{ {
if (indexType == LatteIndexType::U16_BE) if (indexType == LatteIndexType::U16_BE)
{ {
if (_cpuExtension_AVX2) #if defined(ARCH_X86_64)
if (g_CPUFeatures.x86.avx2)
LatteIndices_fastConvertU16_AVX2(indexData, indexOutputPtr, count, indexMin, indexMax); LatteIndices_fastConvertU16_AVX2(indexData, indexOutputPtr, count, indexMin, indexMax);
else if (_cpuExtension_SSE4_1 && _cpuExtension_SSSE3) else if (g_CPUFeatures.x86.sse4_1 && g_CPUFeatures.x86.ssse3)
LatteIndices_fastConvertU16_SSE41(indexData, indexOutputPtr, count, indexMin, indexMax); LatteIndices_fastConvertU16_SSE41(indexData, indexOutputPtr, count, indexMin, indexMax);
else else
LatteIndices_convertBE<uint16>(indexData, indexOutputPtr, count, indexMin, indexMax); LatteIndices_convertBE<uint16>(indexData, indexOutputPtr, count, indexMin, indexMax);
#else
LatteIndices_convertBE<uint16>(indexData, indexOutputPtr, count, indexMin, indexMax);
#endif
} }
else if (indexType == LatteIndexType::U32_BE) else if (indexType == LatteIndexType::U32_BE)
{ {
if (_cpuExtension_AVX2) #if defined(ARCH_X86_64)
if (g_CPUFeatures.x86.avx2)
LatteIndices_fastConvertU32_AVX2(indexData, indexOutputPtr, count, indexMin, indexMax); LatteIndices_fastConvertU32_AVX2(indexData, indexOutputPtr, count, indexMin, indexMax);
else else
LatteIndices_convertBE<uint32>(indexData, indexOutputPtr, count, indexMin, indexMax); LatteIndices_convertBE<uint32>(indexData, indexOutputPtr, count, indexMin, indexMax);
#else
LatteIndices_convertBE<uint32>(indexData, indexOutputPtr, count, indexMin, indexMax);
#endif
} }
else if (indexType == LatteIndexType::U16_LE) else if (indexType == LatteIndexType::U16_LE)
{ {
@ -714,4 +726,4 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
LatteIndexCache.outputCount = outputCount; LatteIndexCache.outputCount = outputCount;
LatteIndexCache.indexBufferOffset = indexBufferOffset; LatteIndexCache.indexBufferOffset = indexBufferOffset;
LatteIndexCache.indexBufferIndex = indexBufferIndex; LatteIndexCache.indexBufferIndex = indexBufferIndex;
} }

View File

@ -2,6 +2,7 @@
#include "Cafe/HW/Latte/Core/LatteDraw.h" #include "Cafe/HW/Latte/Core/LatteDraw.h"
#include "Cafe/HW/Latte/Core/LatteTexture.h" #include "Cafe/HW/Latte/Core/LatteTexture.h"
#include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/Renderer/Renderer.h"
#include "Common/cpu_features.h"
std::unordered_set<LatteTexture*> g_allTextures; std::unordered_set<LatteTexture*> g_allTextures;
@ -146,7 +147,7 @@ uint32 LatteTexture_CalculateTextureDataHash(LatteTexture* hostTexture)
if( isCompressedFormat == false ) if( isCompressedFormat == false )
{ {
#if BOOST_OS_WINDOWS #if BOOST_OS_WINDOWS
if (_cpuExtension_AVX2) if (g_CPUFeatures.x86.avx2)
{ {
__m256i h256 = { 0 }; __m256i h256 = { 0 };
__m256i* readPtr = (__m256i*)texDataU32; __m256i* readPtr = (__m256i*)texDataU32;

View File

@ -1075,7 +1075,9 @@ namespace coreinit
{ {
OSHostThread* hostThread = (OSHostThread*)_thread; OSHostThread* hostThread = (OSHostThread*)_thread;
#if defined(ARCH_X86_64)
_mm_setcsr(_mm_getcsr() | 0x8000); // flush denormals to zero _mm_setcsr(_mm_getcsr() | 0x8000); // flush denormals to zero
#endif
uint32 lehmer_lcg = 12345; uint32 lehmer_lcg = 12345;
@ -1116,7 +1118,9 @@ namespace coreinit
{ {
SetThreadName(fmt::format("OSSchedulerThread[core={}]", (uintptr_t)_assignedCoreIndex).c_str()); SetThreadName(fmt::format("OSSchedulerThread[core={}]", (uintptr_t)_assignedCoreIndex).c_str());
t_assignedCoreIndex = (sint32)(uintptr_t)_assignedCoreIndex; t_assignedCoreIndex = (sint32)(uintptr_t)_assignedCoreIndex;
#if defined(ARCH_X86_64)
_mm_setcsr(_mm_getcsr() | 0x8000); // flush denormals to zero _mm_setcsr(_mm_getcsr() | 0x8000); // flush denormals to zero
#endif
t_schedulerFiber = Fiber::PrepareCurrentThread(); t_schedulerFiber = Fiber::PrepareCurrentThread();
// create scheduler idle fiber and switch to it // create scheduler idle fiber and switch to it

View File

@ -1,5 +1,7 @@
add_library(CemuCommon add_library(CemuCommon
betype.h betype.h
cpu_features.cpp
cpu_features.h
enumFlags.h enumFlags.h
ExceptionHandler/ExceptionHandler.h ExceptionHandler/ExceptionHandler.h
FileStream.h FileStream.h

101
src/Common/cpu_features.cpp Normal file
View File

@ -0,0 +1,101 @@
#include "cpu_features.h"
// wrappers with uniform prototype for implementation-specific x86 CPU id
#if defined(ARCH_X86_64)
#ifdef __GNUC__
#include <cpuid.h>
#endif
inline void cpuid(int cpuInfo[4], int functionId) {
#if defined(_MSC_VER)
__cpuid(cpuInfo, functionId);
#elif defined(__GNUC__)
__cpuid(functionId, cpuInfo[0], cpuInfo[1], cpuInfo[2], cpuInfo[3]);
#else
#error No definition for cpuid
#endif
}
inline void cpuidex(int cpuInfo[4], int functionId, int subFunctionId) {
#if defined(_MSC_VER)
__cpuidex(cpuInfo, functionId, subFunctionId);
#elif defined(__GNUC__)
__cpuid_count(functionId, subFunctionId, cpuInfo[0], cpuInfo[1], cpuInfo[2], cpuInfo[3]);
#else
#error No definition for cpuidex
#endif
}
#endif
CPUFeaturesImpl::CPUFeaturesImpl()
{
#if defined(ARCH_X86_64)
int cpuInfo[4];
cpuid(cpuInfo, 0x80000001);
x86.lzcnt = ((cpuInfo[2] >> 5) & 1) != 0;
cpuid(cpuInfo, 0x1);
x86.movbe = ((cpuInfo[2] >> 22) & 1) != 0;
x86.avx = ((cpuInfo[2] >> 28) & 1) != 0;
x86.aesni = ((cpuInfo[2] >> 25) & 1) != 0;
x86.ssse3 = ((cpuInfo[2] >> 9) & 1) != 0;
x86.sse4_1 = ((cpuInfo[2] >> 19) & 1) != 0;
cpuidex(cpuInfo, 0x7, 0);
x86.avx2 = ((cpuInfo[1] >> 5) & 1) != 0;
x86.bmi2 = ((cpuInfo[1] >> 8) & 1) != 0;
cpuid(cpuInfo, 0x80000007);
x86.invariant_tsc = ((cpuInfo[3] >> 8) & 1);
// get CPU brand name
uint32_t nExIds, i = 0;
memset(m_cpuBrandName, 0, sizeof(m_cpuBrandName));
cpuid(cpuInfo, 0x80000000);
nExIds = (uint32_t)cpuInfo[0];
for (uint32_t i = 0x80000000; i <= nExIds; ++i)
{
cpuid(cpuInfo, i);
if (i == 0x80000002)
memcpy(m_cpuBrandName, cpuInfo, sizeof(cpuInfo));
else if (i == 0x80000003)
memcpy(m_cpuBrandName + 16, cpuInfo, sizeof(cpuInfo));
else if (i == 0x80000004)
memcpy(m_cpuBrandName + 32, cpuInfo, sizeof(cpuInfo));
}
#endif
}
std::string CPUFeaturesImpl::GetCPUName()
{
return { m_cpuBrandName };
}
std::string CPUFeaturesImpl::GetCommaSeparatedExtensionList()
{
std::string tmp;
auto appendExt = [&tmp](const char* str)
{
if (!tmp.empty())
tmp.append(", ");
tmp.append(str);
};
if (x86.ssse3)
appendExt("SSSE3");
if (x86.sse4_1)
appendExt("SSE4.1");
if (x86.avx)
appendExt("AVX");
if (x86.avx2)
appendExt("AVX2");
if (x86.lzcnt)
appendExt("LZCNT");
if (x86.movbe)
appendExt("MOVBE");
if (x86.bmi2)
appendExt("BMI2");
if (x86.aesni)
appendExt("AES-NI");
if(x86.invariant_tsc)
appendExt("INVARIANT-TSC");
return tmp;
}
CPUFeaturesImpl g_CPUFeatures;

26
src/Common/cpu_features.h Normal file
View File

@ -0,0 +1,26 @@
class CPUFeaturesImpl
{
public:
CPUFeaturesImpl();
std::string GetCPUName(); // empty if not available
std::string GetCommaSeparatedExtensionList();
struct
{
bool ssse3{ false };
bool sse4_1{ false };
bool avx{ false };
bool avx2{ false };
bool lzcnt{ false };
bool movbe{ false };
bool bmi2{ false };
bool aesni{ false };
bool invariant_tsc{ false };
}x86;
private:
char m_cpuBrandName[0x40]{ 0 };
};
extern CPUFeaturesImpl g_CPUFeatures;

View File

@ -24,13 +24,22 @@
// } // }
// #endif // #endif
// arch defines
#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
#define ARCH_X86_64
#endif
// c includes // c includes
#include <cstdint> #include <cstdint>
#include <cstdlib> #include <cstdlib>
#include <cmath> #include <cmath>
#include <ctime> #include <ctime>
#include <cassert> #include <cassert>
#if defined(ARCH_X86_64)
#include <immintrin.h> #include <immintrin.h>
#endif
// c++ includes // c++ includes
#include <string> #include <string>
@ -106,11 +115,6 @@ using uint8le = uint8_t;
#include "Cemu/Logging/CemuDebugLogging.h" #include "Cemu/Logging/CemuDebugLogging.h"
#include "Cemu/Logging/CemuLogging.h" #include "Cemu/Logging/CemuLogging.h"
// CPU extensions
extern bool _cpuExtension_SSSE3;
extern bool _cpuExtension_SSE4_1;
extern bool _cpuExtension_AVX2;
// manual endian-swapping // manual endian-swapping
#if _MSC_VER #if _MSC_VER
@ -251,30 +255,35 @@ inline uint64 _udiv128(uint64 highDividend, uint64 lowDividend, uint64 divisor,
#define NOEXPORT __attribute__ ((visibility ("hidden"))) #define NOEXPORT __attribute__ ((visibility ("hidden")))
#endif #endif
#ifdef __GNUC__ // On aarch64 we handle some of the x86 intrinsics by implementing them as wrappers
#include <cpuid.h> #if defined(__aarch64__)
#endif
inline void cpuid(int cpuInfo[4], int functionId) { inline void _mm_pause()
#if defined(_MSC_VER) {
__cpuid(cpuInfo, functionId); asm volatile("yield");
#elif defined(__GNUC__)
__cpuid(functionId, cpuInfo[0], cpuInfo[1], cpuInfo[2], cpuInfo[3]);
#else
#error No definition for cpuid
#endif
} }
inline void cpuidex(int cpuInfo[4], int functionId, int subFunctionId) { inline uint64 __rdtsc()
#if defined(_MSC_VER) {
__cpuidex(cpuInfo, functionId, subFunctionId); uint64 t;
#elif defined(__GNUC__) asm volatile("mrs %0, cntvct_el0" : "=r" (t));
__cpuid_count(functionId, subFunctionId, cpuInfo[0], cpuInfo[1], cpuInfo[2], cpuInfo[3]); return t;
#else
#error No definition for cpuidex
#endif
} }
inline void _mm_mfence()
{
}
inline unsigned char _addcarry_u64(unsigned char carry, unsigned long long a, unsigned long long b, unsigned long long *result)
{
*result = a + b + (unsigned long long)carry;
if (*result < a)
return 1;
return 0;
}
#endif
// MEMPTR // MEMPTR
#include "Common/MemPtr.h" #include "Common/MemPtr.h"

View File

@ -1,39 +1,47 @@
project(CemuAsm C) project(CemuAsm C)
if (WIN32) if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
enable_language(C ASM_MASM) if (WIN32)
add_library(CemuAsm x64util_masm.asm) enable_language(C ASM_MASM)
set_source_files_properties(x64util_masm.asm PROPERTIES LANGUAGE ASM_MASM)
# workaround for cr flag being passed to LINK.exe which considers it an input file and thus fails add_library(CemuAsm x64util_masm.asm)
# doesn't always seem to happen. The Windows CI builds were fine, but locally I would run into this problem set_source_files_properties(x64util_masm.asm PROPERTIES LANGUAGE ASM_MASM)
# possibly related to https://gitlab.kitware.com/cmake/cmake/-/issues/18889
set(CMAKE_ASM_MASM_CREATE_STATIC_LIBRARY "<CMAKE_AR> /OUT:<TARGET> <LINK_FLAGS> <OBJECTS>")
# workaround for cr flag being passed to LINK.exe which considers it an input file and thus fails
# doesn't always seem to happen. The Windows CI builds were fine, but locally I would run into this problem
# possibly related to https://gitlab.kitware.com/cmake/cmake/-/issues/18889
set(CMAKE_ASM_MASM_CREATE_STATIC_LIBRARY "<CMAKE_AR> /OUT:<TARGET> <LINK_FLAGS> <OBJECTS>")
set_property(TARGET CemuAsm PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
else()
# NASM
if (APPLE)
set(CMAKE_ASM_NASM_COMPILE_OBJECT "<CMAKE_ASM_NASM_COMPILER> -g -Fdwarf -f macho64 --prefix _ -o <OBJECT> <SOURCE>")
else()
set(CMAKE_ASM_NASM_COMPILE_OBJECT "<CMAKE_ASM_NASM_COMPILER> -g -Fdwarf -f elf64 -o <OBJECT> <SOURCE>")
endif()
set(CMAKE_ASM_NASM_LINK_EXECUTABLE "ld <FLAGS> <CMAKE_ASM_NASM_LINK_FLAGS> <LINK_FLAGS> -fPIC <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
enable_language(C ASM_NASM)
add_library(CemuAsm x64util_nasm.asm)
set_source_files_properties(x64util_nasm.asm PROPERTIES LANGUAGE ASM_NASM)
if (APPLE)
set_target_properties(CemuAsm PROPERTIES NASM_OBJ_FORMAT macho64)
else()
set_target_properties(CemuAsm PROPERTIES NASM_OBJ_FORMAT elf64)
endif()
set_target_properties(CemuAsm PROPERTIES LINKER_LANGUAGE C)
endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64)|(AARCH64)")
add_library(CemuAsm stub.cpp)
else() else()
message(STATUS "CemuAsm - Unsupported arch: ${CMAKE_SYSTEM_PROCESSOR}")
# NASM
if (APPLE)
set(CMAKE_ASM_NASM_COMPILE_OBJECT "<CMAKE_ASM_NASM_COMPILER> -g -Fdwarf -f macho64 --prefix _ -o <OBJECT> <SOURCE>")
else()
set(CMAKE_ASM_NASM_COMPILE_OBJECT "<CMAKE_ASM_NASM_COMPILER> -g -Fdwarf -f elf64 -o <OBJECT> <SOURCE>")
endif()
set(CMAKE_ASM_NASM_LINK_EXECUTABLE "ld <FLAGS> <CMAKE_ASM_NASM_LINK_FLAGS> <LINK_FLAGS> -fPIC <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
enable_language(C ASM_NASM)
add_library(CemuAsm x64util_nasm.asm)
set_source_files_properties(x64util_nasm.asm PROPERTIES LANGUAGE ASM_NASM)
if (APPLE)
set_target_properties(CemuAsm PROPERTIES NASM_OBJ_FORMAT macho64)
else()
set_target_properties(CemuAsm PROPERTIES NASM_OBJ_FORMAT elf64)
endif()
set_target_properties(CemuAsm PROPERTIES LINKER_LANGUAGE C)
endif() endif()
set_property(TARGET CemuAsm PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")

1
src/asm/stub.cpp Normal file
View File

@ -0,0 +1 @@

View File

@ -1,4 +1,20 @@
#pragma once #pragma once
#if defined(ARCH_X86_64)
extern "C" void recompiler_fres(); extern "C" void recompiler_fres();
extern "C" void recompiler_frsqrte(); extern "C" void recompiler_frsqrte();
#else
// stubbed on non-x86 for now
static void recompiler_fres()
{
cemu_assert_unimplemented();
}
static void recompiler_frsqrte()
{
cemu_assert_unimplemented();
}
#endif

View File

@ -64,6 +64,7 @@ void CemuConfig::Load(XMLConfigParser& parser)
save_screenshot = parser.get("save_screenshot", save_screenshot); save_screenshot = parser.get("save_screenshot", save_screenshot);
did_show_vulkan_warning = parser.get("vk_warning", did_show_vulkan_warning); did_show_vulkan_warning = parser.get("vk_warning", did_show_vulkan_warning);
did_show_graphic_pack_download = parser.get("gp_download", did_show_graphic_pack_download); did_show_graphic_pack_download = parser.get("gp_download", did_show_graphic_pack_download);
did_show_macos_disclaimer = parser.get("macos_disclaimer", did_show_macos_disclaimer);
fullscreen = parser.get("fullscreen", fullscreen); fullscreen = parser.get("fullscreen", fullscreen);
proxy_server = parser.get("proxy_server", ""); proxy_server = parser.get("proxy_server", "");
@ -365,6 +366,7 @@ void CemuConfig::Save(XMLConfigParser& parser)
config.set<bool>("save_screenshot", save_screenshot); config.set<bool>("save_screenshot", save_screenshot);
config.set<bool>("vk_warning", did_show_vulkan_warning); config.set<bool>("vk_warning", did_show_vulkan_warning);
config.set<bool>("gp_download", did_show_graphic_pack_download); config.set<bool>("gp_download", did_show_graphic_pack_download);
config.set<bool>("macos_disclaimer", did_show_macos_disclaimer);
config.set<bool>("fullscreen", fullscreen); config.set<bool>("fullscreen", fullscreen);
config.set("proxy_server", proxy_server.GetValue().c_str()); config.set("proxy_server", proxy_server.GetValue().c_str());

View File

@ -407,6 +407,7 @@ struct CemuConfig
ConfigValue<bool> did_show_vulkan_warning{false}; ConfigValue<bool> did_show_vulkan_warning{false};
ConfigValue<bool> did_show_graphic_pack_download{false}; ConfigValue<bool> did_show_graphic_pack_download{false};
ConfigValue<bool> did_show_macos_disclaimer{false};
int game_list_style = 0; int game_list_style = 0;
std::string game_list_column_order; std::string game_list_column_order;

View File

@ -174,6 +174,23 @@ bool CemuApp::OnInit()
SetTopWindow(m_mainFrame); SetTopWindow(m_mainFrame);
m_mainFrame->Show(); m_mainFrame->Show();
// show warning on macOS about state of builds
#if BOOST_OS_MACOS
if (!GetConfig().did_show_macos_disclaimer)
{
const auto message = _(
"Thank you for testing the in-development build of Cemu for macOS.\n \n"
"The macOS port is currently purely experimental and should not be considered stable or ready for issue-free gameplay. "
"There are also known issues with degraded performance due to the use of MoltenVk and Rosetta for ARM Macs. We appreciate your patience while we improve Cemu for macOS.");
wxMessageDialog dialog(nullptr, message, "Preview version", wxCENTRE | wxOK | wxICON_WARNING);
dialog.SetOKLabel(_("I understand"));
dialog.ShowModal();
GetConfig().did_show_macos_disclaimer = true;
g_config.Save();
}
#endif
return true; return true;
} }

View File

@ -19,6 +19,7 @@
#include "Cafe/TitleList/SaveList.h" #include "Cafe/TitleList/SaveList.h"
#include "Common/ExceptionHandler/ExceptionHandler.h" #include "Common/ExceptionHandler/ExceptionHandler.h"
#include "Common/cpu_features.h"
#include <wx/setup.h> #include <wx/setup.h>
#include "util/helpers/helpers.h" #include "util/helpers/helpers.h"
@ -37,8 +38,13 @@
#define SDL_MAIN_HANDLED #define SDL_MAIN_HANDLED
#include <SDL.h> #include <SDL.h>
#if BOOST_OS_LINUX || BOOST_OS_MACOS #if BOOST_OS_LINUX
#define _putenv(__s) putenv((char*)(__s)) #define _putenv(__s) putenv((char*)(__s))
#include <sys/sysinfo.h>
#elif BOOST_OS_MACOS
#define _putenv(__s) putenv((char*)(__s))
#include <sys/types.h>
#include <sys/sysctl.h>
#endif #endif
#if BOOST_OS_WINDOWS #if BOOST_OS_WINDOWS
@ -49,41 +55,32 @@ extern "C"
} }
#endif #endif
bool _cpuExtension_SSSE3 = false;
bool _cpuExtension_SSE4_1 = false;
bool _cpuExtension_AVX2 = false;
std::atomic_bool g_isGPUInitFinished = false; std::atomic_bool g_isGPUInitFinished = false;
std::wstring executablePath; std::wstring executablePath;
void logCPUAndMemoryInfo() void logCPUAndMemoryInfo()
{ {
#if BOOST_OS_WINDOWS std::string cpuName = g_CPUFeatures.GetCPUName();
int CPUInfo[4] = { -1 }; if (!cpuName.empty())
unsigned nExIds, i = 0; cemuLog_log(LogType::Force, "CPU: {}", cpuName);
char CPUBrandString[0x40];
// Get the information associated with each extended ID.
cpuid(CPUInfo, 0x80000000);
nExIds = CPUInfo[0];
for (i = 0x80000000; i <= nExIds; ++i)
{
cpuid(CPUInfo, i);
// Interpret CPU brand string
if (i == 0x80000002)
memcpy(CPUBrandString, CPUInfo, sizeof(CPUInfo));
else if (i == 0x80000003)
memcpy(CPUBrandString + 16, CPUInfo, sizeof(CPUInfo));
else if (i == 0x80000004)
memcpy(CPUBrandString + 32, CPUInfo, sizeof(CPUInfo));
}
forceLog_printf("CPU: %s", CPUBrandString);
#if BOOST_OS_WINDOWS
MEMORYSTATUSEX statex; MEMORYSTATUSEX statex;
statex.dwLength = sizeof(statex); statex.dwLength = sizeof(statex);
GlobalMemoryStatusEx(&statex); GlobalMemoryStatusEx(&statex);
uint32 memoryInMB = (uint32)(statex.ullTotalPhys / 1024LL / 1024LL); uint32 memoryInMB = (uint32)(statex.ullTotalPhys / 1024LL / 1024LL);
forceLog_printf("RAM: %uMB", memoryInMB); forceLog_printf("RAM: %uMB", memoryInMB);
#elif BOOST_OS_LINUX
struct sysinfo info {};
sysinfo(&info);
cemuLog_log(LogType::Force, "RAM: {}MB", ((static_cast<uint64_t>(info.totalram) * info.mem_unit) / 1024LL / 1024LL));
#elif BOOST_OS_MACOS
int64_t totalRam;
size_t size = sizeof(totalRam);
int result = sysctlbyname("hw.memsize", &totalRam, &size, NULL, 0);
if (result == 0)
cemuLog_log(LogType::Force, "RAM: {}MB", (totalRam / 1024LL / 1024LL));
#endif #endif
} }
@ -120,32 +117,7 @@ void infoLog_cemuStartup()
checkForWine(); checkForWine();
// CPU and RAM info // CPU and RAM info
logCPUAndMemoryInfo(); logCPUAndMemoryInfo();
// extensions that Cemu uses cemuLog_log(LogType::Force, "Used CPU extensions: {}", g_CPUFeatures.GetCommaSeparatedExtensionList());
char cpuExtensionStr[256];
strcpy(cpuExtensionStr, "");
if (_cpuExtension_SSSE3)
{
strcat(cpuExtensionStr, "SSSE3");
}
if (_cpuExtension_SSE4_1)
{
if (cpuExtensionStr[0] != '\0')
strcat(cpuExtensionStr, ", ");
strcat(cpuExtensionStr, "SSE4.1");
}
if (_cpuExtension_AVX2)
{
if (cpuExtensionStr[0] != '\0')
strcat(cpuExtensionStr, ", ");
strcat(cpuExtensionStr, "AVX2");
}
if (AES128_useAESNI())
{
if (cpuExtensionStr[0] != '\0')
strcat(cpuExtensionStr, ", ");
strcat(cpuExtensionStr, "AES-NI");
}
cemuLog_force("Used CPU extensions: {}", cpuExtensionStr);
} }
// some implementations of _putenv dont copy the string and instead only store a pointer // some implementations of _putenv dont copy the string and instead only store a pointer
@ -194,14 +166,6 @@ void mainEmulatorCommonInit()
AES128_init(); AES128_init();
// init PPC timer (call this as early as possible because it measures frequency of RDTSC using an asynchronous thread over 3 seconds) // init PPC timer (call this as early as possible because it measures frequency of RDTSC using an asynchronous thread over 3 seconds)
PPCTimer_init(); PPCTimer_init();
// check available CPU extensions
int cpuInfo[4];
cpuid(cpuInfo, 0x1);
_cpuExtension_SSSE3 = ((cpuInfo[2] >> 9) & 1) != 0;
_cpuExtension_SSE4_1 = ((cpuInfo[2] >> 19) & 1) != 0;
cpuidex(cpuInfo, 0x7, 0);
_cpuExtension_AVX2 = ((cpuInfo[1] >> 5) & 1) != 0;
#if BOOST_OS_WINDOWS #if BOOST_OS_WINDOWS
executablePath.resize(4096); executablePath.resize(4096);
@ -382,4 +346,4 @@ int main(int argc, char *argv[])
extern "C" DLLEXPORT uint64 gameMeta_getTitleId() extern "C" DLLEXPORT uint64 gameMeta_getTitleId()
{ {
return CafeSystem::GetForegroundTitleId(); return CafeSystem::GetForegroundTitleId();
} }

View File

@ -10,6 +10,7 @@
/* Includes: */ /* Includes: */
/*****************************************************************************/ /*****************************************************************************/
#include "aes128.h" #include "aes128.h"
#include "Common/cpu_features.h"
/*****************************************************************************/ /*****************************************************************************/
/* Defines: */ /* Defines: */
@ -23,8 +24,6 @@
// The number of rounds in AES Cipher. // The number of rounds in AES Cipher.
#define Nr 10 #define Nr 10
bool useAESNI = false;
typedef uint8 state_t[4][4]; typedef uint8 state_t[4][4];
typedef struct typedef struct
@ -601,6 +600,7 @@ void AES128_CBC_decrypt_updateIV(uint8* output, uint8* input, uint32 length, con
memcpy(iv, newIv, KEYLEN); memcpy(iv, newIv, KEYLEN);
} }
#if defined(ARCH_X86_64)
inline __m128i AESNI128_ASSIST( inline __m128i AESNI128_ASSIST(
__m128i temp1, __m128i temp1,
__m128i temp2) __m128i temp2)
@ -792,6 +792,7 @@ void __aesni__AES128_ECB_encrypt(uint8* input, const uint8* key, uint8* output)
feedback = _mm_aesenclast_si128(feedback, ((__m128i*)expandedKey)[10]); feedback = _mm_aesenclast_si128(feedback, ((__m128i*)expandedKey)[10]);
_mm_storeu_si128(&((__m128i*)output)[0], feedback); _mm_storeu_si128(&((__m128i*)output)[0], feedback);
} }
#endif
void(*AES128_ECB_encrypt)(uint8* input, const uint8* key, uint8* output); void(*AES128_ECB_encrypt)(uint8* input, const uint8* key, uint8* output);
void (*AES128_CBC_decrypt)(uint8* output, uint8* input, uint32 length, const uint8* key, const uint8* iv) = nullptr; void (*AES128_CBC_decrypt)(uint8* output, uint8* input, uint32 length, const uint8* key, const uint8* iv) = nullptr;
@ -836,10 +837,8 @@ void AES128_init()
lookupTable_multiply[i] = (vE << 0) | (v9 << 8) | (vD << 16) | (vB << 24); lookupTable_multiply[i] = (vE << 0) | (v9 << 8) | (vD << 16) | (vB << 24);
} }
// check if AES-NI is available // check if AES-NI is available
int v[4]; #if defined(ARCH_X86_64)
cpuid(v, 1); if (g_CPUFeatures.x86.aesni)
useAESNI = (v[2] & 0x2000000) != 0;
if (useAESNI)
{ {
// AES-NI implementation // AES-NI implementation
AES128_CBC_decrypt = __aesni__AES128_CBC_decrypt; AES128_CBC_decrypt = __aesni__AES128_CBC_decrypt;
@ -851,9 +850,8 @@ void AES128_init()
AES128_CBC_decrypt = __soft__AES128_CBC_decrypt; AES128_CBC_decrypt = __soft__AES128_CBC_decrypt;
AES128_ECB_encrypt = __soft__AES128_ECB_encrypt; AES128_ECB_encrypt = __soft__AES128_ECB_encrypt;
} }
#else
AES128_CBC_decrypt = __soft__AES128_CBC_decrypt;
AES128_ECB_encrypt = __soft__AES128_ECB_encrypt;
#endif
} }
bool AES128_useAESNI()
{
return useAESNI;
}

View File

@ -2,7 +2,6 @@
#define _AES_H_ #define _AES_H_
void AES128_init(); void AES128_init();
bool AES128_useAESNI();
extern void(*AES128_ECB_encrypt)(uint8* input, const uint8* key, uint8* output); extern void(*AES128_ECB_encrypt)(uint8* input, const uint8* key, uint8* output);

View File

@ -19,7 +19,8 @@ public:
{ {
if (!m_lockBool.exchange(true, std::memory_order_acquire)) if (!m_lockBool.exchange(true, std::memory_order_acquire))
break; break;
while (m_lockBool.load(std::memory_order_relaxed)) _mm_pause(); while (m_lockBool.load(std::memory_order_relaxed))
_mm_pause();
} }
} }
@ -36,4 +37,4 @@ public:
private: private:
mutable std::atomic<bool> m_lockBool = false; mutable std::atomic<bool> m_lockBool = false;
}; };