diff --git a/Source/Core/Common/Src/Common.h b/Source/Core/Common/Src/Common.h index 6cc7cf0581..8f08d8d4a7 100644 --- a/Source/Core/Common/Src/Common.h +++ b/Source/Core/Common/Src/Common.h @@ -82,6 +82,7 @@ private: #define GC_ALIGNED16(x) __declspec(align(16)) x #define GC_ALIGNED32(x) __declspec(align(32)) x #define GC_ALIGNED64(x) __declspec(align(64)) x + #define GC_ALIGNED128(x) __declspec(align(128)) x #define GC_ALIGNED16_DECL(x) __declspec(align(16)) x #define GC_ALIGNED64_DECL(x) __declspec(align(64)) x @@ -129,6 +130,7 @@ private: #define GC_ALIGNED16(x) __attribute__((aligned(16))) x #define GC_ALIGNED32(x) __attribute__((aligned(32))) x #define GC_ALIGNED64(x) __attribute__((aligned(64))) x +#define GC_ALIGNED128(x) __attribute__((aligned(128))) x #define GC_ALIGNED16_DECL(x) __attribute__((aligned(16))) x #define GC_ALIGNED64_DECL(x) __attribute__((aligned(64))) x #endif diff --git a/Source/Core/VideoCommon/Src/DLCache.cpp b/Source/Core/VideoCommon/Src/DLCache.cpp index 8d1a9b829b..4c3490ed01 100644 --- a/Source/Core/VideoCommon/Src/DLCache.cpp +++ b/Source/Core/VideoCommon/Src/DLCache.cpp @@ -305,7 +305,7 @@ u8 AnalyzeAndRunDisplayList(u32 address, int size, CachedDisplayList *dl) u32 Cmd2 = DataReadU32(); int transfer_size = ((Cmd2 >> 16) & 15) + 1; u32 xf_address = Cmd2 & 0xFFFF; - u32 data_buffer[16]; + GC_ALIGNED128(u32 data_buffer[16]); DataReadU32xFuncs[transfer_size-1](data_buffer); LoadXFReg(transfer_size, xf_address, data_buffer); INCSTAT(stats.thisFrame.numXFLoads); @@ -455,10 +455,10 @@ bool CompileAndRunDisplayList(u32 address, int size, CachedDisplayList *dl) ReferencedDataRegion* NewRegion = new ReferencedDataRegion; NewRegion->MustClean = true; NewRegion->size = transfer_size * 4; - NewRegion->start_address = (u8*) new u8[NewRegion->size]; + NewRegion->start_address = (u8*) new u8[NewRegion->size+0xf]; // alignment NewRegion->hash = 0; dl->InsertRegion(NewRegion); - u32 *data_buffer = (u32*)NewRegion->start_address; + u32 *data_buffer = (u32*)(u8*)(((size_t)NewRegion->start_address+0xf)&~0xf); DataReadU32xFuncs[transfer_size-1](data_buffer); LoadXFReg(transfer_size, xf_address, data_buffer); INCSTAT(stats.thisFrame.numXFLoads); diff --git a/Source/Core/VideoCommon/Src/DataReader.h b/Source/Core/VideoCommon/Src/DataReader.h index 7feb066645..96cdfc0739 100644 --- a/Source/Core/VideoCommon/Src/DataReader.h +++ b/Source/Core/VideoCommon/Src/DataReader.h @@ -83,35 +83,29 @@ const __m128i mask4 = _mm_set_epi8(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3); template void DataReadU32xN_SSSE3(u32 *bufx16) { - __m128i* store = (__m128i *)bufx16; - __m128i* load = (__m128i *)g_pVideoData; + memcpy(bufx16, g_pVideoData, sizeof(u32) * N); + __m128i* buf = (__m128i *)bufx16; switch(N) { - case 13: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4)); - case 9: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4)); - case 5: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4)); - case 1: // 1 U32 left: - ((u32 *)store)[0] = Common::swap32(((u32 *)load)[0]); + case 13: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); buf++; + case 9: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); buf++; + case 5: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); buf++; + case 1: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask1)); break; - case 14: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4)); - case 10: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4)); - case 6: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4)); - case 2: // 2 U32s left: - ((u32 *)store)[0] = Common::swap32(((u32 *)load)[0]); - ((u32 *)store)[1] = Common::swap32(((u32 *)load)[1]); + case 14: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); buf++; + case 10: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); buf++; + case 6: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); buf++; + case 2: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask2)); break; - case 15: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4)); - case 11: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4)); - case 7: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4)); - case 3: // 3 U32s left: - ((u32 *)store)[0] = Common::swap32(((u32 *)load)[0]); - ((u32 *)store)[1] = Common::swap32(((u32 *)load)[1]); - ((u32 *)store)[2] = Common::swap32(((u32 *)load)[2]); + case 15: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); buf++; + case 11: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); buf++; + case 7: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); buf++; + case 3: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask3)); break; - case 16: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4)); - case 12: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4)); - case 8: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4)); - case 4: _mm_storeu_si128(store, _mm_shuffle_epi8(_mm_loadu_si128(load), mask4)); + case 16: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); buf++; + case 12: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); buf++; + case 8: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); buf++; + case 4: _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), mask4)); break; } g_pVideoData += (sizeof(u32) * N); diff --git a/Source/Core/VideoCommon/Src/OpcodeDecoding.cpp b/Source/Core/VideoCommon/Src/OpcodeDecoding.cpp index 3555f1fba1..8b7564158a 100644 --- a/Source/Core/VideoCommon/Src/OpcodeDecoding.cpp +++ b/Source/Core/VideoCommon/Src/OpcodeDecoding.cpp @@ -364,8 +364,7 @@ static void DecodeSemiNop() u32 Cmd2 = DataReadU32(); int transfer_size = ((Cmd2 >> 16) & 15) + 1; u32 address = Cmd2 & 0xFFFF; - // TODO - speed this up. pshufb? - u32 data_buffer[16]; + GC_ALIGNED128(u32 data_buffer[16]); DataReadU32xFuncs[transfer_size-1](data_buffer); LoadXFReg(transfer_size, address, data_buffer); INCSTAT(stats.thisFrame.numXFLoads);