From ef6f6a7fa9a90efa78d0ae557e5546dd6568bb74 Mon Sep 17 00:00:00 2001
From: degasus <wickmarkus@web.de>
Date: Wed, 3 Sep 2014 22:39:26 +0200
Subject: [PATCH] VideoCommon: remove XFReg copy optimization

This code is just ugly and I doubt there is a way that copying twice is faster.
---
 Source/Core/VideoCommon/DataReader.h       | 40 ----------------
 Source/Core/VideoCommon/OpcodeDecoding.cpp | 53 +---------------------
 Source/Core/VideoCommon/XFMemory.h         |  2 +-
 Source/Core/VideoCommon/XFStructs.cpp      | 21 +++++----
 4 files changed, 15 insertions(+), 101 deletions(-)

diff --git a/Source/Core/VideoCommon/DataReader.h b/Source/Core/VideoCommon/DataReader.h
index 17c927a42b..85beec3a11 100644
--- a/Source/Core/VideoCommon/DataReader.h
+++ b/Source/Core/VideoCommon/DataReader.h
@@ -92,46 +92,6 @@ __forceinline u32 DataReadU32()
 	return DataRead<u32>();
 }
 
-#if _M_SSE >= 0x301
-const __m128i bs_mask = _mm_set_epi32(0x0C0D0E0FL, 0x08090A0BL, 0x04050607L, 0x00010203L);
-
-template<unsigned int N>
-void DataReadU32xN_SSSE3(u32 *bufx16)
-{
-	memcpy(bufx16, g_pVideoData, sizeof(u32) * N);
-	__m128i* buf = (__m128i *)bufx16;
-	if (N>12) { _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), bs_mask)); buf++; }
-	if (N>8)  { _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), bs_mask)); buf++; }
-	if (N>4)  { _mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), bs_mask)); buf++; }
-	_mm_store_si128(buf, _mm_shuffle_epi8(_mm_load_si128(buf), bs_mask));
-	g_pVideoData += (sizeof(u32) * N);
-}
-
-#endif
-
-template<unsigned int N>
-void DataReadU32xN(u32 *bufx16)
-{
-	memcpy(bufx16, g_pVideoData, sizeof(u32) * N);
-	if (N >= 1) bufx16[0] = Common::swap32(bufx16[0]);
-	if (N >= 2) bufx16[1] = Common::swap32(bufx16[1]);
-	if (N >= 3) bufx16[2] = Common::swap32(bufx16[2]);
-	if (N >= 4) bufx16[3] = Common::swap32(bufx16[3]);
-	if (N >= 5) bufx16[4] = Common::swap32(bufx16[4]);
-	if (N >= 6) bufx16[5] = Common::swap32(bufx16[5]);
-	if (N >= 7) bufx16[6] = Common::swap32(bufx16[6]);
-	if (N >= 8) bufx16[7] = Common::swap32(bufx16[7]);
-	if (N >= 9) bufx16[8] = Common::swap32(bufx16[8]);
-	if (N >= 10) bufx16[9] = Common::swap32(bufx16[9]);
-	if (N >= 11) bufx16[10] = Common::swap32(bufx16[10]);
-	if (N >= 12) bufx16[11] = Common::swap32(bufx16[11]);
-	if (N >= 13) bufx16[12] = Common::swap32(bufx16[12]);
-	if (N >= 14) bufx16[13] = Common::swap32(bufx16[13]);
-	if (N >= 15) bufx16[14] = Common::swap32(bufx16[14]);
-	if (N >= 16) bufx16[15] = Common::swap32(bufx16[15]);
-	g_pVideoData += (sizeof(u32) * N);
-}
-
 __forceinline u32 DataReadU32Unswapped()
 {
 	u32 tmp = *(u32*)g_pVideoData;
diff --git a/Source/Core/VideoCommon/OpcodeDecoding.cpp b/Source/Core/VideoCommon/OpcodeDecoding.cpp
index fe644db21e..a5bd8a62ee 100644
--- a/Source/Core/VideoCommon/OpcodeDecoding.cpp
+++ b/Source/Core/VideoCommon/OpcodeDecoding.cpp
@@ -34,47 +34,6 @@
 u8* g_pVideoData = nullptr;
 bool g_bRecordFifoData = false;
 
-typedef void (*DataReadU32xNfunc)(u32 *buf);
-#if _M_SSE >= 0x301
-static DataReadU32xNfunc DataReadU32xFuncs_SSSE3[16] = {
-	DataReadU32xN_SSSE3<1>,
-	DataReadU32xN_SSSE3<2>,
-	DataReadU32xN_SSSE3<3>,
-	DataReadU32xN_SSSE3<4>,
-	DataReadU32xN_SSSE3<5>,
-	DataReadU32xN_SSSE3<6>,
-	DataReadU32xN_SSSE3<7>,
-	DataReadU32xN_SSSE3<8>,
-	DataReadU32xN_SSSE3<9>,
-	DataReadU32xN_SSSE3<10>,
-	DataReadU32xN_SSSE3<11>,
-	DataReadU32xN_SSSE3<12>,
-	DataReadU32xN_SSSE3<13>,
-	DataReadU32xN_SSSE3<14>,
-	DataReadU32xN_SSSE3<15>,
-	DataReadU32xN_SSSE3<16>
-};
-#endif
-
-static DataReadU32xNfunc DataReadU32xFuncs[16] = {
-	DataReadU32xN<1>,
-	DataReadU32xN<2>,
-	DataReadU32xN<3>,
-	DataReadU32xN<4>,
-	DataReadU32xN<5>,
-	DataReadU32xN<6>,
-	DataReadU32xN<7>,
-	DataReadU32xN<8>,
-	DataReadU32xN<9>,
-	DataReadU32xN<10>,
-	DataReadU32xN<11>,
-	DataReadU32xN<12>,
-	DataReadU32xN<13>,
-	DataReadU32xN<14>,
-	DataReadU32xN<15>,
-	DataReadU32xN<16>
-};
-
 static u32 InterpretDisplayList(u32 address, u32 size)
 {
 	u8* old_pVideoData = g_pVideoData;
@@ -182,9 +141,7 @@ static u32 Decode(u8* end, bool skipped_frame)
 				return 0;
 			cycles = 18 + 6 * transfer_size;
 			u32 xf_address = Cmd2 & 0xFFFF;
-			GC_ALIGNED128(u32 data_buffer[16]);
-			DataReadU32xFuncs[transfer_size-1](data_buffer);
-			LoadXFReg(transfer_size, xf_address, data_buffer);
+			LoadXFReg(transfer_size, xf_address);
 
 			INCSTAT(stats.thisFrame.numXFLoads);
 		}
@@ -296,14 +253,6 @@ static u32 Decode(u8* end, bool skipped_frame)
 void OpcodeDecoder_Init()
 {
 	g_pVideoData = GetVideoBufferStartPtr();
-
-#if _M_SSE >= 0x301
-	if (cpu_info.bSSSE3)
-	{
-		for (int i = 0; i < 16; ++i)
-			DataReadU32xFuncs[i] = DataReadU32xFuncs_SSSE3[i];
-	}
-#endif
 }
 
 
diff --git a/Source/Core/VideoCommon/XFMemory.h b/Source/Core/VideoCommon/XFMemory.h
index f36378b1e9..3646a70264 100644
--- a/Source/Core/VideoCommon/XFMemory.h
+++ b/Source/Core/VideoCommon/XFMemory.h
@@ -273,5 +273,5 @@ struct XFMemory
 
 extern XFMemory xfmem;
 
-void LoadXFReg(u32 transferSize, u32 address, u32 *pData);
+void LoadXFReg(u32 transferSize, u32 address);
 void LoadIndexedXF(u32 val, int array);
diff --git a/Source/Core/VideoCommon/XFStructs.cpp b/Source/Core/VideoCommon/XFStructs.cpp
index 235c76bea6..650a4a913b 100644
--- a/Source/Core/VideoCommon/XFStructs.cpp
+++ b/Source/Core/VideoCommon/XFStructs.cpp
@@ -5,6 +5,7 @@
 #include "Common/Common.h"
 #include "Core/HW/Memmap.h"
 #include "VideoCommon/CPMemory.h"
+#include "VideoCommon/DataReader.h"
 #include "VideoCommon/PixelShaderManager.h"
 #include "VideoCommon/VertexManagerBase.h"
 #include "VideoCommon/VertexShaderManager.h"
@@ -17,14 +18,14 @@ static void XFMemWritten(u32 transferSize, u32 baseAddress)
 	VertexShaderManager::InvalidateXFRange(baseAddress, baseAddress + transferSize);
 }
 
-static void XFRegWritten(int transferSize, u32 baseAddress, u32 *pData)
+static void XFRegWritten(int transferSize, u32 baseAddress)
 {
 	u32 address = baseAddress;
 	u32 dataIndex = 0;
 
 	while (transferSize > 0 && address < 0x1058)
 	{
-		u32 newValue = pData[dataIndex];
+		u32 newValue = DataPeek<u32>(dataIndex * sizeof(u32));
 		u32 nextAddress = address + 1;
 
 		switch (address)
@@ -191,7 +192,7 @@ static void XFRegWritten(int transferSize, u32 baseAddress, u32 *pData)
 	}
 }
 
-void LoadXFReg(u32 transferSize, u32 baseAddress, u32 *pData)
+void LoadXFReg(u32 transferSize, u32 baseAddress)
 {
 	// do not allow writes past registers
 	if (baseAddress + transferSize > 0x1058)
@@ -225,16 +226,20 @@ void LoadXFReg(u32 transferSize, u32 baseAddress, u32 *pData)
 		}
 
 		XFMemWritten(xfMemTransferSize, xfMemBase);
-		memcpy((u32*)(&xfmem) + xfMemBase, pData, xfMemTransferSize * 4);
-
-		pData += xfMemTransferSize;
+		for (u32 i = 0; i < xfMemTransferSize; i++)
+		{
+			((u32*)&xfmem)[xfMemBase + i] = DataRead<u32>();
+		}
 	}
 
 	// write to XF regs
 	if (transferSize > 0)
 	{
-		XFRegWritten(transferSize, baseAddress, pData);
-		memcpy((u32*)(&xfmem) + baseAddress, pData, transferSize * 4);
+		XFRegWritten(transferSize, baseAddress);
+		for (u32 i = 0; i < transferSize; i++)
+		{
+			((u32*)&xfmem)[baseAddress + i] = DataRead<u32>();
+		}
 	}
 }