From 608f9bcd6724c77fef9be8e45ec8f14b49e2b3fb Mon Sep 17 00:00:00 2001
From: comex <comexk@gmail.com>
Date: Mon, 1 Sep 2014 01:11:32 -0400
Subject: [PATCH] Refactor opcode decoding a bit to kill FifoCommandRunnable.

Separated out from my gpu-determinism branch by request.  It's not a big
commit; I just like to write long commit messages.

The main reason to kill it is hopefully a slight performance improvement
from avoiding the double switch (especially in single core mode);
however, this also improves cycle calculation, as described below.

- FifoCommandRunnable is removed; in its stead, Decode returns the
number of cycles (which only matters for "sync" GPU mode), or 0 if there
was not enough data, and is also responsible for unknown opcode alerts.

Decode and DecodeSemiNop are almost identical, so the latter is replaced
with a skipped_frame parameter to Decode.  Doesn't mean we can't improve
skipped_frame mode to do less work; if, at such a point, branching on it
has too much overhead (it certainly won't now), it can always be changed
to a template parameter.

- FifoCommandRunnable used a fixed, large cycle count for display lists,
regardless of the contents.  Presumably the actual hardware's processing
time is mostly the processing time of whatever commands are in the list,
and with this change InterpretDisplayList can just return the list's
cycle count to be added to the total.  (Since the calculation for this
is part of Decode, it didn't seem easy to split this change up.)

To facilitate this, Decode also gains an explicit 'end' parameter in
lieu of FifoCommandRunnable's call to GetVideoBufferEndPtr, which can
point to there or to the end of a display list (or elsewhere in
gpu-determinism, but that's another story).  Also, as a small
optimization, InterpretDisplayList now calls OpcodeDecoder_Run rather
than having its own Decode loop, to allow Decode to be inlined (haven't
checked whether this actually happens though).

skipped_frame mode still does not traverse display lists and uses the
old fake value of 45 cycles.  degasus has suggested that this hack is
not essential for performance and can be removed, but I want to separate
any potential performance impact of that from this commit.
---
 Source/Core/VideoCommon/Fifo.cpp              |   4 +-
 Source/Core/VideoCommon/OpcodeDecoding.cpp    | 370 ++++++------------
 Source/Core/VideoCommon/OpcodeDecoding.h      |   3 +-
 .../Core/VideoCommon/VertexLoaderManager.cpp  |  13 +-
 Source/Core/VideoCommon/VertexLoaderManager.h |   3 +-
 5 files changed, 123 insertions(+), 270 deletions(-)

diff --git a/Source/Core/VideoCommon/Fifo.cpp b/Source/Core/VideoCommon/Fifo.cpp
index 165cd8e239..fb47c73e49 100644
--- a/Source/Core/VideoCommon/Fifo.cpp
+++ b/Source/Core/VideoCommon/Fifo.cpp
@@ -173,7 +173,7 @@ void RunGpuLoop()
 
 				ReadDataFromFifo(uData, 32);
 
-				cyclesExecuted = OpcodeDecoder_Run(g_bSkipCurrentFrame);
+				cyclesExecuted = OpcodeDecoder_Run(g_bSkipCurrentFrame, GetVideoBufferEndPtr());
 
 				if (Core::g_CoreStartupParameter.bSyncGPU && Common::AtomicLoad(CommandProcessor::VITicks) > cyclesExecuted)
 					Common::AtomicAdd(CommandProcessor::VITicks, -(s32)cyclesExecuted);
@@ -235,7 +235,7 @@ void RunGpu()
 		FPURoundMode::SaveSIMDState();
 		FPURoundMode::LoadDefaultSIMDState();
 		ReadDataFromFifo(uData, 32);
-		OpcodeDecoder_Run(g_bSkipCurrentFrame);
+		OpcodeDecoder_Run(g_bSkipCurrentFrame, GetVideoBufferEndPtr());
 		FPURoundMode::LoadSIMDState();
 
 		//DEBUG_LOG(COMMANDPROCESSOR, "Fifo wraps to base");
diff --git a/Source/Core/VideoCommon/OpcodeDecoding.cpp b/Source/Core/VideoCommon/OpcodeDecoding.cpp
index ff9cd8e735..fe644db21e 100644
--- a/Source/Core/VideoCommon/OpcodeDecoding.cpp
+++ b/Source/Core/VideoCommon/OpcodeDecoding.cpp
@@ -75,13 +75,13 @@ static DataReadU32xNfunc DataReadU32xFuncs[16] = {
 	DataReadU32xN<16>
 };
 
-static void Decode();
-
-void InterpretDisplayList(u32 address, u32 size)
+static u32 InterpretDisplayList(u32 address, u32 size)
 {
 	u8* old_pVideoData = g_pVideoData;
 	u8* startAddress = Memory::GetPointer(address);
 
+	u32 cycles = 0;
+
 	// Avoid the crash if Memory::GetPointer failed ..
 	if (startAddress != nullptr)
 	{
@@ -91,10 +91,7 @@ void InterpretDisplayList(u32 address, u32 size)
 		Statistics::SwapDL();
 
 		u8 *end = g_pVideoData + size;
-		while (g_pVideoData < end)
-		{
-			Decode();
-		}
+		cycles = OpcodeDecoder_Run(false, end);
 		INCSTAT(stats.thisFrame.numDListsCalled);
 
 		// un-swap
@@ -103,186 +100,71 @@ void InterpretDisplayList(u32 address, u32 size)
 
 	// reset to the old pointer
 	g_pVideoData = old_pVideoData;
+
+	return cycles;
 }
 
-static u32 FifoCommandRunnable(u32 &command_size)
+static void UnknownOpcode(u8 cmd_byte, void *buffer, bool preprocess)
 {
-	u32 cycleTime = 0;
-	u32 buffer_size = (u32)(GetVideoBufferEndPtr() - g_pVideoData);
-	if (buffer_size == 0)
-		return 0;  // can't peek
-
-	u8 cmd_byte = DataPeek8(0);
-
-	switch (cmd_byte)
+	// TODO(Omega): Maybe dump FIFO to file on this error
+	std::string temp = StringFromFormat(
+		"GFX FIFO: Unknown Opcode (0x%x @ %p).\n"
+		"This means one of the following:\n"
+		"* The emulated GPU got desynced, disabling dual core can help\n"
+		"* Command stream corrupted by some spurious memory bug\n"
+		"* This really is an unknown opcode (unlikely)\n"
+		"* Some other sort of bug\n\n"
+		"Dolphin will now likely crash or hang. Enjoy." ,
+		cmd_byte,
+		buffer);
+	Host_SysMessage(temp.c_str());
+	INFO_LOG(VIDEO, "%s", temp.c_str());
 	{
-	case GX_NOP: // Hm, this means that we scan over nop streams pretty slowly...
-		command_size = 1;
-		cycleTime = 6;
-		break;
-	case GX_CMD_INVL_VC: // Invalidate Vertex Cache - no parameters
-		command_size = 1;
-		cycleTime = 6;
-		break;
-	case GX_CMD_UNKNOWN_METRICS: // zelda 4 swords calls it and checks the metrics registers after that
-		command_size = 1;
-		cycleTime = 6;
-		break;
+		SCPFifoStruct &fifo = CommandProcessor::fifo;
 
-	case GX_LOAD_BP_REG:
-		command_size = 5;
-		cycleTime = 12;
-		break;
+		std::string tmp = StringFromFormat(
+			"Illegal command %02x\n"
+			"CPBase: 0x%08x\n"
+			"CPEnd: 0x%08x\n"
+			"CPHiWatermark: 0x%08x\n"
+			"CPLoWatermark: 0x%08x\n"
+			"CPReadWriteDistance: 0x%08x\n"
+			"CPWritePointer: 0x%08x\n"
+			"CPReadPointer: 0x%08x\n"
+			"CPBreakpoint: 0x%08x\n"
+			"bFF_GPReadEnable: %s\n"
+			"bFF_BPEnable: %s\n"
+			"bFF_BPInt: %s\n"
+			"bFF_Breakpoint: %s\n"
+			,cmd_byte, fifo.CPBase, fifo.CPEnd, fifo.CPHiWatermark, fifo.CPLoWatermark, fifo.CPReadWriteDistance
+			,fifo.CPWritePointer, fifo.CPReadPointer, fifo.CPBreakpoint, fifo.bFF_GPReadEnable ? "true" : "false"
+			,fifo.bFF_BPEnable ? "true" : "false" ,fifo.bFF_BPInt ? "true" : "false"
+			,fifo.bFF_Breakpoint ? "true" : "false");
 
-	case GX_LOAD_CP_REG:
-		command_size = 6;
-		cycleTime = 12;
-		break;
-
-	case GX_LOAD_INDX_A:
-	case GX_LOAD_INDX_B:
-	case GX_LOAD_INDX_C:
-	case GX_LOAD_INDX_D:
-		command_size = 5;
-		cycleTime = 6; // TODO
-		break;
-
-	case GX_CMD_CALL_DL:
-		{
-			// FIXME: Calculate the cycle time of the display list.
-			//u32 address = DataPeek32(1);
-			//u32 size = DataPeek32(5);
-			//u8* old_pVideoData = g_pVideoData;
-			//u8* startAddress = Memory::GetPointer(address);
-
-			//// Avoid the crash if Memory::GetPointer failed ..
-			//if (startAddress != 0)
-			//{
-			//	g_pVideoData = startAddress;
-			//	u8 *end = g_pVideoData + size;
-			//	u32 step = 0;
-			//	while (g_pVideoData < end)
-			//	{
-			//		cycleTime += FifoCommandRunnable(step);
-			//		g_pVideoData += step;
-			//	}
-			//}
-			//else
-			//{
-			//	cycleTime = 45;
-			//}
-
-			//// reset to the old pointer
-			//g_pVideoData = old_pVideoData;
-			command_size = 9;
-			cycleTime = 45;  // This is unverified
-		}
-		break;
-
-	case GX_LOAD_XF_REG:
-		{
-			// check if we can read the header
-			if (buffer_size >= 5)
-			{
-				command_size = 1 + 4;
-				u32 Cmd2 = DataPeek32(1);
-				int transfer_size = ((Cmd2 >> 16) & 15) + 1;
-				command_size += transfer_size * 4;
-				cycleTime = 18 + 6 * transfer_size;
-			}
-			else
-			{
-				return 0;
-			}
-		}
-		break;
-
-	default:
-		if ((cmd_byte & 0xC0) == 0x80)
-		{
-			// check if we can read the header
-			if (buffer_size >= 3)
-			{
-				command_size = 1 + 2;
-				u16 numVertices = DataPeek16(1);
-				command_size += numVertices * VertexLoaderManager::GetVertexSize(cmd_byte & GX_VAT_MASK);
-				cycleTime = 1600; // This depends on the number of pixels rendered
-			}
-			else
-			{
-				return 0;
-			}
-		}
-		else
-		{
-			// TODO(Omega): Maybe dump FIFO to file on this error
-			std::string temp = StringFromFormat(
-				"GFX FIFO: Unknown Opcode (0x%x).\n"
-				"This means one of the following:\n"
-				"* The emulated GPU got desynced, disabling dual core can help\n"
-				"* Command stream corrupted by some spurious memory bug\n"
-				"* This really is an unknown opcode (unlikely)\n"
-				"* Some other sort of bug\n\n"
-				"Dolphin will now likely crash or hang. Enjoy." , cmd_byte);
-			Host_SysMessage(temp.c_str());
-			INFO_LOG(VIDEO, "%s", temp.c_str());
-			{
-				SCPFifoStruct &fifo = CommandProcessor::fifo;
-
-				std::string tmp = StringFromFormat(
-					"Illegal command %02x\n"
-					"CPBase: 0x%08x\n"
-					"CPEnd: 0x%08x\n"
-					"CPHiWatermark: 0x%08x\n"
-					"CPLoWatermark: 0x%08x\n"
-					"CPReadWriteDistance: 0x%08x\n"
-					"CPWritePointer: 0x%08x\n"
-					"CPReadPointer: 0x%08x\n"
-					"CPBreakpoint: 0x%08x\n"
-					"bFF_GPReadEnable: %s\n"
-					"bFF_BPEnable: %s\n"
-					"bFF_BPInt: %s\n"
-					"bFF_Breakpoint: %s\n"
-					,cmd_byte, fifo.CPBase, fifo.CPEnd, fifo.CPHiWatermark, fifo.CPLoWatermark, fifo.CPReadWriteDistance
-					,fifo.CPWritePointer, fifo.CPReadPointer, fifo.CPBreakpoint, fifo.bFF_GPReadEnable ? "true" : "false"
-					,fifo.bFF_BPEnable ? "true" : "false" ,fifo.bFF_BPInt ? "true" : "false"
-					,fifo.bFF_Breakpoint ? "true" : "false");
-
-				Host_SysMessage(tmp.c_str());
-				INFO_LOG(VIDEO, "%s", tmp.c_str());
-			}
-		}
-		break;
+		Host_SysMessage(tmp.c_str());
+		INFO_LOG(VIDEO, "%s", tmp.c_str());
 	}
-
-	if (command_size > buffer_size)
-		return 0;
-
-	// INFO_LOG("OP detected: cmd_byte 0x%x  size %i  buffer %i",cmd_byte, command_size, buffer_size);
-	if (cycleTime == 0)
-		cycleTime = 6;
-
-	return cycleTime;
 }
 
-static u32 FifoCommandRunnable()
-{
-	u32 command_size = 0;
-	return FifoCommandRunnable(command_size);
-}
-
-static void Decode()
+static u32 Decode(u8* end, bool skipped_frame)
 {
 	u8 *opcodeStart = g_pVideoData;
+	if (g_pVideoData == end)
+		return 0;
 
-	int cmd_byte = DataReadU8();
+	u8 cmd_byte = DataReadU8();
+	u32 cycles;
 	switch (cmd_byte)
 	{
 	case GX_NOP:
+		cycles = 6; // Hm, this means that we scan over nop streams pretty slowly...
 		break;
 
 	case GX_LOAD_CP_REG: //0x08
 		{
+			if (end - g_pVideoData < 1 + 4)
+				return 0;
+			cycles = 12;
 			u8 sub_cmd = DataReadU8();
 			u32 value = DataReadU32();
 			LoadCPReg(sub_cmd, value);
@@ -292,8 +174,13 @@ static void Decode()
 
 	case GX_LOAD_XF_REG:
 		{
+			if (end - g_pVideoData < 4)
+				return 0;
 			u32 Cmd2 = DataReadU32();
 			int transfer_size = ((Cmd2 >> 16) & 15) + 1;
+			if ((size_t) (end - g_pVideoData) < transfer_size * sizeof(u32))
+				return 0;
+			cycles = 18 + 6 * transfer_size;
 			u32 xf_address = Cmd2 & 0xFFFF;
 			GC_ALIGNED128(u32 data_buffer[16]);
 			DataReadU32xFuncs[transfer_size-1](data_buffer);
@@ -304,36 +191,60 @@ static void Decode()
 		break;
 
 	case GX_LOAD_INDX_A: //used for position matrices
+		if (end - g_pVideoData < 4)
+			return 0;
+		cycles = 6;
 		LoadIndexedXF(DataReadU32(), 0xC);
 		break;
 	case GX_LOAD_INDX_B: //used for normal matrices
+		if (end - g_pVideoData < 4)
+			return 0;
+		cycles = 6;
 		LoadIndexedXF(DataReadU32(), 0xD);
 		break;
 	case GX_LOAD_INDX_C: //used for postmatrices
+		if (end - g_pVideoData < 4)
+			return 0;
+		cycles = 6;
 		LoadIndexedXF(DataReadU32(), 0xE);
 		break;
 	case GX_LOAD_INDX_D: //used for lights
+		if (end - g_pVideoData < 4)
+			return 0;
+		cycles = 6;
 		LoadIndexedXF(DataReadU32(), 0xF);
 		break;
 
 	case GX_CMD_CALL_DL:
 		{
+			if (end - g_pVideoData < 8)
+				return 0;
 			u32 address = DataReadU32();
 			u32 count = DataReadU32();
-			InterpretDisplayList(address, count);
+			if (skipped_frame)
+				cycles = 45; // xxx
+			else
+				cycles = 6 + InterpretDisplayList(address, count);
 		}
 		break;
 
 	case GX_CMD_UNKNOWN_METRICS: // zelda 4 swords calls it and checks the metrics registers after that
+		cycles = 6;
 		DEBUG_LOG(VIDEO, "GX 0x44: %08x", cmd_byte);
 		break;
 
 	case GX_CMD_INVL_VC: // Invalidate Vertex Cache
+		cycles = 6;
 		DEBUG_LOG(VIDEO, "Invalidate (vertex cache?)");
 		break;
 
 	case GX_LOAD_BP_REG: //0x61
+		// In skipped_frame case: We have to let BP writes through because they set
+		// tokens and stuff.  TODO: Call a much simplified LoadBPReg instead.
 		{
+			if (end - g_pVideoData < 4)
+				return 0;
+			cycles = 12;
 			u32 bp_cmd = DataReadU32();
 			LoadBPReg(bp_cmd);
 			INCSTAT(stats.thisFrame.numBPLoads);
@@ -344,18 +255,33 @@ static void Decode()
 	default:
 		if ((cmd_byte & 0xC0) == 0x80)
 		{
-			// load vertices (use computed vertex size from FifoCommandRunnable above)
+			cycles = 1600;
+			// load vertices
+			if (end - g_pVideoData < 2)
+				return 0;
 			u16 numVertices = DataReadU16();
 
-			VertexLoaderManager::RunVertices(
-				cmd_byte & GX_VAT_MASK,   // Vertex loader index (0 - 7)
-				(cmd_byte & GX_PRIMITIVE_MASK) >> GX_PRIMITIVE_SHIFT,
-				numVertices);
+			if (skipped_frame)
+			{
+				size_t size = numVertices * VertexLoaderManager::GetVertexSize(cmd_byte & GX_VAT_MASK);
+				if ((size_t) (end - g_pVideoData) < size)
+					return 0;
+				DataSkip((u32)size);
+			}
+			else
+			{
+				if (!VertexLoaderManager::RunVertices(
+					cmd_byte & GX_VAT_MASK,   // Vertex loader index (0 - 7)
+					(cmd_byte & GX_PRIMITIVE_MASK) >> GX_PRIMITIVE_SHIFT,
+					numVertices,
+					end - g_pVideoData))
+					return 0;
+			}
 		}
 		else
 		{
-			ERROR_LOG(VIDEO, "OpcodeDecoding::Decode: Illegal command %02x", cmd_byte);
-			break;
+			UnknownOpcode(cmd_byte, opcodeStart, false);
+			cycles = 1;
 		}
 		break;
 	}
@@ -363,89 +289,8 @@ static void Decode()
 	// Display lists get added directly into the FIFO stream
 	if (g_bRecordFifoData && cmd_byte != GX_CMD_CALL_DL)
 		FifoRecorder::GetInstance().WriteGPCommand(opcodeStart, u32(g_pVideoData - opcodeStart));
-}
 
-static void DecodeSemiNop()
-{
-	u8 *opcodeStart = g_pVideoData;
-
-	int cmd_byte = DataReadU8();
-	switch (cmd_byte)
-	{
-	case GX_CMD_UNKNOWN_METRICS: // zelda 4 swords calls it and checks the metrics registers after that
-	case GX_CMD_INVL_VC: // Invalidate Vertex Cache
-	case GX_NOP:
-		break;
-
-	case GX_LOAD_CP_REG: //0x08
-		// We have to let CP writes through because they determine the size of vertices.
-		{
-			u8 sub_cmd = DataReadU8();
-			u32 value = DataReadU32();
-			LoadCPReg(sub_cmd, value);
-			INCSTAT(stats.thisFrame.numCPLoads);
-		}
-		break;
-
-	case GX_LOAD_XF_REG:
-		{
-			u32 Cmd2 = DataReadU32();
-			int transfer_size = ((Cmd2 >> 16) & 15) + 1;
-			u32 address = Cmd2 & 0xFFFF;
-			GC_ALIGNED128(u32 data_buffer[16]);
-			DataReadU32xFuncs[transfer_size-1](data_buffer);
-			LoadXFReg(transfer_size, address, data_buffer);
-			INCSTAT(stats.thisFrame.numXFLoads);
-		}
-		break;
-
-	case GX_LOAD_INDX_A: //used for position matrices
-		LoadIndexedXF(DataReadU32(), 0xC);
-		break;
-	case GX_LOAD_INDX_B: //used for normal matrices
-		LoadIndexedXF(DataReadU32(), 0xD);
-		break;
-	case GX_LOAD_INDX_C: //used for postmatrices
-		LoadIndexedXF(DataReadU32(), 0xE);
-		break;
-	case GX_LOAD_INDX_D: //used for lights
-		LoadIndexedXF(DataReadU32(), 0xF);
-		break;
-
-	case GX_CMD_CALL_DL:
-		// Hm, wonder if any games put tokens in display lists - in that case,
-		// we'll have to parse them too.
-		DataSkip(8);
-		break;
-
-	case GX_LOAD_BP_REG: //0x61
-		// We have to let BP writes through because they set tokens and stuff.
-		// TODO: Call a much simplified LoadBPReg instead.
-		{
-			u32 bp_cmd = DataReadU32();
-			LoadBPReg(bp_cmd);
-			INCSTAT(stats.thisFrame.numBPLoads);
-		}
-		break;
-
-	// draw primitives
-	default:
-		if ((cmd_byte & 0xC0) == 0x80)
-		{
-			// load vertices (use computed vertex size from FifoCommandRunnable above)
-			u16 numVertices = DataReadU16();
-			DataSkip(numVertices * VertexLoaderManager::GetVertexSize(cmd_byte & GX_VAT_MASK));
-		}
-		else
-		{
-			ERROR_LOG(VIDEO, "OpcodeDecoding::Decode: Illegal command %02x", cmd_byte);
-			break;
-		}
-		break;
-	}
-
-	if (g_bRecordFifoData && cmd_byte != GX_CMD_CALL_DL)
-		FifoRecorder::GetInstance().WriteGPCommand(opcodeStart, u32(g_pVideoData - opcodeStart));
+	return cycles;
 }
 
 void OpcodeDecoder_Init()
@@ -466,15 +311,18 @@ void OpcodeDecoder_Shutdown()
 {
 }
 
-u32 OpcodeDecoder_Run(bool skipped_frame)
+u32 OpcodeDecoder_Run(bool skipped_frame, u8* end)
 {
 	u32 totalCycles = 0;
 	while (true)
 	{
-		u32 cycles = FifoCommandRunnable();
+		u8* old = g_pVideoData;
+		u32 cycles = Decode(end, skipped_frame);
 		if (cycles == 0)
+		{
+			g_pVideoData = old;
 			break;
-		skipped_frame ? DecodeSemiNop() : Decode();
+		}
 		totalCycles += cycles;
 	}
 	return totalCycles;
diff --git a/Source/Core/VideoCommon/OpcodeDecoding.h b/Source/Core/VideoCommon/OpcodeDecoding.h
index baf44bd40f..2944863bbc 100644
--- a/Source/Core/VideoCommon/OpcodeDecoding.h
+++ b/Source/Core/VideoCommon/OpcodeDecoding.h
@@ -38,5 +38,4 @@ extern bool g_bRecordFifoData;
 
 void OpcodeDecoder_Init();
 void OpcodeDecoder_Shutdown();
-u32 OpcodeDecoder_Run(bool skipped_frame);
-void InterpretDisplayList(u32 address, u32 size);
+u32 OpcodeDecoder_Run(bool skipped_frame, u8* end);
diff --git a/Source/Core/VideoCommon/VertexLoaderManager.cpp b/Source/Core/VideoCommon/VertexLoaderManager.cpp
index b74fc029db..7ff21d103b 100644
--- a/Source/Core/VideoCommon/VertexLoaderManager.cpp
+++ b/Source/Core/VideoCommon/VertexLoaderManager.cpp
@@ -151,17 +151,21 @@ static VertexLoaderCacheItem RefreshLoader(int vtx_attr_group)
 	return s_VertexLoaders[vtx_attr_group];
 }
 
-void RunVertices(int vtx_attr_group, int primitive, int count)
+bool RunVertices(int vtx_attr_group, int primitive, int count, size_t buf_size)
 {
 	if (!count)
-		return;
+		return true;
 	auto loader = RefreshLoader(vtx_attr_group);
 
+	size_t size = count * loader.first->GetVertexSize();
+	if (buf_size < size)
+		return false;
+
 	if (bpmem.genMode.cullmode == GenMode::CULL_ALL && primitive < 5)
 	{
 		// if cull mode is CULL_ALL, ignore triangles and quads
-		DataSkip(count * loader.first->GetVertexSize());
-		return;
+		DataSkip((u32)size);
+		return true;
 	}
 
 	// If the native vertex format changed, force a flush.
@@ -178,6 +182,7 @@ void RunVertices(int vtx_attr_group, int primitive, int count)
 
 	ADDSTAT(stats.thisFrame.numPrims, count);
 	INCSTAT(stats.thisFrame.numPrimitiveJoins);
+	return true;
 }
 
 int GetVertexSize(int vtx_attr_group)
diff --git a/Source/Core/VideoCommon/VertexLoaderManager.h b/Source/Core/VideoCommon/VertexLoaderManager.h
index 17bf13c3ba..ff3485b9d5 100644
--- a/Source/Core/VideoCommon/VertexLoaderManager.h
+++ b/Source/Core/VideoCommon/VertexLoaderManager.h
@@ -17,7 +17,8 @@ namespace VertexLoaderManager
 	void MarkAllDirty();
 
 	int GetVertexSize(int vtx_attr_group);
-	void RunVertices(int vtx_attr_group, int primitive, int count);
+	// Returns false if buf_size is insufficient.
+	bool RunVertices(int vtx_attr_group, int primitive, int count, size_t buf_size);
 
 	// For debugging
 	void AppendListToString(std::string *dest);