diff --git a/Source/Core/Common/BlockingLoop.h b/Source/Core/Common/BlockingLoop.h
new file mode 100644
index 0000000000..d67e07d90e
--- /dev/null
+++ b/Source/Core/Common/BlockingLoop.h
@@ -0,0 +1,164 @@
+// Copyright 2015 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <atomic>
+#include <mutex>
+#include <thread>
+
+#include "Common/Event.h"
+#include "Common/Flag.h"
+
+namespace Common
+{
+
+// This class provides a synchronized loop.
+// It's a thread-safe way to trigger a new iteration without busy loops.
+// It's optimized for high-usage iterations which usually are already running while it's triggered often.
+class BlockingLoop
+{
+public:
+	BlockingLoop()
+	{
+		m_stopped.Set();
+	}
+
+	~BlockingLoop()
+	{
+		Stop();
+	}
+
+	// Triggers to rerun the payload of the Run() function at least once again.
+	// This function will never block and is designed to finish as fast as possible.
+	void Wakeup()
+	{
+		// already running, so no need for a wakeup
+		if (m_is_running.IsSet())
+			return;
+
+		m_is_running.Set();
+		m_is_pending.Set();
+		m_new_work_event.Set();
+	}
+
+	// Wait for a complete payload run after the last Wakeup() call.
+	// If stopped, this returns immediately.
+	void Wait()
+	{
+		// We have to give the loop a chance to exit.
+		m_may_sleep.Set();
+
+		if (m_stopped.IsSet() || (!m_is_running.IsSet() && !m_is_pending.IsSet()))
+			return;
+
+		// notifying this event will only wake up one thread, so use a mutex here to
+		// allow only one waiting thread. And in this way, we get an event free wakeup
+		// but for the first thread for free
+		std::lock_guard<std::mutex> lk(m_wait_lock);
+
+		while (!m_stopped.IsSet() && (m_is_running.IsSet() || m_is_pending.IsSet()))
+		{
+			m_may_sleep.Set();
+			m_done_event.Wait();
+		}
+	}
+
+	// Half start the worker.
+	// So this object is in running state and Wait() will block until the worker calls Run().
+	// This may be called from any thread and is supposed to call at least once before Wait() is used.
+	void Prepare()
+	{
+		// There is a race condition if the other threads call this function while
+		// the loop thread is initializing. Using this lock will ensure a valid state.
+		std::lock_guard<std::mutex> lk(m_prepare_lock);
+
+		if (!m_stopped.TestAndClear())
+			return;
+		m_is_pending.Set();
+		m_shutdown.Clear();
+		m_may_sleep.Set();
+	}
+
+	// Mainloop of this object.
+	// The payload callback is called at least as often as it's needed to match the Wakeup() requirements.
+	template<class F> void Run(F payload)
+	{
+		Prepare();
+
+		while (!m_shutdown.IsSet())
+		{
+			payload();
+
+			m_is_pending.Clear();
+			m_done_event.Set();
+
+			if (m_is_running.IsSet())
+			{
+				if (m_may_sleep.IsSet())
+				{
+					m_is_pending.Set();
+					m_is_running.Clear();
+
+					// We'll sleep after the next iteration now,
+					// so clear this flag now and we won't sleep another times.
+					m_may_sleep.Clear();
+				}
+			}
+			else
+			{
+				m_new_work_event.WaitFor(std::chrono::milliseconds(100));
+			}
+
+		}
+
+		m_is_running.Clear();
+		m_is_pending.Clear();
+		m_stopped.Set();
+
+		m_done_event.Set();
+	}
+
+	// Quits the mainloop.
+	// By default, it will wait until the Mainloop quits.
+	// Be careful to not use the blocking way within the payload of the Run() method.
+	void Stop(bool block = true)
+	{
+		if (m_stopped.IsSet())
+			return;
+
+		m_shutdown.Set();
+		Wakeup();
+
+		if (block)
+			Wait();
+	}
+
+	bool IsRunning() const
+	{
+		return !m_stopped.IsSet() && !m_shutdown.IsSet();
+	}
+
+	void AllowSleep()
+	{
+		m_may_sleep.Set();
+	}
+
+private:
+	std::mutex m_wait_lock;
+	std::mutex m_prepare_lock;
+
+	Flag m_stopped; // This one is set, Wait() shall not block.
+	Flag m_shutdown; // If this one is set, the loop shall be quit.
+
+	Event m_new_work_event;
+	Flag m_is_running; // If this one is set, the loop will be called at least once again.
+
+	Event m_done_event;
+	Flag m_is_pending; // If this one is set, there might still be work to do.
+
+	Flag m_may_sleep; // If this one is set, we fall back from the busy loop to an event based synchronization.
+};
+
+}
diff --git a/Source/Core/Common/Common.vcxproj b/Source/Core/Common/Common.vcxproj
index 7b2c278cd1..fe3a5f22dc 100644
--- a/Source/Core/Common/Common.vcxproj
+++ b/Source/Core/Common/Common.vcxproj
@@ -40,6 +40,7 @@
     <ClInclude Include="Atomic_Win32.h" />
     <ClInclude Include="BitField.h" />
     <ClInclude Include="BitSet.h" />
+    <ClInclude Include="BlockLoop.h" />
     <ClInclude Include="BreakPoints.h" />
     <ClInclude Include="CDUtils.h" />
     <ClInclude Include="ChunkFile.h" />
diff --git a/Source/Core/Common/Common.vcxproj.filters b/Source/Core/Common/Common.vcxproj.filters
index ffaf7d6be8..712122b3d1 100644
--- a/Source/Core/Common/Common.vcxproj.filters
+++ b/Source/Core/Common/Common.vcxproj.filters
@@ -14,6 +14,7 @@
     <ClInclude Include="Atomic_Win32.h" />
     <ClInclude Include="BitField.h" />
     <ClInclude Include="BitSet.h" />
+    <ClInclude Include="BlockingLoop.h" />
     <ClInclude Include="BreakPoints.h" />
     <ClInclude Include="CDUtils.h" />
     <ClInclude Include="ChunkFile.h" />
@@ -126,4 +127,4 @@
   <ItemGroup>
     <Text Include="CMakeLists.txt" />
   </ItemGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/Source/Core/Core/CoreTiming.cpp b/Source/Core/Core/CoreTiming.cpp
index 8b92f72f35..fa106aa4b8 100644
--- a/Source/Core/Core/CoreTiming.cpp
+++ b/Source/Core/Core/CoreTiming.cpp
@@ -475,7 +475,7 @@ void Idle()
 {
 	//DEBUG_LOG(POWERPC, "Idle");
 
-	if (SConfig::GetInstance().m_LocalCoreStartupParameter.bSyncGPUOnSkipIdleHack)
+	if (SConfig::GetInstance().m_LocalCoreStartupParameter.bSyncGPUOnSkipIdleHack && !SConfig::GetInstance().m_LocalCoreStartupParameter.bSyncGPU)
 	{
 		//When the FIFO is processing data we must not advance because in this way
 		//the VI will be desynchronized. So, We are waiting until the FIFO finish and
diff --git a/Source/Core/Core/HW/SystemTimers.cpp b/Source/Core/Core/HW/SystemTimers.cpp
index 5c84966489..a44614d9e0 100644
--- a/Source/Core/Core/HW/SystemTimers.cpp
+++ b/Source/Core/Core/HW/SystemTimers.cpp
@@ -62,6 +62,7 @@ IPC_HLE_PERIOD: For the Wiimote this is the call schedule:
 #include "Core/PowerPC/PowerPC.h"
 
 #include "VideoCommon/CommandProcessor.h"
+#include "VideoCommon/Fifo.h"
 #include "VideoCommon/VideoBackendBase.h"
 
 
@@ -189,7 +190,7 @@ static void PatchEngineCallback(u64 userdata, int cyclesLate)
 static void ThrottleCallback(u64 last_time, int cyclesLate)
 {
 	// Allow the GPU thread to sleep. Setting this flag here limits the wakeups to 1 kHz.
-	CommandProcessor::s_gpuMaySleep.Set();
+	GpuMaySleep();
 
 	u32 time = Common::Timer::GetTimeMs();
 
diff --git a/Source/Core/VideoCommon/CommandProcessor.cpp b/Source/Core/VideoCommon/CommandProcessor.cpp
index 449e99982f..88b16afe7e 100644
--- a/Source/Core/VideoCommon/CommandProcessor.cpp
+++ b/Source/Core/VideoCommon/CommandProcessor.cpp
@@ -49,8 +49,6 @@ static std::atomic<bool> s_interrupt_finish_waiting;
 
 static std::atomic<u32> s_vi_ticks(CommandProcessor::m_cpClockOrigin);
 
-Common::Flag s_gpuMaySleep;
-
 static bool IsOnThread()
 {
 	return SConfig::GetInstance().m_LocalCoreStartupParameter.bCPUThread;
diff --git a/Source/Core/VideoCommon/CommandProcessor.h b/Source/Core/VideoCommon/CommandProcessor.h
index 0b33150ce4..ee130b82b6 100644
--- a/Source/Core/VideoCommon/CommandProcessor.h
+++ b/Source/Core/VideoCommon/CommandProcessor.h
@@ -17,7 +17,6 @@ namespace CommandProcessor
 {
 
 extern SCPFifoStruct fifo; //This one is shared between gfx thread and emulator thread.
-extern Common::Flag s_gpuMaySleep;
 
 // internal hardware addresses
 enum
diff --git a/Source/Core/VideoCommon/Fifo.cpp b/Source/Core/VideoCommon/Fifo.cpp
index 289a62d8e7..5e482f3670 100644
--- a/Source/Core/VideoCommon/Fifo.cpp
+++ b/Source/Core/VideoCommon/Fifo.cpp
@@ -5,6 +5,7 @@
 #include <atomic>
 
 #include "Common/Atomic.h"
+#include "Common/BlockingLoop.h"
 #include "Common/ChunkFile.h"
 #include "Common/CPUDetect.h"
 #include "Common/Event.h"
@@ -26,11 +27,13 @@
 #include "VideoCommon/OpcodeDecoding.h"
 #include "VideoCommon/PixelEngine.h"
 #include "VideoCommon/VertexLoaderManager.h"
+#include "VideoCommon/VertexManagerBase.h"
 #include "VideoCommon/VideoConfig.h"
 
 bool g_bSkipCurrentFrame = false;
 
-static std::atomic<bool> s_gpu_running_state;
+static Common::BlockingLoop s_gpu_mainloop;
+
 static std::atomic<bool> s_emu_running_state;
 
 // Most of this array is unlikely to be faulted in...
@@ -41,8 +44,6 @@ static u8* s_fifo_aux_read_ptr;
 bool g_use_deterministic_gpu_thread;
 
 // STATE_TO_SAVE
-static std::mutex s_video_buffer_lock;
-static std::condition_variable s_video_buffer_cond;
 static u8* s_video_buffer;
 static u8* s_video_buffer_read_ptr;
 static std::atomic<u8*> s_video_buffer_write_ptr;
@@ -60,12 +61,6 @@ static u8* s_video_buffer_pp_read_ptr;
 // polls, it's just atomic.
 // - The pp_read_ptr is the CPU preprocessing version of the read_ptr.
 
-static Common::Flag s_gpu_is_running; // If this one is set, the gpu loop will be called at least once again
-static Common::Event s_gpu_new_work_event;
-
-static Common::Flag s_gpu_is_pending; // If this one is set, there might still be work to do
-static Common::Event s_gpu_done_event;
-
 void Fifo_DoState(PointerWrap &p)
 {
 	p.DoArray(s_video_buffer, FIFO_SIZE);
@@ -102,13 +97,14 @@ void Fifo_Init()
 	// Padded so that SIMD overreads in the vertex loader are safe
 	s_video_buffer = (u8*)AllocateMemoryPages(FIFO_SIZE + 4);
 	ResetVideoBuffer();
-	s_gpu_running_state.store(false);
+	if (SConfig::GetInstance().m_LocalCoreStartupParameter.bCPUThread)
+		s_gpu_mainloop.Prepare();
 	CommandProcessor::SetVITicks(CommandProcessor::m_cpClockOrigin);
 }
 
 void Fifo_Shutdown()
 {
-	if (s_gpu_running_state.load())
+	if (s_gpu_mainloop.IsRunning())
 		PanicAlert("Fifo shutting down while active");
 
 	FreeMemoryPages(s_video_buffer, FIFO_SIZE + 4);
@@ -135,27 +131,22 @@ void ExitGpuLoop()
 	FlushGpu();
 
 	// Terminate GPU thread loop
-	s_gpu_running_state.store(false);
 	s_emu_running_state.store(true);
-	s_gpu_new_work_event.Set();
+	s_gpu_mainloop.Stop(false);
 }
 
 void EmulatorState(bool running)
 {
 	s_emu_running_state.store(running);
-	s_gpu_new_work_event.Set();
+	s_gpu_mainloop.Wakeup();
 }
 
 void SyncGPU(SyncGPUReason reason, bool may_move_read_ptr)
 {
 	if (g_use_deterministic_gpu_thread)
 	{
-		std::unique_lock<std::mutex> lk(s_video_buffer_lock);
-		u8* write_ptr = s_video_buffer_write_ptr;
-		s_video_buffer_cond.wait(lk, [&]() {
-			return !s_gpu_running_state.load() || s_video_buffer_seen_ptr == write_ptr;
-		});
-		if (!s_gpu_running_state.load())
+		s_gpu_mainloop.Wait();
+		if (!s_gpu_mainloop.IsRunning())
 			return;
 
 		// Opportunistically reset FIFOs so we don't wrap around.
@@ -168,6 +159,8 @@ void SyncGPU(SyncGPUReason reason, bool may_move_read_ptr)
 
 		if (may_move_read_ptr)
 		{
+			u8* write_ptr = s_video_buffer_write_ptr;
+
 			// what's left over in the buffer
 			size_t size = write_ptr - s_video_buffer_pp_read_ptr;
 
@@ -188,7 +181,7 @@ void PushFifoAuxBuffer(void* ptr, size_t size)
 	if (size > (size_t) (s_fifo_aux_data + FIFO_SIZE - s_fifo_aux_write_ptr))
 	{
 		SyncGPU(SYNC_GPU_AUX_SPACE, /* may_move_read_ptr */ false);
-		if (!s_gpu_running_state.load())
+		if (!s_gpu_mainloop.IsRunning())
 		{
 			// GPU is shutting down
 			return;
@@ -243,9 +236,9 @@ static void ReadDataFromFifoOnCPU(u32 readPtr)
 		// We can't wrap around while the GPU is working on the data.
 		// This should be very rare due to the reset in SyncGPU.
 		SyncGPU(SYNC_GPU_WRAPAROUND);
-		if (!s_gpu_running_state.load())
+		if (!s_gpu_mainloop.IsRunning())
 		{
-			// GPU is shutting down
+			// GPU is shutting down, so the next asserts may fail
 			return;
 		}
 
@@ -283,18 +276,19 @@ void ResetVideoBuffer()
 // Purpose: Keep the Core HW updated about the CPU-GPU distance
 void RunGpuLoop()
 {
-	s_gpu_running_state.store(true);
-	SCPFifoStruct &fifo = CommandProcessor::fifo;
-	u32 cyclesExecuted = 0;
 
 	AsyncRequests::GetInstance()->SetEnable(true);
 	AsyncRequests::GetInstance()->SetPassthrough(false);
 
-	while (s_gpu_running_state.load())
-	{
+	s_gpu_mainloop.Run(
+	[] {
 		g_video_backend->PeekMessages();
 
-		if (g_use_deterministic_gpu_thread && s_emu_running_state.load())
+		// Do nothing while paused
+		if (!s_emu_running_state.load())
+			return;
+
+		if (g_use_deterministic_gpu_thread)
 		{
 			AsyncRequests::GetInstance()->PullEvents();
 
@@ -305,16 +299,13 @@ void RunGpuLoop()
 			if (write_ptr > seen_ptr)
 			{
 				s_video_buffer_read_ptr = OpcodeDecoder_Run(DataReader(s_video_buffer_read_ptr, write_ptr), nullptr, false);
-
-				{
-					std::lock_guard<std::mutex> vblk(s_video_buffer_lock);
-					s_video_buffer_seen_ptr = write_ptr;
-					s_video_buffer_cond.notify_all();
-				}
+				s_video_buffer_seen_ptr = write_ptr;
 			}
 		}
-		else if (s_emu_running_state.load())
+		else
 		{
+			SCPFifoStruct &fifo = CommandProcessor::fifo;
+
 			AsyncRequests::GetInstance()->PullEvents();
 
 			CommandProcessor::SetCPStatusFromGPU();
@@ -333,6 +324,7 @@ void RunGpuLoop()
 
 				if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bSyncGPU || CommandProcessor::GetVITicks() > CommandProcessor::m_cpClockOrigin)
 				{
+					u32 cyclesExecuted = 0;
 					u32 readPtr = fifo.CPReadPointer;
 					ReadDataFromFifo(readPtr);
 
@@ -369,31 +361,15 @@ void RunGpuLoop()
 				// leading the CPU thread to wait in Video_BeginField or Video_AccessEFB thus slowing things down.
 				AsyncRequests::GetInstance()->PullEvents();
 			}
+			// The fifo is empty and it's unlikely we will get any more work in the near future.
+			// Make sure VertexManager finishes drawing any primitives it has stored in it's buffer.
+			VertexManager::Flush();
 
 			// don't release the GPU running state on sync GPU waits
 			fifo.isGpuReadingData = !run_loop;
 		}
+	});
 
-		s_gpu_is_pending.Clear();
-		s_gpu_done_event.Set();
-
-		if (s_gpu_is_running.IsSet())
-		{
-			if (CommandProcessor::s_gpuMaySleep.IsSet())
-			{
-				// Reset the atomic flag. But as the CPU thread might have pushed some new data, we have to rerun the GPU loop
-				s_gpu_is_pending.Set();
-				s_gpu_is_running.Clear();
-				CommandProcessor::s_gpuMaySleep.Clear();
-			}
-		}
-		else
-		{
-			s_gpu_new_work_event.WaitFor(std::chrono::milliseconds(100));
-		}
-	}
-	// wake up SyncGPU if we were interrupted
-	s_video_buffer_cond.notify_all();
 	AsyncRequests::GetInstance()->SetEnable(false);
 	AsyncRequests::GetInstance()->SetPassthrough(true);
 }
@@ -403,11 +379,12 @@ void FlushGpu()
 	if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bCPUThread || g_use_deterministic_gpu_thread)
 		return;
 
-	while (s_gpu_is_running.IsSet() || s_gpu_is_pending.IsSet())
-	{
-		CommandProcessor::s_gpuMaySleep.Set();
-		s_gpu_done_event.Wait();
-	}
+	s_gpu_mainloop.Wait();
+}
+
+void GpuMaySleep()
+{
+	s_gpu_mainloop.AllowSleep();
 }
 
 bool AtBreakpoint()
@@ -429,6 +406,7 @@ void RunGpu()
 			if (g_use_deterministic_gpu_thread)
 			{
 				ReadDataFromFifoOnCPU(fifo.CPReadPointer);
+				s_gpu_mainloop.Wakeup();
 			}
 			else
 			{
@@ -460,11 +438,9 @@ void RunGpu()
 	}
 
 	// wake up GPU thread
-	if (SConfig::GetInstance().m_LocalCoreStartupParameter.bCPUThread && !s_gpu_is_running.IsSet())
+	if (SConfig::GetInstance().m_LocalCoreStartupParameter.bCPUThread)
 	{
-		s_gpu_is_pending.Set();
-		s_gpu_is_running.Set();
-		s_gpu_new_work_event.Set();
+		s_gpu_mainloop.Wakeup();
 	}
 }
 
diff --git a/Source/Core/VideoCommon/Fifo.h b/Source/Core/VideoCommon/Fifo.h
index b59004aa03..8a8a954fe0 100644
--- a/Source/Core/VideoCommon/Fifo.h
+++ b/Source/Core/VideoCommon/Fifo.h
@@ -43,6 +43,7 @@ void* PopFifoAuxBuffer(size_t size);
 
 void FlushGpu();
 void RunGpu();
+void GpuMaySleep();
 void RunGpuLoop();
 void ExitGpuLoop();
 void EmulatorState(bool running);