diff --git a/app/src/main/cpp/skyline/gpu/command_scheduler.cpp b/app/src/main/cpp/skyline/gpu/command_scheduler.cpp
index 0092db84..0c78c7fb 100644
--- a/app/src/main/cpp/skyline/gpu/command_scheduler.cpp
+++ b/app/src/main/cpp/skyline/gpu/command_scheduler.cpp
@@ -32,10 +32,11 @@ namespace skyline::gpu {
     }
 
     CommandScheduler::CommandBufferSlot::CommandBufferSlot(vk::raii::Device &device, vk::CommandBuffer commandBuffer, vk::raii::CommandPool &pool)
-        : device(device),
-          commandBuffer(device, static_cast<VkCommandBuffer>(commandBuffer), static_cast<VkCommandPool>(*pool)),
-          fence(device, vk::FenceCreateInfo{}),
-          cycle(std::make_shared<FenceCycle>(device, *fence)) {}
+        : device{device},
+          commandBuffer{device, static_cast<VkCommandBuffer>(commandBuffer), static_cast<VkCommandPool>(*pool)},
+          fence{device, vk::FenceCreateInfo{}},
+          semaphore{device, vk::SemaphoreCreateInfo{}},
+          cycle{std::make_shared<FenceCycle>(device, *fence, *semaphore)} {}
 
     CommandScheduler::CommandScheduler(const DeviceState &state, GPU &pGpu)
         : state{state},
@@ -55,7 +56,7 @@ namespace skyline::gpu {
             if (!slot.active.test_and_set(std::memory_order_acq_rel)) {
                 if (slot.cycle->Poll()) {
                     slot.commandBuffer.reset();
-                    slot.cycle = std::make_shared<FenceCycle>(slot.device, *slot.fence);
+                    slot.cycle = std::make_shared<FenceCycle>(*slot.cycle);
                     return {slot};
                 } else {
                     slot.active.clear(std::memory_order_release);
@@ -76,12 +77,29 @@ namespace skyline::gpu {
         return {pool->buffers.emplace_back(gpu.vkDevice, commandBuffer, pool->vkCommandPool)};
     }
 
-    void CommandScheduler::SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, std::shared_ptr<FenceCycle> cycle) {
+    void CommandScheduler::SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, std::shared_ptr<FenceCycle> cycle, span<vk::Semaphore> waitSemaphores, span<vk::Semaphore> signalSemaphores) {
+        boost::container::small_vector<vk::Semaphore, 3> fullWaitSemaphores{waitSemaphores.begin(), waitSemaphores.end()};
+        boost::container::small_vector<vk::PipelineStageFlags, 3> fullWaitStages{waitSemaphores.size(), vk::PipelineStageFlagBits::eAllCommands};
+
+        if (cycle->semaphoreSubmitWait) {
+            fullWaitSemaphores.push_back(cycle->semaphore);
+            // We don't need a full barrier since this is only done to ensure the semaphore is unsignalled
+            fullWaitStages.push_back(vk::PipelineStageFlagBits::eTopOfPipe);
+        }
+
+        boost::container::small_vector<vk::Semaphore, 2> fullSignalSemaphores{signalSemaphores.begin(), signalSemaphores.end()};
+        fullSignalSemaphores.push_back(cycle->semaphore);
+
         {
-            std::scoped_lock lock(gpu.queueMutex);
+            std::scoped_lock lock{gpu.queueMutex};
             gpu.vkQueue.submit(vk::SubmitInfo{
                 .commandBufferCount = 1,
                 .pCommandBuffers = &*commandBuffer,
+                .waitSemaphoreCount = static_cast<u32>(waitSemaphores.size()),
+                .pWaitSemaphores = fullWaitSemaphores.data(),
+                .pWaitDstStageMask = fullWaitStages.data(),
+                .signalSemaphoreCount = static_cast<u32>(fullSignalSemaphores.size()),
+                .pSignalSemaphores = fullSignalSemaphores.data(),
             }, cycle->fence);
         }
 
diff --git a/app/src/main/cpp/skyline/gpu/command_scheduler.h b/app/src/main/cpp/skyline/gpu/command_scheduler.h
index 02f19290..6d3c3e06 100644
--- a/app/src/main/cpp/skyline/gpu/command_scheduler.h
+++ b/app/src/main/cpp/skyline/gpu/command_scheduler.h
@@ -21,6 +21,7 @@ namespace skyline::gpu {
             const vk::raii::Device &device;
             vk::raii::CommandBuffer commandBuffer;
             vk::raii::Fence fence; //!< A fence used for tracking all submits of a buffer
+            vk::raii::Semaphore semaphore; //!< A semaphore used for tracking work status on the GPU
             std::shared_ptr<FenceCycle> cycle; //!< The latest cycle on the fence, all waits must be performed through this
 
             CommandBufferSlot(vk::raii::Device &device, vk::CommandBuffer commandBuffer, vk::raii::CommandPool &pool);
@@ -94,7 +95,7 @@ namespace skyline::gpu {
              */
             std::shared_ptr<FenceCycle> Reset() {
                 slot->cycle->Wait();
-                slot->cycle = std::make_shared<FenceCycle>(slot->device, *slot->fence);
+                slot->cycle = std::make_shared<FenceCycle>(*slot->cycle);
                 slot->commandBuffer.reset();
                 return slot->cycle;
             }
@@ -114,13 +115,15 @@ namespace skyline::gpu {
          * @note The supplied command buffer and cycle **must** be from AllocateCommandBuffer()
          * @note Any cycle submitted via this method does not need to destroy dependencies manually, the waiter thread will handle this
          */
-        void SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, std::shared_ptr<FenceCycle> cycle);
+        void SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, std::shared_ptr<FenceCycle> cycle, span<vk::Semaphore> waitSemaphores = {}, span<vk::Semaphore> signalSemaphore = {});
 
         /**
          * @brief Submits a command buffer recorded with the supplied function synchronously
+         * @param waitSemaphores A span of all (excl fence cycle) semaphores that should be waited on by the GPU before executing the command buffer
+         * @param signalSemaphore A span of all semaphores that should be signalled by the GPU after executing the command buffer
          */
         template<typename RecordFunction>
-        std::shared_ptr<FenceCycle> Submit(RecordFunction recordFunction) {
+        std::shared_ptr<FenceCycle> Submit(RecordFunction recordFunction, span<vk::Semaphore> waitSemaphores = {}, span<vk::Semaphore> signalSemaphores = {}) {
             auto commandBuffer{AllocateCommandBuffer()};
             try {
                 commandBuffer->begin(vk::CommandBufferBeginInfo{
@@ -130,29 +133,7 @@ namespace skyline::gpu {
                 commandBuffer->end();
 
                 auto cycle{commandBuffer.GetFenceCycle()};
-                SubmitCommandBuffer(*commandBuffer, cycle);
-                return cycle;
-            } catch (...) {
-                commandBuffer.GetFenceCycle()->Cancel();
-                std::rethrow_exception(std::current_exception());
-            }
-        }
-
-        /**
-         * @note Same as Submit but with FenceCycle as an argument rather than return value
-         */
-        template<typename RecordFunction>
-        std::shared_ptr<FenceCycle> SubmitWithCycle(RecordFunction recordFunction) {
-            auto commandBuffer{AllocateCommandBuffer()};
-            auto cycle{commandBuffer.GetFenceCycle()};
-            try {
-                commandBuffer->begin(vk::CommandBufferBeginInfo{
-                    .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
-                });
-                recordFunction(*commandBuffer, cycle);
-                commandBuffer->end();
-
-                SubmitCommandBuffer(*commandBuffer, cycle);
+                SubmitCommandBuffer(*commandBuffer, cycle, waitSemaphores, signalSemaphores);
                 return cycle;
             } catch (...) {
                 commandBuffer.GetFenceCycle()->Cancel();
diff --git a/app/src/main/cpp/skyline/gpu/fence_cycle.h b/app/src/main/cpp/skyline/gpu/fence_cycle.h
index 98a3c035..c38a30f2 100644
--- a/app/src/main/cpp/skyline/gpu/fence_cycle.h
+++ b/app/src/main/cpp/skyline/gpu/fence_cycle.h
@@ -25,6 +25,10 @@ namespace skyline::gpu {
         std::condition_variable_any submitCondition;
         bool submitted{}; //!< If the fence has been submitted to the GPU
         vk::Fence fence;
+        vk::Semaphore semaphore; //!< Semaphore that will be signalled upon GPU completion of the fence
+        bool semaphoreSubmitWait{}; //!< If the semaphore needs to be waited on (on GPU) before the fence's command buffer begins. Used to ensure fences that wouldn't otherwise be unsignalled are unsignalled
+        bool nextSemaphoreSubmitWait{true}; //!< If the next fence cycle created from this one after it's signalled should wait on the semaphore to unsignal it
+        std::shared_ptr<FenceCycle> semaphoreUnsignalCycle{}; //!< If the semaphore is used on the GPU, the cycle for the submission that uses it, so it can be waited on before the fence is signalled to ensure the semaphore is unsignalled
 
         friend CommandScheduler;
 
@@ -41,11 +45,15 @@ namespace skyline::gpu {
         }
 
       public:
-        FenceCycle(const vk::raii::Device &device, vk::Fence fence, bool signalled = false) : signalled{signalled}, device{device}, fence{fence} {
+        FenceCycle(const vk::raii::Device &device, vk::Fence fence, vk::Semaphore semaphore, bool signalled = false) : signalled{signalled}, device{device}, fence{fence}, semaphore{semaphore}, nextSemaphoreSubmitWait{!signalled} {
             if (!signalled)
                 device.resetFences(fence);
         }
 
+        explicit FenceCycle(const FenceCycle &cycle) : signalled{false}, device{cycle.device}, fence{cycle.fence}, semaphore{cycle.semaphore}, semaphoreSubmitWait{cycle.nextSemaphoreSubmitWait} {
+            device.resetFences(fence);
+        }
+
         ~FenceCycle() {
             Wait();
         }
@@ -58,6 +66,33 @@ namespace skyline::gpu {
             DestroyDependencies();
         }
 
+        /**
+         * @brief Executes a function with the fence locked to record a usage of its semaphore, if no semaphore can be provided then a CPU-side wait will be performed instead
+         */
+        std::shared_ptr<FenceCycle> RecordSemaphoreWaitUsage(std::function<std::shared_ptr<FenceCycle>(vk::Semaphore sema)> &&func) {
+            // We can't submit any semaphore waits until the signal has been submitted, so do that first
+            WaitSubmit();
+
+            std::unique_lock lock{mutex};
+
+            // If we already have a semaphore usage, just wait on the fence since we can't wait on it twice and have no way to add one after the fact
+            if (semaphoreUnsignalCycle) {
+                // Safe to unlock since semaphoreUnsignalCycle can never be reset
+                lock.unlock();
+
+                Wait();
+                return func({});
+            }
+
+            // If we're already signalled then there's no need to wait on the semaphore
+            if (signalled.test(std::memory_order_relaxed))
+                return func({});
+
+            semaphoreUnsignalCycle = func(semaphore);
+            nextSemaphoreSubmitWait = false; // We don't need a semaphore wait on the next fence cycle to unsignal the semaphore anymore as the usage will do that
+            return semaphoreUnsignalCycle;
+        }
+
         /**
          * @brief Waits for submission of the command buffer associated with this cycle to the GPU
          */
@@ -65,11 +100,21 @@ namespace skyline::gpu {
             if (signalled.test(std::memory_order_consume))
                 return;
 
+            std::unique_lock lock{mutex};
+            if (submitted)
+                return;
+
+            if (signalled.test(std::memory_order_consume))
+                return;
+
+            lock.unlock();
             chainedCycles.Iterate([&](const auto &cycle) {
+                if (!cycle->Find(this))
+                    raise(SIGTRAP);
                 cycle->WaitSubmit();
             });
+            lock.lock();
 
-            std::unique_lock lock{mutex};
             submitCondition.wait(lock, [this] { return submitted; });
         }
 
@@ -84,19 +129,19 @@ namespace skyline::gpu {
                 return;
             }
 
-            chainedCycles.Iterate([shouldDestroy](auto &cycle) {
+            chainedCycles.Iterate([shouldDestroy, this](auto &cycle) {
                 cycle->Wait(shouldDestroy);
             });
 
             std::unique_lock lock{mutex};
-            submitCondition.wait(lock, [&] { return submitted; });
-
-            if (signalled.test(std::memory_order_consume)) {
+            if (signalled.test(std::memory_order_relaxed)) {
                 if (shouldDestroy)
                     DestroyDependencies();
                 return;
             }
 
+            submitCondition.wait(lock, [&] { return submitted; });
+
             vk::Result waitResult;
             while ((waitResult = (*device).waitForFences(1, &fence, false, std::numeric_limits<u64>::max(), *device.getDispatcher())) != vk::Result::eSuccess) {
                 if (waitResult == vk::Result::eTimeout)
@@ -110,68 +155,14 @@ namespace skyline::gpu {
                 throw exception("An error occurred while waiting for fence 0x{:X}: {}", static_cast<VkFence>(fence), vk::to_string(waitResult));
             }
 
-            signalled.test_and_set(std::memory_order_release);
+            if (semaphoreUnsignalCycle)
+                semaphoreUnsignalCycle->Wait();
+
+            signalled.test_and_set(std::memory_order_relaxed);
             if (shouldDestroy)
                 DestroyDependencies();
         }
 
-        /**
-         * @brief Wait on a fence cycle with a timeout in nanoseconds
-         * @param shouldDestroy If true, the dependencies of this cycle will be destroyed after the fence is signalled
-         * @return If the wait was successful or timed out
-         */
-        bool Wait(i64 timeoutNs, bool shouldDestroy = false) {
-            if (signalled.test(std::memory_order_consume)) {
-                if (shouldDestroy)
-                    DestroyDependencies();
-                return true;
-            }
-
-            i64 startTime{util::GetTimeNs()}, initialTimeout{timeoutNs};
-            if (!chainedCycles.AllOf([&](auto &cycle) {
-                if (!cycle->Wait(timeoutNs, shouldDestroy))
-                    return false;
-                timeoutNs = std::max<i64>(0, initialTimeout - (util::GetTimeNs() - startTime));
-                return true;
-            }))
-                return false;
-
-            std::unique_lock lock{mutex, std::defer_lock};
-            if (!lock.try_lock_for(std::chrono::nanoseconds{timeoutNs}))
-                return false;
-
-            if (!submitCondition.wait_for(lock, std::chrono::nanoseconds(timeoutNs), [&] { return submitted; }))
-                return false;
-
-            timeoutNs = std::max<i64>(0, initialTimeout - (util::GetTimeNs() - startTime));
-
-            vk::Result waitResult;
-            while ((waitResult = (*device).waitForFences(1, &fence, false, static_cast<u64>(timeoutNs), *device.getDispatcher())) != vk::Result::eSuccess) {
-                if (waitResult == vk::Result::eTimeout)
-                    break;
-
-                if (waitResult == vk::Result::eErrorInitializationFailed) {
-                    timeoutNs = std::max<i64>(0, initialTimeout - (util::GetTimeNs() - startTime));
-                    continue;
-                }
-
-                throw exception("An error occurred while waiting for fence 0x{:X}: {}", static_cast<VkFence>(fence), vk::to_string(waitResult));
-            }
-
-            if (waitResult == vk::Result::eSuccess) {
-                signalled.test_and_set(std::memory_order_release);
-                if (shouldDestroy)
-                    DestroyDependencies();
-                return true;
-            } else {
-                return false;
-            }
-        }
-
-        bool Wait(std::chrono::duration<i64, std::nano> timeout, bool shouldDestroy = false) {
-            return Wait(timeout.count(), shouldDestroy);
-        }
-
         /**
          * @param quick Skips the call to check the fence's status, just checking the signalled flag
          * @return If the fence is signalled currently or not
@@ -193,12 +184,21 @@ namespace skyline::gpu {
             if (!lock)
                 return false;
 
+            if (signalled.test(std::memory_order_relaxed)) {
+                if (shouldDestroy)
+                    DestroyDependencies();
+                return true;
+            }
+
             if (!submitted)
                 return false;
 
             auto status{(*device).getFenceStatus(fence, *device.getDispatcher())};
             if (status == vk::Result::eSuccess) {
-                signalled.test_and_set(std::memory_order_release);
+                if (semaphoreUnsignalCycle && !semaphoreUnsignalCycle->Poll())
+                    return false;
+
+                signalled.test_and_set(std::memory_order_relaxed);
                 if (shouldDestroy)
                     DestroyDependencies();
                 return true;
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
index b43c62ff..0cda42d9 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@@ -28,11 +28,12 @@ namespace skyline::gpu::interconnect {
           },
           commandBuffer{AllocateRaiiCommandBuffer(gpu, commandPool)},
           fence{gpu.vkDevice, vk::FenceCreateInfo{ .flags = vk::FenceCreateFlagBits::eSignaled }},
-          cycle{std::make_shared<FenceCycle>(gpu.vkDevice, *fence, true)} {}
+          semaphore{gpu.vkDevice, vk::SemaphoreCreateInfo{}},
+          cycle{std::make_shared<FenceCycle>(gpu.vkDevice, *fence, *semaphore, true)} {}
 
     std::shared_ptr<FenceCycle> CommandRecordThread::Slot::Reset(GPU &gpu) {
         cycle->Wait();
-        cycle = std::make_shared<FenceCycle>(gpu.vkDevice, *fence);
+        cycle = std::make_shared<FenceCycle>(*cycle);
         // Command buffer doesn't need to be reset since that's done implicitly by begin
         return cycle;
     }
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
index 72b3df0e..92da3c47 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
@@ -24,6 +24,7 @@ namespace skyline::gpu::interconnect {
             vk::raii::CommandPool commandPool; //!< Use one command pool per slot since command buffers from different slots may be recorded into on multiple threads at the same time
             vk::raii::CommandBuffer commandBuffer;
             vk::raii::Fence fence;
+            vk::raii::Semaphore semaphore;
             std::shared_ptr<FenceCycle> cycle;
             boost::container::stable_vector<node::NodeVariant> nodes;
             LinearAllocatorState<> allocator;