diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
index 6c4b5db5..81f295c3 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@@ -672,23 +672,37 @@ namespace skyline::gpu::interconnect {
 
         executionTag = AllocateTag();
 
-        if (!slot->nodes.empty()) {
-            TRACE_EVENT("gpu", "CommandExecutor::Submit");
+        // Ensure all pushed callbacks wait for the submission to have finished GPU execution
+        if (!slot->nodes.empty())
+            waiterThread.Queue(cycle, {});
 
-            if (callback && *state.settings->useDirectMemoryImport)
-                waiterThread.Queue(cycle, std::move(callback));
-            else
-                waiterThread.Queue(cycle, {});
+        if (*state.settings->useDirectMemoryImport) {
+            // When DMI is in use, callbacks and deferred actions should be executed in sequence with the host GPU
+            for (auto &actionCb : pendingDeferredActions)
+                waiterThread.Queue(nullptr, std::move(actionCb));
 
-            SubmitInternal();
-            submissionNumber++;
-        } else {
-            if (callback && *state.settings->useDirectMemoryImport)
+            pendingDeferredActions.clear();
+
+            if (callback)
                 waiterThread.Queue(nullptr, std::move(callback));
         }
 
-        if (callback && !*state.settings->useDirectMemoryImport)
-            callback();
+        if (!slot->nodes.empty()) {
+            TRACE_EVENT("gpu", "CommandExecutor::Submit");
+            SubmitInternal();
+            submissionNumber++;
+        }
+
+        if (!*state.settings->useDirectMemoryImport) {
+            // When DMI is not in use, execute callbacks immediately after submission
+            for (auto &actionCb : pendingDeferredActions)
+                actionCb();
+
+            pendingDeferredActions.clear();
+
+            if (callback)
+                callback();
+        }
 
         ResetInternal();
 
@@ -710,6 +724,10 @@ namespace skyline::gpu::interconnect {
         }
     }
 
+    void CommandExecutor::AddDeferredAction(std::function<void()> &&callback) {
+        pendingDeferredActions.emplace_back(std::move(callback));
+    }
+
     void CommandExecutor::LockPreserve() {
         if (!preserveLocked) {
             preserveLocked = true;
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
index 1d8cde1f..ec89fa20 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
@@ -202,6 +202,8 @@ namespace skyline::gpu::interconnect {
         std::vector<std::function<void()>> flushCallbacks; //!< Set of persistent callbacks that will be called at the start of Execute in order to flush data required for recording
         std::vector<std::function<void()>> pipelineChangeCallbacks; //!< Set of persistent callbacks that will be called after any non-Maxwell 3D engine changes the active pipeline
 
+        std::vector<std::function<void()>> pendingDeferredActions;
+
         u32 nextCheckpointId{}; //!< The ID of the next debug checkpoint to be allocated
 
         void RotateRecordSlot();
@@ -372,6 +374,11 @@ namespace skyline::gpu::interconnect {
          */
         void Submit(std::function<void()> &&callback = {}, bool wait = false);
 
+        /**
+         * @brief Adds an action to be executed upon current cycle completion (if DMI is on, otherwise after submission)
+         */
+        void AddDeferredAction(std::function<void()> &&callback);
+
         /**
          * @brief Locks all preserve attached buffers/textures
          * @note This **MUST** be called before attaching any buffers/textures to an execution
diff --git a/app/src/main/cpp/skyline/soc/gm20b/engines/gpfifo.cpp b/app/src/main/cpp/skyline/soc/gm20b/engines/gpfifo.cpp
index ae59d9f4..67c47f5f 100644
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/gpfifo.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/gpfifo.cpp
@@ -19,7 +19,7 @@ namespace skyline::soc::gm20b::engine {
             ENGINE_STRUCT_CASE(syncpoint, action, {
                 if (action.operation == Registers::Syncpoint::Operation::Incr) {
                     Logger::Debug("Increment syncpoint: {}", +action.index);
-                    channelCtx.executor.Submit([=, syncpoints = &this->syncpoints, index = action.index]() {
+                    channelCtx.executor.AddDeferredAction([=, syncpoints = &this->syncpoints, index = action.index]() {
                         syncpoints->at(index).host.Increment();
                     });
                     syncpoints.at(action.index).guest.Increment();
@@ -50,7 +50,7 @@ namespace skyline::soc::gm20b::engine {
                         channelCtx.Lock();
                         break;
                     case Registers::Semaphore::Operation::Release:
-                        channelCtx.executor.Submit([this, action, address, payload = registers.semaphore->payload] () {
+                        channelCtx.executor.AddDeferredAction([this, action, address, payload = registers.semaphore->payload] () {
                             // Write timestamp first to ensure ordering
                             if (action.releaseSize == Registers::Semaphore::ReleaseSize::SixteenBytes) {
                                 channelCtx.asCtx->gmmu.Write<u32>(address + 4, 0);
@@ -121,7 +121,7 @@ namespace skyline::soc::gm20b::engine {
                 channelCtx.executor.AddFullBarrier();
             })
             ENGINE_CASE(setReference, {
-                channelCtx.executor.Submit();
+                channelCtx.executor.AddFullBarrier();
             })
         }
     };
diff --git a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
index be2af8d6..5cc1cbaa 100644
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
@@ -264,7 +264,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
 
             ENGINE_CASE(syncpointAction, {
                 Logger::Debug("Increment syncpoint: {}", static_cast<u16>(syncpointAction.id));
-                channelCtx.executor.Submit([=, syncpoints = &this->syncpoints, index = syncpointAction.id]() {
+                channelCtx.executor.AddDeferredAction([=, syncpoints = &this->syncpoints, index = syncpointAction.id]() {
                     syncpoints->at(index).host.Increment();
                 });
                 syncpoints.at(syncpointAction.id).guest.Increment();
@@ -399,7 +399,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
 
                 switch (info.op) {
                     case type::SemaphoreInfo::Op::Release:
-                        channelCtx.executor.Submit([=, this, semaphore = *registers.semaphore]() {
+                        channelCtx.executor.AddDeferredAction([=, this, semaphore = *registers.semaphore]() {
                             WriteSemaphoreResult(semaphore, semaphore.payload);
                         });
                         break;
@@ -407,7 +407,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
                     case type::SemaphoreInfo::Op::Counter: {
                         switch (info.counterType) {
                             case type::SemaphoreInfo::CounterType::Zero:
-                                channelCtx.executor.Submit([=, this, semaphore = *registers.semaphore]() {
+                                channelCtx.executor.AddDeferredAction([=, this, semaphore = *registers.semaphore]() {
                                     WriteSemaphoreResult(semaphore, semaphore.payload);
                                 });
                                 break;