From 8471ab754d9965c13d51b11cb2f9ff9db903f3cb Mon Sep 17 00:00:00 2001
From: Billy Laws <blaws05@gmail.com>
Date: Wed, 31 Aug 2022 16:38:04 +0100
Subject: [PATCH] Introduce a spin lock for resources locked at a very high
 frequency

Constant buffer updates result in a barrage of std::mutex calls that take a lot of time even under no contention (around 5%). Using a custom spinlock in cases like these allows inlining locking code reducing the cost of locks under no contention to almost 0.
---
 app/CMakeLists.txt                            |  1 +
 .../main/cpp/skyline/common/address_space.h   |  3 +-
 app/src/main/cpp/skyline/common/spin_lock.cpp | 28 +++++++
 app/src/main/cpp/skyline/common/spin_lock.h   | 80 +++++++++++++++++++
 app/src/main/cpp/skyline/gpu/buffer.h         |  5 +-
 app/src/main/cpp/skyline/gpu/buffer_manager.h |  3 +-
 6 files changed, 116 insertions(+), 4 deletions(-)
 create mode 100644 app/src/main/cpp/skyline/common/spin_lock.cpp
 create mode 100644 app/src/main/cpp/skyline/common/spin_lock.h
diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt
index 6ae4e087..0cd1a6ea 100644
--- a/app/CMakeLists.txt
+++ b/app/CMakeLists.txt
@@ -142,6 +142,7 @@ add_library(skyline SHARED
         ${source_DIR}/skyline/common/exception.cpp
         ${source_DIR}/skyline/common/logger.cpp
         ${source_DIR}/skyline/common/signal.cpp
+        ${source_DIR}/skyline/common/spin_lock.cpp
         ${source_DIR}/skyline/common/uuid.cpp
         ${source_DIR}/skyline/common/trace.cpp
         ${source_DIR}/skyline/nce/guest.S
diff --git a/app/src/main/cpp/skyline/common/address_space.h b/app/src/main/cpp/skyline/common/address_space.h
index feec6d91..e87d5b39 100644
--- a/app/src/main/cpp/skyline/common/address_space.h
+++ b/app/src/main/cpp/skyline/common/address_space.h
@@ -7,6 +7,7 @@
 #include <concepts>
 #include <common.h>
 #include "segment_table.h"
+#include "spin_lock.h"
 
 namespace skyline {
     template<typename VaType, size_t AddressSpaceBits>
@@ -54,7 +55,7 @@ namespace skyline {
             }
         };
 
-        std::mutex blockMutex;
+        SpinLock blockMutex;
         std::vector<Block> blocks{Block{}};
 
         /**
diff --git a/app/src/main/cpp/skyline/common/spin_lock.cpp b/app/src/main/cpp/skyline/common/spin_lock.cpp
new file mode 100644
index 00000000..3f18782f
--- /dev/null
+++ b/app/src/main/cpp/skyline/common/spin_lock.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#include <chrono>
+#include <thread>
+#include "spin_lock.h"
+
+namespace skyline {
+    static constexpr size_t LockAttemptsPerYield{256};
+    static constexpr size_t LockAttemptsPerSleep{1024};
+    static constexpr size_t SleepDurationUs{1000};
+
+    void  __attribute__ ((noinline)) SpinLock::LockSlow() {
+        // We need to start with attempt = 1, otherwise
+        // attempt % LockAttemptsPerSleep is zero for the first iteration.
+        size_t attempt{1};
+        while (true) {
+            if (!locked.test_and_set(std::memory_order_acquire))
+                return;
+
+            if (attempt % LockAttemptsPerSleep == 0)
+                std::this_thread::sleep_for(std::chrono::microseconds(100));
+            else if (attempt % LockAttemptsPerYield == 0)
+                std::this_thread::yield();
+        }
+    }
+
+}
diff --git a/app/src/main/cpp/skyline/common/spin_lock.h b/app/src/main/cpp/skyline/common/spin_lock.h
new file mode 100644
index 00000000..11823786
--- /dev/null
+++ b/app/src/main/cpp/skyline/common/spin_lock.h
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#pragma once
+
+#include <atomic>
+#include <thread>
+#include "base.h"
+
+namespace skyline {
+    /**
+     * @brief A simple spin lock implementation with a yield and sleep fallback
+     * @note This should *ONLY* be used in situations where it is provably better than an std::mutex due to spinlocks having worse perfomance under heavy contention
+     */
+    class SpinLock {
+      private:
+        std::atomic_flag locked{};
+
+        void LockSlow();
+
+      public:
+        void lock() {
+            if (!locked.test_and_set(std::memory_order_acquire)) [[likely]]
+                return;
+
+            LockSlow();
+        }
+
+        bool try_lock() {
+            return !locked.test_and_set(std::memory_order_acquire);
+        }
+
+        void unlock() {
+            locked.clear(std::memory_order_release);
+        }
+    };
+
+    /**
+     * @brief Recursive lock built ontop of `SpinLock`
+     * @note This should *ONLY* be used in situations where it is provably better than an std::mutex due to spinlocks having worse perfomance under heavy contention
+     */
+    class RecursiveSpinLock {
+      private:
+        SpinLock backingLock;
+        u32 uses{};
+        std::thread::id tid{};
+
+      public:
+        void lock() {
+            if (tid == std::this_thread::get_id()) {
+                uses++;
+            } else {
+                backingLock.lock();
+                tid = std::this_thread::get_id();
+                uses = 1;
+            }
+        }
+
+        bool try_lock() {
+            if (tid == std::this_thread::get_id()) {
+                uses++;
+                return true;
+            } else {
+                if (backingLock.try_lock()) {
+                    tid = std::this_thread::get_id();
+                    uses = 1;
+                    return true;
+                }
+                return false;
+            }
+        }
+
+        void unlock() {
+            if (--uses == 0) {
+                tid = {};
+                backingLock.unlock();
+            }
+        }
+    };
+}
diff --git a/app/src/main/cpp/skyline/gpu/buffer.h b/app/src/main/cpp/skyline/gpu/buffer.h
index 00074463..fc2e8275 100644
--- a/app/src/main/cpp/skyline/gpu/buffer.h
+++ b/app/src/main/cpp/skyline/gpu/buffer.h
@@ -5,6 +5,7 @@
 
 #include <boost/functional/hash.hpp>
 #include <common/linear_allocator.h>
+#include <common/spin_lock.h>
 #include <nce.h>
 #include <gpu/tag_allocator.h>
 #include "megabuffer.h"
@@ -24,7 +25,7 @@ namespace skyline::gpu {
     class Buffer : public std::enable_shared_from_this<Buffer> {
       private:
         GPU &gpu;
-        std::mutex mutex; //!< Synchronizes any mutations to the buffer or its backing
+        SpinLock mutex; //!< Synchronizes any mutations to the buffer or its backing
         std::atomic<ContextTag> tag{}; //!< The tag associated with the last lock call
         memory::Buffer backing;
         std::optional<GuestBuffer> guest;
@@ -46,7 +47,7 @@ namespace skyline::gpu {
             SequencedWrites, //!< Sequenced writes must not modify the backing on the CPU due to it being read directly on the GPU, but non-sequenced writes can freely occur (SynchroniseHost etc)
             AllWrites //!< No CPU writes to the backing can be performed, all must be sequenced on the GPU or delayed till this is no longer the case
         } backingImmutability{}; //!< Describes how the buffer backing should be accessed by the current context
-        std::recursive_mutex stateMutex; //!< Synchronizes access to the dirty state and backing immutability
+        RecursiveSpinLock stateMutex; //!< Synchronizes access to the dirty state and backing immutability
 
         bool everHadInlineUpdate{}; //!< Whether the buffer has ever had an inline update since it was created, if this is set then megabuffering will be attempted by views to avoid the cost of inline GPU updates
 
diff --git a/app/src/main/cpp/skyline/gpu/buffer_manager.h b/app/src/main/cpp/skyline/gpu/buffer_manager.h
index f32b89fc..9eb7206a 100644
--- a/app/src/main/cpp/skyline/gpu/buffer_manager.h
+++ b/app/src/main/cpp/skyline/gpu/buffer_manager.h
@@ -5,6 +5,7 @@
 
 #include <common/linear_allocator.h>
 #include <common/segment_table.h>
+#include <common/spin_lock.h>
 #include "buffer.h"
 
 namespace skyline::gpu {
@@ -28,7 +29,7 @@ namespace skyline::gpu {
         struct LockedBuffer {
             std::shared_ptr<Buffer> buffer;
             ContextLock<Buffer> lock;
-            std::unique_lock<std::recursive_mutex> stateLock;
+            std::unique_lock<RecursiveSpinLock> stateLock;
 
             LockedBuffer(std::shared_ptr<Buffer> pBuffer, ContextTag tag);