From f361fddd3e4f4d3c92e92cc47240ddf00c098d06 Mon Sep 17 00:00:00 2001
From: Mr-Wiseguy <mrwiseguyromhacking@gmail.com>
Date: Mon, 23 Oct 2023 15:32:30 -0400
Subject: [PATCH] More WIP linux work, upgraded libultra to include changes
 from BT recomp

---
 include/recomp.h              |  22 +-
 include/rsp.h                 |  13 +-
 include/rt64_layer.h          |  18 +-
 portultra/audio.cpp           |   4 +-
 portultra/events.cpp          |  34 +++-
 portultra/mesgqueue.cpp       | 296 ++++++++++++++++-----------
 portultra/multilibultra.hpp   |  63 ++++--
 portultra/scheduler.cpp       | 273 ++++++++++++++++++-------
 portultra/threads.cpp         |  64 +++---
 portultra/timer.cpp           |   3 +-
 portultra/ultra64.h           |  12 +-
 portultra/ultrainit.cpp       |   2 +-
 src/dp.cpp                    |  43 +++-
 src/eep.cpp                   |  38 +++-
 src/main/main.cpp             | 366 ++++++++++++++++++++++++++++++++++
 src/overlays.cpp              |  40 +++-
 src/pi.cpp                    |   2 +-
 src/portultra_translation.cpp |   6 +-
 src/recomp.cpp                | 114 ++++++++---
 src/rt64_layer.cpp            |  12 +-
 src/vi.cpp                    |   9 +
 21 files changed, 1108 insertions(+), 326 deletions(-)
 create mode 100644 src/main/main.cpp

diff --git a/include/recomp.h b/include/recomp.h
index b33645d..bb40c0a 100644
--- a/include/recomp.h
+++ b/include/recomp.h
@@ -46,8 +46,8 @@ typedef uint64_t gpr;
     //(*(uint8_t*)(rdram + ((((reg) + (offset)) ^ 3) & 0x3FFFFFF)))
 
 #define SD(val, offset, reg) { \
-    *(uint32_t*)(rdram + ((((reg) + (offset) + 4)) - 0xFFFFFFFF80000000)) = (uint32_t)((val) >> 0); \
-    *(uint32_t*)(rdram + ((((reg) + (offset) + 0)) - 0xFFFFFFFF80000000)) = (uint32_t)((val) >> 32); \
+    *(uint32_t*)(rdram + ((((reg) + (offset) + 4)) - 0xFFFFFFFF80000000)) = (uint32_t)((gpr)(val) >> 0); \
+    *(uint32_t*)(rdram + ((((reg) + (offset) + 0)) - 0xFFFFFFFF80000000)) = (uint32_t)((gpr)(val) >> 32); \
 }
 
 //#define SD(val, offset, reg) { \
@@ -239,15 +239,26 @@ typedef struct {
         r8,  r9,  r10, r11, r12, r13, r14, r15,
         r16, r17, r18, r19, r20, r21, r22, r23,
         r24, r25, r26, r27, r28, r29, r30, r31;
-    fpr f0,  f2,  f4,  f6,  f8,  f10, f12, f14,
-        f16, f18, f20, f22, f24, f26, f28, f30;
+    fpr f0,  f1,  f2,  f3,  f4,  f5,  f6,  f7,
+        f8,  f9,  f10, f11, f12, f13, f14, f15,
+        f16, f17, f18, f19, f20, f21, f22, f23,
+        f24, f25, f26, f27, f28, f29, f30, f31;
     uint64_t hi, lo;
+    uint32_t* f_odd;
+    uint32_t status_reg;
+    uint8_t mips3_float_mode;
 } recomp_context;
 
+// Checks if the target is an even float register or that mips3 float mode is enabled
+#define CHECK_FR(ctx, idx) \
+    assert(((idx) & 1) == 0 || (ctx)->mips3_float_mode)
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+void cop0_status_write(recomp_context* ctx, gpr value);
+gpr cop0_status_read(recomp_context* ctx);
 void switch_error(const char* func, uint32_t vram, uint32_t jtbl);
 void do_break(uint32_t vram);
 
@@ -272,6 +283,9 @@ extern int32_t section_addresses[];
 #define RELOC_LO16(section_index, offset) \
     LO16(section_addresses[section_index] + (offset))
 
+// For Banjo-Tooie
+void recomp_syscall_handler(uint8_t* rdram, recomp_context* ctx, int32_t instruction_vram);
+
 // For the Mario Party games (not working)
 //// This has to be in this file so it can be inlined
 //struct jmp_buf_storage {
diff --git a/include/rsp.h b/include/rsp.h
index a5e53fb..394920f 100644
--- a/include/rsp.h
+++ b/include/rsp.h
@@ -8,7 +8,8 @@ enum class RspExitReason {
     Invalid,
     Broke,
     ImemOverrun,
-    UnhandledJumpTarget
+    UnhandledJumpTarget,
+    Unsupported
 };
 
 extern uint8_t dmem[];
@@ -16,19 +17,19 @@ extern uint16_t rspReciprocals[512];
 extern uint16_t rspInverseSquareRoots[512];
 
 #define RSP_MEM_W(offset, addr) \
-    (*reinterpret_cast<uint32_t*>(dmem + (offset) + (addr)))
+    (*reinterpret_cast<uint32_t*>(dmem + (0xFFF & ((offset) + (addr)))))
 
 #define RSP_MEM_H(offset, addr) \
-    (*reinterpret_cast<int16_t*>(dmem + (((offset) + (addr)) ^ 2)))
+    (*reinterpret_cast<int16_t*>(dmem + (0xFFF & (((offset) + (addr)) ^ 2))))
 
 #define RSP_MEM_HU(offset, addr) \
-    (*reinterpret_cast<uint16_t*>(dmem + (((offset) + (addr)) ^ 2)))
+    (*reinterpret_cast<uint16_t*>(dmem + (0xFFF & (((offset) + (addr)) ^ 2))))
 
 #define RSP_MEM_B(offset, addr) \
-    (*reinterpret_cast<int8_t*>(dmem + (((offset) + (addr)) ^ 3)))
+    (*reinterpret_cast<int8_t*>(dmem + (0xFFF & (((offset) + (addr)) ^ 3))))
 
 #define RSP_MEM_BU(offset, addr) \
-    (*reinterpret_cast<uint8_t*>(dmem + (((offset) + (addr)) ^ 3)))
+    (*reinterpret_cast<uint8_t*>(dmem + (0xFFF & (((offset) + (addr)) ^ 3))))
     
 #define RSP_ADD32(a, b) \
     ((int32_t)((a) + (b)))
diff --git a/include/rt64_layer.h b/include/rt64_layer.h
index c334c1c..91a0e64 100644
--- a/include/rt64_layer.h
+++ b/include/rt64_layer.h
@@ -1,11 +1,13 @@
 #ifndef __RT64_LAYER_H__
 #define __RT64_LAYER_H__
 
-typedef struct {
-    void* hWnd;
-    void* hStatusBar;
+#include "../portultra/multilibultra.hpp"
 
-    int Reserved;
+typedef struct {
+    // void* hWnd;
+    // void* hStatusBar;
+
+    // int Reserved;
 
     unsigned char* HEADER;  /* This is the rom header (first 40h bytes of the rom) */
     unsigned char* RDRAM;
@@ -40,9 +42,9 @@ typedef struct {
 
     void (*CheckInterrupts)(void);
 
-    // unsigned int version;
-    // unsigned int* SP_STATUS_REG;
-    // const unsigned int* RDRAM_SIZE;
+    unsigned int version;
+    unsigned int* SP_STATUS_REG;
+    const unsigned int* RDRAM_SIZE;
 } GFX_INFO;
 
 #ifdef _WIN32
@@ -61,7 +63,7 @@ typedef struct {
 //DLLEXPORT void (CALL *UpdateScreen)(void) = nullptr;
 //DLLEXPORT void (CALL *PumpEvents)(void) = nullptr;
 
-DLLIMPORT int InitiateGFX(GFX_INFO Gfx_Info);
+extern "C" int InitiateGFXLinux(GFX_INFO Gfx_Info, Window window, Display *display);
 DLLIMPORT void ProcessRDPList(void);
 DLLIMPORT void ProcessDList(void);
 DLLIMPORT void UpdateScreen(void);
diff --git a/portultra/audio.cpp b/portultra/audio.cpp
index 3dca8ad..dd51c04 100644
--- a/portultra/audio.cpp
+++ b/portultra/audio.cpp
@@ -64,8 +64,8 @@ float buffer_offset_frames = 0.5f;
 uint32_t Multilibultra::get_remaining_audio_bytes() {
 	// Get the number of remaining buffered audio bytes.
 	uint32_t buffered_byte_count;
-	if (audio_callbacks.get_samples_remaining()) {
-		buffered_byte_count = audio_callbacks.get_samples_remaining() * sizeof(int16_t);
+	if (audio_callbacks.get_frames_remaining != nullptr) {
+		buffered_byte_count = audio_callbacks.get_frames_remaining() * 2 * sizeof(int16_t);
 	}
 	else {
 		buffered_byte_count = 100;
diff --git a/portultra/events.cpp b/portultra/events.cpp
index eb8686f..f58a5e5 100644
--- a/portultra/events.cpp
+++ b/portultra/events.cpp
@@ -93,6 +93,8 @@ extern "C" void osViSetEvent(RDRAM_ARG PTR(OSMesgQueue) mq_, OSMesg msg, u32 ret
     events_context.vi.retrace_count = retrace_count;
 }
 
+uint64_t total_vis = 0;
+
 void vi_thread_func() {
     Multilibultra::set_native_thread_name("VI Thread");
     // This thread should be prioritized over every other thread in the application, as it's what allows
@@ -100,7 +102,6 @@ void vi_thread_func() {
     Multilibultra::set_native_thread_priority(Multilibultra::ThreadPriority::Critical);
     using namespace std::chrono_literals;
     
-    uint64_t total_vis = 0;
     int remaining_retraces = events_context.vi.retrace_count;
 
     while (true) {
@@ -157,7 +158,7 @@ void dp_complete() {
     osSendMesg(PASS_RDRAM events_context.dp.mq, events_context.dp.msg, OS_MESG_NOBLOCK);
 }
 
-void RT64Init(uint8_t* rom, uint8_t* rdram, void* window_handle);
+void RT64Init(uint8_t* rom, uint8_t* rdram, Multilibultra::WindowHandle window_handle);
 void RT64SendDL(uint8_t* rdram, const OSTask* task);
 void RT64UpdateScreen(uint32_t vi_origin);
 void RT64ChangeWindow();
@@ -235,12 +236,29 @@ void task_thread_func(uint8_t* rdram, uint8_t* rom, std::atomic_flag* thread_rea
     }
 }
 
-void gfx_thread_func(uint8_t* rdram, uint8_t* rom, std::atomic_flag* thread_ready, void* window_handle) {
+static Multilibultra::gfx_callbacks_t gfx_callbacks;
+
+void Multilibultra::set_gfx_callbacks(const gfx_callbacks_t* callbacks) {
+    if (callbacks != nullptr) {
+        gfx_callbacks = *callbacks;
+    }
+}
+
+void gfx_thread_func(uint8_t* rdram, uint8_t* rom, std::atomic_flag* thread_ready, Multilibultra::WindowHandle window_handle) {
     using namespace std::chrono_literals;
+    Multilibultra::gfx_callbacks_t::gfx_data_t gfx_data{};
 
     Multilibultra::set_native_thread_name("Gfx Thread");
     Multilibultra::set_native_thread_priority(Multilibultra::ThreadPriority::Normal);
 
+    if (gfx_callbacks.create_gfx != nullptr) {
+        gfx_data = gfx_callbacks.create_gfx();
+    }
+
+    if (gfx_callbacks.create_window != nullptr) {
+        window_handle = gfx_callbacks.create_window(gfx_data);
+    }
+
     RT64Init(rom, rdram, window_handle);
     
     rsp_constants_init();
@@ -263,15 +281,13 @@ void gfx_thread_func(uint8_t* rdram, uint8_t* rom, std::atomic_flag* thread_read
                 RT64SendDL(rdram, &task_action->task);
                 dp_complete();
             } else if (const auto* swap_action = std::get_if<SwapBuffersAction>(&action)) {
-                static volatile int i = 0;
-                if (i >= 100) {
-                    i = 0;
-                }
-                i++;
                 events_context.vi.current_buffer = events_context.vi.next_buffer;
                 RT64UpdateScreen(swap_action->origin);
             }
         }
+        if (gfx_callbacks.update_gfx != nullptr) {
+            gfx_callbacks.update_gfx(nullptr);
+        }
     }
 }
 
@@ -428,7 +444,7 @@ void Multilibultra::send_si_message() {
     osSendMesg(PASS_RDRAM events_context.si.mq, events_context.si.msg, OS_MESG_NOBLOCK);
 }
 
-void Multilibultra::init_events(uint8_t* rdram, uint8_t* rom, void* window_handle) {
+void Multilibultra::init_events(uint8_t* rdram, uint8_t* rom, Multilibultra::WindowHandle window_handle) {
     std::atomic_flag gfx_thread_ready;
     std::atomic_flag task_thread_ready;
     events_context.rdram = rdram;
diff --git a/portultra/mesgqueue.cpp b/portultra/mesgqueue.cpp
index fd7a0bb..b979c65 100644
--- a/portultra/mesgqueue.cpp
+++ b/portultra/mesgqueue.cpp
@@ -5,6 +5,58 @@
 #include "multilibultra.hpp"
 #include "recomp.h"
 
+#if defined(_M_X64)
+static inline void spinlock_pause() {
+    _mm_pause();
+}
+#elif defined(__x86_64__)
+static inline void spinlock_pause() {
+    __builtin_ia32_pause();
+}
+#else
+#error "No spinlock_pause implementation for current architecture"
+#endif
+
+template <typename T>
+class atomic_spinlock {
+    static_assert(sizeof(std::atomic<T>) == sizeof(T), "atomic_spinlock must be used with a type that is the same size as its atomic counterpart");
+    static_assert(std::atomic<T>::is_always_lock_free, "atomic_spinlock must be used with an always lock-free atomic type");
+    std::atomic_ref<T> locked_;
+public:
+    atomic_spinlock(T& flag) : locked_{ flag } {}
+
+    void lock() {
+        // Loop until the lock is acquired.
+        while (true) {
+            // Try to acquire the lock.
+            if (!locked_.exchange(true, std::memory_order_acquire)) {
+                // If it was acquired then exit the loop.
+                break;
+            }
+            // Otherwise, wait until the lock is no longer acquired.
+            // Doing this instead of constantly trying to acquire the lock reduces cache coherency traffic.
+            while (locked_.load(std::memory_order_relaxed)) {
+                // Add a platform-specific pause instruction to reduce load unit traffic.
+                spinlock_pause();
+            }
+        }
+    }
+
+    void unlock() {
+        // Release the lock by setting it to false.
+        locked_.store(false, std::memory_order_release);
+    }
+};
+
+class mesg_queue_lock {
+    OSMesgQueue* queue_;
+    atomic_spinlock<uint8_t> spinlock_;
+public:
+    mesg_queue_lock(OSMesgQueue* mq) : queue_{ mq }, spinlock_{ mq->lock } {}
+    void lock() { spinlock_.lock(); }
+    void unlock() { spinlock_.unlock(); }
+};
+
 extern "C" void osCreateMesgQueue(RDRAM_ARG PTR(OSMesgQueue) mq_, PTR(OSMesg) msg, s32 count) {
     OSMesgQueue *mq = TO_PTR(OSMesgQueue, mq_);
     mq->blocked_on_recv = NULLPTR;
@@ -13,6 +65,7 @@ extern "C" void osCreateMesgQueue(RDRAM_ARG PTR(OSMesgQueue) mq_, PTR(OSMesg) ms
     mq->msg = msg;
     mq->validCount = 0;
     mq->first = 0;
+    mq->lock = false;
 }
 
 s32 MQ_GET_COUNT(OSMesgQueue *mq) {
@@ -47,148 +100,159 @@ bool thread_queue_empty(RDRAM_ARG PTR(OSThread)* queue) {
     return *queue == NULLPTR;
 }
 
-extern "C" s32 osSendMesg(RDRAM_ARG PTR(OSMesgQueue) mq_, OSMesg msg, s32 flags) {
-    OSMesgQueue *mq = TO_PTR(OSMesgQueue, mq_);
-    
-    // Prevent accidentally blocking anything that isn't a game thread
-    if (!Multilibultra::is_game_thread()) {
-        flags = OS_MESG_NOBLOCK;
-    }
+std::mutex test_mutex{};
 
-    Multilibultra::disable_preemption();
+// Attempts to put a message into a queue.
+// If the queue is not full, returns true and pops a thread from the blocked on receive list. 
+// If the queue is full and this is a blocking send, places the current thread into the blocked on send list
+// for the message queue, marks the current thread as being blocked on a queue and returns false.
+bool mesg_queue_try_insert(RDRAM_ARG OSMesgQueue* mq, OSMesg msg, OSThread*& to_run, bool jam, bool blocking) {
+    //mesg_queue_lock lock{ mq };
+    std::lock_guard guard{ test_mutex };
 
-    if (flags == OS_MESG_NOBLOCK) {
-        // If non-blocking, fail if the queue is full
-        if (MQ_IS_FULL(mq)) {
-            Multilibultra::enable_preemption();
-            return -1;
-        }
-    } else {
-        // Otherwise, yield this thread until the queue has room
-        while (MQ_IS_FULL(mq)) {
-            debug_printf("[Message Queue] Thread %d is blocked on send\n", TO_PTR(OSThread, Multilibultra::this_thread())->id);
+    // If the queue is full, insert this thread into the blocked on send queue and return false.
+    if (MQ_IS_FULL(mq)) {
+        if (blocking) {
             thread_queue_insert(PASS_RDRAM &mq->blocked_on_send, Multilibultra::this_thread());
-            Multilibultra::enable_preemption();
-            Multilibultra::pause_self(PASS_RDRAM1);
-            Multilibultra::disable_preemption();
+            // TODO is it safe to use the schedule queue here while in the message queue lock?
+            Multilibultra::block_self(PASS_RDRAM1);
         }
+        to_run = nullptr;
+        return false;
     }
-    
-    s32 last = (mq->first + mq->validCount) % mq->msgCount;
-    TO_PTR(OSMesg, mq->msg)[last] = msg;
-    mq->validCount++;
-    
-    OSThread* to_run = nullptr;
 
+    // The queue wasn't full, so place the message into it.
+    if (jam) {
+        // Insert this message at the start of the queue.
+        mq->first = (mq->first + mq->msgCount - 1) % mq->msgCount;
+        TO_PTR(OSMesg, mq->msg)[mq->first] = msg;
+        mq->validCount++;
+    }
+    else {
+        // Insert this message at the end of the queue.
+        s32 last = (mq->first + mq->validCount) % mq->msgCount;
+        TO_PTR(OSMesg, mq->msg)[last] = msg;
+        mq->validCount++;
+    }
+
+    // Pop a thread from the blocked on recv queue to wake afterwards.
     if (!thread_queue_empty(PASS_RDRAM &mq->blocked_on_recv)) {
         to_run = thread_queue_pop(PASS_RDRAM &mq->blocked_on_recv);
     }
-    
-    Multilibultra::enable_preemption();
-    if (to_run) {
-        debug_printf("[Message Queue] Thread %d is unblocked\n", to_run->id);
-        if (Multilibultra::is_game_thread()) {
-            OSThread* self = TO_PTR(OSThread, Multilibultra::this_thread());
-            if (to_run->priority > self->priority) {
-                Multilibultra::swap_to_thread(PASS_RDRAM to_run);
-            } else {
-                Multilibultra::schedule_running_thread(to_run);
-            }
-        } else {
-            Multilibultra::schedule_running_thread(to_run);
-        }
-    }
-    return 0;
+
+    return true;
 }
 
-extern "C" s32 osJamMesg(RDRAM_ARG PTR(OSMesgQueue) mq_, OSMesg msg, s32 flags) {
-    OSMesgQueue *mq = TO_PTR(OSMesgQueue, mq_);
-    Multilibultra::disable_preemption();
+// Attempts to remove a message from a queue.
+// If the queue is not empty, returns true and pops a thread from the blocked on send list. 
+// If the queue is empty and this is a blocking receive, places the current thread into the blocked on receive list
+// for the message queue, marks the current thread as being blocked on a queue and returns false.
+bool mesg_queue_try_remove(RDRAM_ARG OSMesgQueue* mq, PTR(OSMesg) msg_out, OSThread*& to_run, bool blocking) {
+    //mesg_queue_lock lock{ mq };
+    std::lock_guard guard{ test_mutex };
 
-    if (flags == OS_MESG_NOBLOCK) {
-        // If non-blocking, fail if the queue is full
-        if (MQ_IS_FULL(mq)) {
-            Multilibultra::enable_preemption();
-            return -1;
-        }
-    } else {
-        // Otherwise, yield this thread in a loop until the queue is no longer full
-        while (MQ_IS_FULL(mq)) {
-            debug_printf("[Message Queue] Thread %d is blocked on jam\n", TO_PTR(OSThread, Multilibultra::this_thread())->id);
-            thread_queue_insert(PASS_RDRAM &mq->blocked_on_send, Multilibultra::this_thread());
-            Multilibultra::enable_preemption();
-            Multilibultra::pause_self(PASS_RDRAM1);
-            Multilibultra::disable_preemption();
-        }
-    }
-    
-    mq->first = (mq->first + mq->msgCount - 1) % mq->msgCount;
-    TO_PTR(OSMesg, mq->msg)[mq->first] = msg;
-    mq->validCount++;
-    
-    OSThread *to_run = nullptr;
-
-    if (!thread_queue_empty(PASS_RDRAM &mq->blocked_on_recv)) {
-        to_run = thread_queue_pop(PASS_RDRAM &mq->blocked_on_recv);
-    }
-    
-    Multilibultra::enable_preemption();
-    if (to_run) {
-        debug_printf("[Message Queue] Thread %d is unblocked\n", to_run->id);
-        OSThread *self = TO_PTR(OSThread, Multilibultra::this_thread());
-        if (to_run->priority > self->priority) {
-            Multilibultra::swap_to_thread(PASS_RDRAM to_run);
-        } else {
-            Multilibultra::schedule_running_thread(to_run);
-        }
-    }
-    return 0;
-}
-
-extern "C" s32 osRecvMesg(RDRAM_ARG PTR(OSMesgQueue) mq_, PTR(OSMesg) msg_, s32 flags) {
-    OSMesgQueue *mq = TO_PTR(OSMesgQueue, mq_);
-    OSMesg *msg = TO_PTR(OSMesg, msg_);
-    Multilibultra::disable_preemption();
-
-    if (flags == OS_MESG_NOBLOCK) {
-        // If non-blocking, fail if the queue is empty
-        if (MQ_IS_EMPTY(mq)) {
-            Multilibultra::enable_preemption();
-            return -1;
-        }
-    } else {
-        // Otherwise, yield this thread in a loop until the queue is no longer full
-        while (MQ_IS_EMPTY(mq)) {
-            debug_printf("[Message Queue] Thread %d is blocked on receive\n", TO_PTR(OSThread, Multilibultra::this_thread())->id);
+    // If the queue is full, insert this thread into the blocked on receive queue and return false.
+    if (MQ_IS_EMPTY(mq)) {
+        if (blocking) {
             thread_queue_insert(PASS_RDRAM &mq->blocked_on_recv, Multilibultra::this_thread());
-            Multilibultra::enable_preemption();
-            Multilibultra::pause_self(PASS_RDRAM1);
-            Multilibultra::disable_preemption();
+            // TODO is it safe to use the schedule queue here while in the message queue lock?
+            Multilibultra::block_self(PASS_RDRAM1);
         }
+        to_run = nullptr;
+        return false;
     }
 
-    if (msg_ != NULLPTR) {
-        *msg = TO_PTR(OSMesg, mq->msg)[mq->first];
+    // The queue wasn't empty, so remove the first message from it.
+    if (msg_out != NULLPTR) {
+        *TO_PTR(OSMesg, msg_out) = TO_PTR(OSMesg, mq->msg)[mq->first];
     }
-    
     mq->first = (mq->first + 1) % mq->msgCount;
     mq->validCount--;
 
-    OSThread *to_run = nullptr;
-
+    // Pop a thread from the blocked on send queue to wake afterwards.
     if (!thread_queue_empty(PASS_RDRAM &mq->blocked_on_send)) {
         to_run = thread_queue_pop(PASS_RDRAM &mq->blocked_on_send);
     }
-    
-    Multilibultra::enable_preemption();
+
+    return true;
+}
+
+enum class MesgQueueActionType {
+    Send,
+    Jam,
+    Receive
+};
+
+s32 mesg_queue_action(RDRAM_ARG PTR(OSMesgQueue) mq_, OSMesg msg, PTR(OSMesg) msg_out, s32 flags, MesgQueueActionType action) {
+    OSMesgQueue* mq = TO_PTR(OSMesgQueue, mq_);
+    OSThread* this_thread = TO_PTR(OSThread, Multilibultra::this_thread());
+    bool is_blocking = flags != OS_MESG_NOBLOCK;
+
+    // Prevent accidentally blocking anything that isn't a game thread
+    if (!Multilibultra::is_game_thread()) {
+        is_blocking = false;
+    }
+
+    OSThread* to_run = nullptr;
+
+    // Repeatedly attempt to send the message until it's successful.
+    while (true) {
+        // Try to insert/remove the message into the queue depending on the action.
+        bool success = false;
+        switch (action) {
+            case MesgQueueActionType::Send:
+                success = mesg_queue_try_insert(PASS_RDRAM mq, msg, to_run, false, is_blocking);
+                break;
+            case MesgQueueActionType::Jam:
+                success = mesg_queue_try_insert(PASS_RDRAM mq, msg, to_run, true, is_blocking);
+                break;
+            case MesgQueueActionType::Receive:
+                success = mesg_queue_try_remove(PASS_RDRAM mq, msg_out, to_run, is_blocking);
+                break;
+        }
+
+        // If successful, don't block.
+        if (success) {
+            //goto after;
+            break;
+        }
+
+        // Otherwise if the action was unsuccessful but wasn't blocking, return -1 to indicate a failure.
+        if (!is_blocking) {
+            return -1;
+        }
+
+        // The action failed, so pause this thread until unblocked by the queue.
+        debug_printf("[Message Queue] Thread %d is blocked on %s\n", this_thread->id, action == MesgQueueActionType::Receive ? "receive" : "send");
+
+        // Wait for it this thread be resumed.
+        Multilibultra::wait_for_resumed(PASS_RDRAM1);
+    }
+    //after:
+
+    // If any thread was blocked on receiving from this queue, wake it.    
     if (to_run) {
         debug_printf("[Message Queue] Thread %d is unblocked\n", to_run->id);
-        OSThread *self = TO_PTR(OSThread, Multilibultra::this_thread());
-        if (to_run->priority > self->priority) {
-            Multilibultra::swap_to_thread(PASS_RDRAM to_run);
-        } else {
-            Multilibultra::schedule_running_thread(to_run);
+        Multilibultra::unblock_thread(to_run);
+
+        // If the unblocked thread is higher priority than this one, pause this thread so it can take over.
+        if (Multilibultra::is_game_thread() && to_run->priority > this_thread->priority) {
+            Multilibultra::yield_self(PASS_RDRAM1);
+            Multilibultra::wait_for_resumed(PASS_RDRAM1);
         }
     }
+
     return 0;
 }
+
+extern "C" s32 osSendMesg(RDRAM_ARG PTR(OSMesgQueue) mq_, OSMesg msg, s32 flags) {
+    return mesg_queue_action(PASS_RDRAM mq_, msg, NULLPTR, flags, MesgQueueActionType::Send);
+}
+
+extern "C" s32 osJamMesg(RDRAM_ARG PTR(OSMesgQueue) mq_, OSMesg msg, s32 flags) {
+    return mesg_queue_action(PASS_RDRAM mq_, msg, NULLPTR, flags, MesgQueueActionType::Jam);
+}
+
+extern "C" s32 osRecvMesg(RDRAM_ARG PTR(OSMesgQueue) mq_, PTR(OSMesg) msg_out_, s32 flags) {
+    return mesg_queue_action(PASS_RDRAM mq_, NULLPTR, msg_out_, flags, MesgQueueActionType::Receive);
+}
diff --git a/portultra/multilibultra.hpp b/portultra/multilibultra.hpp
index 61607f1..58dde89 100644
--- a/portultra/multilibultra.hpp
+++ b/portultra/multilibultra.hpp
@@ -8,31 +8,57 @@
 
 #include "ultra64.h"
 
+#if defined(_WIN32)
+#   include <Windows.h>
+#elif defined(__ANDROID__)
+#   include "android/native_window.h"
+#elif defined(__linux__)
+#   include "X11/Xlib.h"
+#   undef None
+#   undef Status
+#   undef LockMask
+#endif
+
 struct UltraThreadContext {
     std::thread host_thread;
-    std::atomic_bool running;
+    std::atomic_bool scheduled;
+    std::atomic_bool descheduled;
     std::atomic_bool initialized;
 };
 
 namespace Multilibultra {
 
+#if defined(_WIN32)
+    // Native HWND handle to the target window.
+    using WindowHandle = HWND;
+#elif defined(__ANDROID__)
+    using WindowHandle = ANativeWindow*;
+#elif defined(__linux__)
+    struct WindowHandle {
+        Display* display;
+        Window window;
+    };
+#endif
+
 // We need a place in rdram to hold the PI handles, so pick an address in extended rdram
 constexpr int32_t cart_handle = 0x80800000;
 constexpr int32_t flash_handle = (int32_t)(cart_handle + sizeof(OSPiHandle));
 constexpr uint32_t save_size = 1024 * 1024 / 8; // Maximum save size, 1Mbit for flash
 
-void preinit(uint8_t* rdram, uint8_t* rom, void* window_handle);
+void preinit(uint8_t* rdram, uint8_t* rom, WindowHandle window_handle);
 void save_init();
 void init_scheduler();
-void init_events(uint8_t* rdram, uint8_t* rom, void* window_handle);
+void init_events(uint8_t* rdram, uint8_t* rom, WindowHandle window_handle);
 void init_timers(RDRAM_ARG1);
-void set_self_paused(RDRAM_ARG1);
+void yield_self(RDRAM_ARG1);
+void block_self(RDRAM_ARG1);
+void unblock_thread(OSThread* t);
 void wait_for_resumed(RDRAM_ARG1);
 void swap_to_thread(RDRAM_ARG OSThread *to);
-void pause_thread_impl(OSThread *t);
-void resume_thread_impl(OSThread *t);
+void resume_thread_impl(OSThread* t);
 void schedule_running_thread(OSThread *t);
-void pause_self(RDRAM_ARG1);
+void halt_self(RDRAM_ARG1);
+void stop_thread(OSThread *t);
 void cleanup_thread(OSThread *t);
 
 enum class ThreadPriority {
@@ -46,8 +72,6 @@ enum class ThreadPriority {
 void set_native_thread_name(const std::string& name);
 void set_native_thread_priority(ThreadPriority pri);
 PTR(OSThread) this_thread();
-void disable_preemption();
-void enable_preemption();
 void notify_scheduler();
 void reprioritize_thread(OSThread *t, OSPri pri);
 void set_main_thread();
@@ -69,7 +93,7 @@ struct audio_callbacks_t {
     using get_samples_remaining_t = size_t();
     using set_frequency_t = void(uint32_t);
     queue_samples_t* queue_samples;
-    get_samples_remaining_t* get_samples_remaining;
+    get_samples_remaining_t* get_frames_remaining;
     set_frequency_t* set_frequency;
 };
 void set_audio_callbacks(const audio_callbacks_t* callbacks);
@@ -81,19 +105,22 @@ struct input_callbacks_t {
 };
 void set_input_callbacks(const input_callbacks_t* callback);
 
-class preemption_guard {
-public:
-    preemption_guard();
-    ~preemption_guard();
-private:
-    std::lock_guard<std::mutex> lock;
+struct gfx_callbacks_t {
+    using gfx_data_t = void*;
+    using create_gfx_t = gfx_data_t();
+    using create_window_t = WindowHandle(gfx_data_t);
+    using update_gfx_t = void(gfx_data_t);
+    create_gfx_t* create_gfx;
+    create_window_t* create_window;
+    update_gfx_t* update_gfx;
 };
+void set_gfx_callbacks(const gfx_callbacks_t* callbacks);
 
 } // namespace Multilibultra
 
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 
-#define debug_printf(...)
-//#define debug_printf(...) printf(__VA_ARGS__);
+//#define debug_printf(...)
+#define debug_printf(...) printf(__VA_ARGS__);
 
 #endif
diff --git a/portultra/scheduler.cpp b/portultra/scheduler.cpp
index 69e50ef..2000678 100644
--- a/portultra/scheduler.cpp
+++ b/portultra/scheduler.cpp
@@ -3,6 +3,7 @@
 #include <atomic>
 #include <vector>
 #include <variant>
+#include <algorithm>
 
 #include "blockingconcurrentqueue.h"
 #include "multilibultra.hpp"
@@ -24,17 +25,27 @@ public:
             return false;
         }
 
-        if (it == this->c.begin()) {
-            // deque the top element
-            this->pop();
-        } else {
-            // remove element and re-heap
-            this->c.erase(it);
-            std::make_heap(this->c.begin(), this->c.end(), this->comp);
-        }
+        // remove element and re-heap
+        this->c.erase(it);
+        std::make_heap(this->c.begin(), this->c.end(), this->comp);
         
         return true;
     }
+
+    void print() {
+        std::vector<OSThread*> backup = this->c;
+        debug_printf("[Scheduler] Scheduled Threads:\n");
+        while (!empty()) {
+            OSThread* t = top();
+            pop();
+            debug_printf("              %d: pri %d state %d\n", t->id, t->priority, t->state);
+        }
+        this->c = backup;
+    }
+
+    bool contains(OSThread* t) {
+        return std::find(this->c.begin(), this->c.end(), t) != this->c.end();
+    }
 };
 
 struct NotifySchedulerAction {
@@ -58,26 +69,54 @@ struct ReprioritizeThreadAction {
     OSPri pri;
 };
 
-using ThreadAction = std::variant<NotifySchedulerAction, ScheduleThreadAction, StopThreadAction, CleanupThreadAction, ReprioritizeThreadAction>;
+struct YieldedThreadAction {
+    OSThread* t;
+};
+
+struct BlockedThreadAction {
+    OSThread* t;
+};
+
+struct UnblockThreadAction {
+    OSThread* t;
+};
+
+using ThreadAction = std::variant<std::monostate, NotifySchedulerAction, ScheduleThreadAction, StopThreadAction, CleanupThreadAction, ReprioritizeThreadAction, YieldedThreadAction, BlockedThreadAction, UnblockThreadAction>;
 
 static struct {
     moodycamel::BlockingConcurrentQueue<ThreadAction> action_queue{};
     OSThread* running_thread;
-
-    bool can_preempt;
-    std::mutex premption_mutex;
 } scheduler_context{};
 
 void handle_thread_queueing(thread_queue_t& running_thread_queue, const ScheduleThreadAction& action) {
     OSThread* to_schedule = action.t;
     debug_printf("[Scheduler] Scheduling thread %d\n", to_schedule->id);
-    running_thread_queue.push(to_schedule);
+
+    // Do not schedule the thread if it's waiting on a message queue
+    if (to_schedule->state == OSThreadState::BLOCKED_STOPPED) {
+        to_schedule->state = OSThreadState::BLOCKED_PAUSED;
+    }
+    else {
+        to_schedule->state = OSThreadState::PAUSED;
+        running_thread_queue.push(to_schedule);
+    }
 }
 
 void handle_thread_stopping(thread_queue_t& running_thread_queue, const StopThreadAction& action) {
     OSThread* to_stop = action.t;
     debug_printf("[Scheduler] Stopping thread %d\n", to_stop->id);
+
     running_thread_queue.remove(to_stop);
+    if (running_thread_queue.contains(to_stop)) {
+        assert(false);
+    }
+
+    if (to_stop->state == OSThreadState::BLOCKED_PAUSED) {
+        to_stop->state = OSThreadState::BLOCKED_STOPPED;
+    }
+    else {
+        to_stop->state = OSThreadState::STOPPED;
+    }
 }
 
 void handle_thread_cleanup(thread_queue_t& running_thread_queue, OSThread*& cur_running_thread, const CleanupThreadAction& action) {
@@ -111,17 +150,76 @@ void handle_thread_reprioritization(thread_queue_t& running_thread_queue, const
     running_thread_queue.push(to_reprioritize);
 }
 
+void handle_thread_yielded(thread_queue_t& running_thread_queue, const YieldedThreadAction& action) {
+    OSThread* yielded = action.t;
+    
+    debug_printf("[Scheduler] Thread %d has yielded\n", yielded->id);
+    // Remove the yielded thread from the thread queue. If it was in the queue then re-add it so that it's placed after any other threads with the same priority.
+    if (running_thread_queue.remove(yielded)) {
+        running_thread_queue.push(yielded);
+    }
+    yielded->state = OSThreadState::PAUSED;
+    debug_printf("[Scheduler] Set thread %d to PAUSED\n", yielded->id);
+}
+
+void handle_thread_blocked(thread_queue_t& running_thread_queue, const BlockedThreadAction& action) {
+    OSThread* blocked = action.t;
+
+    debug_printf("[Scheduler] Thread %d has been blocked\n", blocked->id);
+    // Remove the thread from the running queue.
+    running_thread_queue.remove(blocked);
+
+    // Update the thread's state accordingly. 
+    if (blocked->state == OSThreadState::STOPPED) {
+        blocked->state = OSThreadState::BLOCKED_STOPPED;
+    }
+    else if (blocked->state == OSThreadState::RUNNING) {
+        blocked->state = OSThreadState::BLOCKED_PAUSED;
+    }
+    else {
+        assert(false);
+    }
+    running_thread_queue.remove(blocked);
+}
+
+void handle_thread_unblocking(thread_queue_t& running_thread_queue, const UnblockThreadAction& action) {
+    OSThread* unblocked = action.t;
+
+    // Do nothing if this thread has already been unblocked.
+    if (unblocked->state != OSThreadState::BLOCKED_STOPPED && unblocked->state != OSThreadState::BLOCKED_PAUSED) {
+        return;
+    }
+
+    debug_printf("[Scheduler] Thread %d has been unblocked\n", unblocked->id);
+    // Update the thread's state accordingly. 
+    if (unblocked->state == OSThreadState::BLOCKED_STOPPED) {
+        unblocked->state = OSThreadState::STOPPED;
+    }
+    else if (unblocked->state == OSThreadState::BLOCKED_PAUSED) {
+        // The thread wasn't stopped, so put it back in the running queue now that it's been unblocked.
+        unblocked->state = OSThreadState::PAUSED;
+        running_thread_queue.push(unblocked);
+    }
+    else {
+        assert(false);
+    }
+}
+
 void swap_running_thread(thread_queue_t& running_thread_queue, OSThread*& cur_running_thread) {
     if (running_thread_queue.size() > 0) {
         OSThread* new_running_thread = running_thread_queue.top();
-        if (cur_running_thread != new_running_thread) {
+        // If the running thread has changed or the running thread is paused, run the running thread
+        if (cur_running_thread != new_running_thread || (cur_running_thread && cur_running_thread->state != OSThreadState::RUNNING)) {
             if (cur_running_thread && cur_running_thread->state == OSThreadState::RUNNING) {
                 debug_printf("[Scheduler] Need to wait for thread %d to pause itself\n", cur_running_thread->id);
                 return;
-            } else {
-                debug_printf("[Scheduler] Switching execution to thread %d (%d)\n", new_running_thread->id, new_running_thread->priority);
             }
+            debug_printf("[Scheduler] Switching execution to thread %d (%d)\n", new_running_thread->id, new_running_thread->priority);
             Multilibultra::resume_thread_impl(new_running_thread);
+            if (cur_running_thread) {
+                cur_running_thread->context->descheduled.store(true);
+                cur_running_thread->context->descheduled.notify_all();
+            }
             cur_running_thread = new_running_thread;
         } else if (cur_running_thread && cur_running_thread->state != OSThreadState::RUNNING) {
             Multilibultra::resume_thread_impl(cur_running_thread);
@@ -139,28 +237,41 @@ void scheduler_func() {
     Multilibultra::set_native_thread_priority(Multilibultra::ThreadPriority::VeryHigh);
 
     while (true) {
-        ThreadAction action;
+        using namespace std::chrono_literals;
+        ThreadAction action{};
         OSThread* old_running_thread = cur_running_thread;
+        //scheduler_context.action_queue.wait_dequeue_timed(action, 1ms);
         scheduler_context.action_queue.wait_dequeue(action);
 
-        std::lock_guard lock{scheduler_context.premption_mutex};
+        if (std::get_if<std::monostate>(&action) == nullptr) {
+            // Determine the action type and act on it
+            if (const auto* notify_action = std::get_if<NotifySchedulerAction>(&action)) {
+                // Nothing to do
+            }
+            else if (const auto* stop_action = std::get_if<StopThreadAction>(&action)) {
+                handle_thread_stopping(running_thread_queue, *stop_action);
+            }
+            else if (const auto* cleanup_action = std::get_if<CleanupThreadAction>(&action)) {
+                handle_thread_cleanup(running_thread_queue, cur_running_thread, *cleanup_action);
+            }
+            else if (const auto* schedule_action = std::get_if<ScheduleThreadAction>(&action)) {
+                handle_thread_queueing(running_thread_queue, *schedule_action);
+            }
+            else if (const auto* reprioritize_action = std::get_if<ReprioritizeThreadAction>(&action)) {
+                handle_thread_reprioritization(running_thread_queue, *reprioritize_action);
+            }
+            else if (const auto* yielded_action = std::get_if<YieldedThreadAction>(&action)) {
+                handle_thread_yielded(running_thread_queue, *yielded_action);
+            }
+            else if (const auto* blocked_action = std::get_if<BlockedThreadAction>(&action)) {
+                handle_thread_blocked(running_thread_queue, *blocked_action);
+            }
+            else if (const auto* unblock_action = std::get_if<UnblockThreadAction>(&action)) {
+                handle_thread_unblocking(running_thread_queue, *unblock_action);
+            }
+        }
 
-        // Determine the action type and act on it
-        if (const auto* cleanup_action = std::get_if<NotifySchedulerAction>(&action)) {
-            // Nothing to do
-        }
-        else if (const auto* stop_action = std::get_if<StopThreadAction>(&action)) {
-            handle_thread_stopping(running_thread_queue, *stop_action);
-        }
-        else if (const auto* cleanup_action = std::get_if<CleanupThreadAction>(&action)) {
-            handle_thread_cleanup(running_thread_queue, cur_running_thread, *cleanup_action);
-        }
-        else if (const auto* schedule_action = std::get_if<ScheduleThreadAction>(&action)) {
-            handle_thread_queueing(running_thread_queue, *schedule_action);
-        }
-        else if (const auto* reprioritize_action = std::get_if<ReprioritizeThreadAction>(&action)) {
-            handle_thread_reprioritization(running_thread_queue, *reprioritize_action);
-        }
+        running_thread_queue.print();
 
         // Determine which thread to run, stopping the current running thread if necessary
         swap_running_thread(running_thread_queue, cur_running_thread);
@@ -180,78 +291,90 @@ extern "C" void do_yield() {
 namespace Multilibultra {
 
 void init_scheduler() {
-    scheduler_context.can_preempt = true;
     std::thread scheduler_thread{scheduler_func};
     scheduler_thread.detach();
 }
 
 void schedule_running_thread(OSThread *t) {
-    debug_printf("[Scheduler] Queuing Thread %d to be scheduled\n", t->id);
+    debug_printf("[Thread] Queuing Thread %d to be scheduled\n", t->id);
     scheduler_context.action_queue.enqueue(ScheduleThreadAction{t});
 }
 
 void swap_to_thread(RDRAM_ARG OSThread *to) {
     OSThread *self = TO_PTR(OSThread, Multilibultra::this_thread());
-    debug_printf("[Scheduler] Scheduling swap from thread %d to %d\n", self->id, to->id);
+    debug_printf("[Thread] Scheduling swap from thread %d to %d\n", self->id, to->id);
     
-    Multilibultra::set_self_paused(PASS_RDRAM1);
-    scheduler_context.action_queue.enqueue(ScheduleThreadAction{to});
-    Multilibultra::wait_for_resumed(PASS_RDRAM1);
+    // Tell the scheduler that the swapped-to thread is ready to run and that this thread is yielding.
+    schedule_running_thread(to);
+    yield_self(PASS_RDRAM1);
+
+    // Wait for the scheduler to resume this thread.
+    wait_for_resumed(PASS_RDRAM1);
 }
 
 void reprioritize_thread(OSThread *t, OSPri pri) {
-    debug_printf("[Scheduler] Adjusting Thread %d priority to %d\n", t->id, pri);
+    debug_printf("[Thread] Adjusting Thread %d priority to %d\n", t->id, pri);
 
     scheduler_context.action_queue.enqueue(ReprioritizeThreadAction{t, pri});
 }
 
-void pause_self(RDRAM_ARG1) {
-    OSThread *self = TO_PTR(OSThread, Multilibultra::this_thread());
-    debug_printf("[Scheduler] Thread %d pausing itself\n", self->id);
+void stop_thread(OSThread *t) {
+    debug_printf("[Thread] Queueing stopping of thread %d\n", t->id);
 
-    Multilibultra::set_self_paused(PASS_RDRAM1);
-    scheduler_context.action_queue.enqueue(StopThreadAction{self});
-    Multilibultra::wait_for_resumed(PASS_RDRAM1);
+    scheduler_context.action_queue.enqueue(StopThreadAction{t});
+}
+
+void Multilibultra::yield_self(RDRAM_ARG1) {
+    OSThread* self = TO_PTR(OSThread, Multilibultra::this_thread());
+    debug_printf("[Thread] Thread %d yielding itself\n", self->id);
+
+    scheduler_context.action_queue.enqueue(YieldedThreadAction{ self });
+}
+
+void Multilibultra::block_self(RDRAM_ARG1) {
+    OSThread* self = TO_PTR(OSThread, Multilibultra::this_thread());
+    debug_printf("[Thread] Thread %d has been blocked\n", self->id);
+
+    scheduler_context.action_queue.enqueue(BlockedThreadAction{ self });
+    
+}
+
+void Multilibultra::unblock_thread(OSThread *t) {
+    debug_printf("[Thread] Unblocking thread %d\n", t->id);
+
+    scheduler_context.action_queue.enqueue(UnblockThreadAction{ t });
+}
+
+void halt_self(RDRAM_ARG1) {
+    OSThread* self = TO_PTR(OSThread, Multilibultra::this_thread());
+    debug_printf("[Thread] Thread %d pausing itself\n", self->id);
+
+    stop_thread(self);
+    yield_self(PASS_RDRAM1);
+    wait_for_resumed(PASS_RDRAM1);
 }
 
 void cleanup_thread(OSThread *t) {
     scheduler_context.action_queue.enqueue(CleanupThreadAction{t});
 }
 
-void disable_preemption() {
-    scheduler_context.premption_mutex.lock();
-    if (Multilibultra::is_game_thread()) {
-        scheduler_context.can_preempt = false;
-    }
-}
-
-void enable_preemption() {
-    if (Multilibultra::is_game_thread()) {
-        scheduler_context.can_preempt = true;
-    }
-#pragma warning(push)
-#pragma warning( disable : 26110)
-    scheduler_context.premption_mutex.unlock();
-#pragma warning( pop ) 
-}
-
-// lock's constructor is called first, so can_preempt is set after locking
-preemption_guard::preemption_guard() : lock{scheduler_context.premption_mutex} {
-    scheduler_context.can_preempt = false;
-}
-
-// lock's destructor is called last, so can_preempt is set before unlocking
-preemption_guard::~preemption_guard() {
-    scheduler_context.can_preempt = true;
-}
-
 void notify_scheduler() {
     scheduler_context.action_queue.enqueue(NotifySchedulerAction{});
 }
 
+void resume_thread_impl(OSThread* t) {
+    if (t->state == OSThreadState::PREEMPTED) {
+        // Nothing to do here
+    }
+    t->state = OSThreadState::RUNNING;
+    debug_printf("[Scheduler] Set thread %d to RUNNING\n", t->id);
+    t->context->scheduled.store(true);
+    t->context->scheduled.notify_all();
+}
+
 }
 
 extern "C" void pause_self(uint8_t* rdram) {
-    Multilibultra::pause_self(rdram);
+    Multilibultra::halt_self(rdram);
 }
 
diff --git a/portultra/threads.cpp b/portultra/threads.cpp
index b8891e4..36cee41 100644
--- a/portultra/threads.cpp
+++ b/portultra/threads.cpp
@@ -7,10 +7,8 @@
 #include "multilibultra.hpp"
 
 // Native APIs only used to set thread names for easier debugging
-#if defined(_WIN32)
+#ifdef _WIN32
 #include <Windows.h>
-#elif defined(__linux__)
-#include <pthread.h>
 #endif
 
 extern "C" void bootproc();
@@ -127,7 +125,6 @@ static void _thread_func(RDRAM_ARG PTR(OSThread) self_, PTR(thread_func_t) entry
     Multilibultra::set_native_thread_priority(Multilibultra::ThreadPriority::High);
 
     // Set initialized to false to indicate that this thread can be started.
-    Multilibultra::set_self_paused(PASS_RDRAM1);
     self->context->initialized.store(true);
     self->context->initialized.notify_all();
 
@@ -153,7 +150,7 @@ extern "C" void osStartThread(RDRAM_ARG PTR(OSThread) t_) {
     OSThread* t = TO_PTR(OSThread, t_);
     debug_printf("[os] Start Thread %d\n", t->id);
 
-    // Wait until the thread is initialized to indicate that it's action_queued to be started.
+    // Wait until the thread is initialized to indicate that it's queued to be started.
     t->context->initialized.wait(false);
 
     debug_printf("[os] Thread %d is ready to be started\n", t->id);
@@ -178,20 +175,33 @@ extern "C" void osCreateThread(RDRAM_ARG PTR(OSThread) t_, OSId id, PTR(thread_f
     t->next = NULLPTR;
     t->priority = pri;
     t->id = id;
-    t->state = OSThreadState::PAUSED;
+    t->state = OSThreadState::STOPPED;
     t->sp = sp - 0x10; // Set up the first stack frame
     t->destroyed = false;
 
     // Spawn a new thread, which will immediately pause itself and wait until it's been started.
     t->context = new UltraThreadContext{};
     t->context->initialized.store(false);
-    t->context->running.store(false);
+    t->context->scheduled.store(false);
+    t->context->descheduled.store(true);
 
     t->context->host_thread = std::thread{_thread_func, PASS_RDRAM t_, entrypoint, arg};
 }
 
 extern "C" void osStopThread(RDRAM_ARG PTR(OSThread) t_) {
-    assert(false);
+    // If null is passed in as the thread then the calling thread is stopping itself.
+    if (t_ == NULLPTR) {
+        t_ = Multilibultra::this_thread();
+    }
+
+    // Remove the thread in question from the scheduler so it doesn't get scheduled again.
+    OSThread* t = TO_PTR(OSThread, t_);
+    Multilibultra::stop_thread(t);
+
+    // If a thread is stopping itself, tell the scheduler that it has yielded.
+    if (t_ == Multilibultra::this_thread()) {
+        Multilibultra::yield_self(PASS_RDRAM1);
+    }
 }
 
 extern "C" void osDestroyThread(RDRAM_ARG PTR(OSThread) t_) {
@@ -207,6 +217,12 @@ extern "C" void osDestroyThread(RDRAM_ARG PTR(OSThread) t_) {
     }
 }
 
+// TODO make the thread queue stable to ensure correct yielding behavior
+extern "C" void osYieldThread(RDRAM_ARG1) {
+    Multilibultra::yield_self(PASS_RDRAM1);
+    Multilibultra::wait_for_resumed(PASS_RDRAM1);
+}
+
 extern "C" void osSetThreadPri(RDRAM_ARG PTR(OSThread) t, OSPri pri) {
     if (t == NULLPTR) {
         t = thread_self;
@@ -214,13 +230,12 @@ extern "C" void osSetThreadPri(RDRAM_ARG PTR(OSThread) t, OSPri pri) {
     bool pause_self = false;
     if (pri > TO_PTR(OSThread, thread_self)->priority) {
         pause_self = true;
-        Multilibultra::set_self_paused(PASS_RDRAM1);
     } else if (t == thread_self && pri < TO_PTR(OSThread, thread_self)->priority) {
         pause_self = true;
-        Multilibultra::set_self_paused(PASS_RDRAM1);
     }
     Multilibultra::reprioritize_thread(TO_PTR(OSThread, t), pri);
     if (pause_self) {
+        Multilibultra::yield_self(PASS_RDRAM1);
         Multilibultra::wait_for_resumed(PASS_RDRAM1);
     }
 }
@@ -239,15 +254,6 @@ extern "C" OSId osGetThreadId(RDRAM_ARG PTR(OSThread) t) {
     return TO_PTR(OSThread, t)->id;
 }
 
-// TODO yield thread, need a stable priority queue in the scheduler
-
-void Multilibultra::set_self_paused(RDRAM_ARG1) {
-    debug_printf("[Thread] Thread pausing itself: %d\n", TO_PTR(OSThread, thread_self)->id);
-    TO_PTR(OSThread, thread_self)->state = OSThreadState::PAUSED;
-    TO_PTR(OSThread, thread_self)->context->running.store(false);
-    TO_PTR(OSThread, thread_self)->context->running.notify_all();
-}
-
 void check_destroyed(OSThread* t) {
     if (t->destroyed) {
         throw thread_terminated{};
@@ -256,25 +262,13 @@ void check_destroyed(OSThread* t) {
 
 void Multilibultra::wait_for_resumed(RDRAM_ARG1) {
     check_destroyed(TO_PTR(OSThread, thread_self));
-    TO_PTR(OSThread, thread_self)->context->running.wait(false);
+    //TO_PTR(OSThread, thread_self)->context->descheduled.wait(false);
+    //TO_PTR(OSThread, thread_self)->context->descheduled.store(false);
+    TO_PTR(OSThread, thread_self)->context->scheduled.wait(false);
+    TO_PTR(OSThread, thread_self)->context->scheduled.store(false);
     check_destroyed(TO_PTR(OSThread, thread_self));
 }
 
-void Multilibultra::pause_thread_impl(OSThread* t) {
-    t->state = OSThreadState::PREEMPTED;
-    t->context->running.store(false);
-    t->context->running.notify_all();
-}
-
-void Multilibultra::resume_thread_impl(OSThread *t) {
-    if (t->state == OSThreadState::PREEMPTED) {
-        // Nothing to do here
-    }
-    t->state = OSThreadState::RUNNING;
-    t->context->running.store(true);
-    t->context->running.notify_all();
-}
-
 PTR(OSThread) Multilibultra::this_thread() {
     return thread_self;
 }
diff --git a/portultra/timer.cpp b/portultra/timer.cpp
index 2be4f1a..dce0c99 100644
--- a/portultra/timer.cpp
+++ b/portultra/timer.cpp
@@ -5,7 +5,6 @@
 
 #include "ultra64.h"
 #include "multilibultra.hpp"
-#include "recomp.h"
 
 // Start time for the program
 static std::chrono::system_clock::time_point start = std::chrono::system_clock::now();
@@ -24,7 +23,7 @@ struct OSTimer {
 };
 
 struct AddTimerAction {
-    PTR(OSTask) timer;
+    PTR(OSTimer) timer;
 };
 
 struct RemoveTimerAction {
diff --git a/portultra/ultra64.h b/portultra/ultra64.h
index 5587004..dc117b3 100644
--- a/portultra/ultra64.h
+++ b/portultra/ultra64.h
@@ -78,6 +78,9 @@ typedef struct UltraThreadContext UltraThreadContext;
 typedef enum {
     RUNNING,
     PAUSED,
+    STOPPED,
+    BLOCKED_PAUSED,
+    BLOCKED_STOPPED,
     PREEMPTED
 } OSThreadState;
 
@@ -101,9 +104,11 @@ typedef PTR(void) OSMesg;
 typedef struct OSMesgQueue {
     PTR(OSThread) blocked_on_recv; /* Linked list of threads blocked on receiving from this queue */
     PTR(OSThread) blocked_on_send; /* Linked list of threads blocked on sending to this queue */ 
-    s32 validCount;            /* Number of messages in the queue */
-    s32 first;                 /* Index of the first message in the ring buffer */
-    s32 msgCount;              /* Size of message buffer */
+    s32 validCount;                /* Number of messages in the queue */
+    s32 first;                     /* Index of the first message in the ring buffer */
+    uint8_t lock;                  /* Lock flag used to implement a spinlock */
+    uint8_t pad;                   /* Explicit padding (would be compiler-inserted otherwise) */
+    s16 msgCount;                  /* Size of message buffer (s32 in the original libultra, but s16 here to make room for the lock flag) */
     PTR(OSMesg) msg;               /* Pointer to circular buffer to store messages */
 } OSMesgQueue;
 
@@ -218,6 +223,7 @@ void osCreateThread(RDRAM_ARG PTR(OSThread) t, OSId id, PTR(thread_func_t) entry
 void osStartThread(RDRAM_ARG PTR(OSThread) t);
 void osStopThread(RDRAM_ARG PTR(OSThread) t);
 void osDestroyThread(RDRAM_ARG PTR(OSThread) t);
+void osYieldThread(RDRAM_ARG1);
 void osSetThreadPri(RDRAM_ARG PTR(OSThread) t, OSPri pri);
 OSPri osGetThreadPri(RDRAM_ARG PTR(OSThread) thread);
 OSId osGetThreadId(RDRAM_ARG PTR(OSThread) t);
diff --git a/portultra/ultrainit.cpp b/portultra/ultrainit.cpp
index 1abfbac..f8f4c93 100644
--- a/portultra/ultrainit.cpp
+++ b/portultra/ultrainit.cpp
@@ -1,7 +1,7 @@
 #include "ultra64.h"
 #include "multilibultra.hpp"
 
-void Multilibultra::preinit(uint8_t* rdram, uint8_t* rom, void* window_handle) {
+void Multilibultra::preinit(uint8_t* rdram, uint8_t* rom, Multilibultra::WindowHandle window_handle) {
     Multilibultra::set_main_thread();
     Multilibultra::init_events(rdram, rom, window_handle);
     Multilibultra::init_timers(rdram);
diff --git a/src/dp.cpp b/src/dp.cpp
index 3cf8e43..3ad5327 100644
--- a/src/dp.cpp
+++ b/src/dp.cpp
@@ -1,5 +1,44 @@
 #include "recomp.h"
 
-extern "C" void osDpSetNextBuffer_recomp(uint8_t* rdram, recomp_context* ctx) {
-    ;
+enum class RDPStatusBit {
+	XbusDmem = 0,
+	Freeze = 1,
+	Flush = 2,
+	CommandBusy = 6,
+	BufferReady = 7,
+	DmaBusy = 8,
+	EndValid = 9,
+	StartValid = 10,
+};
+
+constexpr void update_bit(uint32_t& state, uint32_t flags, RDPStatusBit bit) {
+	int set_bit_pos = (int)bit * 2 + 0;
+	int reset_bit_pos = (int)bit * 2 + 1;
+	bool set = (flags & (1U << set_bit_pos)) != 0;
+	bool reset = (flags & (1U << reset_bit_pos)) != 0;
+
+	if (set ^ reset) {
+		if (set) {
+			state |= (1U << (int)bit);
+		}
+		else {
+			state &= ~(1U << (int)bit);
+		}
+	}
+}
+
+uint32_t rdp_state = 1 << (int)RDPStatusBit::BufferReady;
+
+extern "C" void osDpSetNextBuffer_recomp(uint8_t* rdram, recomp_context* ctx) {
+    assert(false);
+}
+
+extern "C" void osDpGetStatus_recomp(uint8_t* rdram, recomp_context* ctx) {
+	ctx->r2 = rdp_state;
+}
+
+extern "C" void osDpSetStatus_recomp(uint8_t* rdram, recomp_context* ctx) {
+	update_bit(rdp_state, ctx->r4, RDPStatusBit::XbusDmem);
+	update_bit(rdp_state, ctx->r4, RDPStatusBit::Freeze);
+	update_bit(rdp_state, ctx->r4, RDPStatusBit::Flush);
 }
diff --git a/src/eep.cpp b/src/eep.cpp
index b6b04f9..211a93a 100644
--- a/src/eep.cpp
+++ b/src/eep.cpp
@@ -1,21 +1,49 @@
 #include "recomp.h"
+#include "../portultra/ultra64.h"
+
+void save_write(uint8_t* rdram, gpr rdram_address, uint32_t offset, uint32_t count);
+void save_read(uint8_t* rdram, gpr rdram_address, uint32_t offset, uint32_t count);
+
+constexpr int eeprom_block_size = 8;
+constexpr int eep4_size = 4096;
+constexpr int eep4_block_count = eep4_size / eeprom_block_size;
+constexpr int eep16_size = 16384;
+constexpr int eep16_block_count = eep16_size / eeprom_block_size;
 
 extern "C" void osEepromProbe_recomp(uint8_t* rdram, recomp_context* ctx) {
-    ;
+    ctx->r2 = 0x02; // EEP16K
 }
 
 extern "C" void osEepromWrite_recomp(uint8_t* rdram, recomp_context* ctx) {
-    ;
+    assert(false);// ctx->r2 = 8; // CONT_NO_RESPONSE_ERROR
 }
 
 extern "C" void osEepromLongWrite_recomp(uint8_t* rdram, recomp_context* ctx) {
-    ;
+    uint8_t eep_address = ctx->r5;
+    gpr buffer = ctx->r6;
+    int32_t nbytes = ctx->r7;
+
+    assert(!(nbytes & 7));
+    assert(eep_address * eeprom_block_size + nbytes <= eep16_size);
+
+    save_write(rdram, buffer, eep_address * eeprom_block_size, nbytes);
+
+    ctx->r2 = 0;
 }
 
 extern "C" void osEepromRead_recomp(uint8_t* rdram, recomp_context* ctx) {
-    ;
+    assert(false);// ctx->r2 = 8; // CONT_NO_RESPONSE_ERROR
 }
 
 extern "C" void osEepromLongRead_recomp(uint8_t* rdram, recomp_context* ctx) {
-    ;
+    uint8_t eep_address = ctx->r5;
+    gpr buffer = ctx->r6;
+    int32_t nbytes = ctx->r7;
+
+    assert(!(nbytes & 7));
+    assert(eep_address * eeprom_block_size + nbytes <= eep16_size);
+
+    save_read(rdram, buffer, eep_address * eeprom_block_size, nbytes);
+
+    ctx->r2 = 0;
 }
diff --git a/src/main/main.cpp b/src/main/main.cpp
new file mode 100644
index 0000000..aa6a4b2
--- /dev/null
+++ b/src/main/main.cpp
@@ -0,0 +1,366 @@
+#include <cstdio>
+#include <cassert>
+#include <unordered_map>
+
+#include "../../portultra/ultra64.h"
+#include "../../portultra/multilibultra.hpp"
+#define SDL_MAIN_HANDLED
+#ifdef _WIN32
+#include "SDL.h"
+#else
+#include "SDL2/SDL.h"
+#include "SDL2/SDL_syswm.h"
+#endif
+
+#ifdef _WIN32
+#define NOMINMAX
+#define WIN32_LEAN_AND_MEAN
+#include <Windows.h>
+#include "SDL_syswm.h"
+#endif
+
+extern "C" void init();
+/*extern "C"*/ void start(Multilibultra::WindowHandle window_handle, const Multilibultra::audio_callbacks_t* audio_callbacks, const Multilibultra::input_callbacks_t* input_callbacks);
+
+template<typename... Ts>
+void exit_error(const char* str, Ts ...args) {
+    // TODO pop up an error
+    ((void)fprintf(stderr, str, args), ...);
+    assert(false);
+    std::quick_exit(EXIT_FAILURE);
+}
+
+
+std::vector<std::pair<SDL_Scancode, int>> keyboard_button_map{
+    { SDL_Scancode::SDL_SCANCODE_LEFT,   0x0002 }, // c left
+    { SDL_Scancode::SDL_SCANCODE_RIGHT,  0x0001 }, // c right
+    { SDL_Scancode::SDL_SCANCODE_UP,     0x0008 }, // c up
+    { SDL_Scancode::SDL_SCANCODE_DOWN,   0x0004 }, // c down
+    { SDL_Scancode::SDL_SCANCODE_RETURN, 0x1000 }, // start
+    { SDL_Scancode::SDL_SCANCODE_SPACE,  0x8000 }, // a
+    { SDL_Scancode::SDL_SCANCODE_LSHIFT, 0x4000 }, // b
+    { SDL_Scancode::SDL_SCANCODE_Q,      0x2000 }, // z
+    { SDL_Scancode::SDL_SCANCODE_E,      0x0020 }, // l
+    { SDL_Scancode::SDL_SCANCODE_R,      0x0010 }, // r
+    { SDL_Scancode::SDL_SCANCODE_J,      0x0200 }, // dpad left
+    { SDL_Scancode::SDL_SCANCODE_L,      0x0100 }, // dpad right
+    { SDL_Scancode::SDL_SCANCODE_I,      0x0800 }, // dpad up
+    { SDL_Scancode::SDL_SCANCODE_K,      0x0400 }, // dpad down
+};
+
+struct GameControllerAxisMapping {
+    SDL_GameControllerAxis axis;
+    int threshold; // Positive or negative to indicate direction
+    uint16_t output_mask;
+};
+
+constexpr int controller_default_threshold = 20000;
+
+std::vector<GameControllerAxisMapping> controller_axis_map{
+    { SDL_GameControllerAxis::SDL_CONTROLLER_AXIS_RIGHTX,      -controller_default_threshold, 0x0002 }, // c left
+    { SDL_GameControllerAxis::SDL_CONTROLLER_AXIS_RIGHTX,       controller_default_threshold, 0x0001 }, // c right
+    { SDL_GameControllerAxis::SDL_CONTROLLER_AXIS_RIGHTY,      -controller_default_threshold, 0x0008 }, // c up
+    { SDL_GameControllerAxis::SDL_CONTROLLER_AXIS_RIGHTY,       controller_default_threshold, 0x0004 }, // c down
+    { SDL_GameControllerAxis::SDL_CONTROLLER_AXIS_TRIGGERLEFT, 10000,                         0x2000 }, // z
+    //{ SDL_Scancode::SDL_SCANCODE_RIGHT,  0x0001 }, // c right
+    //{ SDL_Scancode::SDL_SCANCODE_UP,     0x0008 }, // c up
+    //{ SDL_Scancode::SDL_SCANCODE_DOWN,   0x0004 }, // c down
+    //{ SDL_Scancode::SDL_SCANCODE_RETURN, 0x1000 }, // start
+    //{ SDL_Scancode::SDL_SCANCODE_SPACE,  0x8000 }, // a
+    //{ SDL_Scancode::SDL_SCANCODE_LSHIFT, 0x4000 }, // b
+    //{ SDL_Scancode::SDL_SCANCODE_Q,      0x2000 }, // z
+    //{ SDL_Scancode::SDL_SCANCODE_E,      0x0020 }, // l
+    //{ SDL_Scancode::SDL_SCANCODE_R,      0x0010 }, // r
+    //{ SDL_Scancode::SDL_SCANCODE_J,      0x0200 }, // dpad left
+    //{ SDL_Scancode::SDL_SCANCODE_L,      0x0100 }, // dpad right
+    //{ SDL_Scancode::SDL_SCANCODE_I,      0x0800 }, // dpad up
+    //{ SDL_Scancode::SDL_SCANCODE_K,      0x0400 }, // dpad down
+};
+
+struct GameControllerButtonMapping {
+    SDL_GameControllerButton button;
+    uint16_t output_mask;
+};
+std::vector<GameControllerButtonMapping> controller_button_map{
+    { SDL_GameControllerButton::SDL_CONTROLLER_BUTTON_START,         0x1000 }, // start
+    { SDL_GameControllerButton::SDL_CONTROLLER_BUTTON_A,             0x8000 }, // a
+    { SDL_GameControllerButton::SDL_CONTROLLER_BUTTON_B,             0x4000 }, // b
+    { SDL_GameControllerButton::SDL_CONTROLLER_BUTTON_X,             0x4000 }, // b
+    { SDL_GameControllerButton::SDL_CONTROLLER_BUTTON_LEFTSHOULDER,  0x0020 }, // l
+    { SDL_GameControllerButton::SDL_CONTROLLER_BUTTON_RIGHTSHOULDER, 0x0010 }, // r
+    { SDL_GameControllerButton::SDL_CONTROLLER_BUTTON_DPAD_LEFT,     0x0200 }, // dpad left
+    { SDL_GameControllerButton::SDL_CONTROLLER_BUTTON_DPAD_RIGHT,    0x0100 }, // dpad right
+    { SDL_GameControllerButton::SDL_CONTROLLER_BUTTON_DPAD_UP,       0x0800 }, // dpad up
+    { SDL_GameControllerButton::SDL_CONTROLLER_BUTTON_DPAD_DOWN,     0x0400 }, // dpad down
+};
+
+std::vector<SDL_JoystickID> controllers{};
+
+int sdl_event_filter(void* userdata, SDL_Event* event) {
+    switch (event->type) {
+    //case SDL_EventType::SDL_KEYUP:
+    //case SDL_EventType::SDL_KEYDOWN:
+    //    {
+    //        const Uint8* key_states = SDL_GetKeyboardState(nullptr);
+    //        int new_button = 0;
+
+    //        for (const auto& mapping : keyboard_button_map) {
+    //            if (key_states[mapping.first]) {
+    //                new_button |= mapping.second;
+    //            }
+    //        }
+
+    //        button = new_button;
+
+    //        stick_x = (100.0f / 100.0f) * (key_states[SDL_Scancode::SDL_SCANCODE_D] - key_states[SDL_Scancode::SDL_SCANCODE_A]);
+    //        stick_y = (100.0f / 100.0f) * (key_states[SDL_Scancode::SDL_SCANCODE_W] - key_states[SDL_Scancode::SDL_SCANCODE_S]);
+    //    }
+    //    break;
+    case SDL_EventType::SDL_CONTROLLERDEVICEADDED:
+        {
+            SDL_ControllerDeviceEvent* controller_event = (SDL_ControllerDeviceEvent*)event;
+            SDL_GameController* controller = SDL_GameControllerOpen(controller_event->which);
+            printf("Controller added: %d\n", controller_event->which);
+            if (controller != nullptr) {
+                printf("  Instance ID: %d\n", SDL_JoystickInstanceID(SDL_GameControllerGetJoystick(controller)));
+                controllers.push_back(SDL_JoystickInstanceID(SDL_GameControllerGetJoystick(controller)));
+            }
+        }
+        break;
+    case SDL_EventType::SDL_CONTROLLERDEVICEREMOVED:
+        {
+            SDL_ControllerDeviceEvent* controller_event = (SDL_ControllerDeviceEvent*)event;
+            printf("Controller removed: %d\n", controller_event->which);
+            std::remove(controllers.begin(), controllers.end(), controller_event->which);
+        }
+        break;
+    case SDL_EventType::SDL_QUIT:
+        std::quick_exit(EXIT_SUCCESS);
+        break;
+    }
+    return 1;
+}
+
+Multilibultra::gfx_callbacks_t::gfx_data_t create_gfx() {
+    if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_GAMECONTROLLER) > 0) {
+        exit_error("Failed to initialize SDL2: %s\n", SDL_GetError());
+    }
+
+    return {};
+}
+
+Multilibultra::WindowHandle create_window(Multilibultra::gfx_callbacks_t::gfx_data_t) {
+    SDL_Window* window = SDL_CreateWindow("Majora's Mask", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, 1280, 720, SDL_WINDOW_RESIZABLE);
+
+    if (window == nullptr) {
+        exit_error("Failed to create window: %s\n", SDL_GetError());
+    }
+
+    SDL_SysWMinfo wmInfo;
+    SDL_VERSION(&wmInfo.version);
+    SDL_GetWindowWMInfo(window, &wmInfo);
+
+#if defined(_WIN32)
+    return wmInfo.info.win.window;
+#elif defined(__ANDROID__)
+    static_assert(false && "Unimplemented");
+#elif defined(__linux__)
+    return Multilibultra::WindowHandle{ wmInfo.info.x11.display, wmInfo.info.x11.window };
+#else
+    static_assert(false && "Unimplemented");
+#endif
+}
+
+void update_gfx(void*) {
+    // Handle events
+    constexpr int max_events_per_frame = 16;
+    SDL_Event cur_event;
+    int i = 0;
+    while (i++ < max_events_per_frame && SDL_PollEvent(&cur_event)) {
+        sdl_event_filter(nullptr, &cur_event);
+    }
+}
+
+void get_input(uint16_t* buttons_out, float* x_out, float* y_out) {
+    uint16_t cur_buttons = 0;
+    float cur_x = 0.0f;
+    float cur_y = 0.0f;
+
+    const Uint8* key_states = SDL_GetKeyboardState(nullptr);
+    int new_button = 0;
+
+    for (const auto& mapping : keyboard_button_map) {
+        if (key_states[mapping.first]) {
+            cur_buttons |= mapping.second;
+        }
+    }
+
+    cur_x += (100.0f / 100.0f) * (key_states[SDL_Scancode::SDL_SCANCODE_D] - key_states[SDL_Scancode::SDL_SCANCODE_A]);
+    cur_y += (100.0f / 100.0f) * (key_states[SDL_Scancode::SDL_SCANCODE_W] - key_states[SDL_Scancode::SDL_SCANCODE_S]);
+
+    for (SDL_JoystickID controller_id : controllers) {
+        SDL_GameController* controller = SDL_GameControllerFromInstanceID(controller_id);
+        if (controller != nullptr) {
+            cur_x += SDL_GameControllerGetAxis(controller, SDL_GameControllerAxis::SDL_CONTROLLER_AXIS_LEFTX) * (1/32768.0f);
+            cur_y -= SDL_GameControllerGetAxis(controller, SDL_GameControllerAxis::SDL_CONTROLLER_AXIS_LEFTY) * (1/32768.0f);
+        }
+
+        for (const auto& mapping : controller_axis_map) {
+            int input_value = SDL_GameControllerGetAxis(controller, mapping.axis);
+            if (mapping.threshold > 0) {
+                if (input_value > mapping.threshold) {
+                    cur_buttons |= mapping.output_mask;
+                }
+            }
+            else {
+                if (input_value < mapping.threshold) {
+                    cur_buttons |= mapping.output_mask;
+                }
+            }
+        }
+
+        for (const auto& mapping : controller_button_map) {
+            int input_value = SDL_GameControllerGetButton(controller, mapping.button);
+            if (input_value) {
+                cur_buttons |= mapping.output_mask;
+            }
+        }
+    }
+
+    *buttons_out = cur_buttons;
+    cur_x = std::clamp(cur_x, -1.0f, 1.0f);
+    cur_y = std::clamp(cur_y, -1.0f, 1.0f);
+    *x_out = cur_x;
+    *y_out = cur_y;
+}
+
+static SDL_AudioDeviceID audio_device = 0;
+static uint32_t sample_rate = 48000;
+
+void queue_samples(int16_t* audio_data, size_t sample_count) {
+    // Buffer for holding the output of swapping the audio channels. This is reused across
+    // calls to reduce runtime allocations.
+    static std::vector<float> swap_buffer;
+
+    // Make sure the swap buffer is large enough to hold all the incoming audio data.
+    if (sample_count > swap_buffer.size()) {
+        swap_buffer.resize(sample_count);
+    }
+
+    // Convert the audio from 16-bit values to floats and swap the audio channels into the
+    // swap buffer to correct for the address xor caused by endianness handling.
+    for (size_t i = 0; i < sample_count; i += 2) {
+        swap_buffer[i + 0] = audio_data[i + 1] * (0.5f / 32768.0f);
+        swap_buffer[i + 1] = audio_data[i + 0] * (0.5f / 32768.0f);
+    }
+
+    // Queue the swapped audio data.
+    SDL_QueueAudio(audio_device, swap_buffer.data(), sample_count * sizeof(swap_buffer[0]));
+}
+
+constexpr int channel_count = 2;
+constexpr int bytes_per_frame = channel_count * sizeof(float);
+
+size_t get_frames_remaining() {
+    constexpr float buffer_offset_frames = 1.0f;
+    // Get the number of remaining buffered audio bytes.
+    uint32_t buffered_byte_count = SDL_GetQueuedAudioSize(audio_device);
+
+    // Adjust the reported count to be some number of refreshes in the future, which helps ensure that
+    // there are enough samples even if the audio thread experiences a small amount of lag. This prevents
+    // audio popping on games that use the buffered audio byte count to determine how many samples
+    // to generate.
+    uint32_t frames_per_vi = (sample_rate / 60);
+    if (buffered_byte_count > (buffer_offset_frames * bytes_per_frame * frames_per_vi)) {
+        buffered_byte_count -= (buffer_offset_frames * bytes_per_frame * frames_per_vi);
+    }
+    else {
+        buffered_byte_count = 0;
+    }
+    // Convert from byte count to sample count.
+    return buffered_byte_count / bytes_per_frame;
+}
+
+void set_frequency(uint32_t freq) {
+    if (audio_device != 0) {
+        SDL_CloseAudioDevice(audio_device);
+    }
+    SDL_AudioSpec spec_desired{
+        .freq = (int)freq,
+        .format = AUDIO_F32,
+        .channels = channel_count,
+        .silence = 0, // calculated
+        .samples = 0x100, // Fairly small sample count to reduce the latency of internal buffering
+        .padding = 0, // unused
+        .size = 0, // calculated
+        .callback = nullptr,//feed_audio, // Use a callback as QueueAudio causes popping
+        .userdata = nullptr
+    };
+
+    audio_device = SDL_OpenAudioDevice(nullptr, false, &spec_desired, nullptr, 0);
+    if (audio_device == 0) {
+        exit_error("SDL error opening audio device: %s\n", SDL_GetError());
+    }
+    SDL_PauseAudioDevice(audio_device, 0);
+    sample_rate = freq;
+}
+
+int main(int argc, char** argv) {
+
+#ifdef _WIN32
+    // Set up console output to accept UTF-8 on windows
+    SetConsoleOutputCP(CP_UTF8);
+
+    // Change to a font that supports Japanese characters
+    CONSOLE_FONT_INFOEX cfi;
+    cfi.cbSize = sizeof cfi;
+    cfi.nFont = 0;
+    cfi.dwFontSize.X = 0;
+    cfi.dwFontSize.Y = 16;
+    cfi.FontFamily = FF_DONTCARE;
+    cfi.FontWeight = FW_NORMAL;
+    wcscpy_s(cfi.FaceName, L"NSimSun");
+    SetCurrentConsoleFontEx(GetStdHandle(STD_OUTPUT_HANDLE), FALSE, &cfi);
+#else
+    std::setlocale(LC_ALL, "en_US.UTF-8");
+#endif
+
+    // Initialize SDL audio.
+    SDL_InitSubSystem(SDL_INIT_AUDIO);
+    // Pick an initial dummy sample rate; this will be set by the game later to the true sample rate.
+    set_frequency(sample_rate);
+
+    init();
+
+    Multilibultra::gfx_callbacks_t gfx_callbacks{
+        .create_gfx = create_gfx,
+        .create_window = create_window,
+        .update_gfx = update_gfx,
+    };
+
+    Multilibultra::audio_callbacks_t audio_callbacks{
+        .queue_samples = queue_samples,
+        .get_frames_remaining = get_frames_remaining,
+        .set_frequency = set_frequency,
+    };
+
+    Multilibultra::input_callbacks_t input_callbacks{
+        .get_input = get_input,
+    };
+
+    //create_gfx();
+    //void* window_handle = create_window(nullptr);
+
+    Multilibultra::set_gfx_callbacks(&gfx_callbacks);
+    start(Multilibultra::WindowHandle{}, &audio_callbacks, &input_callbacks);
+
+    // Do nothing forever
+    while (1) {
+        using namespace std::chrono_literals;
+        std::this_thread::sleep_for(10ms);
+        //update_gfx(nullptr);
+        //std::this_thread::sleep_for(1ms);
+    }
+
+    return EXIT_SUCCESS;
+}
diff --git a/src/overlays.cpp b/src/overlays.cpp
index fb3d9f7..aaf7a08 100644
--- a/src/overlays.cpp
+++ b/src/overlays.cpp
@@ -59,6 +59,42 @@ extern "C" void load_overlays(uint32_t rom, int32_t ram_addr, uint32_t size) {
     }
 }
 
+extern "C" void unload_overlays(int32_t ram_addr, uint32_t size);
+
+extern "C" void unload_overlay_by_id(uint32_t id) {
+    uint32_t section_table_index = overlay_sections_by_index[id];
+    const SectionTableEntry& section = section_table[section_table_index];
+
+    auto find_it = std::find_if(loaded_sections.begin(), loaded_sections.end(), [section_table_index](const LoadedSection& s) { return s.section_table_index == section_table_index; });
+
+    if (find_it != loaded_sections.end()) {
+        // Determine where each function was loaded to and remove that entry from the function map
+        for (size_t func_index = 0; func_index < section.num_funcs; func_index++) {
+            const auto& func = section.funcs[func_index];
+            uint32_t func_address = func.offset + find_it->loaded_ram_addr;
+            func_map.erase(func_address);
+        }
+        // Reset the section's address in the address table
+        section_addresses[section.index] = section.ram_addr;
+        // Remove the section from the loaded section map
+        loaded_sections.erase(find_it);
+    }
+}
+
+extern "C" void load_overlay_by_id(uint32_t id, uint32_t ram_addr) {
+    uint32_t section_table_index = overlay_sections_by_index[id];
+    const SectionTableEntry& section = section_table[section_table_index];
+    int32_t prev_address = section_addresses[section.index];
+    if (/*ram_addr >= 0x80000000 && ram_addr < 0x81000000) {*/ prev_address == section.ram_addr) {
+        load_overlay(section_table_index, ram_addr);
+    }
+    else {
+        int32_t new_address = prev_address + ram_addr;
+        unload_overlay_by_id(id);
+        load_overlay(section_table_index, new_address);
+    }
+}
+
 extern "C" void unload_overlays(int32_t ram_addr, uint32_t size) {
     for (auto it = loaded_sections.begin(); it != loaded_sections.end();) {
         const auto& section = section_table[it->section_table_index];
@@ -72,6 +108,7 @@ extern "C" void unload_overlays(int32_t ram_addr, uint32_t size) {
                     "  rom: 0x%08X size: 0x%08X loaded_addr: 0x%08X\n"
                     "  unloaded_ram: 0x%08X unloaded_size : 0x%08X\n",
                         section.rom_addr, section.size, it->loaded_ram_addr, ram_addr, size);
+                assert(false);
                 std::exit(EXIT_FAILURE);
             }
             // Determine where each function was loaded to and remove that entry from the function map
@@ -81,7 +118,7 @@ extern "C" void unload_overlays(int32_t ram_addr, uint32_t size) {
                 func_map.erase(func_address);
             }
             // Reset the section's address in the address table
-            section_addresses[section.index] = 0;
+            section_addresses[section.index] = section.ram_addr;
             // Remove the section from the loaded section map
             it = loaded_sections.erase(it);
             // Skip incrementing the iterator
@@ -108,6 +145,7 @@ extern "C" recomp_func_t * get_function(int32_t addr) {
     auto func_find = func_map.find(addr);
     if (func_find == func_map.end()) {
         fprintf(stderr, "Failed to find function at 0x%08X\n", addr);
+        assert(false);
         std::exit(EXIT_FAILURE);
     }
     return func_find->second;
diff --git a/src/pi.cpp b/src/pi.cpp
index 86366d8..39bfdad 100644
--- a/src/pi.cpp
+++ b/src/pi.cpp
@@ -138,7 +138,7 @@ extern "C" void osPiStartDma_recomp(uint8_t* rdram, recomp_context* ctx) {
     uint32_t mb = ctx->r4;
     uint32_t pri = ctx->r5;
     uint32_t direction = ctx->r6;
-    uint32_t devAddr = ctx->r7;
+    uint32_t devAddr = ctx->r7 | rom_base;
     gpr dramAddr = MEM_W(0x10, ctx->r29);
     uint32_t size = MEM_W(0x14, ctx->r29);
     PTR(OSMesgQueue) mq = MEM_W(0x18, ctx->r29);
diff --git a/src/portultra_translation.cpp b/src/portultra_translation.cpp
index f2f9df2..e32ca68 100644
--- a/src/portultra_translation.cpp
+++ b/src/portultra_translation.cpp
@@ -28,6 +28,10 @@ extern "C" void osDestroyThread_recomp(uint8_t * rdram, recomp_context * ctx) {
     osDestroyThread(rdram, (int32_t)ctx->r4);
 }
 
+extern "C" void osYieldThread_recomp(uint8_t * rdram, recomp_context * ctx) {
+    osYieldThread(rdram);
+}
+
 extern "C" void osSetThreadPri_recomp(uint8_t* rdram, recomp_context* ctx) {
     osSetThreadPri(rdram, (int32_t)ctx->r4, (OSPri)ctx->r5);
 }
@@ -85,7 +89,7 @@ extern "C" void osStopTimer_recomp(uint8_t * rdram, recomp_context * ctx) {
 }
 
 extern "C" void osVirtualToPhysical_recomp(uint8_t * rdram, recomp_context * ctx) {
-    ctx->r2 = osVirtualToPhysical((int32_t)ctx->r2);
+    ctx->r2 = osVirtualToPhysical((int32_t)ctx->r4);
 }
 
 extern "C" void osInvalDCache_recomp(uint8_t * rdram, recomp_context * ctx) {
diff --git a/src/recomp.cpp b/src/recomp.cpp
index 6c8de60..dcec2bf 100644
--- a/src/recomp.cpp
+++ b/src/recomp.cpp
@@ -1,4 +1,4 @@
-﻿#ifdef _WIN32
+#ifdef _WIN32
 #include <Windows.h>
 #endif
 #include <cstdio>
@@ -41,13 +41,58 @@ extern "C" void osGetMemSize_recomp(uint8_t * rdram, recomp_context * ctx) {
     ctx->r2 = 8 * 1024 * 1024;
 }
 
+enum class StatusReg {
+    FR = 0x04000000,
+};
+
+extern "C" void cop0_status_write(recomp_context* ctx, gpr value) {
+    uint32_t old_sr = ctx->status_reg;
+    uint32_t new_sr = (uint32_t)value;
+    uint32_t changed = old_sr ^ new_sr;
+
+    // Check if the FR bit changed
+    if (changed & (uint32_t)StatusReg::FR) {
+        // Check if the FR bit was set
+        if (new_sr & (uint32_t)StatusReg::FR) {
+            // FR = 1, odd single floats point to their own registers
+            ctx->f_odd = &ctx->f1.u32l;
+            ctx->mips3_float_mode = true;
+        }
+        // Otherwise, it was cleared
+        else {
+            // FR = 0, odd single floats point to the upper half of the previous register
+            ctx->f_odd = &ctx->f0.u32h;
+            ctx->mips3_float_mode = false;
+        }
+
+        // Remove the FR bit from the changed bits as it's been handled
+        changed &= ~(uint32_t)StatusReg::FR;
+    }
+
+    // If any other bits were changed, assert false as they're not handled currently
+    if (changed) {
+        printf("Unhandled status register bits changed: 0x%08X\n", changed);
+        assert(false);
+        exit(EXIT_FAILURE);
+    }
+    
+    // Update the status register in the context
+    ctx->status_reg = new_sr;
+}
+
+extern "C" gpr cop0_status_read(recomp_context* ctx) {
+    return (gpr)(int32_t)ctx->status_reg;
+}
+
 extern "C" void switch_error(const char* func, uint32_t vram, uint32_t jtbl) {
     printf("Switch-case out of bounds in %s at 0x%08X for jump table at 0x%08X\n", func, vram, jtbl);
+    assert(false);
     exit(EXIT_FAILURE);
 }
 
 extern "C" void do_break(uint32_t vram) {
     printf("Encountered break at original vram 0x%08X\n", vram);
+    assert(false);
     exit(EXIT_FAILURE);
 }
 
@@ -55,6 +100,8 @@ void run_thread_function(uint8_t* rdram, uint64_t addr, uint64_t sp, uint64_t ar
     recomp_context ctx{};
     ctx.r29 = sp;
     ctx.r4 = arg;
+    ctx.mips3_float_mode = 0;
+    ctx.f_odd = &ctx.f0.u32h;
     recomp_func_t* func = get_function(addr);
     func(rdram, &ctx);
 }
@@ -72,10 +119,6 @@ void init_overlays();
 extern "C" void load_overlays(uint32_t rom, int32_t ram_addr, uint32_t size);
 extern "C" void unload_overlays(int32_t ram_addr, uint32_t size);
 
-#ifdef _WIN32
-#include <Windows.h>
-#endif
-
 std::unique_ptr<uint8_t[]> rdram_buffer;
 recomp_context context{};
 
@@ -124,6 +167,10 @@ EXPORT extern "C" void init() {
     // Set up stack pointer
     context.r29 = 0xFFFFFFFF803FFFF0u;
 
+    // Set up context floats
+    context.f_odd = &context.f0.u32h;
+    context.mips3_float_mode = false;
+
     // Initialize variables normally set by IPL3
     constexpr int32_t osTvType = 0x80000300;
     constexpr int32_t osRomType = 0x80000304;
@@ -140,10 +187,37 @@ EXPORT extern "C" void init() {
     MEM_W(osMemSize, 0) = 8 * 1024 * 1024; // 8MB
 }
 
-EXPORT extern "C" void start(void* window_handle, const Multilibultra::audio_callbacks_t* audio_callbacks, const Multilibultra::input_callbacks_t* input_callbacks) {
+// LRESULT CALLBACK WindowProc(HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam) {
+//     return DefWindowProc(hwnd, uMsg, wParam, lParam);
+// }
+
+/*EXPORT extern "C"*/ void start(Multilibultra::WindowHandle window_handle, const Multilibultra::audio_callbacks_t* audio_callbacks, const Multilibultra::input_callbacks_t* input_callbacks) {
     Multilibultra::set_audio_callbacks(audio_callbacks);
     Multilibultra::set_input_callbacks(input_callbacks);
-    std::thread game_thread{[](void* window_handle) {
+
+    //// Register window class.
+    //WNDCLASS wc;
+    //memset(&wc, 0, sizeof(WNDCLASS));
+    //wc.lpfnWndProc = WindowProc;
+    //wc.hInstance = GetModuleHandle(0);
+    //wc.hbrBackground = (HBRUSH)(COLOR_BACKGROUND);
+    //wc.lpszClassName = "RT64Sample";
+    //RegisterClass(&wc);
+
+    //// Create window.
+    //const int Width = 1280;
+    //const int Height = 720;
+    //RECT rect;
+    //UINT dwStyle = WS_OVERLAPPEDWINDOW | WS_VISIBLE;
+    //rect.left = (GetSystemMetrics(SM_CXSCREEN) - Width) / 2;
+    //rect.top = (GetSystemMetrics(SM_CYSCREEN) - Height) / 2;
+    //rect.right = rect.left + Width;
+    //rect.bottom = rect.top + Height;
+    //AdjustWindowRectEx(&rect, dwStyle, 0, 0);
+
+    //HWND hwnd = CreateWindow(wc.lpszClassName, "Recomp", dwStyle, rect.left, rect.top, rect.right - rect.left, rect.bottom - rect.top, 0, 0, wc.hInstance, NULL);
+
+    std::thread game_thread{[](Multilibultra::WindowHandle window_handle) {
         debug_printf("[Recomp] Starting\n");
         
         Multilibultra::set_native_thread_name("Game Start Thread");
@@ -157,29 +231,3 @@ EXPORT extern "C" void start(void* window_handle, const Multilibultra::audio_cal
 
     game_thread.detach();
 }
-
-int main(int argc, char **argv) {
-
-#ifdef _WIN32
-    // Set up console output to accept UTF-8 on windows
-    SetConsoleOutputCP(CP_UTF8);
-
-    // Change to a font that supports Japanese characters
-    CONSOLE_FONT_INFOEX cfi;
-    cfi.cbSize = sizeof cfi;
-    cfi.nFont = 0;
-    cfi.dwFontSize.X = 0;
-    cfi.dwFontSize.Y = 16;
-    cfi.FontFamily = FF_DONTCARE;
-    cfi.FontWeight = FW_NORMAL;
-    wcscpy_s(cfi.FaceName, L"NSimSun");
-    SetCurrentConsoleFontEx(GetStdHandle(STD_OUTPUT_HANDLE), FALSE, &cfi);
-#else
-    std::setlocale(LC_ALL, "en_US.UTF-8");
-#endif
-
-    init();
-    start(nullptr, nullptr, nullptr);
-
-    return EXIT_SUCCESS;
-}
diff --git a/src/rt64_layer.cpp b/src/rt64_layer.cpp
index a101b39..ab719bb 100644
--- a/src/rt64_layer.cpp
+++ b/src/rt64_layer.cpp
@@ -44,7 +44,7 @@ void dummy_check_interrupts() {
 
 }
 
-void RT64Init(uint8_t* rom, uint8_t* rdram, void* window_handle) {
+void RT64Init(uint8_t* rom, uint8_t* rdram, Multilibultra::WindowHandle window_handle) {
     // Dynamic loading
     //auto RT64 = LoadLibrary("RT64.dll");
     //if (RT64 == 0) {
@@ -57,8 +57,8 @@ void RT64Init(uint8_t* rom, uint8_t* rdram, void* window_handle) {
     //GET_FUNC(RT64, UpdateScreen);
 
     GFX_INFO gfx_info{};
-    gfx_info.hWnd = window_handle;
-    gfx_info.hStatusBar = nullptr;
+    // gfx_info.hWnd = window_handle;
+    // gfx_info.hStatusBar = nullptr;
 
     gfx_info.HEADER = rom;
     gfx_info.RDRAM = rdram;
@@ -93,7 +93,11 @@ void RT64Init(uint8_t* rom, uint8_t* rdram, void* window_handle) {
 
     gfx_info.CheckInterrupts = dummy_check_interrupts;
 
-	InitiateGFX(gfx_info);
+    gfx_info.version = 2;
+    gfx_info.SP_STATUS_REG = &SP_STATUS_REG;
+    gfx_info.RDRAM_SIZE = &RDRAM_SIZE;
+
+	InitiateGFXLinux(gfx_info, window_handle.window, window_handle.display);
 }
 
 void RT64SendDL(uint8_t* rdram, const OSTask* task) {
diff --git a/src/vi.cpp b/src/vi.cpp
index 4301d11..a38cabe 100644
--- a/src/vi.cpp
+++ b/src/vi.cpp
@@ -36,3 +36,12 @@ extern "C" void osViSwapBuffer_recomp(uint8_t* rdram, recomp_context* ctx) {
 extern "C" void osViSetMode_recomp(uint8_t* rdram, recomp_context* ctx) {
     osViSetMode(rdram, (int32_t)ctx->r4);
 }
+
+extern uint64_t total_vis;
+
+extern "C" void wait_one_frame(uint8_t* rdram, recomp_context* ctx) {
+    uint64_t cur_vis = total_vis;
+    while (cur_vis == total_vis) {
+        std::this_thread::yield();
+    }
+}