Introduce `ThreadLocal` Class + Fix Several GPU Bugs

* Fix `AddClearColorSubpass` bug where it would not generate a `VkCmdNextSubpass` when an attachment clear was utilized
* Fix `AddSubpass` bug where the Depth Stencil texture would not be synced
* Respect `VkCommandPool` external synchronization requirements by making it thread-local with a custom RAII wrapper
* Fix linear RT width calculation as it's provided in terms of bytes rather than format units
* Fix `AllocateStagingBuffer` bug where it would not supply `eTransferDst` as a usage flag
* Fix `AllocateMappedImage` where `VkMemoryPropertyFlags` were not respected resulting in non-`eHostVisible` memory being utilized
* Change feature requirement in `AndroidManifest.xml` to Vulkan 1.1 from OGL 3.1 as this was incorrect
This commit is contained in:
PixelyIon 2021-10-11 09:13:25 +05:30 committed by Billy Laws
parent eb25f60033
commit 9b9bf8d300
17 changed files with 248 additions and 38 deletions

3
.gitignore vendored
View File

@ -92,3 +92,6 @@ lint/reports/
# Discord plugin for IntelliJ IDEA
.idea/discord.xml
# Adreno Validation Layer
libVkLayer_adreno.so

View File

@ -4,11 +4,13 @@
package="emu.skyline">
<uses-permission android:name="android.permission.INTERNET" />
<uses-permission android:name="android.permission.VIBRATE" />
<uses-feature
android:glEsVersion="0x00030001"
android:required="true" />
<uses-permission android:name="android.permission.VIBRATE" />
android:name="android.hardware.vulkan.version"
android:required="true"
android:version="0x401000" />
<application
android:name=".SkylineApplication"
android:allowBackup="true"

View File

@ -8,7 +8,7 @@
#include "gpu.h"
#include "audio.h"
#include "input.h"
#include "kernel/types/KThread.h"
#include "kernel/types/KProcess.h"
namespace skyline {
Logger::Logger(const std::string &path, LogLevel configLevel) : configLevel(configLevel), start(util::GetTimeNs() / constant::NsInMillisecond) {
@ -56,4 +56,9 @@ namespace skyline {
scheduler = std::make_shared<kernel::Scheduler>(*this);
input = std::make_shared<input::Input>(*this);
}
DeviceState::~DeviceState() {
if (process)
process->ClearHandleTable();
}
}

View File

@ -711,19 +711,21 @@ namespace skyline {
struct DeviceState {
DeviceState(kernel::OS *os, std::shared_ptr<JvmManager> jvmManager, std::shared_ptr<Settings> settings, std::shared_ptr<Logger> logger);
~DeviceState();
kernel::OS *os;
std::shared_ptr<JvmManager> jvm;
std::shared_ptr<Settings> settings;
std::shared_ptr<Logger> logger;
std::shared_ptr<loader::Loader> loader;
std::shared_ptr<kernel::type::KProcess> process{};
static thread_local inline std::shared_ptr<kernel::type::KThread> thread{}; //!< The KThread of the thread which accesses this object
static thread_local inline nce::ThreadContext *ctx{}; //!< The context of the guest thread for the corresponding host thread
std::shared_ptr<gpu::GPU> gpu;
std::shared_ptr<soc::SOC> soc;
std::shared_ptr<audio::Audio> audio;
std::shared_ptr<nce::NCE> nce;
std::shared_ptr<kernel::Scheduler> scheduler;
std::shared_ptr<kernel::type::KProcess> process;
static thread_local inline std::shared_ptr<kernel::type::KThread> thread{}; //!< The KThread of the thread which accesses this object
static thread_local inline nce::ThreadContext *ctx{}; //!< The context of the guest thread for the corresponding host thread
std::shared_ptr<input::Input> input;
};
}

View File

@ -0,0 +1,129 @@
// SPDX-License-Identifier: MPL-2.0
// Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
#pragma once
#include <pthread.h>
namespace skyline {
/**
* @brief A thread-local RAII-bound wrapper class which unlike `thread_local` doesn't require the member to be static
* @note Caller must ensure any arguments passed into the constructor remain valid throughout its lifetime
* @note Caller must ensure the destructors of the object doesn't have any thread-local dependencies as they might be called from another thread
* @note RAII-bound means that *all* thread-local instances of the object will be destroyed after this class is destroyed but can also be destroyed when a thread owning an instance dies
*/
template<typename Type, bool TrivialDestructor = std::is_trivially_destructible_v<Type>>
class ThreadLocal;
template<typename Type>
class ThreadLocal<Type, true> {
private:
pthread_key_t key;
std::function<Type *()> constructor;
public:
template<typename... Args>
ThreadLocal(Args &&... args) : constructor([args...]() { return new Type(args...); }) {
int result;
if ((result = pthread_key_create(&key, nullptr)))
throw exception("Cannot create pthread_key: {}", strerror(result));
}
Type *operator->() {
auto pointer{pthread_getspecific(key)};
if (pointer)
return static_cast<Type *>(pointer);
int result;
Type *object{constructor(*this)};
if ((result = pthread_setspecific(key, object)))
throw exception("Cannot set pthread_key to constructed type: {}", strerror(result));
return object;
}
Type &operator*() {
return *operator->();
}
~ThreadLocal() {
pthread_key_delete(key);
}
};
template<typename Type>
class ThreadLocal<Type, false> {
private:
struct IntrustiveTypeNode {
Type object;
ThreadLocal &threadLocal;
IntrustiveTypeNode *next{};
template<typename... Args>
IntrustiveTypeNode(ThreadLocal &threadLocal, Args &&... args) : object(std::forward<Args>(args)...), threadLocal(threadLocal) {}
~IntrustiveTypeNode() {
auto current{threadLocal.list.load(std::memory_order_acquire)};
while (current == this)
if (threadLocal.list.compare_exchange_strong(current, next, std::memory_order_release, std::memory_order_consume))
return;
while (current) {
if (current->next == this) {
current->next = next;
return;
}
current = current->next;
}
}
};
pthread_key_t key;
std::function<IntrustiveTypeNode *(ThreadLocal &)> constructor;
std::atomic<IntrustiveTypeNode *> list; //!< An atomic instrusive linked list of all instances of the object to call non-trivial destructors for the objects
public:
template<typename... Args>
ThreadLocal(Args &&... args) : constructor([args...](ThreadLocal &threadLocal) { return new IntrustiveTypeNode(threadLocal, args...); }) {
auto destructor{[](void *object) {
static_cast<IntrustiveTypeNode *>(object)->~IntrustiveTypeNode();
}};
int result;
if ((result = pthread_key_create(&key, destructor)))
throw exception("Cannot create pthread_key: {}", strerror(result));
}
Type *operator->() {
auto pointer{pthread_getspecific(key)};
if (pointer)
return &static_cast<IntrustiveTypeNode *>(pointer)->object;
int result;
IntrustiveTypeNode *node{constructor(*this)};
if ((result = pthread_setspecific(key, node)))
throw exception("Cannot set pthread_key to constructed type: {}", strerror(result));
auto next{list.load(std::memory_order_acquire)};
do {
node->next = next;
} while (!list.compare_exchange_strong(next, node, std::memory_order_release, std::memory_order_consume));
return &node->object;
}
Type &operator*() {
return *operator->();
}
~ThreadLocal() {
auto current{list.exchange(nullptr, std::memory_order_acquire)};
while (current) {
current->object.~Type();
current = current->next;
}
pthread_key_delete(key);
}
};
}

View File

@ -20,21 +20,20 @@ namespace skyline::gpu {
return false;
}
CommandScheduler::CommandScheduler(GPU &pGpu) : gpu(pGpu), vkCommandPool(pGpu.vkDevice, vk::CommandPoolCreateInfo{
CommandScheduler::CommandScheduler(GPU &pGpu) : gpu(pGpu), pool(std::ref(pGpu.vkDevice), vk::CommandPoolCreateInfo{
.flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer,
.queueFamilyIndex = pGpu.vkQueueFamilyIndex,
}) {}
CommandScheduler::ActiveCommandBuffer CommandScheduler::AllocateCommandBuffer() {
std::scoped_lock lock(mutex);
auto slot{std::find_if(commandBuffers.begin(), commandBuffers.end(), CommandBufferSlot::AllocateIfFree)};
auto slotId{std::distance(commandBuffers.begin(), slot)};
if (slot != commandBuffers.end())
auto slot{std::find_if(pool->buffers.begin(), pool->buffers.end(), CommandBufferSlot::AllocateIfFree)};
auto slotId{std::distance(pool->buffers.begin(), slot)};
if (slot != pool->buffers.end())
return ActiveCommandBuffer(*slot);
vk::CommandBuffer commandBuffer;
vk::CommandBufferAllocateInfo commandBufferAllocateInfo{
.commandPool = *vkCommandPool,
.commandPool = *pool->vkCommandPool,
.level = vk::CommandBufferLevel::ePrimary,
.commandBufferCount = 1,
};
@ -42,7 +41,7 @@ namespace skyline::gpu {
auto result{(*gpu.vkDevice).allocateCommandBuffers(&commandBufferAllocateInfo, &commandBuffer, *gpu.vkDevice.getDispatcher())};
if (result != vk::Result::eSuccess)
vk::throwResultException(result, __builtin_FUNCTION());
return ActiveCommandBuffer(commandBuffers.emplace_back(gpu.vkDevice, commandBuffer, vkCommandPool));
return ActiveCommandBuffer(pool->buffers.emplace_back(gpu.vkDevice, commandBuffer, pool->vkCommandPool));
}
void CommandScheduler::SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, vk::Fence fence) {

View File

@ -3,6 +3,7 @@
#pragma once
#include <common/thread_local.h>
#include "fence_cycle.h"
namespace skyline::gpu {
@ -62,9 +63,19 @@ namespace skyline::gpu {
};
GPU &gpu;
std::mutex mutex; //!< Synchronizes mutations to the command pool due to allocations
vk::raii::CommandPool vkCommandPool;
std::list<CommandBufferSlot> commandBuffers;
/**
* @brief A command pool designed to be thread-local to respect external synchronization for all command buffers and the associated pool
* @note If we utilized a single global pool there would need to be a mutex around command buffer recording which would incur significant costs
*/
struct CommandPool {
vk::raii::CommandPool vkCommandPool;
std::list<CommandBufferSlot> buffers;
template<typename... Args>
constexpr CommandPool(Args &&... args) : vkCommandPool(std::forward<Args>(args)...) {}
};
ThreadLocal<CommandPool> pool;
/**
* @brief Allocates an existing or new primary command buffer from the pool

View File

@ -20,16 +20,18 @@ namespace skyline::gpu::interconnect {
}
void CommandExecutor::AddSubpass(const std::function<void(vk::raii::CommandBuffer &, const std::shared_ptr<FenceCycle> &, GPU &)> &function, vk::Rect2D renderArea, std::vector<TextureView> inputAttachments, std::vector<TextureView> colorAttachments, std::optional<TextureView> depthStencilAttachment) {
for (const auto& attachments : {inputAttachments, colorAttachments})
for (const auto& attachment : attachments)
for (const auto &attachments : {inputAttachments, colorAttachments})
for (const auto &attachment : attachments)
syncTextures.emplace(attachment.backing.get());
if (depthStencilAttachment)
syncTextures.emplace(depthStencilAttachment->backing.get());
bool newRenderpass{CreateRenderpass(renderArea)};
renderpass->AddSubpass(inputAttachments, colorAttachments, depthStencilAttachment ? &*depthStencilAttachment : nullptr);
if (newRenderpass)
nodes.emplace_back(std::in_place_type_t<node::FunctionNode>(), function);
else
nodes.emplace_back(std::in_place_type_t<node::NextSubpassNode>(), function);
nodes.emplace_back(std::in_place_type_t<node::NextSubpassFunctionNode>(), function);
}
void CommandExecutor::AddClearColorSubpass(TextureView attachment, const vk::ClearColorValue &value) {
@ -38,7 +40,10 @@ namespace skyline::gpu::interconnect {
})};
renderpass->AddSubpass({}, attachment, nullptr);
if (!renderpass->ClearColorAttachment(0, value)) {
if (renderpass->ClearColorAttachment(0, value)) {
if (!newRenderpass)
nodes.emplace_back(std::in_place_type_t<node::NextSubpassNode>());
} else {
auto function{[scissor = attachment.backing->dimensions, value](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &) {
commandBuffer.clearAttachments(vk::ClearAttachment{
.aspectMask = vk::ImageAspectFlagBits::eColor,
@ -54,7 +59,7 @@ namespace skyline::gpu::interconnect {
if (newRenderpass)
nodes.emplace_back(std::in_place_type_t<node::FunctionNode>(), function);
else
nodes.emplace_back(std::in_place_type_t<node::NextSubpassNode>(), function);
nodes.emplace_back(std::in_place_type_t<node::NextSubpassFunctionNode>(), function);
}
}
@ -73,12 +78,15 @@ namespace skyline::gpu::interconnect {
using namespace node;
for (NodeVariant &node : nodes) {
#define NODE(name) [&](name& node) { node(commandBuffer, cycle, gpu); }
std::visit(VariantVisitor{
[&](FunctionNode &node) { node(commandBuffer, cycle, gpu); },
[&](RenderpassNode &node) { node(commandBuffer, cycle, gpu); },
[&](NextSubpassNode &node) { node(commandBuffer, cycle, gpu); },
[&](RenderpassEndNode &node) { node(commandBuffer, cycle, gpu); },
NODE(FunctionNode),
NODE(RenderpassNode),
NODE(NextSubpassNode),
NODE(NextSubpassFunctionNode),
NODE(RenderpassEndNode),
}, node);
#undef NODE
}
for (auto texture : syncTextures)

View File

@ -286,10 +286,19 @@ namespace skyline::gpu::interconnect::node {
}
};
/**
* @brief A node which progresses to the next subpass during a renderpass
*/
struct NextSubpassNode {
void operator()(vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle, GPU &gpu) {
commandBuffer.nextSubpass(vk::SubpassContents::eInline);
}
};
/**
* @brief A FunctionNode which progresses to the next subpass prior to calling the function
*/
struct NextSubpassNode : private FunctionNode {
struct NextSubpassFunctionNode : private FunctionNode {
using FunctionNode::FunctionNode;
void operator()(vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle, GPU &gpu) {
@ -307,5 +316,5 @@ namespace skyline::gpu::interconnect::node {
}
};
using NodeVariant = std::variant<FunctionNode, RenderpassNode, NextSubpassNode, RenderpassEndNode>; //!< A variant encompassing all command nodes types
using NodeVariant = std::variant<FunctionNode, RenderpassNode, NextSubpassNode, NextSubpassFunctionNode, RenderpassEndNode>; //!< A variant encompassing all command nodes types
}

View File

@ -32,6 +32,7 @@ namespace skyline::gpu::interconnect {
u32 gpuAddressHigh;
};
};
u32 widthBytes; //!< The width in bytes for linear textures
GuestTexture guest;
std::optional<TextureView> view;
@ -74,6 +75,9 @@ namespace skyline::gpu::interconnect {
void SetRenderTargetWidth(size_t index, u32 value) {
auto &renderTarget{renderTargets.at(index)};
renderTarget.widthBytes = value;
if (renderTarget.guest.tileConfig.mode == texture::TileMode::Linear && renderTarget.guest.format)
value /= renderTarget.guest.format->bpb; // Width is in bytes rather than format units for linear textures
renderTarget.guest.dimensions.width = value;
renderTarget.view.reset();
}
@ -134,6 +138,10 @@ namespace skyline::gpu::interconnect {
throw exception("Cannot translate the supplied RT format: 0x{:X}", static_cast<u32>(format));
}
}();
if (renderTarget.guest.tileConfig.mode == texture::TileMode::Linear && renderTarget.guest.format)
renderTarget.guest.dimensions.width = renderTarget.widthBytes / renderTarget.guest.format->bpb;
renderTarget.disabled = !renderTarget.guest.format;
renderTarget.view.reset();
}
@ -142,8 +150,17 @@ namespace skyline::gpu::interconnect {
auto &renderTarget{renderTargets.at(index)};
auto &config{renderTarget.guest.tileConfig};
if (mode.isLinear) {
if (config.mode != texture::TileMode::Linear && renderTarget.guest.format) {
// Width is provided in bytes rather than format units for linear textures
renderTarget.widthBytes = renderTarget.guest.dimensions.width;
renderTarget.guest.dimensions.width /= renderTarget.guest.format->bpb;
}
config.mode = texture::TileMode::Linear;
} else [[likely]] {
if (config.mode == texture::TileMode::Linear && renderTarget.guest.format)
renderTarget.guest.dimensions.width = renderTarget.widthBytes;
config = texture::TileConfig{
.mode = texture::TileMode::Block,
.blockHeight = static_cast<u8>(1U << mode.blockHeightLog2),

View File

@ -78,7 +78,7 @@ namespace skyline::gpu::memory {
std::shared_ptr<StagingBuffer> MemoryManager::AllocateStagingBuffer(vk::DeviceSize size) {
vk::BufferCreateInfo bufferCreateInfo{
.size = size,
.usage = vk::BufferUsageFlagBits::eTransferSrc,
.usage = vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst,
.sharingMode = vk::SharingMode::eExclusive,
.queueFamilyIndexCount = 1,
.pQueueFamilyIndices = &gpu.vkQueueFamilyIndex,
@ -112,7 +112,7 @@ namespace skyline::gpu::memory {
Image MemoryManager::AllocateMappedImage(const vk::ImageCreateInfo &createInfo) {
VmaAllocationCreateInfo allocationCreateInfo{
.usage = VMA_MEMORY_USAGE_UNKNOWN,
.memoryTypeBits = static_cast<u32>(vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eDeviceLocal),
.requiredFlags = static_cast<VkMemoryPropertyFlags>(vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eDeviceLocal),
};
VkImage image;

View File

@ -484,6 +484,10 @@ namespace skyline::gpu {
cycle = lCycle;
}
Texture::~Texture() {
WaitOnFence();
}
TextureView::TextureView(std::shared_ptr<Texture> backing, vk::ImageViewType type, vk::ImageSubresourceRange range, texture::Format format, vk::ComponentMapping mapping) : backing(std::move(backing)), type(type), format(format), mapping(mapping), range(range) {}
vk::ImageView TextureView::GetView() {

View File

@ -371,6 +371,8 @@ namespace skyline::gpu {
*/
Texture(GPU &gpu, texture::Dimensions dimensions, texture::Format format, vk::ImageLayout initialLayout = vk::ImageLayout::eGeneral, vk::ImageUsageFlags usage = {}, vk::ImageTiling tiling = vk::ImageTiling::eOptimal, u32 mipLevels = 1, u32 layerCount = 1, vk::SampleCountFlagBits sampleCount = vk::SampleCountFlagBits::e1);
~Texture();
/**
* @note The handle returned is nullable and the appropriate precautions should be taken
*/

View File

@ -32,14 +32,8 @@ namespace skyline::kernel::type {
if (all) {
for (const auto &thread : threads)
thread->Kill(join);
} else {
std::shared_ptr<KThread> thread;
try {
thread = threads.at(0);
} catch (const std::out_of_range &) {
return;
}
thread->Kill(join);
} else if (!threads.empty()) {
threads[0]->Kill(join);
}
}
@ -101,6 +95,11 @@ namespace skyline::kernel::type {
return std::nullopt;
}
void KProcess::ClearHandleTable() {
std::shared_lock lock(handleMutex);
handles.clear();
}
constexpr u32 HandleWaitersBit{1UL << 30}; //!< A bit which denotes if a mutex psuedo-handle has waiters or not
Result KProcess::MutexLock(u32 *mutex, KHandle ownerHandle, KHandle tag) {

View File

@ -201,6 +201,12 @@ namespace skyline {
handles.at(handle - constant::BaseHandleIndex) = nullptr;
}
/**
* @brief Clear the process handle table
* @note A handle created prior to clearing must not be retrieved after this is run
*/
void ClearHandleTable();
/**
* @brief Locks the mutex at the specified address
* @param ownerHandle The psuedo-handle of the current mutex owner

View File

@ -8,6 +8,13 @@
namespace skyline::service::am {
ISelfController::ISelfController(const DeviceState &state, ServiceManager &manager) : libraryAppletLaunchableEvent(std::make_shared<type::KEvent>(state, false)), accumulatedSuspendedTickChangedEvent(std::make_shared<type::KEvent>(state, false)), hosbinder(manager.CreateOrGetService<hosbinder::IHOSBinderDriver>("dispdrv")), BaseService(state, manager) {}
Result ISelfController::Exit(type::KSession &session, ipc::IpcRequest &request, ipc::IpcResponse &response) {
if (state.thread->id)
state.process->Kill(false);
std::longjmp(state.thread->originalCtx, true);
return {};
}
Result ISelfController::LockExit(type::KSession &session, ipc::IpcRequest &request, ipc::IpcResponse &response) {
return {};
}

View File

@ -23,6 +23,12 @@ namespace skyline::service::am {
public:
ISelfController(const DeviceState &state, ServiceManager &manager);
/**
* @brief Exits the current applet
* @url https://switchbrew.org/wiki/Applet_Manager_services#Exit
*/
Result Exit(type::KSession &session, ipc::IpcRequest &request, ipc::IpcResponse &response);
/**
* @brief Function prevents the running application from being quit via the home button
* @url https://switchbrew.org/wiki/Applet_Manager_services#LockExit
@ -90,6 +96,7 @@ namespace skyline::service::am {
Result GetAccumulatedSuspendedTickChangedEvent(type::KSession &session, ipc::IpcRequest &request, ipc::IpcResponse &response);
SERVICE_DECL(
SFUNC(0x0, ISelfController, Exit),
SFUNC(0x1, ISelfController, LockExit),
SFUNC(0x2, ISelfController, UnlockExit),
SFUNC(0x9, ISelfController, GetLibraryAppletLaunchableEvent),