Implement Asynchronous Presentation

We currently present all frames synchronously on the thread that calls into SurfaceFlinger functions, this is unoptimal as it doesn't match guest behavior which can lead to delaying the guest from working on the next frame. This commit queuing up frames to non-blocking and handles all waiting then presenting the frame on a dedicated thread.
This commit is contained in:
PixelyIon 2022-06-09 22:16:15 +05:30
parent 6e09dc5204
commit 3139889a09
No known key found for this signature in database
GPG Key ID: 11BC6C3201BC2C05
4 changed files with 213 additions and 121 deletions

View File

@ -7,6 +7,7 @@
#include <common/signal.h>
#include <jvm.h>
#include <gpu.h>
#include <soc.h>
#include <loader/loader.h>
#include <kernel/types/KProcess.h>
#include "presentation_engine.h"
@ -21,12 +22,13 @@ namespace skyline::gpu {
using namespace service::hosbinder;
PresentationEngine::PresentationEngine(const DeviceState &state, GPU &gpu)
: state(state),
gpu(gpu),
acquireFence(gpu.vkDevice, vk::FenceCreateInfo{}),
presentationTrack(static_cast<u64>(trace::TrackIds::Presentation), perfetto::ProcessTrack::Current()),
choreographerThread(&PresentationEngine::ChoreographerThread, this),
vsyncEvent(std::make_shared<kernel::type::KEvent>(state, true)) {
: state{state},
gpu{gpu},
acquireFence{gpu.vkDevice, vk::FenceCreateInfo{}},
presentationTrack{static_cast<u64>(trace::TrackIds::Presentation), perfetto::ProcessTrack::Current()},
choreographerThread{&PresentationEngine::ChoreographerThread, this},
presentationThread{&PresentationEngine::PresentationThread, this},
vsyncEvent{std::make_shared<kernel::type::KEvent>(state, true)} {
auto desc{presentationTrack.Serialize()};
desc.set_name("Presentation");
perfetto::TrackEvent::SetTrackDescriptor(presentationTrack, desc);
@ -88,6 +90,158 @@ namespace skyline::gpu {
}
}
void PresentationEngine::PresentFrame(const PresentableFrame &frame) {
std::unique_lock lock(mutex);
surfaceCondition.wait(lock, [this]() { return vkSurface.has_value(); });
frame.fence.Wait(state.soc->host1x);
std::scoped_lock textureLock(*frame.texture);
if (frame.texture->format != swapchainFormat || frame.texture->dimensions != swapchainExtent)
UpdateSwapchain(frame.texture->format, frame.texture->dimensions);
int result;
if (frame.crop && frame.crop != windowCrop) {
if ((result = window->perform(window, NATIVE_WINDOW_SET_CROP, &frame.crop)))
throw exception("Setting the layer crop to ({}-{})x({}-{}) failed with {}", frame.crop.left, frame.crop.right, frame.crop.top, frame.crop.bottom, result);
windowCrop = frame.crop;
}
if (frame.scalingMode != NativeWindowScalingMode::Freeze && windowScalingMode != frame.scalingMode) {
if ((result = window->perform(window, NATIVE_WINDOW_SET_SCALING_MODE, static_cast<i32>(frame.scalingMode))))
throw exception("Setting the layer scaling mode to '{}' failed with {}", ToString(frame.scalingMode), result);
windowScalingMode = frame.scalingMode;
}
if (frame.transform != windowTransform) {
if ((result = window->perform(window, NATIVE_WINDOW_SET_BUFFERS_TRANSFORM, static_cast<i32>(frame.transform))))
throw exception("Setting the buffer transform to '{}' failed with {}", ToString(frame.transform), result);
windowTransform = frame.transform;
}
gpu.vkDevice.resetFences(*acquireFence);
std::pair<vk::Result, u32> nextImage;
while (nextImage = vkSwapchain->acquireNextImage(std::numeric_limits<u64>::max(), {}, *acquireFence), nextImage.first != vk::Result::eSuccess) [[unlikely]] {
if (nextImage.first == vk::Result::eSuboptimalKHR)
surfaceCondition.wait(lock, [this]() { return vkSurface.has_value(); });
else
throw exception("vkAcquireNextImageKHR returned an unhandled result '{}'", vk::to_string(nextImage.first));
}
std::ignore = gpu.vkDevice.waitForFences(*acquireFence, true, std::numeric_limits<u64>::max());
frame.texture->SynchronizeHost();
images.at(nextImage.second)->CopyFrom(frame.texture, vk::ImageSubresourceRange{
.aspectMask = vk::ImageAspectFlagBits::eColor,
.levelCount = 1,
.layerCount = 1,
});
auto getMonotonicNsNow{[]() -> i64 {
timespec time;
if (clock_gettime(CLOCK_MONOTONIC, &time))
throw exception("Failed to clock_gettime with '{}'", strerror(errno));
return (time.tv_sec * constant::NsInSecond) + time.tv_nsec;
}};
i64 timestamp{frame.timestamp};
if (timestamp) {
// If the timestamp is specified, we need to convert it from the util::GetTimeNs base to the CLOCK_MONOTONIC one
// We do so by getting an offset from the current time in nanoseconds and then adding it to the current time in CLOCK_MONOTONIC
// Note: It's important we do this right before present as going past the timestamp could lead to fewer Binder IPC calls
i64 current{util::GetTimeNs()};
if (current < timestamp) {
timestamp = getMonotonicNsNow() + (timestamp - current);
} else {
timestamp = 0;
}
}
if (frame.swapInterval) {
// If we have a swap interval, we have to adjust the timestamp to emulate the swap interval
i64 lastFramePresentTime{util::AlignUpNpot(windowLastTimestamp, refreshCycleDuration)};
if (lastFramePresentTime > lastChoreographerTime)
// If the last frame was presented after the last choreographer callback, calculate the new frame's timestamp relative to it
timestamp = std::max(timestamp, lastFramePresentTime + (refreshCycleDuration * frame.swapInterval));
else
// If there has been a choreographer callback since the last frame, calculate the new frame's timestamp relative to it
timestamp = std::max(timestamp, lastChoreographerTime + (2 * refreshCycleDuration * frame.swapInterval));
}
i64 lastTimestamp{std::exchange(windowLastTimestamp, timestamp)};
if (!timestamp && lastTimestamp)
// We need to nullify the timestamp if it transitioned from being specified (non-zero) to unspecified (zero)
timestamp = NativeWindowTimestampAuto;
if (timestamp && (result = window->perform(window, NATIVE_WINDOW_SET_BUFFERS_TIMESTAMP, timestamp)))
throw exception("Setting the buffer timestamp to {} failed with {}", timestamp, result);
u64 frameId{};
if ((result = window->perform(window, NATIVE_WINDOW_GET_NEXT_FRAME_ID, &frameId)))
throw exception("Retrieving the next frame's ID failed with {}", result);
{
std::scoped_lock queueLock{gpu.queueMutex};
std::ignore = gpu.vkQueue.presentKHR(vk::PresentInfoKHR{
.swapchainCount = 1,
.pSwapchains = &**vkSwapchain,
.pImageIndices = &nextImage.second,
}); // We don't care about suboptimal images as they are caused by not respecting the transform hint, we handle transformations externally
}
timestamp = timestamp ? timestamp : getMonotonicNsNow();
if (frameTimestamp) {
i64 sampleWeight{Fps ? Fps : 1}; //!< The weight of each sample in calculating the average, we want to roughly average the past second
auto weightedAverage{[](auto weight, auto previousAverage, auto current) {
return (((weight - 1) * previousAverage) + current) / weight;
}}; //!< Modified moving average (https://en.wikipedia.org/wiki/Moving_average#Modified_moving_average)
i64 currentFrametime{timestamp - frameTimestamp};
averageFrametimeNs = weightedAverage(sampleWeight, averageFrametimeNs, currentFrametime);
AverageFrametimeMs = static_cast<jfloat>(averageFrametimeNs) / constant::NsInMillisecond;
i64 currentFrametimeDeviation{std::abs(averageFrametimeNs - currentFrametime)};
averageFrametimeDeviationNs = weightedAverage(sampleWeight, averageFrametimeDeviationNs, currentFrametimeDeviation);
AverageFrametimeDeviationMs = static_cast<jfloat>(averageFrametimeDeviationNs) / constant::NsInMillisecond;
Fps = static_cast<jint>(std::round(static_cast<float>(constant::NsInSecond) / static_cast<float>(averageFrametimeNs)));
TRACE_EVENT_INSTANT("gpu", "Present", presentationTrack, "FrameTimeNs", timestamp - frameTimestamp, "Fps", Fps);
frameTimestamp = timestamp;
} else {
frameTimestamp = timestamp;
}
}
void PresentationEngine::PresentationThread() {
if (int result{pthread_setname_np(pthread_self(), "Sky-Present")})
Logger::Warn("Failed to set the thread name: {}", strerror(result));
try {
signal::SetSignalHandler({SIGINT, SIGILL, SIGTRAP, SIGBUS, SIGFPE, SIGSEGV}, signal::ExceptionalSignalHandler);
presentQueue.Process([this](const PresentableFrame& frame) {
PresentFrame(frame);
frame.presentCallback(); // We're calling the callback here as it's outside of all the locks in PresentFrame
});
} catch (const signal::SignalException &e) {
Logger::Error("{}\nStack Trace:{}", e.what(), state.loader->GetStackTrace(e.frames));
if (state.process)
state.process->Kill(false);
else
std::rethrow_exception(std::current_exception());
} catch (const std::exception &e) {
Logger::Error(e.what());
if (state.process)
state.process->Kill(false);
else
std::rethrow_exception(std::current_exception());
}
}
NativeWindowTransform GetAndroidTransform(vk::SurfaceTransformFlagBitsKHR transform) {
using NativeWindowTransform = NativeWindowTransform;
switch (transform) {
@ -219,115 +373,23 @@ namespace skyline::gpu {
}
}
void PresentationEngine::Present(const std::shared_ptr<Texture> &texture, i64 timestamp, u64 swapInterval, AndroidRect crop, NativeWindowScalingMode scalingMode, NativeWindowTransform transform, u64 &frameId) {
u64 PresentationEngine::Present(const std::shared_ptr<Texture> &texture, i64 timestamp, i64 swapInterval, AndroidRect crop, NativeWindowScalingMode scalingMode, NativeWindowTransform transform, skyline::service::hosbinder::AndroidFence fence, const std::function<void()>& presentCallback) {
std::unique_lock lock(mutex);
surfaceCondition.wait(lock, [this]() { return vkSurface.has_value(); });
std::scoped_lock textureLock(*texture);
if (texture->format != swapchainFormat || texture->dimensions != swapchainExtent)
UpdateSwapchain(texture->format, texture->dimensions);
int result;
if (crop && crop != windowCrop) {
if ((result = window->perform(window, NATIVE_WINDOW_SET_CROP, &crop)))
throw exception("Setting the layer crop to ({}-{})x({}-{}) failed with {}", crop.left, crop.right, crop.top, crop.bottom, result);
windowCrop = crop;
}
if (scalingMode != NativeWindowScalingMode::Freeze && windowScalingMode != scalingMode) {
if ((result = window->perform(window, NATIVE_WINDOW_SET_SCALING_MODE, static_cast<i32>(scalingMode))))
throw exception("Setting the layer scaling mode to '{}' failed with {}", ToString(scalingMode), result);
windowScalingMode = scalingMode;
}
if (transform != windowTransform) {
if ((result = window->perform(window, NATIVE_WINDOW_SET_BUFFERS_TRANSFORM, static_cast<i32>(transform))))
throw exception("Setting the buffer transform to '{}' failed with {}", ToString(transform), result);
windowTransform = transform;
}
gpu.vkDevice.resetFences(*acquireFence);
std::pair<vk::Result, u32> nextImage;
while (nextImage = vkSwapchain->acquireNextImage(std::numeric_limits<u64>::max(), {}, *acquireFence), nextImage.first != vk::Result::eSuccess) [[unlikely]] {
if (nextImage.first == vk::Result::eSuboptimalKHR)
surfaceCondition.wait(lock, [this]() { return vkSurface.has_value(); });
else
throw exception("vkAcquireNextImageKHR returned an unhandled result '{}'", vk::to_string(nextImage.first));
}
std::ignore = gpu.vkDevice.waitForFences(*acquireFence, true, std::numeric_limits<u64>::max());
texture->SynchronizeHost();
images.at(nextImage.second)->CopyFrom(texture, vk::ImageSubresourceRange{
.aspectMask = vk::ImageAspectFlagBits::eColor,
.levelCount = 1,
.layerCount = 1,
presentQueue.Push(PresentableFrame{
texture,
fence,
timestamp,
swapInterval,
presentCallback,
nextFrameId,
crop,
scalingMode,
transform
});
if (timestamp) {
// If the timestamp is specified, we need to convert it from the util::GetTimeNs base to the CLOCK_MONOTONIC one
// We do so by getting an offset from the current time in nanoseconds and then adding it to the current time in CLOCK_MONOTONIC
// Note: It's important we do this right before present as going past the timestamp could lead to fewer Binder IPC calls
i64 current{util::GetTimeNs()};
if (current < timestamp) {
timespec time;
if (clock_gettime(CLOCK_MONOTONIC, &time))
throw exception("Failed to clock_gettime with '{}'", strerror(errno));
timestamp = ((time.tv_sec * constant::NsInSecond) + time.tv_nsec) + (timestamp - current);
} else {
timestamp = 0;
}
}
if (swapInterval > 1)
// If we have a swap interval above 1 we have to adjust the timestamp to emulate the swap interval
timestamp = std::max(timestamp, lastChoreographerTime + (refreshCycleDuration * static_cast<i64>(swapInterval) * 2));
auto lastTimestamp{std::exchange(windowLastTimestamp, timestamp)};
if (!timestamp && lastTimestamp)
// We need to nullify the timestamp if it transitioned from being specified (non-zero) to unspecified (zero)
timestamp = NativeWindowTimestampAuto;
if (timestamp && (result = window->perform(window, NATIVE_WINDOW_SET_BUFFERS_TIMESTAMP, timestamp)))
throw exception("Setting the buffer timestamp to {} failed with {}", timestamp, result);
if ((result = window->perform(window, NATIVE_WINDOW_GET_NEXT_FRAME_ID, &frameId)))
throw exception("Retrieving the next frame's ID failed with {}", result);
{
std::scoped_lock queueLock{gpu.queueMutex};
std::ignore = gpu.vkQueue.presentKHR(vk::PresentInfoKHR{
.swapchainCount = 1,
.pSwapchains = &**vkSwapchain,
.pImageIndices = &nextImage.second,
}); // We don't care about suboptimal images as they are caused by not respecting the transform hint, we handle transformations externally
}
if (frameTimestamp) {
i64 now{util::GetTimeNs()};
i64 sampleWeight{swapInterval ? constant::NsInSecond / (refreshCycleDuration * static_cast<i64>(swapInterval)) : 10}; //!< The weight of each sample in calculating the average, we arbitrarily average 10 samples for unlocked FPS
auto weightedAverage{[](auto weight, auto previousAverage, auto current) {
return (((weight - 1) * previousAverage) + current) / weight;
}}; //!< Modified moving average (https://en.wikipedia.org/wiki/Moving_average#Modified_moving_average)
i64 currentFrametime{now - frameTimestamp};
averageFrametimeNs = weightedAverage(sampleWeight, averageFrametimeNs, currentFrametime);
AverageFrametimeMs = static_cast<jfloat>(averageFrametimeNs) / constant::NsInMillisecond;
i64 currentFrametimeDeviation{std::abs(averageFrametimeNs - currentFrametime)};
averageFrametimeDeviationNs = weightedAverage(sampleWeight, averageFrametimeDeviationNs, currentFrametimeDeviation);
AverageFrametimeDeviationMs = static_cast<jfloat>(averageFrametimeDeviationNs) / constant::NsInMillisecond;
Fps = static_cast<jint>(std::round(static_cast<float>(constant::NsInSecond) / static_cast<float>(averageFrametimeNs)));
TRACE_EVENT_INSTANT("gpu", "Present", presentationTrack, "FrameTimeNs", now - frameTimestamp, "Fps", Fps);
frameTimestamp = now;
} else {
frameTimestamp = util::GetTimeNs();
}
return nextFrameId++;
}
NativeWindowTransform PresentationEngine::GetTransformHint() {

View File

@ -6,6 +6,7 @@
#include <jni.h>
#include <android/looper.h>
#include <common/trace.h>
#include <common/circular_queue.h>
#include <kernel/types/KEvent.h>
#include <services/hosbinder/GraphicBufferProducer.h>
#include "texture/texture.h"
@ -28,7 +29,7 @@ namespace skyline::gpu {
service::hosbinder::AndroidRect windowCrop{}; //!< A rectangle with the bounds of the current crop performed on the image prior to presentation
service::hosbinder::NativeWindowScalingMode windowScalingMode{service::hosbinder::NativeWindowScalingMode::ScaleToWindow}; //!< The mode in which the cropped image is scaled up to the surface
service::hosbinder::NativeWindowTransform windowTransform{}; //!< The transformation performed on the image prior to presentation
u64 windowLastTimestamp{}; //!< The last timestamp submitted to the window, 0 or CLOCK_MONOTONIC value
i64 windowLastTimestamp{}; //!< The last timestamp submitted to the window, 0 or CLOCK_MONOTONIC value
std::optional<vk::raii::SurfaceKHR> vkSurface; //!< The Vulkan Surface object that is backed by ANativeWindow
vk::SurfaceCapabilitiesKHR vkSurfaceCapabilities; //!< The capabilities of the current Vulkan Surface
@ -52,6 +53,24 @@ namespace skyline::gpu {
i64 refreshCycleDuration{}; //!< The duration of a single refresh cycle for the display in nanoseconds
bool choreographerStop{}; //!< If the Choreographer thread should stop on the next ALooper_wake()
struct PresentableFrame {
std::shared_ptr<Texture> texture{};
skyline::service::hosbinder::AndroidFence fence{}; //!< The fence that must be waited on prior to using the texture
i64 timestamp{}; //!< The earliest timestamp (relative to ARM CPU timer) that this frame must be presented at
i64 swapInterval{}; //!< The interval between frames in terms of 60Hz display refreshes (1/60th of a second)
std::function<void()> presentCallback; //!< A user-defined callback to use after presenting a frame
size_t id{}; //!< The ID of this frame, it is used to correlate the frame in other operations
service::hosbinder::AndroidRect crop{};
service::hosbinder::NativeWindowScalingMode scalingMode{};
service::hosbinder::NativeWindowTransform transform{};
};
std::thread presentationThread; //!< A thread for asynchronously presenting queued frames after their corresponded fences are signalled
static constexpr size_t PresentQueueFrameCount{5}; //!< The amount of frames the presentation queue can hold
CircularQueue<PresentableFrame> presentQueue{PresentQueueFrameCount}; //!< A circular queue containing all the frames that we can present
size_t nextFrameId{1}; //!< The frame ID to use for the next frame
/**
* @url https://developer.android.com/ndk/reference/group/choreographer#achoreographer_postframecallback64
*/
@ -62,6 +81,16 @@ namespace skyline::gpu {
*/
void ChoreographerThread();
/**
* @brief Submits a single frame to the host API for presentation with the appropriate waits and copies
*/
void PresentFrame(const PresentableFrame& frame);
/**
* @brief The thread that handles presentation of frames submitted to it
*/
void PresentationThread();
/**
* @note 'PresentationEngine::mutex' **must** be locked prior to calling this
*/
@ -86,10 +115,12 @@ namespace skyline::gpu {
* @param crop A rectangle with bounds that the image will be cropped to
* @param scalingMode The mode by which the image must be scaled up to the surface
* @param transform A transformation that should be performed on the image
* @param frameId The ID of this frame for correlating it with presentation timing readouts
* @param fence The fence to wait on prior to presenting the texture
* @param presentCallback The callback to be called when the texture is presented to the surface
* @return The ID of this frame for correlating it with presentation timing readouts
* @note The texture **must** be locked prior to calling this
*/
void Present(const std::shared_ptr<Texture> &texture, i64 timestamp, u64 swapInterval, service::hosbinder::AndroidRect crop, service::hosbinder::NativeWindowScalingMode scalingMode, service::hosbinder::NativeWindowTransform transform, u64 &frameId);
u64 Present(const std::shared_ptr<Texture> &texture, i64 timestamp, i64 swapInterval, service::hosbinder::AndroidRect crop, service::hosbinder::NativeWindowScalingMode scalingMode, service::hosbinder::NativeWindowTransform transform, skyline::service::hosbinder::AndroidFence fence, const std::function<void()>& presentCallback);
/**
* @return A transform that the application should render with to elide costly transforms later

View File

@ -384,16 +384,15 @@ namespace skyline::service::hosbinder {
throw exception("Application attempting to perform unknown sticky transformation: {:#b}", static_cast<u32>(stickyTransform));
}
fence.Wait(state.soc->host1x);
state.gpu->presentation.Present(buffer.texture, isAutoTimestamp ? 0 : timestamp, swapInterval, crop, scalingMode, transform, fence, [this, &buffer] {
std::scoped_lock lock(mutex);
{
u64 frameId;
state.gpu->presentation.Present(buffer.texture, isAutoTimestamp ? 0 : timestamp, swapInterval, crop, scalingMode, transform, frameId);
}
buffer.state = BufferState::Free;
bufferEvent->Signal();
});
buffer.state = BufferState::Queued;
buffer.frameNumber = ++frameNumber;
buffer.state = BufferState::Free;
bufferEvent->Signal();
width = defaultWidth;
height = defaultHeight;

View File

@ -82,7 +82,7 @@ namespace skyline::service::hosbinder {
/**
* @return If the rectangle had any defined bounds
*/
constexpr operator bool() {
constexpr operator bool() const {
return left || top || right || bottom;
}